162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * sm3-neon-core.S - SM3 secure hash using NEON instructions
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Linux/arm64 port of the libgcrypt SM3 implementation for AArch64
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
862306a36Sopenharmony_ci * Copyright (c) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
962306a36Sopenharmony_ci */
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_ci#include <linux/linkage.h>
1262306a36Sopenharmony_ci#include <linux/cfi_types.h>
1362306a36Sopenharmony_ci#include <asm/assembler.h>
1462306a36Sopenharmony_ci
1562306a36Sopenharmony_ci/* Context structure */
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ci#define state_h0 0
1862306a36Sopenharmony_ci#define state_h1 4
1962306a36Sopenharmony_ci#define state_h2 8
2062306a36Sopenharmony_ci#define state_h3 12
2162306a36Sopenharmony_ci#define state_h4 16
2262306a36Sopenharmony_ci#define state_h5 20
2362306a36Sopenharmony_ci#define state_h6 24
2462306a36Sopenharmony_ci#define state_h7 28
2562306a36Sopenharmony_ci
2662306a36Sopenharmony_ci/* Stack structure */
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci#define STACK_W_SIZE        (32 * 2 * 3)
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci#define STACK_W             (0)
3162306a36Sopenharmony_ci#define STACK_SIZE          (STACK_W + STACK_W_SIZE)
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci/* Register macros */
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_ci#define RSTATE x0
3662306a36Sopenharmony_ci#define RDATA  x1
3762306a36Sopenharmony_ci#define RNBLKS x2
3862306a36Sopenharmony_ci#define RKPTR  x28
3962306a36Sopenharmony_ci#define RFRAME x29
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_ci#define ra w3
4262306a36Sopenharmony_ci#define rb w4
4362306a36Sopenharmony_ci#define rc w5
4462306a36Sopenharmony_ci#define rd w6
4562306a36Sopenharmony_ci#define re w7
4662306a36Sopenharmony_ci#define rf w8
4762306a36Sopenharmony_ci#define rg w9
4862306a36Sopenharmony_ci#define rh w10
4962306a36Sopenharmony_ci
5062306a36Sopenharmony_ci#define t0 w11
5162306a36Sopenharmony_ci#define t1 w12
5262306a36Sopenharmony_ci#define t2 w13
5362306a36Sopenharmony_ci#define t3 w14
5462306a36Sopenharmony_ci#define t4 w15
5562306a36Sopenharmony_ci#define t5 w16
5662306a36Sopenharmony_ci#define t6 w17
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ci#define k_even w19
5962306a36Sopenharmony_ci#define k_odd w20
6062306a36Sopenharmony_ci
6162306a36Sopenharmony_ci#define addr0 x21
6262306a36Sopenharmony_ci#define addr1 x22
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci#define s0 w23
6562306a36Sopenharmony_ci#define s1 w24
6662306a36Sopenharmony_ci#define s2 w25
6762306a36Sopenharmony_ci#define s3 w26
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_ci#define W0 v0
7062306a36Sopenharmony_ci#define W1 v1
7162306a36Sopenharmony_ci#define W2 v2
7262306a36Sopenharmony_ci#define W3 v3
7362306a36Sopenharmony_ci#define W4 v4
7462306a36Sopenharmony_ci#define W5 v5
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci#define XTMP0 v6
7762306a36Sopenharmony_ci#define XTMP1 v7
7862306a36Sopenharmony_ci#define XTMP2 v16
7962306a36Sopenharmony_ci#define XTMP3 v17
8062306a36Sopenharmony_ci#define XTMP4 v18
8162306a36Sopenharmony_ci#define XTMP5 v19
8262306a36Sopenharmony_ci#define XTMP6 v20
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci/* Helper macros. */
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_ci#define _(...) /*_*/
8762306a36Sopenharmony_ci
8862306a36Sopenharmony_ci#define clear_vec(x) \
8962306a36Sopenharmony_ci	movi	x.8h, #0;
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci#define rolw(o, a, n) \
9262306a36Sopenharmony_ci	ror	o, a, #(32 - n);
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_ci/* Round function macros. */
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci#define GG1_1(x, y, z, o, t) \
9762306a36Sopenharmony_ci	eor	o, x, y;
9862306a36Sopenharmony_ci#define GG1_2(x, y, z, o, t) \
9962306a36Sopenharmony_ci	eor	o, o, z;
10062306a36Sopenharmony_ci#define GG1_3(x, y, z, o, t)
10162306a36Sopenharmony_ci
10262306a36Sopenharmony_ci#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t)
10362306a36Sopenharmony_ci#define FF1_2(x, y, z, o, t)
10462306a36Sopenharmony_ci#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t)
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci#define GG2_1(x, y, z, o, t) \
10762306a36Sopenharmony_ci	bic	o, z, x;
10862306a36Sopenharmony_ci#define GG2_2(x, y, z, o, t) \
10962306a36Sopenharmony_ci	and	t, y, x;
11062306a36Sopenharmony_ci#define GG2_3(x, y, z, o, t) \
11162306a36Sopenharmony_ci	eor	o, o, t;
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_ci#define FF2_1(x, y, z, o, t) \
11462306a36Sopenharmony_ci	eor	o, x, y;
11562306a36Sopenharmony_ci#define FF2_2(x, y, z, o, t) \
11662306a36Sopenharmony_ci	and	t, x, y; \
11762306a36Sopenharmony_ci	and	o, o, z;
11862306a36Sopenharmony_ci#define FF2_3(x, y, z, o, t) \
11962306a36Sopenharmony_ci	eor	o, o, t;
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_ci#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
12262306a36Sopenharmony_ci	K_LOAD(round);                                                        \
12362306a36Sopenharmony_ci	ldr	t5, [sp, #(wtype##_W1_ADDR(round, widx))];                    \
12462306a36Sopenharmony_ci	rolw(t0, a, 12);                              /* rol(a, 12) => t0 */  \
12562306a36Sopenharmony_ci      IOP(1, iop_param);                                                      \
12662306a36Sopenharmony_ci	FF##i##_1(a, b, c, t1, t2);                                           \
12762306a36Sopenharmony_ci	ldr	t6, [sp, #(wtype##_W1W2_ADDR(round, widx))];                  \
12862306a36Sopenharmony_ci	add	k, k, e;                                                      \
12962306a36Sopenharmony_ci      IOP(2, iop_param);                                                      \
13062306a36Sopenharmony_ci	GG##i##_1(e, f, g, t3, t4);                                           \
13162306a36Sopenharmony_ci	FF##i##_2(a, b, c, t1, t2);                                           \
13262306a36Sopenharmony_ci      IOP(3, iop_param);                                                      \
13362306a36Sopenharmony_ci	add	k, k, t0;                                                     \
13462306a36Sopenharmony_ci	add	h, h, t5;                                                     \
13562306a36Sopenharmony_ci	add	d, d, t6;                     /* w1w2 + d => d */             \
13662306a36Sopenharmony_ci      IOP(4, iop_param);                                                      \
13762306a36Sopenharmony_ci	rolw(k, k, 7);                        /* rol (t0 + e + t), 7) => k */ \
13862306a36Sopenharmony_ci	GG##i##_2(e, f, g, t3, t4);                                           \
13962306a36Sopenharmony_ci	add	h, h, k;                      /* h + w1 + k => h */           \
14062306a36Sopenharmony_ci      IOP(5, iop_param);                                                      \
14162306a36Sopenharmony_ci	FF##i##_3(a, b, c, t1, t2);                                           \
14262306a36Sopenharmony_ci	eor	t0, t0, k;                    /* k ^ t0 => t0 */              \
14362306a36Sopenharmony_ci	GG##i##_3(e, f, g, t3, t4);                                           \
14462306a36Sopenharmony_ci	add	d, d, t1;                     /* FF(a,b,c) + d => d */        \
14562306a36Sopenharmony_ci      IOP(6, iop_param);                                                      \
14662306a36Sopenharmony_ci	add	t3, t3, h;                    /* GG(e,f,g) + h => t3 */       \
14762306a36Sopenharmony_ci	rolw(b, b, 9);                        /* rol(b, 9) => b */            \
14862306a36Sopenharmony_ci	eor	h, t3, t3, ror #(32-9);                                       \
14962306a36Sopenharmony_ci      IOP(7, iop_param);                                                      \
15062306a36Sopenharmony_ci	add	d, d, t0;                     /* t0 + d => d */               \
15162306a36Sopenharmony_ci	rolw(f, f, 19);                       /* rol(f, 19) => f */           \
15262306a36Sopenharmony_ci      IOP(8, iop_param);                                                      \
15362306a36Sopenharmony_ci	eor	h, h, t3, ror #(32-17);       /* P0(t3) => h */
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
15662306a36Sopenharmony_ci	R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
15762306a36Sopenharmony_ci
15862306a36Sopenharmony_ci#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
15962306a36Sopenharmony_ci	R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci#define KL(round) \
16262306a36Sopenharmony_ci	ldp	k_even, k_odd, [RKPTR, #(4*(round))];
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci/* Input expansion macros. */
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_ci/* Byte-swapped input address. */
16762306a36Sopenharmony_ci#define IW_W_ADDR(round, widx, offs) \
16862306a36Sopenharmony_ci	(STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci/* Expanded input address. */
17162306a36Sopenharmony_ci#define XW_W_ADDR(round, widx, offs) \
17262306a36Sopenharmony_ci	(STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_ci/* Rounds 1-12, byte-swapped input block addresses. */
17562306a36Sopenharmony_ci#define IW_W1_ADDR(round, widx)   IW_W_ADDR(round, widx, 32)
17662306a36Sopenharmony_ci#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48)
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci/* Rounds 1-12, expanded input block addresses. */
17962306a36Sopenharmony_ci#define XW_W1_ADDR(round, widx)   XW_W_ADDR(round, widx, 0)
18062306a36Sopenharmony_ci#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16)
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci/* Input block loading.
18362306a36Sopenharmony_ci * Interleaving within round function needed for in-order CPUs. */
18462306a36Sopenharmony_ci#define LOAD_W_VEC_1_1() \
18562306a36Sopenharmony_ci	add	addr0, sp, #IW_W1_ADDR(0, 0);
18662306a36Sopenharmony_ci#define LOAD_W_VEC_1_2() \
18762306a36Sopenharmony_ci	add	addr1, sp, #IW_W1_ADDR(4, 0);
18862306a36Sopenharmony_ci#define LOAD_W_VEC_1_3() \
18962306a36Sopenharmony_ci	ld1	{W0.16b}, [RDATA], #16;
19062306a36Sopenharmony_ci#define LOAD_W_VEC_1_4() \
19162306a36Sopenharmony_ci	ld1	{W1.16b}, [RDATA], #16;
19262306a36Sopenharmony_ci#define LOAD_W_VEC_1_5() \
19362306a36Sopenharmony_ci	ld1	{W2.16b}, [RDATA], #16;
19462306a36Sopenharmony_ci#define LOAD_W_VEC_1_6() \
19562306a36Sopenharmony_ci	ld1	{W3.16b}, [RDATA], #16;
19662306a36Sopenharmony_ci#define LOAD_W_VEC_1_7() \
19762306a36Sopenharmony_ci	rev32	XTMP0.16b, W0.16b;
19862306a36Sopenharmony_ci#define LOAD_W_VEC_1_8() \
19962306a36Sopenharmony_ci	rev32	XTMP1.16b, W1.16b;
20062306a36Sopenharmony_ci#define LOAD_W_VEC_2_1() \
20162306a36Sopenharmony_ci	rev32	XTMP2.16b, W2.16b;
20262306a36Sopenharmony_ci#define LOAD_W_VEC_2_2() \
20362306a36Sopenharmony_ci	rev32	XTMP3.16b, W3.16b;
20462306a36Sopenharmony_ci#define LOAD_W_VEC_2_3() \
20562306a36Sopenharmony_ci	eor	XTMP4.16b, XTMP1.16b, XTMP0.16b;
20662306a36Sopenharmony_ci#define LOAD_W_VEC_2_4() \
20762306a36Sopenharmony_ci	eor	XTMP5.16b, XTMP2.16b, XTMP1.16b;
20862306a36Sopenharmony_ci#define LOAD_W_VEC_2_5() \
20962306a36Sopenharmony_ci	st1	{XTMP0.16b}, [addr0], #16;
21062306a36Sopenharmony_ci#define LOAD_W_VEC_2_6() \
21162306a36Sopenharmony_ci	st1	{XTMP4.16b}, [addr0]; \
21262306a36Sopenharmony_ci	add	addr0, sp, #IW_W1_ADDR(8, 0);
21362306a36Sopenharmony_ci#define LOAD_W_VEC_2_7() \
21462306a36Sopenharmony_ci	eor	XTMP6.16b, XTMP3.16b, XTMP2.16b;
21562306a36Sopenharmony_ci#define LOAD_W_VEC_2_8() \
21662306a36Sopenharmony_ci	ext	W0.16b, XTMP0.16b, XTMP0.16b, #8;  /* W0: xx, w0, xx, xx */
21762306a36Sopenharmony_ci#define LOAD_W_VEC_3_1() \
21862306a36Sopenharmony_ci	mov	W2.16b, XTMP1.16b;                 /* W2: xx, w6, w5, w4 */
21962306a36Sopenharmony_ci#define LOAD_W_VEC_3_2() \
22062306a36Sopenharmony_ci	st1	{XTMP1.16b}, [addr1], #16;
22162306a36Sopenharmony_ci#define LOAD_W_VEC_3_3() \
22262306a36Sopenharmony_ci	st1	{XTMP5.16b}, [addr1]; \
22362306a36Sopenharmony_ci	ext	W1.16b, XTMP0.16b, XTMP0.16b, #4;  /* W1: xx, w3, w2, w1 */
22462306a36Sopenharmony_ci#define LOAD_W_VEC_3_4() \
22562306a36Sopenharmony_ci	ext	W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */
22662306a36Sopenharmony_ci#define LOAD_W_VEC_3_5() \
22762306a36Sopenharmony_ci	ext	W4.16b, XTMP2.16b, XTMP3.16b, #8;  /* W4: xx, w12, w11, w10 */
22862306a36Sopenharmony_ci#define LOAD_W_VEC_3_6() \
22962306a36Sopenharmony_ci	st1	{XTMP2.16b}, [addr0], #16;
23062306a36Sopenharmony_ci#define LOAD_W_VEC_3_7() \
23162306a36Sopenharmony_ci	st1	{XTMP6.16b}, [addr0];
23262306a36Sopenharmony_ci#define LOAD_W_VEC_3_8() \
23362306a36Sopenharmony_ci	ext	W5.16b, XTMP3.16b, XTMP3.16b, #4;  /* W5: xx, w15, w14, w13 */
23462306a36Sopenharmony_ci
23562306a36Sopenharmony_ci#define LOAD_W_VEC_1(iop_num, ...) \
23662306a36Sopenharmony_ci	LOAD_W_VEC_1_##iop_num()
23762306a36Sopenharmony_ci#define LOAD_W_VEC_2(iop_num, ...) \
23862306a36Sopenharmony_ci	LOAD_W_VEC_2_##iop_num()
23962306a36Sopenharmony_ci#define LOAD_W_VEC_3(iop_num, ...) \
24062306a36Sopenharmony_ci	LOAD_W_VEC_3_##iop_num()
24162306a36Sopenharmony_ci
24262306a36Sopenharmony_ci/* Message scheduling. Note: 3 words per vector register.
24362306a36Sopenharmony_ci * Interleaving within round function needed for in-order CPUs. */
24462306a36Sopenharmony_ci#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \
24562306a36Sopenharmony_ci	/* Load (w[i - 16]) => XTMP0 */            \
24662306a36Sopenharmony_ci	/* Load (w[i - 13]) => XTMP5 */            \
24762306a36Sopenharmony_ci	ext	XTMP0.16b, w0.16b, w0.16b, #12;    /* XTMP0: w0, xx, xx, xx */
24862306a36Sopenharmony_ci#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \
24962306a36Sopenharmony_ci	ext	XTMP5.16b, w1.16b, w1.16b, #12;
25062306a36Sopenharmony_ci#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \
25162306a36Sopenharmony_ci	ext	XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */
25262306a36Sopenharmony_ci#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \
25362306a36Sopenharmony_ci	ext	XTMP5.16b, XTMP5.16b, w2.16b, #12;
25462306a36Sopenharmony_ci#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \
25562306a36Sopenharmony_ci	/* w[i - 9] == w3 */                       \
25662306a36Sopenharmony_ci	/* W3 ^ XTMP0 => XTMP0 */                  \
25762306a36Sopenharmony_ci	eor	XTMP0.16b, XTMP0.16b, w3.16b;
25862306a36Sopenharmony_ci#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \
25962306a36Sopenharmony_ci	/* w[i - 3] == w5 */                       \
26062306a36Sopenharmony_ci	/* rol(XMM5, 15) ^ XTMP0 => XTMP0 */       \
26162306a36Sopenharmony_ci	/* rol(XTMP5, 7) => XTMP1 */               \
26262306a36Sopenharmony_ci	add	addr0, sp, #XW_W1_ADDR((round), 0); \
26362306a36Sopenharmony_ci	shl	XTMP2.4s, w5.4s, #15;
26462306a36Sopenharmony_ci#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \
26562306a36Sopenharmony_ci	shl	XTMP1.4s, XTMP5.4s, #7;
26662306a36Sopenharmony_ci#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \
26762306a36Sopenharmony_ci	sri	XTMP2.4s, w5.4s, #(32-15);
26862306a36Sopenharmony_ci#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \
26962306a36Sopenharmony_ci	sri	XTMP1.4s, XTMP5.4s, #(32-7);
27062306a36Sopenharmony_ci#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \
27162306a36Sopenharmony_ci	eor	XTMP0.16b, XTMP0.16b, XTMP2.16b;
27262306a36Sopenharmony_ci#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \
27362306a36Sopenharmony_ci	/* w[i - 6] == W4 */                       \
27462306a36Sopenharmony_ci	/* W4 ^ XTMP1 => XTMP1 */                  \
27562306a36Sopenharmony_ci	eor	XTMP1.16b, XTMP1.16b, w4.16b;
27662306a36Sopenharmony_ci#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \
27762306a36Sopenharmony_ci	/* P1(XTMP0) ^ XTMP1 => W0 */              \
27862306a36Sopenharmony_ci	shl	XTMP3.4s, XTMP0.4s, #15;
27962306a36Sopenharmony_ci#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \
28062306a36Sopenharmony_ci	shl	XTMP4.4s, XTMP0.4s, #23;
28162306a36Sopenharmony_ci#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \
28262306a36Sopenharmony_ci	eor	w0.16b, XTMP1.16b, XTMP0.16b;
28362306a36Sopenharmony_ci#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \
28462306a36Sopenharmony_ci	sri	XTMP3.4s, XTMP0.4s, #(32-15);
28562306a36Sopenharmony_ci#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \
28662306a36Sopenharmony_ci	sri	XTMP4.4s, XTMP0.4s, #(32-23);
28762306a36Sopenharmony_ci#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \
28862306a36Sopenharmony_ci	eor	w0.16b, w0.16b, XTMP3.16b;
28962306a36Sopenharmony_ci#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \
29062306a36Sopenharmony_ci	/* Load (w[i - 3]) => XTMP2 */             \
29162306a36Sopenharmony_ci	ext	XTMP2.16b, w4.16b, w4.16b, #12;
29262306a36Sopenharmony_ci#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \
29362306a36Sopenharmony_ci	eor	w0.16b, w0.16b, XTMP4.16b;
29462306a36Sopenharmony_ci#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \
29562306a36Sopenharmony_ci	ext	XTMP2.16b, XTMP2.16b, w5.16b, #12;
29662306a36Sopenharmony_ci#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \
29762306a36Sopenharmony_ci	/* W1 ^ W2 => XTMP3 */                     \
29862306a36Sopenharmony_ci	eor	XTMP3.16b, XTMP2.16b, w0.16b;
29962306a36Sopenharmony_ci#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5)
30062306a36Sopenharmony_ci#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \
30162306a36Sopenharmony_ci	st1	{XTMP2.16b-XTMP3.16b}, [addr0];
30262306a36Sopenharmony_ci#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5)
30362306a36Sopenharmony_ci
30462306a36Sopenharmony_ci#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \
30562306a36Sopenharmony_ci	SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5)
30662306a36Sopenharmony_ci#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \
30762306a36Sopenharmony_ci	SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5)
30862306a36Sopenharmony_ci#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \
30962306a36Sopenharmony_ci	SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5)
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \
31262306a36Sopenharmony_ci	SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0)
31362306a36Sopenharmony_ci#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \
31462306a36Sopenharmony_ci	SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0)
31562306a36Sopenharmony_ci#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \
31662306a36Sopenharmony_ci	SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0)
31762306a36Sopenharmony_ci
31862306a36Sopenharmony_ci#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \
31962306a36Sopenharmony_ci	SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1)
32062306a36Sopenharmony_ci#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \
32162306a36Sopenharmony_ci	SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1)
32262306a36Sopenharmony_ci#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \
32362306a36Sopenharmony_ci	SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1)
32462306a36Sopenharmony_ci
32562306a36Sopenharmony_ci#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \
32662306a36Sopenharmony_ci	SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2)
32762306a36Sopenharmony_ci#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \
32862306a36Sopenharmony_ci	SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2)
32962306a36Sopenharmony_ci#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \
33062306a36Sopenharmony_ci	SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2)
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \
33362306a36Sopenharmony_ci	SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3)
33462306a36Sopenharmony_ci#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \
33562306a36Sopenharmony_ci	SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3)
33662306a36Sopenharmony_ci#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \
33762306a36Sopenharmony_ci	SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3)
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \
34062306a36Sopenharmony_ci	SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4)
34162306a36Sopenharmony_ci#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \
34262306a36Sopenharmony_ci	SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4)
34362306a36Sopenharmony_ci#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \
34462306a36Sopenharmony_ci	SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4)
34562306a36Sopenharmony_ci
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_ci	/*
34862306a36Sopenharmony_ci	 * Transform blocks*64 bytes (blocks*16 32-bit words) at 'src'.
34962306a36Sopenharmony_ci	 *
35062306a36Sopenharmony_ci	 * void sm3_neon_transform(struct sm3_state *sst, u8 const *src,
35162306a36Sopenharmony_ci	 *                         int blocks)
35262306a36Sopenharmony_ci	 */
35362306a36Sopenharmony_ci	.text
35462306a36Sopenharmony_ci.align 3
35562306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sm3_neon_transform)
35662306a36Sopenharmony_ci	ldp		ra, rb, [RSTATE, #0]
35762306a36Sopenharmony_ci	ldp		rc, rd, [RSTATE, #8]
35862306a36Sopenharmony_ci	ldp		re, rf, [RSTATE, #16]
35962306a36Sopenharmony_ci	ldp		rg, rh, [RSTATE, #24]
36062306a36Sopenharmony_ci
36162306a36Sopenharmony_ci	stp		x28, x29, [sp, #-16]!
36262306a36Sopenharmony_ci	stp		x19, x20, [sp, #-16]!
36362306a36Sopenharmony_ci	stp		x21, x22, [sp, #-16]!
36462306a36Sopenharmony_ci	stp		x23, x24, [sp, #-16]!
36562306a36Sopenharmony_ci	stp		x25, x26, [sp, #-16]!
36662306a36Sopenharmony_ci	mov		RFRAME, sp
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci	sub		addr0, sp, #STACK_SIZE
36962306a36Sopenharmony_ci	adr_l		RKPTR, .LKtable
37062306a36Sopenharmony_ci	and		sp, addr0, #(~63)
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci	/* Preload first block. */
37362306a36Sopenharmony_ci	LOAD_W_VEC_1(1, 0)
37462306a36Sopenharmony_ci	LOAD_W_VEC_1(2, 0)
37562306a36Sopenharmony_ci	LOAD_W_VEC_1(3, 0)
37662306a36Sopenharmony_ci	LOAD_W_VEC_1(4, 0)
37762306a36Sopenharmony_ci	LOAD_W_VEC_1(5, 0)
37862306a36Sopenharmony_ci	LOAD_W_VEC_1(6, 0)
37962306a36Sopenharmony_ci	LOAD_W_VEC_1(7, 0)
38062306a36Sopenharmony_ci	LOAD_W_VEC_1(8, 0)
38162306a36Sopenharmony_ci	LOAD_W_VEC_2(1, 0)
38262306a36Sopenharmony_ci	LOAD_W_VEC_2(2, 0)
38362306a36Sopenharmony_ci	LOAD_W_VEC_2(3, 0)
38462306a36Sopenharmony_ci	LOAD_W_VEC_2(4, 0)
38562306a36Sopenharmony_ci	LOAD_W_VEC_2(5, 0)
38662306a36Sopenharmony_ci	LOAD_W_VEC_2(6, 0)
38762306a36Sopenharmony_ci	LOAD_W_VEC_2(7, 0)
38862306a36Sopenharmony_ci	LOAD_W_VEC_2(8, 0)
38962306a36Sopenharmony_ci	LOAD_W_VEC_3(1, 0)
39062306a36Sopenharmony_ci	LOAD_W_VEC_3(2, 0)
39162306a36Sopenharmony_ci	LOAD_W_VEC_3(3, 0)
39262306a36Sopenharmony_ci	LOAD_W_VEC_3(4, 0)
39362306a36Sopenharmony_ci	LOAD_W_VEC_3(5, 0)
39462306a36Sopenharmony_ci	LOAD_W_VEC_3(6, 0)
39562306a36Sopenharmony_ci	LOAD_W_VEC_3(7, 0)
39662306a36Sopenharmony_ci	LOAD_W_VEC_3(8, 0)
39762306a36Sopenharmony_ci
39862306a36Sopenharmony_ci.balign 16
39962306a36Sopenharmony_ci.Loop:
40062306a36Sopenharmony_ci	/* Transform 0-3 */
40162306a36Sopenharmony_ci	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0)
40262306a36Sopenharmony_ci	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  1, 1, IW, _, 0)
40362306a36Sopenharmony_ci	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0)
40462306a36Sopenharmony_ci	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  3, 3, IW, _, 0)
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_ci	/* Transform 4-7 + Precalc 12-14 */
40762306a36Sopenharmony_ci	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0)
40862306a36Sopenharmony_ci	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  5, 1, IW, _, 0)
40962306a36Sopenharmony_ci	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12)
41062306a36Sopenharmony_ci	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12)
41162306a36Sopenharmony_ci
41262306a36Sopenharmony_ci	/* Transform 8-11 + Precalc 12-17 */
41362306a36Sopenharmony_ci	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12)
41462306a36Sopenharmony_ci	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15)
41562306a36Sopenharmony_ci	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15)
41662306a36Sopenharmony_ci	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15)
41762306a36Sopenharmony_ci
41862306a36Sopenharmony_ci	/* Transform 12-14 + Precalc 18-20 */
41962306a36Sopenharmony_ci	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18)
42062306a36Sopenharmony_ci	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18)
42162306a36Sopenharmony_ci	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18)
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci	/* Transform 15-17 + Precalc 21-23 */
42462306a36Sopenharmony_ci	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21)
42562306a36Sopenharmony_ci	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21)
42662306a36Sopenharmony_ci	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21)
42762306a36Sopenharmony_ci
42862306a36Sopenharmony_ci	/* Transform 18-20 + Precalc 24-26 */
42962306a36Sopenharmony_ci	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24)
43062306a36Sopenharmony_ci	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24)
43162306a36Sopenharmony_ci	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24)
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci	/* Transform 21-23 + Precalc 27-29 */
43462306a36Sopenharmony_ci	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27)
43562306a36Sopenharmony_ci	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27)
43662306a36Sopenharmony_ci	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27)
43762306a36Sopenharmony_ci
43862306a36Sopenharmony_ci	/* Transform 24-26 + Precalc 30-32 */
43962306a36Sopenharmony_ci	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30)
44062306a36Sopenharmony_ci	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30)
44162306a36Sopenharmony_ci	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30)
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci	/* Transform 27-29 + Precalc 33-35 */
44462306a36Sopenharmony_ci	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33)
44562306a36Sopenharmony_ci	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33)
44662306a36Sopenharmony_ci	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33)
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_ci	/* Transform 30-32 + Precalc 36-38 */
44962306a36Sopenharmony_ci	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36)
45062306a36Sopenharmony_ci	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36)
45162306a36Sopenharmony_ci	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36)
45262306a36Sopenharmony_ci
45362306a36Sopenharmony_ci	/* Transform 33-35 + Precalc 39-41 */
45462306a36Sopenharmony_ci	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39)
45562306a36Sopenharmony_ci	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39)
45662306a36Sopenharmony_ci	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39)
45762306a36Sopenharmony_ci
45862306a36Sopenharmony_ci	/* Transform 36-38 + Precalc 42-44 */
45962306a36Sopenharmony_ci	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42)
46062306a36Sopenharmony_ci	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42)
46162306a36Sopenharmony_ci	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42)
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_ci	/* Transform 39-41 + Precalc 45-47 */
46462306a36Sopenharmony_ci	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45)
46562306a36Sopenharmony_ci	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45)
46662306a36Sopenharmony_ci	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45)
46762306a36Sopenharmony_ci
46862306a36Sopenharmony_ci	/* Transform 42-44 + Precalc 48-50 */
46962306a36Sopenharmony_ci	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48)
47062306a36Sopenharmony_ci	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48)
47162306a36Sopenharmony_ci	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48)
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_ci	/* Transform 45-47 + Precalc 51-53 */
47462306a36Sopenharmony_ci	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51)
47562306a36Sopenharmony_ci	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51)
47662306a36Sopenharmony_ci	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51)
47762306a36Sopenharmony_ci
47862306a36Sopenharmony_ci	/* Transform 48-50 + Precalc 54-56 */
47962306a36Sopenharmony_ci	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54)
48062306a36Sopenharmony_ci	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54)
48162306a36Sopenharmony_ci	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54)
48262306a36Sopenharmony_ci
48362306a36Sopenharmony_ci	/* Transform 51-53 + Precalc 57-59 */
48462306a36Sopenharmony_ci	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57)
48562306a36Sopenharmony_ci	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57)
48662306a36Sopenharmony_ci	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57)
48762306a36Sopenharmony_ci
48862306a36Sopenharmony_ci	/* Transform 54-56 + Precalc 60-62 */
48962306a36Sopenharmony_ci	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60)
49062306a36Sopenharmony_ci	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60)
49162306a36Sopenharmony_ci	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60)
49262306a36Sopenharmony_ci
49362306a36Sopenharmony_ci	/* Transform 57-59 + Precalc 63 */
49462306a36Sopenharmony_ci	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63)
49562306a36Sopenharmony_ci	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63)
49662306a36Sopenharmony_ci	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63)
49762306a36Sopenharmony_ci
49862306a36Sopenharmony_ci	/* Transform 60 */
49962306a36Sopenharmony_ci	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _)
50062306a36Sopenharmony_ci	subs		RNBLKS, RNBLKS, #1
50162306a36Sopenharmony_ci	b.eq		.Lend
50262306a36Sopenharmony_ci
50362306a36Sopenharmony_ci	/* Transform 61-63 + Preload next block */
50462306a36Sopenharmony_ci	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  61, 1, XW, LOAD_W_VEC_1, _)
50562306a36Sopenharmony_ci	ldp		s0, s1, [RSTATE, #0]
50662306a36Sopenharmony_ci	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _)
50762306a36Sopenharmony_ci	ldp		s2, s3, [RSTATE, #8]
50862306a36Sopenharmony_ci	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  63, 0, XW, LOAD_W_VEC_3, _)
50962306a36Sopenharmony_ci
51062306a36Sopenharmony_ci	/* Update the chaining variables. */
51162306a36Sopenharmony_ci	eor		ra, ra, s0
51262306a36Sopenharmony_ci	eor		rb, rb, s1
51362306a36Sopenharmony_ci	ldp		s0, s1, [RSTATE, #16]
51462306a36Sopenharmony_ci	eor		rc, rc, s2
51562306a36Sopenharmony_ci	ldp		k_even, k_odd, [RSTATE, #24]
51662306a36Sopenharmony_ci	eor		rd, rd, s3
51762306a36Sopenharmony_ci	eor		re, re, s0
51862306a36Sopenharmony_ci	stp		ra, rb, [RSTATE, #0]
51962306a36Sopenharmony_ci	eor		rf, rf, s1
52062306a36Sopenharmony_ci	stp		rc, rd, [RSTATE, #8]
52162306a36Sopenharmony_ci	eor		rg, rg, k_even
52262306a36Sopenharmony_ci	stp		re, rf, [RSTATE, #16]
52362306a36Sopenharmony_ci	eor		rh, rh, k_odd
52462306a36Sopenharmony_ci	stp		rg, rh, [RSTATE, #24]
52562306a36Sopenharmony_ci	b		.Loop
52662306a36Sopenharmony_ci
52762306a36Sopenharmony_ci.Lend:
52862306a36Sopenharmony_ci	/* Transform 61-63 */
52962306a36Sopenharmony_ci	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  61, 1, XW, _, _)
53062306a36Sopenharmony_ci	ldp		s0, s1, [RSTATE, #0]
53162306a36Sopenharmony_ci	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _)
53262306a36Sopenharmony_ci	ldp		s2, s3, [RSTATE, #8]
53362306a36Sopenharmony_ci	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  63, 0, XW, _, _)
53462306a36Sopenharmony_ci
53562306a36Sopenharmony_ci	/* Update the chaining variables. */
53662306a36Sopenharmony_ci	eor		ra, ra, s0
53762306a36Sopenharmony_ci	clear_vec(W0)
53862306a36Sopenharmony_ci	eor		rb, rb, s1
53962306a36Sopenharmony_ci	clear_vec(W1)
54062306a36Sopenharmony_ci	ldp		s0, s1, [RSTATE, #16]
54162306a36Sopenharmony_ci	clear_vec(W2)
54262306a36Sopenharmony_ci	eor		rc, rc, s2
54362306a36Sopenharmony_ci	clear_vec(W3)
54462306a36Sopenharmony_ci	ldp		k_even, k_odd, [RSTATE, #24]
54562306a36Sopenharmony_ci	clear_vec(W4)
54662306a36Sopenharmony_ci	eor		rd, rd, s3
54762306a36Sopenharmony_ci	clear_vec(W5)
54862306a36Sopenharmony_ci	eor		re, re, s0
54962306a36Sopenharmony_ci	clear_vec(XTMP0)
55062306a36Sopenharmony_ci	stp		ra, rb, [RSTATE, #0]
55162306a36Sopenharmony_ci	clear_vec(XTMP1)
55262306a36Sopenharmony_ci	eor		rf, rf, s1
55362306a36Sopenharmony_ci	clear_vec(XTMP2)
55462306a36Sopenharmony_ci	stp		rc, rd, [RSTATE, #8]
55562306a36Sopenharmony_ci	clear_vec(XTMP3)
55662306a36Sopenharmony_ci	eor		rg, rg, k_even
55762306a36Sopenharmony_ci	clear_vec(XTMP4)
55862306a36Sopenharmony_ci	stp		re, rf, [RSTATE, #16]
55962306a36Sopenharmony_ci	clear_vec(XTMP5)
56062306a36Sopenharmony_ci	eor		rh, rh, k_odd
56162306a36Sopenharmony_ci	clear_vec(XTMP6)
56262306a36Sopenharmony_ci	stp		rg, rh, [RSTATE, #24]
56362306a36Sopenharmony_ci
56462306a36Sopenharmony_ci	/* Clear message expansion area */
56562306a36Sopenharmony_ci	add		addr0, sp, #STACK_W
56662306a36Sopenharmony_ci	st1		{W0.16b-W3.16b}, [addr0], #64
56762306a36Sopenharmony_ci	st1		{W0.16b-W3.16b}, [addr0], #64
56862306a36Sopenharmony_ci	st1		{W0.16b-W3.16b}, [addr0]
56962306a36Sopenharmony_ci
57062306a36Sopenharmony_ci	mov		sp, RFRAME
57162306a36Sopenharmony_ci
57262306a36Sopenharmony_ci	ldp		x25, x26, [sp], #16
57362306a36Sopenharmony_ci	ldp		x23, x24, [sp], #16
57462306a36Sopenharmony_ci	ldp		x21, x22, [sp], #16
57562306a36Sopenharmony_ci	ldp		x19, x20, [sp], #16
57662306a36Sopenharmony_ci	ldp		x28, x29, [sp], #16
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci	ret
57962306a36Sopenharmony_ciSYM_FUNC_END(sm3_neon_transform)
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci
58262306a36Sopenharmony_ci	.section	".rodata", "a"
58362306a36Sopenharmony_ci
58462306a36Sopenharmony_ci	.align 4
58562306a36Sopenharmony_ci.LKtable:
58662306a36Sopenharmony_ci	.long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb
58762306a36Sopenharmony_ci	.long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc
58862306a36Sopenharmony_ci	.long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce
58962306a36Sopenharmony_ci	.long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6
59062306a36Sopenharmony_ci	.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
59162306a36Sopenharmony_ci	.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
59262306a36Sopenharmony_ci	.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
59362306a36Sopenharmony_ci	.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
59462306a36Sopenharmony_ci	.long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53
59562306a36Sopenharmony_ci	.long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d
59662306a36Sopenharmony_ci	.long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4
59762306a36Sopenharmony_ci	.long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43
59862306a36Sopenharmony_ci	.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
59962306a36Sopenharmony_ci	.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
60062306a36Sopenharmony_ci	.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
60162306a36Sopenharmony_ci	.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
602