1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * sm3-neon-core.S - SM3 secure hash using NEON instructions
4  *
5  * Linux/arm64 port of the libgcrypt SM3 implementation for AArch64
6  *
7  * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8  * Copyright (c) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9  */
10 
11 #include <linux/linkage.h>
12 #include <linux/cfi_types.h>
13 #include <asm/assembler.h>
14 
15 /* Context structure */
16 
17 #define state_h0 0
18 #define state_h1 4
19 #define state_h2 8
20 #define state_h3 12
21 #define state_h4 16
22 #define state_h5 20
23 #define state_h6 24
24 #define state_h7 28
25 
26 /* Stack structure */
27 
28 #define STACK_W_SIZE        (32 * 2 * 3)
29 
30 #define STACK_W             (0)
31 #define STACK_SIZE          (STACK_W + STACK_W_SIZE)
32 
33 /* Register macros */
34 
35 #define RSTATE x0
36 #define RDATA  x1
37 #define RNBLKS x2
38 #define RKPTR  x28
39 #define RFRAME x29
40 
41 #define ra w3
42 #define rb w4
43 #define rc w5
44 #define rd w6
45 #define re w7
46 #define rf w8
47 #define rg w9
48 #define rh w10
49 
50 #define t0 w11
51 #define t1 w12
52 #define t2 w13
53 #define t3 w14
54 #define t4 w15
55 #define t5 w16
56 #define t6 w17
57 
58 #define k_even w19
59 #define k_odd w20
60 
61 #define addr0 x21
62 #define addr1 x22
63 
64 #define s0 w23
65 #define s1 w24
66 #define s2 w25
67 #define s3 w26
68 
69 #define W0 v0
70 #define W1 v1
71 #define W2 v2
72 #define W3 v3
73 #define W4 v4
74 #define W5 v5
75 
76 #define XTMP0 v6
77 #define XTMP1 v7
78 #define XTMP2 v16
79 #define XTMP3 v17
80 #define XTMP4 v18
81 #define XTMP5 v19
82 #define XTMP6 v20
83 
84 /* Helper macros. */
85 
86 #define _(...) /*_*/
87 
88 #define clear_vec(x) \
89 	movi	x.8h, #0;
90 
91 #define rolw(o, a, n) \
92 	ror	o, a, #(32 - n);
93 
94 /* Round function macros. */
95 
96 #define GG1_1(x, y, z, o, t) \
97 	eor	o, x, y;
98 #define GG1_2(x, y, z, o, t) \
99 	eor	o, o, z;
100 #define GG1_3(x, y, z, o, t)
101 
102 #define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t)
103 #define FF1_2(x, y, z, o, t)
104 #define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t)
105 
106 #define GG2_1(x, y, z, o, t) \
107 	bic	o, z, x;
108 #define GG2_2(x, y, z, o, t) \
109 	and	t, y, x;
110 #define GG2_3(x, y, z, o, t) \
111 	eor	o, o, t;
112 
113 #define FF2_1(x, y, z, o, t) \
114 	eor	o, x, y;
115 #define FF2_2(x, y, z, o, t) \
116 	and	t, x, y; \
117 	and	o, o, z;
118 #define FF2_3(x, y, z, o, t) \
119 	eor	o, o, t;
120 
121 #define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
122 	K_LOAD(round);                                                        \
123 	ldr	t5, [sp, #(wtype##_W1_ADDR(round, widx))];                    \
124 	rolw(t0, a, 12);                              /* rol(a, 12) => t0 */  \
125       IOP(1, iop_param);                                                      \
126 	FF##i##_1(a, b, c, t1, t2);                                           \
127 	ldr	t6, [sp, #(wtype##_W1W2_ADDR(round, widx))];                  \
128 	add	k, k, e;                                                      \
129       IOP(2, iop_param);                                                      \
130 	GG##i##_1(e, f, g, t3, t4);                                           \
131 	FF##i##_2(a, b, c, t1, t2);                                           \
132       IOP(3, iop_param);                                                      \
133 	add	k, k, t0;                                                     \
134 	add	h, h, t5;                                                     \
135 	add	d, d, t6;                     /* w1w2 + d => d */             \
136       IOP(4, iop_param);                                                      \
137 	rolw(k, k, 7);                        /* rol (t0 + e + t), 7) => k */ \
138 	GG##i##_2(e, f, g, t3, t4);                                           \
139 	add	h, h, k;                      /* h + w1 + k => h */           \
140       IOP(5, iop_param);                                                      \
141 	FF##i##_3(a, b, c, t1, t2);                                           \
142 	eor	t0, t0, k;                    /* k ^ t0 => t0 */              \
143 	GG##i##_3(e, f, g, t3, t4);                                           \
144 	add	d, d, t1;                     /* FF(a,b,c) + d => d */        \
145       IOP(6, iop_param);                                                      \
146 	add	t3, t3, h;                    /* GG(e,f,g) + h => t3 */       \
147 	rolw(b, b, 9);                        /* rol(b, 9) => b */            \
148 	eor	h, t3, t3, ror #(32-9);                                       \
149       IOP(7, iop_param);                                                      \
150 	add	d, d, t0;                     /* t0 + d => d */               \
151 	rolw(f, f, 19);                       /* rol(f, 19) => f */           \
152       IOP(8, iop_param);                                                      \
153 	eor	h, h, t3, ror #(32-17);       /* P0(t3) => h */
154 
155 #define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
156 	R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
157 
158 #define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
159 	R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
160 
161 #define KL(round) \
162 	ldp	k_even, k_odd, [RKPTR, #(4*(round))];
163 
164 /* Input expansion macros. */
165 
166 /* Byte-swapped input address. */
167 #define IW_W_ADDR(round, widx, offs) \
168 	(STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))
169 
170 /* Expanded input address. */
171 #define XW_W_ADDR(round, widx, offs) \
172 	(STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))
173 
174 /* Rounds 1-12, byte-swapped input block addresses. */
175 #define IW_W1_ADDR(round, widx)   IW_W_ADDR(round, widx, 32)
176 #define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48)
177 
178 /* Rounds 1-12, expanded input block addresses. */
179 #define XW_W1_ADDR(round, widx)   XW_W_ADDR(round, widx, 0)
180 #define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16)
181 
182 /* Input block loading.
183  * Interleaving within round function needed for in-order CPUs. */
184 #define LOAD_W_VEC_1_1() \
185 	add	addr0, sp, #IW_W1_ADDR(0, 0);
186 #define LOAD_W_VEC_1_2() \
187 	add	addr1, sp, #IW_W1_ADDR(4, 0);
188 #define LOAD_W_VEC_1_3() \
189 	ld1	{W0.16b}, [RDATA], #16;
190 #define LOAD_W_VEC_1_4() \
191 	ld1	{W1.16b}, [RDATA], #16;
192 #define LOAD_W_VEC_1_5() \
193 	ld1	{W2.16b}, [RDATA], #16;
194 #define LOAD_W_VEC_1_6() \
195 	ld1	{W3.16b}, [RDATA], #16;
196 #define LOAD_W_VEC_1_7() \
197 	rev32	XTMP0.16b, W0.16b;
198 #define LOAD_W_VEC_1_8() \
199 	rev32	XTMP1.16b, W1.16b;
200 #define LOAD_W_VEC_2_1() \
201 	rev32	XTMP2.16b, W2.16b;
202 #define LOAD_W_VEC_2_2() \
203 	rev32	XTMP3.16b, W3.16b;
204 #define LOAD_W_VEC_2_3() \
205 	eor	XTMP4.16b, XTMP1.16b, XTMP0.16b;
206 #define LOAD_W_VEC_2_4() \
207 	eor	XTMP5.16b, XTMP2.16b, XTMP1.16b;
208 #define LOAD_W_VEC_2_5() \
209 	st1	{XTMP0.16b}, [addr0], #16;
210 #define LOAD_W_VEC_2_6() \
211 	st1	{XTMP4.16b}, [addr0]; \
212 	add	addr0, sp, #IW_W1_ADDR(8, 0);
213 #define LOAD_W_VEC_2_7() \
214 	eor	XTMP6.16b, XTMP3.16b, XTMP2.16b;
215 #define LOAD_W_VEC_2_8() \
216 	ext	W0.16b, XTMP0.16b, XTMP0.16b, #8;  /* W0: xx, w0, xx, xx */
217 #define LOAD_W_VEC_3_1() \
218 	mov	W2.16b, XTMP1.16b;                 /* W2: xx, w6, w5, w4 */
219 #define LOAD_W_VEC_3_2() \
220 	st1	{XTMP1.16b}, [addr1], #16;
221 #define LOAD_W_VEC_3_3() \
222 	st1	{XTMP5.16b}, [addr1]; \
223 	ext	W1.16b, XTMP0.16b, XTMP0.16b, #4;  /* W1: xx, w3, w2, w1 */
224 #define LOAD_W_VEC_3_4() \
225 	ext	W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */
226 #define LOAD_W_VEC_3_5() \
227 	ext	W4.16b, XTMP2.16b, XTMP3.16b, #8;  /* W4: xx, w12, w11, w10 */
228 #define LOAD_W_VEC_3_6() \
229 	st1	{XTMP2.16b}, [addr0], #16;
230 #define LOAD_W_VEC_3_7() \
231 	st1	{XTMP6.16b}, [addr0];
232 #define LOAD_W_VEC_3_8() \
233 	ext	W5.16b, XTMP3.16b, XTMP3.16b, #4;  /* W5: xx, w15, w14, w13 */
234 
235 #define LOAD_W_VEC_1(iop_num, ...) \
236 	LOAD_W_VEC_1_##iop_num()
237 #define LOAD_W_VEC_2(iop_num, ...) \
238 	LOAD_W_VEC_2_##iop_num()
239 #define LOAD_W_VEC_3(iop_num, ...) \
240 	LOAD_W_VEC_3_##iop_num()
241 
242 /* Message scheduling. Note: 3 words per vector register.
243  * Interleaving within round function needed for in-order CPUs. */
244 #define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \
245 	/* Load (w[i - 16]) => XTMP0 */            \
246 	/* Load (w[i - 13]) => XTMP5 */            \
247 	ext	XTMP0.16b, w0.16b, w0.16b, #12;    /* XTMP0: w0, xx, xx, xx */
248 #define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \
249 	ext	XTMP5.16b, w1.16b, w1.16b, #12;
250 #define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \
251 	ext	XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */
252 #define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \
253 	ext	XTMP5.16b, XTMP5.16b, w2.16b, #12;
254 #define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \
255 	/* w[i - 9] == w3 */                       \
256 	/* W3 ^ XTMP0 => XTMP0 */                  \
257 	eor	XTMP0.16b, XTMP0.16b, w3.16b;
258 #define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \
259 	/* w[i - 3] == w5 */                       \
260 	/* rol(XMM5, 15) ^ XTMP0 => XTMP0 */       \
261 	/* rol(XTMP5, 7) => XTMP1 */               \
262 	add	addr0, sp, #XW_W1_ADDR((round), 0); \
263 	shl	XTMP2.4s, w5.4s, #15;
264 #define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \
265 	shl	XTMP1.4s, XTMP5.4s, #7;
266 #define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \
267 	sri	XTMP2.4s, w5.4s, #(32-15);
268 #define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \
269 	sri	XTMP1.4s, XTMP5.4s, #(32-7);
270 #define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \
271 	eor	XTMP0.16b, XTMP0.16b, XTMP2.16b;
272 #define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \
273 	/* w[i - 6] == W4 */                       \
274 	/* W4 ^ XTMP1 => XTMP1 */                  \
275 	eor	XTMP1.16b, XTMP1.16b, w4.16b;
276 #define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \
277 	/* P1(XTMP0) ^ XTMP1 => W0 */              \
278 	shl	XTMP3.4s, XTMP0.4s, #15;
279 #define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \
280 	shl	XTMP4.4s, XTMP0.4s, #23;
281 #define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \
282 	eor	w0.16b, XTMP1.16b, XTMP0.16b;
283 #define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \
284 	sri	XTMP3.4s, XTMP0.4s, #(32-15);
285 #define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \
286 	sri	XTMP4.4s, XTMP0.4s, #(32-23);
287 #define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \
288 	eor	w0.16b, w0.16b, XTMP3.16b;
289 #define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \
290 	/* Load (w[i - 3]) => XTMP2 */             \
291 	ext	XTMP2.16b, w4.16b, w4.16b, #12;
292 #define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \
293 	eor	w0.16b, w0.16b, XTMP4.16b;
294 #define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \
295 	ext	XTMP2.16b, XTMP2.16b, w5.16b, #12;
296 #define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \
297 	/* W1 ^ W2 => XTMP3 */                     \
298 	eor	XTMP3.16b, XTMP2.16b, w0.16b;
299 #define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5)
300 #define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \
301 	st1	{XTMP2.16b-XTMP3.16b}, [addr0];
302 #define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5)
303 
304 #define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \
305 	SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5)
306 #define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \
307 	SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5)
308 #define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \
309 	SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5)
310 
311 #define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \
312 	SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0)
313 #define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \
314 	SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0)
315 #define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \
316 	SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0)
317 
318 #define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \
319 	SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1)
320 #define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \
321 	SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1)
322 #define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \
323 	SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1)
324 
325 #define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \
326 	SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2)
327 #define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \
328 	SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2)
329 #define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \
330 	SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2)
331 
332 #define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \
333 	SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3)
334 #define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \
335 	SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3)
336 #define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \
337 	SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3)
338 
339 #define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \
340 	SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4)
341 #define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \
342 	SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4)
343 #define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \
344 	SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4)
345 
346 
347 	/*
348 	 * Transform blocks*64 bytes (blocks*16 32-bit words) at 'src'.
349 	 *
350 	 * void sm3_neon_transform(struct sm3_state *sst, u8 const *src,
351 	 *                         int blocks)
352 	 */
353 	.text
354 .align 3
355 SYM_TYPED_FUNC_START(sm3_neon_transform)
356 	ldp		ra, rb, [RSTATE, #0]
357 	ldp		rc, rd, [RSTATE, #8]
358 	ldp		re, rf, [RSTATE, #16]
359 	ldp		rg, rh, [RSTATE, #24]
360 
361 	stp		x28, x29, [sp, #-16]!
362 	stp		x19, x20, [sp, #-16]!
363 	stp		x21, x22, [sp, #-16]!
364 	stp		x23, x24, [sp, #-16]!
365 	stp		x25, x26, [sp, #-16]!
366 	mov		RFRAME, sp
367 
368 	sub		addr0, sp, #STACK_SIZE
369 	adr_l		RKPTR, .LKtable
370 	and		sp, addr0, #(~63)
371 
372 	/* Preload first block. */
373 	LOAD_W_VEC_1(1, 0)
374 	LOAD_W_VEC_1(2, 0)
375 	LOAD_W_VEC_1(3, 0)
376 	LOAD_W_VEC_1(4, 0)
377 	LOAD_W_VEC_1(5, 0)
378 	LOAD_W_VEC_1(6, 0)
379 	LOAD_W_VEC_1(7, 0)
380 	LOAD_W_VEC_1(8, 0)
381 	LOAD_W_VEC_2(1, 0)
382 	LOAD_W_VEC_2(2, 0)
383 	LOAD_W_VEC_2(3, 0)
384 	LOAD_W_VEC_2(4, 0)
385 	LOAD_W_VEC_2(5, 0)
386 	LOAD_W_VEC_2(6, 0)
387 	LOAD_W_VEC_2(7, 0)
388 	LOAD_W_VEC_2(8, 0)
389 	LOAD_W_VEC_3(1, 0)
390 	LOAD_W_VEC_3(2, 0)
391 	LOAD_W_VEC_3(3, 0)
392 	LOAD_W_VEC_3(4, 0)
393 	LOAD_W_VEC_3(5, 0)
394 	LOAD_W_VEC_3(6, 0)
395 	LOAD_W_VEC_3(7, 0)
396 	LOAD_W_VEC_3(8, 0)
397 
398 .balign 16
399 .Loop:
400 	/* Transform 0-3 */
401 	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0)
402 	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  1, 1, IW, _, 0)
403 	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0)
404 	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  3, 3, IW, _, 0)
405 
406 	/* Transform 4-7 + Precalc 12-14 */
407 	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0)
408 	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  5, 1, IW, _, 0)
409 	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12)
410 	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12)
411 
412 	/* Transform 8-11 + Precalc 12-17 */
413 	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12)
414 	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15)
415 	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15)
416 	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15)
417 
418 	/* Transform 12-14 + Precalc 18-20 */
419 	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18)
420 	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18)
421 	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18)
422 
423 	/* Transform 15-17 + Precalc 21-23 */
424 	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21)
425 	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21)
426 	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21)
427 
428 	/* Transform 18-20 + Precalc 24-26 */
429 	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24)
430 	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24)
431 	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24)
432 
433 	/* Transform 21-23 + Precalc 27-29 */
434 	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27)
435 	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27)
436 	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27)
437 
438 	/* Transform 24-26 + Precalc 30-32 */
439 	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30)
440 	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30)
441 	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30)
442 
443 	/* Transform 27-29 + Precalc 33-35 */
444 	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33)
445 	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33)
446 	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33)
447 
448 	/* Transform 30-32 + Precalc 36-38 */
449 	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36)
450 	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36)
451 	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36)
452 
453 	/* Transform 33-35 + Precalc 39-41 */
454 	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39)
455 	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39)
456 	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39)
457 
458 	/* Transform 36-38 + Precalc 42-44 */
459 	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42)
460 	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42)
461 	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42)
462 
463 	/* Transform 39-41 + Precalc 45-47 */
464 	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45)
465 	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45)
466 	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45)
467 
468 	/* Transform 42-44 + Precalc 48-50 */
469 	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48)
470 	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48)
471 	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48)
472 
473 	/* Transform 45-47 + Precalc 51-53 */
474 	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51)
475 	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51)
476 	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51)
477 
478 	/* Transform 48-50 + Precalc 54-56 */
479 	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54)
480 	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54)
481 	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54)
482 
483 	/* Transform 51-53 + Precalc 57-59 */
484 	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57)
485 	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57)
486 	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57)
487 
488 	/* Transform 54-56 + Precalc 60-62 */
489 	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60)
490 	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60)
491 	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60)
492 
493 	/* Transform 57-59 + Precalc 63 */
494 	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63)
495 	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63)
496 	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63)
497 
498 	/* Transform 60 */
499 	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _)
500 	subs		RNBLKS, RNBLKS, #1
501 	b.eq		.Lend
502 
503 	/* Transform 61-63 + Preload next block */
504 	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  61, 1, XW, LOAD_W_VEC_1, _)
505 	ldp		s0, s1, [RSTATE, #0]
506 	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _)
507 	ldp		s2, s3, [RSTATE, #8]
508 	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  63, 0, XW, LOAD_W_VEC_3, _)
509 
510 	/* Update the chaining variables. */
511 	eor		ra, ra, s0
512 	eor		rb, rb, s1
513 	ldp		s0, s1, [RSTATE, #16]
514 	eor		rc, rc, s2
515 	ldp		k_even, k_odd, [RSTATE, #24]
516 	eor		rd, rd, s3
517 	eor		re, re, s0
518 	stp		ra, rb, [RSTATE, #0]
519 	eor		rf, rf, s1
520 	stp		rc, rd, [RSTATE, #8]
521 	eor		rg, rg, k_even
522 	stp		re, rf, [RSTATE, #16]
523 	eor		rh, rh, k_odd
524 	stp		rg, rh, [RSTATE, #24]
525 	b		.Loop
526 
527 .Lend:
528 	/* Transform 61-63 */
529 	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  61, 1, XW, _, _)
530 	ldp		s0, s1, [RSTATE, #0]
531 	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _)
532 	ldp		s2, s3, [RSTATE, #8]
533 	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  63, 0, XW, _, _)
534 
535 	/* Update the chaining variables. */
536 	eor		ra, ra, s0
537 	clear_vec(W0)
538 	eor		rb, rb, s1
539 	clear_vec(W1)
540 	ldp		s0, s1, [RSTATE, #16]
541 	clear_vec(W2)
542 	eor		rc, rc, s2
543 	clear_vec(W3)
544 	ldp		k_even, k_odd, [RSTATE, #24]
545 	clear_vec(W4)
546 	eor		rd, rd, s3
547 	clear_vec(W5)
548 	eor		re, re, s0
549 	clear_vec(XTMP0)
550 	stp		ra, rb, [RSTATE, #0]
551 	clear_vec(XTMP1)
552 	eor		rf, rf, s1
553 	clear_vec(XTMP2)
554 	stp		rc, rd, [RSTATE, #8]
555 	clear_vec(XTMP3)
556 	eor		rg, rg, k_even
557 	clear_vec(XTMP4)
558 	stp		re, rf, [RSTATE, #16]
559 	clear_vec(XTMP5)
560 	eor		rh, rh, k_odd
561 	clear_vec(XTMP6)
562 	stp		rg, rh, [RSTATE, #24]
563 
564 	/* Clear message expansion area */
565 	add		addr0, sp, #STACK_W
566 	st1		{W0.16b-W3.16b}, [addr0], #64
567 	st1		{W0.16b-W3.16b}, [addr0], #64
568 	st1		{W0.16b-W3.16b}, [addr0]
569 
570 	mov		sp, RFRAME
571 
572 	ldp		x25, x26, [sp], #16
573 	ldp		x23, x24, [sp], #16
574 	ldp		x21, x22, [sp], #16
575 	ldp		x19, x20, [sp], #16
576 	ldp		x28, x29, [sp], #16
577 
578 	ret
579 SYM_FUNC_END(sm3_neon_transform)
580 
581 
582 	.section	".rodata", "a"
583 
584 	.align 4
585 .LKtable:
586 	.long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb
587 	.long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc
588 	.long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce
589 	.long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6
590 	.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
591 	.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
592 	.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
593 	.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
594 	.long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53
595 	.long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d
596 	.long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4
597 	.long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43
598 	.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
599 	.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
600 	.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
601 	.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
602