1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Bit sliced AES using NEON instructions
4  *
5  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6  */
7 
8 /*
9  * The algorithm implemented here is described in detail by the paper
10  * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
11  * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
12  *
13  * This implementation is based primarily on the OpenSSL implementation
14  * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
15  */
16 
17 #include <linux/linkage.h>
18 #include <linux/cfi_types.h>
19 #include <asm/assembler.h>
20 
21 	.text
22 
23 	rounds		.req	x11
24 	bskey		.req	x12
25 
26 	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
27 	eor		\b2, \b2, \b1
28 	eor		\b5, \b5, \b6
29 	eor		\b3, \b3, \b0
30 	eor		\b6, \b6, \b2
31 	eor		\b5, \b5, \b0
32 	eor		\b6, \b6, \b3
33 	eor		\b3, \b3, \b7
34 	eor		\b7, \b7, \b5
35 	eor		\b3, \b3, \b4
36 	eor		\b4, \b4, \b5
37 	eor		\b2, \b2, \b7
38 	eor		\b3, \b3, \b1
39 	eor		\b1, \b1, \b5
40 	.endm
41 
42 	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
43 	eor		\b0, \b0, \b6
44 	eor		\b1, \b1, \b4
45 	eor		\b4, \b4, \b6
46 	eor		\b2, \b2, \b0
47 	eor		\b6, \b6, \b1
48 	eor		\b1, \b1, \b5
49 	eor		\b5, \b5, \b3
50 	eor		\b3, \b3, \b7
51 	eor		\b7, \b7, \b5
52 	eor		\b2, \b2, \b5
53 	eor		\b4, \b4, \b7
54 	.endm
55 
56 	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
57 	eor		\b1, \b1, \b7
58 	eor		\b4, \b4, \b7
59 	eor		\b7, \b7, \b5
60 	eor		\b1, \b1, \b3
61 	eor		\b2, \b2, \b5
62 	eor		\b3, \b3, \b7
63 	eor		\b6, \b6, \b1
64 	eor		\b2, \b2, \b0
65 	eor		\b5, \b5, \b3
66 	eor		\b4, \b4, \b6
67 	eor		\b0, \b0, \b6
68 	eor		\b1, \b1, \b4
69 	.endm
70 
71 	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
72 	eor		\b1, \b1, \b5
73 	eor		\b2, \b2, \b7
74 	eor		\b3, \b3, \b1
75 	eor		\b4, \b4, \b5
76 	eor		\b7, \b7, \b5
77 	eor		\b3, \b3, \b4
78 	eor 		\b5, \b5, \b0
79 	eor		\b3, \b3, \b7
80 	eor		\b6, \b6, \b2
81 	eor		\b2, \b2, \b1
82 	eor		\b6, \b6, \b3
83 	eor		\b3, \b3, \b0
84 	eor		\b5, \b5, \b6
85 	.endm
86 
87 	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
88 	eor 		\t0, \y0, \y1
89 	and		\t0, \t0, \x0
90 	eor		\x0, \x0, \x1
91 	and		\t1, \x1, \y0
92 	and		\x0, \x0, \y1
93 	eor		\x1, \t1, \t0
94 	eor		\x0, \x0, \t1
95 	.endm
96 
97 	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
98 	eor		\t0, \y0, \y1
99 	eor 		\t1, \y2, \y3
100 	and		\t0, \t0, \x0
101 	and		\t1, \t1, \x2
102 	eor		\x0, \x0, \x1
103 	eor		\x2, \x2, \x3
104 	and		\x1, \x1, \y0
105 	and		\x3, \x3, \y2
106 	and		\x0, \x0, \y1
107 	and		\x2, \x2, \y3
108 	eor		\x1, \x1, \x0
109 	eor		\x2, \x2, \x3
110 	eor		\x0, \x0, \t0
111 	eor		\x3, \x3, \t1
112 	.endm
113 
114 	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
115 				    y0, y1, y2, y3, t0, t1, t2, t3
116 	eor		\t0, \x0, \x2
117 	eor		\t1, \x1, \x3
118 	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
119 	eor		\y0, \y0, \y2
120 	eor		\y1, \y1, \y3
121 	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
122 	eor		\x0, \x0, \t0
123 	eor		\x2, \x2, \t0
124 	eor		\x1, \x1, \t1
125 	eor		\x3, \x3, \t1
126 	eor		\t0, \x4, \x6
127 	eor		\t1, \x5, \x7
128 	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
129 	eor		\y0, \y0, \y2
130 	eor		\y1, \y1, \y3
131 	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
132 	eor		\x4, \x4, \t0
133 	eor		\x6, \x6, \t0
134 	eor		\x5, \x5, \t1
135 	eor		\x7, \x7, \t1
136 	.endm
137 
138 	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
139 				   t0, t1, t2, t3, s0, s1, s2, s3
140 	eor		\t3, \x4, \x6
141 	eor		\t0, \x5, \x7
142 	eor		\t1, \x1, \x3
143 	eor		\s1, \x7, \x6
144 	eor		\s0, \x0, \x2
145 	eor		\s3, \t3, \t0
146 	orr		\t2, \t0, \t1
147 	and		\s2, \t3, \s0
148 	orr		\t3, \t3, \s0
149 	eor		\s0, \s0, \t1
150 	and		\t0, \t0, \t1
151 	eor		\t1, \x3, \x2
152 	and		\s3, \s3, \s0
153 	and		\s1, \s1, \t1
154 	eor		\t1, \x4, \x5
155 	eor		\s0, \x1, \x0
156 	eor		\t3, \t3, \s1
157 	eor		\t2, \t2, \s1
158 	and		\s1, \t1, \s0
159 	orr		\t1, \t1, \s0
160 	eor		\t3, \t3, \s3
161 	eor		\t0, \t0, \s1
162 	eor		\t2, \t2, \s2
163 	eor		\t1, \t1, \s3
164 	eor		\t0, \t0, \s2
165 	and		\s0, \x7, \x3
166 	eor		\t1, \t1, \s2
167 	and		\s1, \x6, \x2
168 	and		\s2, \x5, \x1
169 	orr		\s3, \x4, \x0
170 	eor		\t3, \t3, \s0
171 	eor		\t1, \t1, \s2
172 	eor		\s0, \t0, \s3
173 	eor		\t2, \t2, \s1
174 	and		\s2, \t3, \t1
175 	eor		\s1, \t2, \s2
176 	eor		\s3, \s0, \s2
177 	bsl		\s1, \t1, \s0
178 	not		\t0, \s0
179 	bsl		\s0, \s1, \s3
180 	bsl		\t0, \s1, \s3
181 	bsl		\s3, \t3, \t2
182 	eor		\t3, \t3, \t2
183 	and		\s2, \s0, \s3
184 	eor		\t1, \t1, \t0
185 	eor		\s2, \s2, \t3
186 	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
187 			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
188 	.endm
189 
190 	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
191 			      t0, t1, t2, t3, s0, s1, s2, s3
192 	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
193 			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
194 	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
195 			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
196 			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
197 			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
198 	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
199 			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
200 	.endm
201 
202 	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
203 				  t0, t1, t2, t3, s0, s1, s2, s3
204 	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
205 			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
206 	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
207 			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
208 			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
209 			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
210 	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
211 			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
212 	.endm
213 
214 	.macro		enc_next_rk
215 	ldp		q16, q17, [bskey], #128
216 	ldp		q18, q19, [bskey, #-96]
217 	ldp		q20, q21, [bskey, #-64]
218 	ldp		q22, q23, [bskey, #-32]
219 	.endm
220 
221 	.macro		dec_next_rk
222 	ldp		q16, q17, [bskey, #-128]!
223 	ldp		q18, q19, [bskey, #32]
224 	ldp		q20, q21, [bskey, #64]
225 	ldp		q22, q23, [bskey, #96]
226 	.endm
227 
228 	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
229 	eor		\x0\().16b, \x0\().16b, v16.16b
230 	eor		\x1\().16b, \x1\().16b, v17.16b
231 	eor		\x2\().16b, \x2\().16b, v18.16b
232 	eor		\x3\().16b, \x3\().16b, v19.16b
233 	eor		\x4\().16b, \x4\().16b, v20.16b
234 	eor		\x5\().16b, \x5\().16b, v21.16b
235 	eor		\x6\().16b, \x6\().16b, v22.16b
236 	eor		\x7\().16b, \x7\().16b, v23.16b
237 	.endm
238 
239 	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
240 	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
241 	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
242 	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
243 	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
244 	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
245 	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
246 	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
247 	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
248 	.endm
249 
250 	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
251 				  t0, t1, t2, t3, t4, t5, t6, t7, inv
252 	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
253 	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
254 	eor		\x0\().16b, \x0\().16b, \t0\().16b
255 	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
256 	eor		\x1\().16b, \x1\().16b, \t1\().16b
257 	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
258 	eor		\x2\().16b, \x2\().16b, \t2\().16b
259 	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
260 	eor		\x3\().16b, \x3\().16b, \t3\().16b
261 	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
262 	eor		\x4\().16b, \x4\().16b, \t4\().16b
263 	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
264 	eor		\x5\().16b, \x5\().16b, \t5\().16b
265 	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
266 	eor		\x6\().16b, \x6\().16b, \t6\().16b
267 	eor		\t1\().16b, \t1\().16b, \x0\().16b
268 	eor		\x7\().16b, \x7\().16b, \t7\().16b
269 	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
270 	eor		\t2\().16b, \t2\().16b, \x1\().16b
271 	eor		\t0\().16b, \t0\().16b, \x7\().16b
272 	eor		\t1\().16b, \t1\().16b, \x7\().16b
273 	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
274 	eor		\t5\().16b, \t5\().16b, \x4\().16b
275 	eor		\x0\().16b, \x0\().16b, \t0\().16b
276 	eor		\t6\().16b, \t6\().16b, \x5\().16b
277 	eor		\x1\().16b, \x1\().16b, \t1\().16b
278 	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
279 	eor		\t4\().16b, \t4\().16b, \x3\().16b
280 	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
281 	eor		\t7\().16b, \t7\().16b, \x6\().16b
282 	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
283 	eor		\t3\().16b, \t3\().16b, \x2\().16b
284 	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
285 	eor		\t4\().16b, \t4\().16b, \x7\().16b
286 	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
287 	eor		\t3\().16b, \t3\().16b, \x7\().16b
288 	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
289 	eor		\x7\().16b, \t1\().16b, \t5\().16b
290 	.ifb		\inv
291 	eor		\x2\().16b, \t0\().16b, \t4\().16b
292 	eor		\x4\().16b, \x4\().16b, \t3\().16b
293 	eor		\x5\().16b, \x5\().16b, \t7\().16b
294 	eor		\x3\().16b, \x3\().16b, \t6\().16b
295 	eor		\x6\().16b, \x6\().16b, \t2\().16b
296 	.else
297 	eor		\t3\().16b, \t3\().16b, \x4\().16b
298 	eor		\x5\().16b, \x5\().16b, \t7\().16b
299 	eor		\x2\().16b, \x3\().16b, \t6\().16b
300 	eor		\x3\().16b, \t0\().16b, \t4\().16b
301 	eor		\x4\().16b, \x6\().16b, \t2\().16b
302 	mov		\x6\().16b, \t3\().16b
303 	.endif
304 	.endm
305 
306 	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
307 				      t0, t1, t2, t3, t4, t5, t6, t7
308 	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
309 	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
310 	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
311 	eor		\t0\().16b, \t0\().16b, \x0\().16b
312 	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
313 	eor		\t6\().16b, \t6\().16b, \x6\().16b
314 	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
315 	eor		\t7\().16b, \t7\().16b, \x7\().16b
316 	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
317 	eor		\t1\().16b, \t1\().16b, \x1\().16b
318 	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
319 	eor		\t2\().16b, \t2\().16b, \x2\().16b
320 	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
321 	eor		\t3\().16b, \t3\().16b, \x3\().16b
322 	eor		\t4\().16b, \t4\().16b, \x4\().16b
323 	eor		\t5\().16b, \t5\().16b, \x5\().16b
324 	eor		\x0\().16b, \x0\().16b, \t6\().16b
325 	eor		\x1\().16b, \x1\().16b, \t6\().16b
326 	eor		\x2\().16b, \x2\().16b, \t0\().16b
327 	eor		\x4\().16b, \x4\().16b, \t2\().16b
328 	eor		\x3\().16b, \x3\().16b, \t1\().16b
329 	eor		\x1\().16b, \x1\().16b, \t7\().16b
330 	eor		\x2\().16b, \x2\().16b, \t7\().16b
331 	eor		\x4\().16b, \x4\().16b, \t6\().16b
332 	eor		\x5\().16b, \x5\().16b, \t3\().16b
333 	eor		\x3\().16b, \x3\().16b, \t6\().16b
334 	eor		\x6\().16b, \x6\().16b, \t4\().16b
335 	eor		\x4\().16b, \x4\().16b, \t7\().16b
336 	eor		\x5\().16b, \x5\().16b, \t7\().16b
337 	eor		\x7\().16b, \x7\().16b, \t5\().16b
338 	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
339 			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
340 	.endm
341 
342 	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
343 	ushr		\t0\().2d, \b0\().2d, #\n
344 	ushr		\t1\().2d, \b1\().2d, #\n
345 	eor		\t0\().16b, \t0\().16b, \a0\().16b
346 	eor		\t1\().16b, \t1\().16b, \a1\().16b
347 	and		\t0\().16b, \t0\().16b, \mask\().16b
348 	and		\t1\().16b, \t1\().16b, \mask\().16b
349 	eor		\a0\().16b, \a0\().16b, \t0\().16b
350 	shl		\t0\().2d, \t0\().2d, #\n
351 	eor		\a1\().16b, \a1\().16b, \t1\().16b
352 	shl		\t1\().2d, \t1\().2d, #\n
353 	eor		\b0\().16b, \b0\().16b, \t0\().16b
354 	eor		\b1\().16b, \b1\().16b, \t1\().16b
355 	.endm
356 
357 	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
358 	movi		\t0\().16b, #0x55
359 	movi		\t1\().16b, #0x33
360 	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
361 	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
362 	movi		\t0\().16b, #0x0f
363 	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
364 	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
365 	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
366 	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
367 	.endm
368 
369 
370 	.align		6
371 M0:	.octa		0x0004080c0105090d02060a0e03070b0f
372 
373 M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
374 SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
375 SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
376 
377 M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
378 ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
379 ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
380 
381 	/*
382 	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
383 	 */
384 SYM_FUNC_START(aesbs_convert_key)
385 	ld1		{v7.4s}, [x1], #16		// load round 0 key
386 	ld1		{v17.4s}, [x1], #16		// load round 1 key
387 
388 	movi		v8.16b,  #0x01			// bit masks
389 	movi		v9.16b,  #0x02
390 	movi		v10.16b, #0x04
391 	movi		v11.16b, #0x08
392 	movi		v12.16b, #0x10
393 	movi		v13.16b, #0x20
394 	movi		v14.16b, #0x40
395 	movi		v15.16b, #0x80
396 	ldr		q16, M0
397 
398 	sub		x2, x2, #1
399 	str		q7, [x0], #16		// save round 0 key
400 
401 .Lkey_loop:
402 	tbl		v7.16b ,{v17.16b}, v16.16b
403 	ld1		{v17.4s}, [x1], #16		// load next round key
404 
405 	cmtst		v0.16b, v7.16b, v8.16b
406 	cmtst		v1.16b, v7.16b, v9.16b
407 	cmtst		v2.16b, v7.16b, v10.16b
408 	cmtst		v3.16b, v7.16b, v11.16b
409 	cmtst		v4.16b, v7.16b, v12.16b
410 	cmtst		v5.16b, v7.16b, v13.16b
411 	cmtst		v6.16b, v7.16b, v14.16b
412 	cmtst		v7.16b, v7.16b, v15.16b
413 	not		v0.16b, v0.16b
414 	not		v1.16b, v1.16b
415 	not		v5.16b, v5.16b
416 	not		v6.16b, v6.16b
417 
418 	subs		x2, x2, #1
419 	stp		q0, q1, [x0], #128
420 	stp		q2, q3, [x0, #-96]
421 	stp		q4, q5, [x0, #-64]
422 	stp		q6, q7, [x0, #-32]
423 	b.ne		.Lkey_loop
424 
425 	movi		v7.16b, #0x63			// compose .L63
426 	eor		v17.16b, v17.16b, v7.16b
427 	str		q17, [x0]
428 	ret
429 SYM_FUNC_END(aesbs_convert_key)
430 
431 	.align		4
432 SYM_FUNC_START_LOCAL(aesbs_encrypt8)
433 	ldr		q9, [bskey], #16		// round 0 key
434 	ldr		q8, M0SR
435 	ldr		q24, SR
436 
437 	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
438 	eor		v11.16b, v1.16b, v9.16b
439 	tbl		v0.16b, {v10.16b}, v8.16b
440 	eor		v12.16b, v2.16b, v9.16b
441 	tbl		v1.16b, {v11.16b}, v8.16b
442 	eor		v13.16b, v3.16b, v9.16b
443 	tbl		v2.16b, {v12.16b}, v8.16b
444 	eor		v14.16b, v4.16b, v9.16b
445 	tbl		v3.16b, {v13.16b}, v8.16b
446 	eor		v15.16b, v5.16b, v9.16b
447 	tbl		v4.16b, {v14.16b}, v8.16b
448 	eor		v10.16b, v6.16b, v9.16b
449 	tbl		v5.16b, {v15.16b}, v8.16b
450 	eor		v11.16b, v7.16b, v9.16b
451 	tbl		v6.16b, {v10.16b}, v8.16b
452 	tbl		v7.16b, {v11.16b}, v8.16b
453 
454 	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
455 
456 	sub		rounds, rounds, #1
457 	b		.Lenc_sbox
458 
459 .Lenc_loop:
460 	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
461 .Lenc_sbox:
462 	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
463 								v13, v14, v15
464 	subs		rounds, rounds, #1
465 	b.cc		.Lenc_done
466 
467 	enc_next_rk
468 
469 	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
470 								v13, v14, v15
471 
472 	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
473 
474 	b.ne		.Lenc_loop
475 	ldr		q24, SRM0
476 	b		.Lenc_loop
477 
478 .Lenc_done:
479 	ldr		q12, [bskey]			// last round key
480 
481 	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
482 
483 	eor		v0.16b, v0.16b, v12.16b
484 	eor		v1.16b, v1.16b, v12.16b
485 	eor		v4.16b, v4.16b, v12.16b
486 	eor		v6.16b, v6.16b, v12.16b
487 	eor		v3.16b, v3.16b, v12.16b
488 	eor		v7.16b, v7.16b, v12.16b
489 	eor		v2.16b, v2.16b, v12.16b
490 	eor		v5.16b, v5.16b, v12.16b
491 	ret
492 SYM_FUNC_END(aesbs_encrypt8)
493 
494 	.align		4
495 SYM_FUNC_START_LOCAL(aesbs_decrypt8)
496 	lsl		x9, rounds, #7
497 	add		bskey, bskey, x9
498 
499 	ldr		q9, [bskey, #-112]!		// round 0 key
500 	ldr		q8, M0ISR
501 	ldr		q24, ISR
502 
503 	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
504 	eor		v11.16b, v1.16b, v9.16b
505 	tbl		v0.16b, {v10.16b}, v8.16b
506 	eor		v12.16b, v2.16b, v9.16b
507 	tbl		v1.16b, {v11.16b}, v8.16b
508 	eor		v13.16b, v3.16b, v9.16b
509 	tbl		v2.16b, {v12.16b}, v8.16b
510 	eor		v14.16b, v4.16b, v9.16b
511 	tbl		v3.16b, {v13.16b}, v8.16b
512 	eor		v15.16b, v5.16b, v9.16b
513 	tbl		v4.16b, {v14.16b}, v8.16b
514 	eor		v10.16b, v6.16b, v9.16b
515 	tbl		v5.16b, {v15.16b}, v8.16b
516 	eor		v11.16b, v7.16b, v9.16b
517 	tbl		v6.16b, {v10.16b}, v8.16b
518 	tbl		v7.16b, {v11.16b}, v8.16b
519 
520 	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
521 
522 	sub		rounds, rounds, #1
523 	b		.Ldec_sbox
524 
525 .Ldec_loop:
526 	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
527 .Ldec_sbox:
528 	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
529 								v13, v14, v15
530 	subs		rounds, rounds, #1
531 	b.cc		.Ldec_done
532 
533 	dec_next_rk
534 
535 	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
536 
537 	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
538 								v13, v14, v15
539 
540 	b.ne		.Ldec_loop
541 	ldr		q24, ISRM0
542 	b		.Ldec_loop
543 .Ldec_done:
544 	ldr		q12, [bskey, #-16]		// last round key
545 
546 	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
547 
548 	eor		v0.16b, v0.16b, v12.16b
549 	eor		v1.16b, v1.16b, v12.16b
550 	eor		v6.16b, v6.16b, v12.16b
551 	eor		v4.16b, v4.16b, v12.16b
552 	eor		v2.16b, v2.16b, v12.16b
553 	eor		v7.16b, v7.16b, v12.16b
554 	eor		v3.16b, v3.16b, v12.16b
555 	eor		v5.16b, v5.16b, v12.16b
556 	ret
557 SYM_FUNC_END(aesbs_decrypt8)
558 
559 	/*
560 	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
561 	 *		     int blocks)
562 	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
563 	 *		     int blocks)
564 	 */
565 	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
566 	frame_push	5
567 
568 	mov		x19, x0
569 	mov		x20, x1
570 	mov		x21, x2
571 	mov		x22, x3
572 	mov		x23, x4
573 
574 99:	mov		x5, #1
575 	lsl		x5, x5, x23
576 	subs		w23, w23, #8
577 	csel		x23, x23, xzr, pl
578 	csel		x5, x5, xzr, mi
579 
580 	ld1		{v0.16b}, [x20], #16
581 	tbnz		x5, #1, 0f
582 	ld1		{v1.16b}, [x20], #16
583 	tbnz		x5, #2, 0f
584 	ld1		{v2.16b}, [x20], #16
585 	tbnz		x5, #3, 0f
586 	ld1		{v3.16b}, [x20], #16
587 	tbnz		x5, #4, 0f
588 	ld1		{v4.16b}, [x20], #16
589 	tbnz		x5, #5, 0f
590 	ld1		{v5.16b}, [x20], #16
591 	tbnz		x5, #6, 0f
592 	ld1		{v6.16b}, [x20], #16
593 	tbnz		x5, #7, 0f
594 	ld1		{v7.16b}, [x20], #16
595 
596 0:	mov		bskey, x21
597 	mov		rounds, x22
598 	bl		\do8
599 
600 	st1		{\o0\().16b}, [x19], #16
601 	tbnz		x5, #1, 1f
602 	st1		{\o1\().16b}, [x19], #16
603 	tbnz		x5, #2, 1f
604 	st1		{\o2\().16b}, [x19], #16
605 	tbnz		x5, #3, 1f
606 	st1		{\o3\().16b}, [x19], #16
607 	tbnz		x5, #4, 1f
608 	st1		{\o4\().16b}, [x19], #16
609 	tbnz		x5, #5, 1f
610 	st1		{\o5\().16b}, [x19], #16
611 	tbnz		x5, #6, 1f
612 	st1		{\o6\().16b}, [x19], #16
613 	tbnz		x5, #7, 1f
614 	st1		{\o7\().16b}, [x19], #16
615 
616 	cbz		x23, 1f
617 	b		99b
618 
619 1:	frame_pop
620 	ret
621 	.endm
622 
623 	.align		4
624 SYM_TYPED_FUNC_START(aesbs_ecb_encrypt)
625 	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
626 SYM_FUNC_END(aesbs_ecb_encrypt)
627 
628 	.align		4
629 SYM_TYPED_FUNC_START(aesbs_ecb_decrypt)
630 	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
631 SYM_FUNC_END(aesbs_ecb_decrypt)
632 
633 	/*
634 	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
635 	 *		     int blocks, u8 iv[])
636 	 */
637 	.align		4
638 SYM_FUNC_START(aesbs_cbc_decrypt)
639 	frame_push	6
640 
641 	mov		x19, x0
642 	mov		x20, x1
643 	mov		x21, x2
644 	mov		x22, x3
645 	mov		x23, x4
646 	mov		x24, x5
647 
648 99:	mov		x6, #1
649 	lsl		x6, x6, x23
650 	subs		w23, w23, #8
651 	csel		x23, x23, xzr, pl
652 	csel		x6, x6, xzr, mi
653 
654 	ld1		{v0.16b}, [x20], #16
655 	mov		v25.16b, v0.16b
656 	tbnz		x6, #1, 0f
657 	ld1		{v1.16b}, [x20], #16
658 	mov		v26.16b, v1.16b
659 	tbnz		x6, #2, 0f
660 	ld1		{v2.16b}, [x20], #16
661 	mov		v27.16b, v2.16b
662 	tbnz		x6, #3, 0f
663 	ld1		{v3.16b}, [x20], #16
664 	mov		v28.16b, v3.16b
665 	tbnz		x6, #4, 0f
666 	ld1		{v4.16b}, [x20], #16
667 	mov		v29.16b, v4.16b
668 	tbnz		x6, #5, 0f
669 	ld1		{v5.16b}, [x20], #16
670 	mov		v30.16b, v5.16b
671 	tbnz		x6, #6, 0f
672 	ld1		{v6.16b}, [x20], #16
673 	mov		v31.16b, v6.16b
674 	tbnz		x6, #7, 0f
675 	ld1		{v7.16b}, [x20]
676 
677 0:	mov		bskey, x21
678 	mov		rounds, x22
679 	bl		aesbs_decrypt8
680 
681 	ld1		{v24.16b}, [x24]		// load IV
682 
683 	eor		v1.16b, v1.16b, v25.16b
684 	eor		v6.16b, v6.16b, v26.16b
685 	eor		v4.16b, v4.16b, v27.16b
686 	eor		v2.16b, v2.16b, v28.16b
687 	eor		v7.16b, v7.16b, v29.16b
688 	eor		v0.16b, v0.16b, v24.16b
689 	eor		v3.16b, v3.16b, v30.16b
690 	eor		v5.16b, v5.16b, v31.16b
691 
692 	st1		{v0.16b}, [x19], #16
693 	mov		v24.16b, v25.16b
694 	tbnz		x6, #1, 1f
695 	st1		{v1.16b}, [x19], #16
696 	mov		v24.16b, v26.16b
697 	tbnz		x6, #2, 1f
698 	st1		{v6.16b}, [x19], #16
699 	mov		v24.16b, v27.16b
700 	tbnz		x6, #3, 1f
701 	st1		{v4.16b}, [x19], #16
702 	mov		v24.16b, v28.16b
703 	tbnz		x6, #4, 1f
704 	st1		{v2.16b}, [x19], #16
705 	mov		v24.16b, v29.16b
706 	tbnz		x6, #5, 1f
707 	st1		{v7.16b}, [x19], #16
708 	mov		v24.16b, v30.16b
709 	tbnz		x6, #6, 1f
710 	st1		{v3.16b}, [x19], #16
711 	mov		v24.16b, v31.16b
712 	tbnz		x6, #7, 1f
713 	ld1		{v24.16b}, [x20], #16
714 	st1		{v5.16b}, [x19], #16
715 1:	st1		{v24.16b}, [x24]		// store IV
716 
717 	cbz		x23, 2f
718 	b		99b
719 
720 2:	frame_pop
721 	ret
722 SYM_FUNC_END(aesbs_cbc_decrypt)
723 
724 	.macro		next_tweak, out, in, const, tmp
725 	sshr		\tmp\().2d,  \in\().2d,   #63
726 	and		\tmp\().16b, \tmp\().16b, \const\().16b
727 	add		\out\().2d,  \in\().2d,   \in\().2d
728 	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
729 	eor		\out\().16b, \out\().16b, \tmp\().16b
730 	.endm
731 
732 	/*
733 	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
734 	 *		     int blocks, u8 iv[])
735 	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
736 	 *		     int blocks, u8 iv[])
737 	 */
738 SYM_FUNC_START_LOCAL(__xts_crypt8)
739 	movi		v18.2s, #0x1
740 	movi		v19.2s, #0x87
741 	uzp1		v18.4s, v18.4s, v19.4s
742 
743 	ld1		{v0.16b-v3.16b}, [x1], #64
744 	ld1		{v4.16b-v7.16b}, [x1], #64
745 
746 	next_tweak	v26, v25, v18, v19
747 	next_tweak	v27, v26, v18, v19
748 	next_tweak	v28, v27, v18, v19
749 	next_tweak	v29, v28, v18, v19
750 	next_tweak	v30, v29, v18, v19
751 	next_tweak	v31, v30, v18, v19
752 	next_tweak	v16, v31, v18, v19
753 	next_tweak	v17, v16, v18, v19
754 
755 	eor		v0.16b, v0.16b, v25.16b
756 	eor		v1.16b, v1.16b, v26.16b
757 	eor		v2.16b, v2.16b, v27.16b
758 	eor		v3.16b, v3.16b, v28.16b
759 	eor		v4.16b, v4.16b, v29.16b
760 	eor		v5.16b, v5.16b, v30.16b
761 	eor		v6.16b, v6.16b, v31.16b
762 	eor		v7.16b, v7.16b, v16.16b
763 
764 	stp		q16, q17, [x6]
765 
766 	mov		bskey, x2
767 	mov		rounds, x3
768 	br		x16
769 SYM_FUNC_END(__xts_crypt8)
770 
771 	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
772 	frame_push	0, 32
773 	add		x6, sp, #.Lframe_local_offset
774 
775 	ld1		{v25.16b}, [x5]
776 
777 0:	adr		x16, \do8
778 	bl		__xts_crypt8
779 
780 	eor		v16.16b, \o0\().16b, v25.16b
781 	eor		v17.16b, \o1\().16b, v26.16b
782 	eor		v18.16b, \o2\().16b, v27.16b
783 	eor		v19.16b, \o3\().16b, v28.16b
784 
785 	ldp		q24, q25, [x6]
786 
787 	eor		v20.16b, \o4\().16b, v29.16b
788 	eor		v21.16b, \o5\().16b, v30.16b
789 	eor		v22.16b, \o6\().16b, v31.16b
790 	eor		v23.16b, \o7\().16b, v24.16b
791 
792 	st1		{v16.16b-v19.16b}, [x0], #64
793 	st1		{v20.16b-v23.16b}, [x0], #64
794 
795 	subs		x4, x4, #8
796 	b.gt		0b
797 
798 	st1		{v25.16b}, [x5]
799 	frame_pop
800 	ret
801 	.endm
802 
803 SYM_TYPED_FUNC_START(aesbs_xts_encrypt)
804 	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
805 SYM_FUNC_END(aesbs_xts_encrypt)
806 
807 SYM_TYPED_FUNC_START(aesbs_xts_decrypt)
808 	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
809 SYM_FUNC_END(aesbs_xts_decrypt)
810 
811 	.macro		next_ctr, v
812 	mov		\v\().d[1], x8
813 	adds		x8, x8, #1
814 	mov		\v\().d[0], x7
815 	adc		x7, x7, xzr
816 	rev64		\v\().16b, \v\().16b
817 	.endm
818 
819 	/*
820 	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
821 	 *		     int rounds, int blocks, u8 iv[])
822 	 */
823 SYM_FUNC_START(aesbs_ctr_encrypt)
824 	frame_push	0
825 	ldp		x7, x8, [x5]
826 	ld1		{v0.16b}, [x5]
827 CPU_LE(	rev		x7, x7		)
828 CPU_LE(	rev		x8, x8		)
829 	adds		x8, x8, #1
830 	adc		x7, x7, xzr
831 
832 0:	next_ctr	v1
833 	next_ctr	v2
834 	next_ctr	v3
835 	next_ctr	v4
836 	next_ctr	v5
837 	next_ctr	v6
838 	next_ctr	v7
839 
840 	mov		bskey, x2
841 	mov		rounds, x3
842 	bl		aesbs_encrypt8
843 
844 	ld1		{ v8.16b-v11.16b}, [x1], #64
845 	ld1		{v12.16b-v15.16b}, [x1], #64
846 
847 	eor		v8.16b, v0.16b, v8.16b
848 	eor		v9.16b, v1.16b, v9.16b
849 	eor		v10.16b, v4.16b, v10.16b
850 	eor		v11.16b, v6.16b, v11.16b
851 	eor		v12.16b, v3.16b, v12.16b
852 	eor		v13.16b, v7.16b, v13.16b
853 	eor		v14.16b, v2.16b, v14.16b
854 	eor		v15.16b, v5.16b, v15.16b
855 
856 	st1		{ v8.16b-v11.16b}, [x0], #64
857 	st1		{v12.16b-v15.16b}, [x0], #64
858 
859 	next_ctr	v0
860 	subs		x4, x4, #8
861 	b.gt		0b
862 
863 	st1		{v0.16b}, [x5]
864 	frame_pop
865 	ret
866 SYM_FUNC_END(aesbs_ctr_encrypt)
867