1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * ARIA Cipher 16-way parallel algorithm (AVX)
4  *
5  * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6  *
7  */
8 
9 #include <linux/linkage.h>
10 #include <linux/cfi_types.h>
11 #include <asm/asm-offsets.h>
12 #include <asm/frame.h>
13 
14 /* register macros */
15 #define CTX %rdi
16 
17 
18 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
19 	( (((a0) & 1) << 0) |				\
20 	  (((a1) & 1) << 1) |				\
21 	  (((a2) & 1) << 2) |				\
22 	  (((a3) & 1) << 3) |				\
23 	  (((a4) & 1) << 4) |				\
24 	  (((a5) & 1) << 5) |				\
25 	  (((a6) & 1) << 6) |				\
26 	  (((a7) & 1) << 7) )
27 
28 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
29 	( ((l7) << (0 * 8)) |				\
30 	  ((l6) << (1 * 8)) |				\
31 	  ((l5) << (2 * 8)) |				\
32 	  ((l4) << (3 * 8)) |				\
33 	  ((l3) << (4 * 8)) |				\
34 	  ((l2) << (5 * 8)) |				\
35 	  ((l1) << (6 * 8)) |				\
36 	  ((l0) << (7 * 8)) )
37 
38 #define inc_le128(x, minus_one, tmp)			\
39 	vpcmpeqq minus_one, x, tmp;			\
40 	vpsubq minus_one, x, x;				\
41 	vpslldq $8, tmp, tmp;				\
42 	vpsubq tmp, x, x;
43 
44 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
45 	vpand x, mask4bit, tmp0;			\
46 	vpandn x, mask4bit, x;				\
47 	vpsrld $4, x, x;				\
48 							\
49 	vpshufb tmp0, lo_t, tmp0;			\
50 	vpshufb x, hi_t, x;				\
51 	vpxor tmp0, x, x;
52 
53 #define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
54 	vpunpckhdq x1, x0, t2;				\
55 	vpunpckldq x1, x0, x0;				\
56 							\
57 	vpunpckldq x3, x2, t1;				\
58 	vpunpckhdq x3, x2, x2;				\
59 							\
60 	vpunpckhqdq t1, x0, x1;				\
61 	vpunpcklqdq t1, x0, x0;				\
62 							\
63 	vpunpckhqdq x2, t2, x3;				\
64 	vpunpcklqdq x2, t2, x2;
65 
66 #define byteslice_16x16b(a0, b0, c0, d0,		\
67 			 a1, b1, c1, d1,		\
68 			 a2, b2, c2, d2,		\
69 			 a3, b3, c3, d3,		\
70 			 st0, st1)			\
71 	vmovdqu d2, st0;				\
72 	vmovdqu d3, st1;				\
73 	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
74 	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
75 	vmovdqu st0, d2;				\
76 	vmovdqu st1, d3;				\
77 							\
78 	vmovdqu a0, st0;				\
79 	vmovdqu a1, st1;				\
80 	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
81 	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
82 							\
83 	vmovdqu .Lshufb_16x16b(%rip), a0;		\
84 	vmovdqu st1, a1;				\
85 	vpshufb a0, a2, a2;				\
86 	vpshufb a0, a3, a3;				\
87 	vpshufb a0, b0, b0;				\
88 	vpshufb a0, b1, b1;				\
89 	vpshufb a0, b2, b2;				\
90 	vpshufb a0, b3, b3;				\
91 	vpshufb a0, a1, a1;				\
92 	vpshufb a0, c0, c0;				\
93 	vpshufb a0, c1, c1;				\
94 	vpshufb a0, c2, c2;				\
95 	vpshufb a0, c3, c3;				\
96 	vpshufb a0, d0, d0;				\
97 	vpshufb a0, d1, d1;				\
98 	vpshufb a0, d2, d2;				\
99 	vpshufb a0, d3, d3;				\
100 	vmovdqu d3, st1;				\
101 	vmovdqu st0, d3;				\
102 	vpshufb a0, d3, a0;				\
103 	vmovdqu d2, st0;				\
104 							\
105 	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
106 	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
107 	vmovdqu st0, d2;				\
108 	vmovdqu st1, d3;				\
109 							\
110 	vmovdqu b0, st0;				\
111 	vmovdqu b1, st1;				\
112 	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
113 	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
114 	vmovdqu st0, b0;				\
115 	vmovdqu st1, b1;				\
116 	/* does not adjust output bytes inside vectors */
117 
118 #define debyteslice_16x16b(a0, b0, c0, d0,		\
119 			   a1, b1, c1, d1,		\
120 			   a2, b2, c2, d2,		\
121 			   a3, b3, c3, d3,		\
122 			   st0, st1)			\
123 	vmovdqu d2, st0;				\
124 	vmovdqu d3, st1;				\
125 	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
126 	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
127 	vmovdqu st0, d2;				\
128 	vmovdqu st1, d3;				\
129 							\
130 	vmovdqu a0, st0;				\
131 	vmovdqu a1, st1;				\
132 	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
133 	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
134 							\
135 	vmovdqu .Lshufb_16x16b(%rip), a0;		\
136 	vmovdqu st1, a1;				\
137 	vpshufb a0, a2, a2;				\
138 	vpshufb a0, a3, a3;				\
139 	vpshufb a0, b0, b0;				\
140 	vpshufb a0, b1, b1;				\
141 	vpshufb a0, b2, b2;				\
142 	vpshufb a0, b3, b3;				\
143 	vpshufb a0, a1, a1;				\
144 	vpshufb a0, c0, c0;				\
145 	vpshufb a0, c1, c1;				\
146 	vpshufb a0, c2, c2;				\
147 	vpshufb a0, c3, c3;				\
148 	vpshufb a0, d0, d0;				\
149 	vpshufb a0, d1, d1;				\
150 	vpshufb a0, d2, d2;				\
151 	vpshufb a0, d3, d3;				\
152 	vmovdqu d3, st1;				\
153 	vmovdqu st0, d3;				\
154 	vpshufb a0, d3, a0;				\
155 	vmovdqu d2, st0;				\
156 							\
157 	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
158 	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
159 	vmovdqu st0, d2;				\
160 	vmovdqu st1, d3;				\
161 							\
162 	vmovdqu b0, st0;				\
163 	vmovdqu b1, st1;				\
164 	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
165 	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
166 	vmovdqu st0, b0;				\
167 	vmovdqu st1, b1;				\
168 	/* does not adjust output bytes inside vectors */
169 
170 /* load blocks to registers and apply pre-whitening */
171 #define inpack16_pre(x0, x1, x2, x3,			\
172 		     x4, x5, x6, x7,			\
173 		     y0, y1, y2, y3,			\
174 		     y4, y5, y6, y7,			\
175 		     rio)				\
176 	vmovdqu (0 * 16)(rio), x0;			\
177 	vmovdqu (1 * 16)(rio), x1;			\
178 	vmovdqu (2 * 16)(rio), x2;			\
179 	vmovdqu (3 * 16)(rio), x3;			\
180 	vmovdqu (4 * 16)(rio), x4;			\
181 	vmovdqu (5 * 16)(rio), x5;			\
182 	vmovdqu (6 * 16)(rio), x6;			\
183 	vmovdqu (7 * 16)(rio), x7;			\
184 	vmovdqu (8 * 16)(rio), y0;			\
185 	vmovdqu (9 * 16)(rio), y1;			\
186 	vmovdqu (10 * 16)(rio), y2;			\
187 	vmovdqu (11 * 16)(rio), y3;			\
188 	vmovdqu (12 * 16)(rio), y4;			\
189 	vmovdqu (13 * 16)(rio), y5;			\
190 	vmovdqu (14 * 16)(rio), y6;			\
191 	vmovdqu (15 * 16)(rio), y7;
192 
193 /* byteslice pre-whitened blocks and store to temporary memory */
194 #define inpack16_post(x0, x1, x2, x3,			\
195 		      x4, x5, x6, x7,			\
196 		      y0, y1, y2, y3,			\
197 		      y4, y5, y6, y7,			\
198 		      mem_ab, mem_cd)			\
199 	byteslice_16x16b(x0, x1, x2, x3,		\
200 			 x4, x5, x6, x7,		\
201 			 y0, y1, y2, y3,		\
202 			 y4, y5, y6, y7,		\
203 			 (mem_ab), (mem_cd));		\
204 							\
205 	vmovdqu x0, 0 * 16(mem_ab);			\
206 	vmovdqu x1, 1 * 16(mem_ab);			\
207 	vmovdqu x2, 2 * 16(mem_ab);			\
208 	vmovdqu x3, 3 * 16(mem_ab);			\
209 	vmovdqu x4, 4 * 16(mem_ab);			\
210 	vmovdqu x5, 5 * 16(mem_ab);			\
211 	vmovdqu x6, 6 * 16(mem_ab);			\
212 	vmovdqu x7, 7 * 16(mem_ab);			\
213 	vmovdqu y0, 0 * 16(mem_cd);			\
214 	vmovdqu y1, 1 * 16(mem_cd);			\
215 	vmovdqu y2, 2 * 16(mem_cd);			\
216 	vmovdqu y3, 3 * 16(mem_cd);			\
217 	vmovdqu y4, 4 * 16(mem_cd);			\
218 	vmovdqu y5, 5 * 16(mem_cd);			\
219 	vmovdqu y6, 6 * 16(mem_cd);			\
220 	vmovdqu y7, 7 * 16(mem_cd);
221 
222 #define write_output(x0, x1, x2, x3,			\
223 		     x4, x5, x6, x7,			\
224 		     y0, y1, y2, y3,			\
225 		     y4, y5, y6, y7,			\
226 		     mem)				\
227 	vmovdqu x0, 0 * 16(mem);			\
228 	vmovdqu x1, 1 * 16(mem);			\
229 	vmovdqu x2, 2 * 16(mem);			\
230 	vmovdqu x3, 3 * 16(mem);			\
231 	vmovdqu x4, 4 * 16(mem);			\
232 	vmovdqu x5, 5 * 16(mem);			\
233 	vmovdqu x6, 6 * 16(mem);			\
234 	vmovdqu x7, 7 * 16(mem);			\
235 	vmovdqu y0, 8 * 16(mem);			\
236 	vmovdqu y1, 9 * 16(mem);			\
237 	vmovdqu y2, 10 * 16(mem);			\
238 	vmovdqu y3, 11 * 16(mem);			\
239 	vmovdqu y4, 12 * 16(mem);			\
240 	vmovdqu y5, 13 * 16(mem);			\
241 	vmovdqu y6, 14 * 16(mem);			\
242 	vmovdqu y7, 15 * 16(mem);			\
243 
244 #define aria_store_state_8way(x0, x1, x2, x3,		\
245 			      x4, x5, x6, x7,		\
246 			      mem_tmp, idx)		\
247 	vmovdqu x0, ((idx + 0) * 16)(mem_tmp);		\
248 	vmovdqu x1, ((idx + 1) * 16)(mem_tmp);		\
249 	vmovdqu x2, ((idx + 2) * 16)(mem_tmp);		\
250 	vmovdqu x3, ((idx + 3) * 16)(mem_tmp);		\
251 	vmovdqu x4, ((idx + 4) * 16)(mem_tmp);		\
252 	vmovdqu x5, ((idx + 5) * 16)(mem_tmp);		\
253 	vmovdqu x6, ((idx + 6) * 16)(mem_tmp);		\
254 	vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
255 
256 #define aria_load_state_8way(x0, x1, x2, x3,		\
257 			     x4, x5, x6, x7,		\
258 			     mem_tmp, idx)		\
259 	vmovdqu ((idx + 0) * 16)(mem_tmp), x0;		\
260 	vmovdqu ((idx + 1) * 16)(mem_tmp), x1;		\
261 	vmovdqu ((idx + 2) * 16)(mem_tmp), x2;		\
262 	vmovdqu ((idx + 3) * 16)(mem_tmp), x3;		\
263 	vmovdqu ((idx + 4) * 16)(mem_tmp), x4;		\
264 	vmovdqu ((idx + 5) * 16)(mem_tmp), x5;		\
265 	vmovdqu ((idx + 6) * 16)(mem_tmp), x6;		\
266 	vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
267 
268 #define aria_ark_8way(x0, x1, x2, x3,			\
269 		      x4, x5, x6, x7,			\
270 		      t0, t1, t2, rk,			\
271 		      idx, round)			\
272 	/* AddRoundKey */                               \
273 	vbroadcastss ((round * 16) + idx + 0)(rk), t0;	\
274 	vpsrld $24, t0, t2;				\
275 	vpshufb t1, t2, t2;				\
276 	vpxor t2, x0, x0;				\
277 	vpsrld $16, t0, t2;				\
278 	vpshufb t1, t2, t2;				\
279 	vpxor t2, x1, x1;				\
280 	vpsrld $8, t0, t2;				\
281 	vpshufb t1, t2, t2;				\
282 	vpxor t2, x2, x2;				\
283 	vpshufb t1, t0, t2;				\
284 	vpxor t2, x3, x3;				\
285 	vbroadcastss ((round * 16) + idx + 4)(rk), t0;	\
286 	vpsrld $24, t0, t2;				\
287 	vpshufb t1, t2, t2;				\
288 	vpxor t2, x4, x4;				\
289 	vpsrld $16, t0, t2;				\
290 	vpshufb t1, t2, t2;				\
291 	vpxor t2, x5, x5;				\
292 	vpsrld $8, t0, t2;				\
293 	vpshufb t1, t2, t2;				\
294 	vpxor t2, x6, x6;				\
295 	vpshufb t1, t0, t2;				\
296 	vpxor t2, x7, x7;
297 
298 #ifdef CONFIG_AS_GFNI
299 #define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
300 			    x4, x5, x6, x7,		\
301 			    t0, t1, t2, t3,		\
302 			    t4, t5, t6, t7)		\
303 	vmovdqa .Ltf_s2_bitmatrix(%rip), t0;		\
304 	vmovdqa .Ltf_inv_bitmatrix(%rip), t1;		\
305 	vmovdqa .Ltf_id_bitmatrix(%rip), t2;		\
306 	vmovdqa .Ltf_aff_bitmatrix(%rip), t3;		\
307 	vmovdqa .Ltf_x2_bitmatrix(%rip), t4;		\
308 	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
309 	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
310 	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
311 	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
312 	vgf2p8affineinvqb $0, t2, x2, x2;		\
313 	vgf2p8affineinvqb $0, t2, x6, x6;		\
314 	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
315 	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
316 	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
317 	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
318 	vgf2p8affineinvqb $0, t2, x3, x3;		\
319 	vgf2p8affineinvqb $0, t2, x7, x7
320 
321 #endif /* CONFIG_AS_GFNI */
322 
323 #define aria_sbox_8way(x0, x1, x2, x3,            	\
324 		       x4, x5, x6, x7,			\
325 		       t0, t1, t2, t3,			\
326 		       t4, t5, t6, t7)			\
327 	vmovdqa .Linv_shift_row(%rip), t0;		\
328 	vmovdqa .Lshift_row(%rip), t1;			\
329 	vbroadcastss .L0f0f0f0f(%rip), t6;		\
330 	vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2;	\
331 	vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3;	\
332 	vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4;	\
333 	vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5;	\
334 							\
335 	vaesenclast t7, x0, x0;				\
336 	vaesenclast t7, x4, x4;				\
337 	vaesenclast t7, x1, x1;				\
338 	vaesenclast t7, x5, x5;				\
339 	vaesdeclast t7, x2, x2;				\
340 	vaesdeclast t7, x6, x6;				\
341 							\
342 	/* AES inverse shift rows */			\
343 	vpshufb t0, x0, x0;				\
344 	vpshufb t0, x4, x4;				\
345 	vpshufb t0, x1, x1;				\
346 	vpshufb t0, x5, x5;				\
347 	vpshufb t1, x3, x3;				\
348 	vpshufb t1, x7, x7;				\
349 	vpshufb t1, x2, x2;				\
350 	vpshufb t1, x6, x6;				\
351 							\
352 	/* affine transformation for S2 */		\
353 	filter_8bit(x1, t2, t3, t6, t0);		\
354 	/* affine transformation for S2 */		\
355 	filter_8bit(x5, t2, t3, t6, t0);		\
356 							\
357 	/* affine transformation for X2 */		\
358 	filter_8bit(x3, t4, t5, t6, t0);		\
359 	/* affine transformation for X2 */		\
360 	filter_8bit(x7, t4, t5, t6, t0);		\
361 	vaesdeclast t7, x3, x3;				\
362 	vaesdeclast t7, x7, x7;
363 
364 #define aria_diff_m(x0, x1, x2, x3,			\
365 		    t0, t1, t2, t3)			\
366 	/* T = rotr32(X, 8); */				\
367 	/* X ^= T */					\
368 	vpxor x0, x3, t0;				\
369 	vpxor x1, x0, t1;				\
370 	vpxor x2, x1, t2;				\
371 	vpxor x3, x2, t3;				\
372 	/* X = T ^ rotr(X, 16); */			\
373 	vpxor t2, x0, x0;				\
374 	vpxor x1, t3, t3;				\
375 	vpxor t0, x2, x2;				\
376 	vpxor t1, x3, x1;				\
377 	vmovdqu t3, x3;
378 
379 #define aria_diff_word(x0, x1, x2, x3,			\
380 		       x4, x5, x6, x7,			\
381 		       y0, y1, y2, y3,			\
382 		       y4, y5, y6, y7)			\
383 	/* t1 ^= t2; */					\
384 	vpxor y0, x4, x4;				\
385 	vpxor y1, x5, x5;				\
386 	vpxor y2, x6, x6;				\
387 	vpxor y3, x7, x7;				\
388 							\
389 	/* t2 ^= t3; */					\
390 	vpxor y4, y0, y0;				\
391 	vpxor y5, y1, y1;				\
392 	vpxor y6, y2, y2;				\
393 	vpxor y7, y3, y3;				\
394 							\
395 	/* t0 ^= t1; */					\
396 	vpxor x4, x0, x0;				\
397 	vpxor x5, x1, x1;				\
398 	vpxor x6, x2, x2;				\
399 	vpxor x7, x3, x3;				\
400 							\
401 	/* t3 ^= t1; */					\
402 	vpxor x4, y4, y4;				\
403 	vpxor x5, y5, y5;				\
404 	vpxor x6, y6, y6;				\
405 	vpxor x7, y7, y7;				\
406 							\
407 	/* t2 ^= t0; */					\
408 	vpxor x0, y0, y0;				\
409 	vpxor x1, y1, y1;				\
410 	vpxor x2, y2, y2;				\
411 	vpxor x3, y3, y3;				\
412 							\
413 	/* t1 ^= t2; */					\
414 	vpxor y0, x4, x4;				\
415 	vpxor y1, x5, x5;				\
416 	vpxor y2, x6, x6;				\
417 	vpxor y3, x7, x7;
418 
419 #define aria_fe(x0, x1, x2, x3,				\
420 		x4, x5, x6, x7,				\
421 		y0, y1, y2, y3,				\
422 		y4, y5, y6, y7,				\
423 		mem_tmp, rk, round)			\
424 	vpxor y7, y7, y7;				\
425 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
426 		      y0, y7, y2, rk, 8, round);	\
427 							\
428 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
429 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
430 							\
431 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
432 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
433 	aria_store_state_8way(x0, x1, x2, x3,		\
434 			      x4, x5, x6, x7,		\
435 			      mem_tmp, 8);		\
436 							\
437 	aria_load_state_8way(x0, x1, x2, x3,		\
438 			     x4, x5, x6, x7,		\
439 			     mem_tmp, 0);		\
440 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
441 		      y0, y7, y2, rk, 0, round);	\
442 							\
443 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
444 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
445 							\
446 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
447 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
448 	aria_store_state_8way(x0, x1, x2, x3,		\
449 			      x4, x5, x6, x7,		\
450 			      mem_tmp, 0);		\
451 	aria_load_state_8way(y0, y1, y2, y3,		\
452 			     y4, y5, y6, y7,		\
453 			     mem_tmp, 8);		\
454 	aria_diff_word(x0, x1, x2, x3,			\
455 		       x4, x5, x6, x7,			\
456 		       y0, y1, y2, y3,			\
457 		       y4, y5, y6, y7);			\
458 	/* aria_diff_byte() 				\
459 	 * T3 = ABCD -> BADC 				\
460 	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
461 	 * T0 = ABCD -> CDAB 				\
462 	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
463 	 * T1 = ABCD -> DCBA 				\
464 	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
465 	 */						\
466 	aria_diff_word(x2, x3, x0, x1,			\
467 		       x7, x6, x5, x4,			\
468 		       y0, y1, y2, y3,			\
469 		       y5, y4, y7, y6);			\
470 	aria_store_state_8way(x3, x2, x1, x0,		\
471 			      x6, x7, x4, x5,		\
472 			      mem_tmp, 0);
473 
474 #define aria_fo(x0, x1, x2, x3,				\
475 		x4, x5, x6, x7,				\
476 		y0, y1, y2, y3,				\
477 		y4, y5, y6, y7,				\
478 		mem_tmp, rk, round)			\
479 	vpxor y7, y7, y7;				\
480 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
481 		      y0, y7, y2, rk, 8, round);	\
482 							\
483 	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
484 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
485 							\
486 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
487 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
488 	aria_store_state_8way(x0, x1, x2, x3,		\
489 			      x4, x5, x6, x7,		\
490 			      mem_tmp, 8);		\
491 							\
492 	aria_load_state_8way(x0, x1, x2, x3,		\
493 			     x4, x5, x6, x7,		\
494 			     mem_tmp, 0);		\
495 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
496 		      y0, y7, y2, rk, 0, round);	\
497 							\
498 	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
499 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
500 							\
501 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
502 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
503 	aria_store_state_8way(x0, x1, x2, x3,		\
504 			      x4, x5, x6, x7,		\
505 			      mem_tmp, 0);		\
506 	aria_load_state_8way(y0, y1, y2, y3,		\
507 			     y4, y5, y6, y7,		\
508 			     mem_tmp, 8);		\
509 	aria_diff_word(x0, x1, x2, x3,			\
510 		       x4, x5, x6, x7,			\
511 		       y0, y1, y2, y3,			\
512 		       y4, y5, y6, y7);			\
513 	/* aria_diff_byte() 				\
514 	 * T1 = ABCD -> BADC 				\
515 	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
516 	 * T2 = ABCD -> CDAB 				\
517 	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
518 	 * T3 = ABCD -> DCBA 				\
519 	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
520 	 */						\
521 	aria_diff_word(x0, x1, x2, x3,			\
522 		       x5, x4, x7, x6,			\
523 		       y2, y3, y0, y1,			\
524 		       y7, y6, y5, y4);			\
525 	aria_store_state_8way(x3, x2, x1, x0,		\
526 			      x6, x7, x4, x5,		\
527 			      mem_tmp, 0);
528 
529 #define aria_ff(x0, x1, x2, x3,				\
530 		x4, x5, x6, x7,				\
531 		y0, y1, y2, y3,				\
532 		y4, y5, y6, y7,				\
533 		mem_tmp, rk, round, last_round)		\
534 	vpxor y7, y7, y7;				\
535 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
536 		      y0, y7, y2, rk, 8, round);	\
537 							\
538 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
539 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
540 							\
541 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
542 		      y0, y7, y2, rk, 8, last_round);	\
543 							\
544 	aria_store_state_8way(x0, x1, x2, x3,		\
545 			      x4, x5, x6, x7,		\
546 			      mem_tmp, 8);		\
547 							\
548 	aria_load_state_8way(x0, x1, x2, x3,		\
549 			     x4, x5, x6, x7,		\
550 			     mem_tmp, 0);		\
551 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
552 		      y0, y7, y2, rk, 0, round);	\
553 							\
554 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
555 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
556 							\
557 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
558 		      y0, y7, y2, rk, 0, last_round);	\
559 							\
560 	aria_load_state_8way(y0, y1, y2, y3,		\
561 			     y4, y5, y6, y7,		\
562 			     mem_tmp, 8);
563 
564 #ifdef CONFIG_AS_GFNI
565 #define aria_fe_gfni(x0, x1, x2, x3,			\
566 		     x4, x5, x6, x7,			\
567 		     y0, y1, y2, y3,			\
568 		     y4, y5, y6, y7,			\
569 		     mem_tmp, rk, round)		\
570 	vpxor y7, y7, y7;				\
571 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
572 		      y0, y7, y2, rk, 8, round);	\
573 							\
574 	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
575 			    x6, x7, x4, x5,		\
576 			    y0, y1, y2, y3, 		\
577 			    y4, y5, y6, y7);		\
578 							\
579 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
580 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
581 	aria_store_state_8way(x0, x1, x2, x3,		\
582 			      x4, x5, x6, x7,		\
583 			      mem_tmp, 8);		\
584 							\
585 	aria_load_state_8way(x0, x1, x2, x3,		\
586 			     x4, x5, x6, x7,		\
587 			     mem_tmp, 0);		\
588 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
589 		      y0, y7, y2, rk, 0, round);	\
590 							\
591 	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
592 			    x6, x7, x4, x5,		\
593 			    y0, y1, y2, y3, 		\
594 			    y4, y5, y6, y7);		\
595 							\
596 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
597 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
598 	aria_store_state_8way(x0, x1, x2, x3,		\
599 			      x4, x5, x6, x7,		\
600 			      mem_tmp, 0);		\
601 	aria_load_state_8way(y0, y1, y2, y3,		\
602 			     y4, y5, y6, y7,		\
603 			     mem_tmp, 8);		\
604 	aria_diff_word(x0, x1, x2, x3,			\
605 		       x4, x5, x6, x7,			\
606 		       y0, y1, y2, y3,			\
607 		       y4, y5, y6, y7);			\
608 	/* aria_diff_byte() 				\
609 	 * T3 = ABCD -> BADC 				\
610 	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
611 	 * T0 = ABCD -> CDAB 				\
612 	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
613 	 * T1 = ABCD -> DCBA 				\
614 	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
615 	 */						\
616 	aria_diff_word(x2, x3, x0, x1,			\
617 		       x7, x6, x5, x4,			\
618 		       y0, y1, y2, y3,			\
619 		       y5, y4, y7, y6);			\
620 	aria_store_state_8way(x3, x2, x1, x0,		\
621 			      x6, x7, x4, x5,		\
622 			      mem_tmp, 0);
623 
624 #define aria_fo_gfni(x0, x1, x2, x3,			\
625 		     x4, x5, x6, x7,			\
626 		     y0, y1, y2, y3,			\
627 		     y4, y5, y6, y7,			\
628 		     mem_tmp, rk, round)		\
629 	vpxor y7, y7, y7;				\
630 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
631 		      y0, y7, y2, rk, 8, round);	\
632 							\
633 	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
634 			    x4, x5, x6, x7,		\
635 			    y0, y1, y2, y3, 		\
636 			    y4, y5, y6, y7);		\
637 							\
638 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
639 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
640 	aria_store_state_8way(x0, x1, x2, x3,		\
641 			      x4, x5, x6, x7,		\
642 			      mem_tmp, 8);		\
643 							\
644 	aria_load_state_8way(x0, x1, x2, x3,		\
645 			     x4, x5, x6, x7,		\
646 			     mem_tmp, 0);		\
647 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
648 		      y0, y7, y2, rk, 0, round);	\
649 							\
650 	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
651 			    x4, x5, x6, x7,		\
652 			    y0, y1, y2, y3, 		\
653 			    y4, y5, y6, y7);		\
654 							\
655 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
656 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
657 	aria_store_state_8way(x0, x1, x2, x3,		\
658 			      x4, x5, x6, x7,		\
659 			      mem_tmp, 0);		\
660 	aria_load_state_8way(y0, y1, y2, y3,		\
661 			     y4, y5, y6, y7,		\
662 			     mem_tmp, 8);		\
663 	aria_diff_word(x0, x1, x2, x3,			\
664 		       x4, x5, x6, x7,			\
665 		       y0, y1, y2, y3,			\
666 		       y4, y5, y6, y7);			\
667 	/* aria_diff_byte() 				\
668 	 * T1 = ABCD -> BADC 				\
669 	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
670 	 * T2 = ABCD -> CDAB 				\
671 	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
672 	 * T3 = ABCD -> DCBA 				\
673 	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
674 	 */						\
675 	aria_diff_word(x0, x1, x2, x3,			\
676 		       x5, x4, x7, x6,			\
677 		       y2, y3, y0, y1,			\
678 		       y7, y6, y5, y4);			\
679 	aria_store_state_8way(x3, x2, x1, x0,		\
680 			      x6, x7, x4, x5,		\
681 			      mem_tmp, 0);
682 
683 #define aria_ff_gfni(x0, x1, x2, x3,			\
684 		x4, x5, x6, x7,				\
685 		y0, y1, y2, y3,				\
686 		y4, y5, y6, y7,				\
687 		mem_tmp, rk, round, last_round)		\
688 	vpxor y7, y7, y7;				\
689 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
690 		      y0, y7, y2, rk, 8, round);	\
691 							\
692 	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
693 			    x6, x7, x4, x5,		\
694 			    y0, y1, y2, y3, 		\
695 			    y4, y5, y6, y7);		\
696 							\
697 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
698 		      y0, y7, y2, rk, 8, last_round);	\
699 							\
700 	aria_store_state_8way(x0, x1, x2, x3,		\
701 			      x4, x5, x6, x7,		\
702 			      mem_tmp, 8);		\
703 							\
704 	aria_load_state_8way(x0, x1, x2, x3,		\
705 			     x4, x5, x6, x7,		\
706 			     mem_tmp, 0);		\
707 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
708 		      y0, y7, y2, rk, 0, round);	\
709 							\
710 	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
711 			    x6, x7, x4, x5,		\
712 			    y0, y1, y2, y3, 		\
713 			    y4, y5, y6, y7);		\
714 							\
715 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
716 		      y0, y7, y2, rk, 0, last_round);	\
717 							\
718 	aria_load_state_8way(y0, y1, y2, y3,		\
719 			     y4, y5, y6, y7,		\
720 			     mem_tmp, 8);
721 
722 #endif /* CONFIG_AS_GFNI */
723 
724 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
725 .section	.rodata.cst16, "aM", @progbits, 16
726 .align 16
727 
728 #define SHUFB_BYTES(idx) \
729 	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
730 
731 .Lshufb_16x16b:
732 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
733 /* For isolating SubBytes from AESENCLAST, inverse shift row */
734 .Linv_shift_row:
735 	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
736 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
737 .Lshift_row:
738 	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
739 	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
740 /* For CTR-mode IV byteswap */
741 .Lbswap128_mask:
742 	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
743 	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
744 
745 /* AES inverse affine and S2 combined:
746  *      1 1 0 0 0 0 0 1     x0     0
747  *      0 1 0 0 1 0 0 0     x1     0
748  *      1 1 0 0 1 1 1 1     x2     0
749  *      0 1 1 0 1 0 0 1     x3     1
750  *      0 1 0 0 1 1 0 0  *  x4  +  0
751  *      0 1 0 1 1 0 0 0     x5     0
752  *      0 0 0 0 0 1 0 1     x6     0
753  *      1 1 1 0 0 1 1 1     x7     1
754  */
755 .Ltf_lo__inv_aff__and__s2:
756 	.octa 0x92172DA81A9FA520B2370D883ABF8500
757 .Ltf_hi__inv_aff__and__s2:
758 	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
759 
760 /* X2 and AES forward affine combined:
761  *      1 0 1 1 0 0 0 1     x0     0
762  *      0 1 1 1 1 0 1 1     x1     0
763  *      0 0 0 1 1 0 1 0     x2     1
764  *      0 1 0 0 0 1 0 0     x3     0
765  *      0 0 1 1 1 0 1 1  *  x4  +  0
766  *      0 1 0 0 1 0 0 0     x5     0
767  *      1 1 0 1 0 0 1 1     x6     0
768  *      0 1 0 0 1 0 1 0     x7     0
769  */
770 .Ltf_lo__x2__and__fwd_aff:
771 	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
772 .Ltf_hi__x2__and__fwd_aff:
773 	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
774 
775 #ifdef CONFIG_AS_GFNI
776 /* AES affine: */
777 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
778 .Ltf_aff_bitmatrix:
779 	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
780 		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
781 		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
782 		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
783 		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
784 		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
785 		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
786 		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
787 	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
788 		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
789 		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
790 		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
791 		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
792 		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
793 		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
794 		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
795 
796 /* AES inverse affine: */
797 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
798 .Ltf_inv_bitmatrix:
799 	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
800 		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
801 		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
802 		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
803 		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
804 		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
805 		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
806 		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
807 	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
808 		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
809 		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
810 		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
811 		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
812 		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
813 		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
814 		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
815 
816 /* S2: */
817 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
818 .Ltf_s2_bitmatrix:
819 	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
820 		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
821 		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
822 		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
823 		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
824 		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
825 		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
826 		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
827 	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
828 		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
829 		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
830 		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
831 		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
832 		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
833 		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
834 		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
835 
836 /* X2: */
837 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
838 .Ltf_x2_bitmatrix:
839 	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
840 		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
841 		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
842 		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
843 		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
844 		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
845 		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
846 		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
847 	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
848 		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
849 		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
850 		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
851 		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
852 		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
853 		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
854 		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
855 
856 /* Identity matrix: */
857 .Ltf_id_bitmatrix:
858 	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
859 		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
860 		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
861 		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
862 		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
863 		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
864 		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
865 		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
866 	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
867 		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
868 		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
869 		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
870 		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
871 		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
872 		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
873 		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
874 #endif /* CONFIG_AS_GFNI */
875 
876 /* 4-bit mask */
877 .section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
878 .align 4
879 .L0f0f0f0f:
880 	.long 0x0f0f0f0f
881 
882 .text
883 
884 SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
885 	/* input:
886 	*      %r9: rk
887 	*      %rsi: dst
888 	*      %rdx: src
889 	*      %xmm0..%xmm15: 16 byte-sliced blocks
890 	*/
891 
892 	FRAME_BEGIN
893 
894 	movq %rsi, %rax;
895 	leaq 8 * 16(%rax), %r8;
896 
897 	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
898 		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
899 		      %xmm15, %rax, %r8);
900 	aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
901 		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
902 		%rax, %r9, 0);
903 	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
904 		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
905 		%xmm15, %rax, %r9, 1);
906 	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
907 		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
908 		%rax, %r9, 2);
909 	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
910 		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
911 		%xmm15, %rax, %r9, 3);
912 	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
913 		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
914 		%rax, %r9, 4);
915 	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
916 		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
917 		%xmm15, %rax, %r9, 5);
918 	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
919 		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
920 		%rax, %r9, 6);
921 	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
922 		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
923 		%xmm15, %rax, %r9, 7);
924 	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
925 		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
926 		%rax, %r9, 8);
927 	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
928 		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
929 		%xmm15, %rax, %r9, 9);
930 	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
931 		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
932 		%rax, %r9, 10);
933 	cmpl $12, ARIA_CTX_rounds(CTX);
934 	jne .Laria_192;
935 	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
936 		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
937 		%xmm15, %rax, %r9, 11, 12);
938 	jmp .Laria_end;
939 .Laria_192:
940 	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
941 		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
942 		%xmm15, %rax, %r9, 11);
943 	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
944 		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
945 		%rax, %r9, 12);
946 	cmpl $14, ARIA_CTX_rounds(CTX);
947 	jne .Laria_256;
948 	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
949 		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
950 		%xmm15, %rax, %r9, 13, 14);
951 	jmp .Laria_end;
952 .Laria_256:
953 	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
954 		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
955 		%xmm15, %rax, %r9, 13);
956 	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
957 		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
958 		%rax, %r9, 14);
959 	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
960 		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
961 		%xmm15, %rax, %r9, 15, 16);
962 .Laria_end:
963 	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
964 			   %xmm9, %xmm13, %xmm0, %xmm5,
965 			   %xmm10, %xmm14, %xmm3, %xmm6,
966 			   %xmm11, %xmm15, %xmm2, %xmm7,
967 			   (%rax), (%r8));
968 
969 	FRAME_END
970 	RET;
971 SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
972 
973 SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
974 	/* input:
975 	*      %rdi: ctx, CTX
976 	*      %rsi: dst
977 	*      %rdx: src
978 	*/
979 
980 	FRAME_BEGIN
981 
982 	leaq ARIA_CTX_enc_key(CTX), %r9;
983 
984 	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
985 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
986 		     %xmm15, %rdx);
987 
988 	call __aria_aesni_avx_crypt_16way;
989 
990 	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
991 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
992 		     %xmm15, %rax);
993 
994 	FRAME_END
995 	RET;
996 SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
997 
998 SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
999 	/* input:
1000 	*      %rdi: ctx, CTX
1001 	*      %rsi: dst
1002 	*      %rdx: src
1003 	*/
1004 
1005 	FRAME_BEGIN
1006 
1007 	leaq ARIA_CTX_dec_key(CTX), %r9;
1008 
1009 	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1010 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1011 		     %xmm15, %rdx);
1012 
1013 	call __aria_aesni_avx_crypt_16way;
1014 
1015 	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1016 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1017 		     %xmm15, %rax);
1018 
1019 	FRAME_END
1020 	RET;
1021 SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
1022 
1023 SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
1024 	/* input:
1025 	*      %rdi: ctx
1026 	*      %rsi: dst
1027 	*      %rdx: src
1028 	*      %rcx: keystream
1029 	*      %r8: iv (big endian, 128bit)
1030 	*/
1031 
1032 	FRAME_BEGIN
1033 	/* load IV and byteswap */
1034 	vmovdqu (%r8), %xmm8;
1035 
1036 	vmovdqa .Lbswap128_mask (%rip), %xmm1;
1037 	vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
1038 
1039 	vpcmpeqd %xmm0, %xmm0, %xmm0;
1040 	vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
1041 
1042 	/* construct IVs */
1043 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1044 	vpshufb %xmm1, %xmm3, %xmm9;
1045 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1046 	vpshufb %xmm1, %xmm3, %xmm10;
1047 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1048 	vpshufb %xmm1, %xmm3, %xmm11;
1049 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1050 	vpshufb %xmm1, %xmm3, %xmm12;
1051 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1052 	vpshufb %xmm1, %xmm3, %xmm13;
1053 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1054 	vpshufb %xmm1, %xmm3, %xmm14;
1055 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1056 	vpshufb %xmm1, %xmm3, %xmm15;
1057 	vmovdqu %xmm8, (0 * 16)(%rcx);
1058 	vmovdqu %xmm9, (1 * 16)(%rcx);
1059 	vmovdqu %xmm10, (2 * 16)(%rcx);
1060 	vmovdqu %xmm11, (3 * 16)(%rcx);
1061 	vmovdqu %xmm12, (4 * 16)(%rcx);
1062 	vmovdqu %xmm13, (5 * 16)(%rcx);
1063 	vmovdqu %xmm14, (6 * 16)(%rcx);
1064 	vmovdqu %xmm15, (7 * 16)(%rcx);
1065 
1066 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1067 	vpshufb %xmm1, %xmm3, %xmm8;
1068 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1069 	vpshufb %xmm1, %xmm3, %xmm9;
1070 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1071 	vpshufb %xmm1, %xmm3, %xmm10;
1072 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1073 	vpshufb %xmm1, %xmm3, %xmm11;
1074 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1075 	vpshufb %xmm1, %xmm3, %xmm12;
1076 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1077 	vpshufb %xmm1, %xmm3, %xmm13;
1078 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1079 	vpshufb %xmm1, %xmm3, %xmm14;
1080 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1081 	vpshufb %xmm1, %xmm3, %xmm15;
1082 	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1083 	vpshufb %xmm1, %xmm3, %xmm4;
1084 	vmovdqu %xmm4, (%r8);
1085 
1086 	vmovdqu (0 * 16)(%rcx), %xmm0;
1087 	vmovdqu (1 * 16)(%rcx), %xmm1;
1088 	vmovdqu (2 * 16)(%rcx), %xmm2;
1089 	vmovdqu (3 * 16)(%rcx), %xmm3;
1090 	vmovdqu (4 * 16)(%rcx), %xmm4;
1091 	vmovdqu (5 * 16)(%rcx), %xmm5;
1092 	vmovdqu (6 * 16)(%rcx), %xmm6;
1093 	vmovdqu (7 * 16)(%rcx), %xmm7;
1094 
1095 	FRAME_END
1096 	RET;
1097 SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1098 
1099 SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1100 	/* input:
1101 	*      %rdi: ctx
1102 	*      %rsi: dst
1103 	*      %rdx: src
1104 	*      %rcx: keystream
1105 	*      %r8: iv (big endian, 128bit)
1106 	*/
1107 	FRAME_BEGIN
1108 
1109 	call __aria_aesni_avx_ctr_gen_keystream_16way;
1110 
1111 	leaq (%rsi), %r10;
1112 	leaq (%rdx), %r11;
1113 	leaq (%rcx), %rsi;
1114 	leaq (%rcx), %rdx;
1115 	leaq ARIA_CTX_enc_key(CTX), %r9;
1116 
1117 	call __aria_aesni_avx_crypt_16way;
1118 
1119 	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1120 	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1121 	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1122 	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1123 	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1124 	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1125 	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1126 	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1127 	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1128 	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1129 	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1130 	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1131 	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1132 	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1133 	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1134 	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1135 	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1136 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1137 		     %xmm15, %r10);
1138 
1139 	FRAME_END
1140 	RET;
1141 SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1142 
1143 #ifdef CONFIG_AS_GFNI
1144 SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1145 	/* input:
1146 	*      %r9: rk
1147 	*      %rsi: dst
1148 	*      %rdx: src
1149 	*      %xmm0..%xmm15: 16 byte-sliced blocks
1150 	*/
1151 
1152 	FRAME_BEGIN
1153 
1154 	movq %rsi, %rax;
1155 	leaq 8 * 16(%rax), %r8;
1156 
1157 	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1158 		      %xmm4, %xmm5, %xmm6, %xmm7,
1159 		      %xmm8, %xmm9, %xmm10, %xmm11,
1160 		      %xmm12, %xmm13, %xmm14,
1161 		      %xmm15, %rax, %r8);
1162 	aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1163 		     %xmm12, %xmm13, %xmm14, %xmm15,
1164 		     %xmm0, %xmm1, %xmm2, %xmm3,
1165 		     %xmm4, %xmm5, %xmm6, %xmm7,
1166 		     %rax, %r9, 0);
1167 	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1168 		     %xmm4, %xmm5, %xmm6, %xmm7,
1169 		     %xmm8, %xmm9, %xmm10, %xmm11,
1170 		     %xmm12, %xmm13, %xmm14,
1171 		     %xmm15, %rax, %r9, 1);
1172 	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1173 		     %xmm12, %xmm13, %xmm14, %xmm15,
1174 		     %xmm0, %xmm1, %xmm2, %xmm3,
1175 		     %xmm4, %xmm5, %xmm6, %xmm7,
1176 		     %rax, %r9, 2);
1177 	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1178 		     %xmm4, %xmm5, %xmm6, %xmm7,
1179 		     %xmm8, %xmm9, %xmm10, %xmm11,
1180 		     %xmm12, %xmm13, %xmm14,
1181 		     %xmm15, %rax, %r9, 3);
1182 	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1183 		     %xmm12, %xmm13, %xmm14, %xmm15,
1184 		     %xmm0, %xmm1, %xmm2, %xmm3,
1185 		     %xmm4, %xmm5, %xmm6, %xmm7,
1186 		     %rax, %r9, 4);
1187 	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1188 		     %xmm4, %xmm5, %xmm6, %xmm7,
1189 		     %xmm8, %xmm9, %xmm10, %xmm11,
1190 		     %xmm12, %xmm13, %xmm14,
1191 		     %xmm15, %rax, %r9, 5);
1192 	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1193 		     %xmm12, %xmm13, %xmm14, %xmm15,
1194 		     %xmm0, %xmm1, %xmm2, %xmm3,
1195 		     %xmm4, %xmm5, %xmm6, %xmm7,
1196 		     %rax, %r9, 6);
1197 	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1198 		     %xmm4, %xmm5, %xmm6, %xmm7,
1199 		     %xmm8, %xmm9, %xmm10, %xmm11,
1200 		     %xmm12, %xmm13, %xmm14,
1201 		     %xmm15, %rax, %r9, 7);
1202 	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1203 		     %xmm12, %xmm13, %xmm14, %xmm15,
1204 		     %xmm0, %xmm1, %xmm2, %xmm3,
1205 		     %xmm4, %xmm5, %xmm6, %xmm7,
1206 		     %rax, %r9, 8);
1207 	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1208 		     %xmm4, %xmm5, %xmm6, %xmm7,
1209 		     %xmm8, %xmm9, %xmm10, %xmm11,
1210 		     %xmm12, %xmm13, %xmm14,
1211 		     %xmm15, %rax, %r9, 9);
1212 	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1213 		     %xmm12, %xmm13, %xmm14, %xmm15,
1214 		     %xmm0, %xmm1, %xmm2, %xmm3,
1215 		     %xmm4, %xmm5, %xmm6, %xmm7,
1216 		     %rax, %r9, 10);
1217 	cmpl $12, ARIA_CTX_rounds(CTX);
1218 	jne .Laria_gfni_192;
1219 	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1220 		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1221 		%xmm15, %rax, %r9, 11, 12);
1222 	jmp .Laria_gfni_end;
1223 .Laria_gfni_192:
1224 	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1225 		     %xmm4, %xmm5, %xmm6, %xmm7,
1226 		     %xmm8, %xmm9, %xmm10, %xmm11,
1227 		     %xmm12, %xmm13, %xmm14,
1228 		     %xmm15, %rax, %r9, 11);
1229 	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1230 		     %xmm12, %xmm13, %xmm14, %xmm15,
1231 		     %xmm0, %xmm1, %xmm2, %xmm3,
1232 		     %xmm4, %xmm5, %xmm6, %xmm7,
1233 		     %rax, %r9, 12);
1234 	cmpl $14, ARIA_CTX_rounds(CTX);
1235 	jne .Laria_gfni_256;
1236 	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1237 		     %xmm4, %xmm5, %xmm6, %xmm7,
1238 		     %xmm8, %xmm9, %xmm10, %xmm11,
1239 		     %xmm12, %xmm13, %xmm14,
1240 		     %xmm15, %rax, %r9, 13, 14);
1241 	jmp .Laria_gfni_end;
1242 .Laria_gfni_256:
1243 	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1244 		     %xmm4, %xmm5, %xmm6, %xmm7,
1245 		     %xmm8, %xmm9, %xmm10, %xmm11,
1246 		     %xmm12, %xmm13, %xmm14,
1247 		     %xmm15, %rax, %r9, 13);
1248 	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1249 		     %xmm12, %xmm13, %xmm14, %xmm15,
1250 		     %xmm0, %xmm1, %xmm2, %xmm3,
1251 		     %xmm4, %xmm5, %xmm6, %xmm7,
1252 		     %rax, %r9, 14);
1253 	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1254 		     %xmm4, %xmm5, %xmm6, %xmm7,
1255 		     %xmm8, %xmm9, %xmm10, %xmm11,
1256 		     %xmm12, %xmm13, %xmm14,
1257 		     %xmm15, %rax, %r9, 15, 16);
1258 .Laria_gfni_end:
1259 	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1260 			   %xmm9, %xmm13, %xmm0, %xmm5,
1261 			   %xmm10, %xmm14, %xmm3, %xmm6,
1262 			   %xmm11, %xmm15, %xmm2, %xmm7,
1263 			   (%rax), (%r8));
1264 
1265 	FRAME_END
1266 	RET;
1267 SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1268 
1269 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1270 	/* input:
1271 	*      %rdi: ctx, CTX
1272 	*      %rsi: dst
1273 	*      %rdx: src
1274 	*/
1275 
1276 	FRAME_BEGIN
1277 
1278 	leaq ARIA_CTX_enc_key(CTX), %r9;
1279 
1280 	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1281 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1282 		     %xmm15, %rdx);
1283 
1284 	call __aria_aesni_avx_gfni_crypt_16way;
1285 
1286 	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1287 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1288 		     %xmm15, %rax);
1289 
1290 	FRAME_END
1291 	RET;
1292 SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1293 
1294 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1295 	/* input:
1296 	*      %rdi: ctx, CTX
1297 	*      %rsi: dst
1298 	*      %rdx: src
1299 	*/
1300 
1301 	FRAME_BEGIN
1302 
1303 	leaq ARIA_CTX_dec_key(CTX), %r9;
1304 
1305 	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1306 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1307 		     %xmm15, %rdx);
1308 
1309 	call __aria_aesni_avx_gfni_crypt_16way;
1310 
1311 	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1312 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1313 		     %xmm15, %rax);
1314 
1315 	FRAME_END
1316 	RET;
1317 SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1318 
1319 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1320 	/* input:
1321 	*      %rdi: ctx
1322 	*      %rsi: dst
1323 	*      %rdx: src
1324 	*      %rcx: keystream
1325 	*      %r8: iv (big endian, 128bit)
1326 	*/
1327 	FRAME_BEGIN
1328 
1329 	call __aria_aesni_avx_ctr_gen_keystream_16way
1330 
1331 	leaq (%rsi), %r10;
1332 	leaq (%rdx), %r11;
1333 	leaq (%rcx), %rsi;
1334 	leaq (%rcx), %rdx;
1335 	leaq ARIA_CTX_enc_key(CTX), %r9;
1336 
1337 	call __aria_aesni_avx_gfni_crypt_16way;
1338 
1339 	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1340 	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1341 	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1342 	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1343 	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1344 	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1345 	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1346 	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1347 	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1348 	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1349 	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1350 	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1351 	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1352 	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1353 	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1354 	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1355 	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1356 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1357 		     %xmm15, %r10);
1358 
1359 	FRAME_END
1360 	RET;
1361 SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1362 #endif /* CONFIG_AS_GFNI */
1363