1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * ARIA Cipher 32-way parallel algorithm (AVX2)
4  *
5  * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6  *
7  */
8 
9 #include <linux/linkage.h>
10 #include <asm/frame.h>
11 #include <asm/asm-offsets.h>
12 #include <linux/cfi_types.h>
13 
14 /* register macros */
15 #define CTX %rdi
16 
17 #define ymm0_x xmm0
18 #define ymm1_x xmm1
19 #define ymm2_x xmm2
20 #define ymm3_x xmm3
21 #define ymm4_x xmm4
22 #define ymm5_x xmm5
23 #define ymm6_x xmm6
24 #define ymm7_x xmm7
25 #define ymm8_x xmm8
26 #define ymm9_x xmm9
27 #define ymm10_x xmm10
28 #define ymm11_x xmm11
29 #define ymm12_x xmm12
30 #define ymm13_x xmm13
31 #define ymm14_x xmm14
32 #define ymm15_x xmm15
33 
34 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
35 	( (((a0) & 1) << 0) |				\
36 	  (((a1) & 1) << 1) |				\
37 	  (((a2) & 1) << 2) |				\
38 	  (((a3) & 1) << 3) |				\
39 	  (((a4) & 1) << 4) |				\
40 	  (((a5) & 1) << 5) |				\
41 	  (((a6) & 1) << 6) |				\
42 	  (((a7) & 1) << 7) )
43 
44 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
45 	( ((l7) << (0 * 8)) |				\
46 	  ((l6) << (1 * 8)) |				\
47 	  ((l5) << (2 * 8)) |				\
48 	  ((l4) << (3 * 8)) |				\
49 	  ((l3) << (4 * 8)) |				\
50 	  ((l2) << (5 * 8)) |				\
51 	  ((l1) << (6 * 8)) |				\
52 	  ((l0) << (7 * 8)) )
53 
54 #define inc_le128(x, minus_one, tmp)			\
55 	vpcmpeqq minus_one, x, tmp;			\
56 	vpsubq minus_one, x, x;				\
57 	vpslldq $8, tmp, tmp;				\
58 	vpsubq tmp, x, x;
59 
60 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
61 	vpand x, mask4bit, tmp0;			\
62 	vpandn x, mask4bit, x;				\
63 	vpsrld $4, x, x;				\
64 							\
65 	vpshufb tmp0, lo_t, tmp0;			\
66 	vpshufb x, hi_t, x;				\
67 	vpxor tmp0, x, x;
68 
69 #define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
70 	vpunpckhdq x1, x0, t2;				\
71 	vpunpckldq x1, x0, x0;				\
72 							\
73 	vpunpckldq x3, x2, t1;				\
74 	vpunpckhdq x3, x2, x2;				\
75 							\
76 	vpunpckhqdq t1, x0, x1;				\
77 	vpunpcklqdq t1, x0, x0;				\
78 							\
79 	vpunpckhqdq x2, t2, x3;				\
80 	vpunpcklqdq x2, t2, x2;
81 
82 #define byteslice_16x16b(a0, b0, c0, d0,		\
83 			 a1, b1, c1, d1,		\
84 			 a2, b2, c2, d2,		\
85 			 a3, b3, c3, d3,		\
86 			 st0, st1)			\
87 	vmovdqu d2, st0;				\
88 	vmovdqu d3, st1;				\
89 	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
90 	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
91 	vmovdqu st0, d2;				\
92 	vmovdqu st1, d3;				\
93 							\
94 	vmovdqu a0, st0;				\
95 	vmovdqu a1, st1;				\
96 	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
97 	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
98 							\
99 	vbroadcasti128 .Lshufb_16x16b(%rip), a0;	\
100 	vmovdqu st1, a1;				\
101 	vpshufb a0, a2, a2;				\
102 	vpshufb a0, a3, a3;				\
103 	vpshufb a0, b0, b0;				\
104 	vpshufb a0, b1, b1;				\
105 	vpshufb a0, b2, b2;				\
106 	vpshufb a0, b3, b3;				\
107 	vpshufb a0, a1, a1;				\
108 	vpshufb a0, c0, c0;				\
109 	vpshufb a0, c1, c1;				\
110 	vpshufb a0, c2, c2;				\
111 	vpshufb a0, c3, c3;				\
112 	vpshufb a0, d0, d0;				\
113 	vpshufb a0, d1, d1;				\
114 	vpshufb a0, d2, d2;				\
115 	vpshufb a0, d3, d3;				\
116 	vmovdqu d3, st1;				\
117 	vmovdqu st0, d3;				\
118 	vpshufb a0, d3, a0;				\
119 	vmovdqu d2, st0;				\
120 							\
121 	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
122 	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
123 	vmovdqu st0, d2;				\
124 	vmovdqu st1, d3;				\
125 							\
126 	vmovdqu b0, st0;				\
127 	vmovdqu b1, st1;				\
128 	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
129 	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
130 	vmovdqu st0, b0;				\
131 	vmovdqu st1, b1;				\
132 	/* does not adjust output bytes inside vectors */
133 
134 #define debyteslice_16x16b(a0, b0, c0, d0,		\
135 			   a1, b1, c1, d1,		\
136 			   a2, b2, c2, d2,		\
137 			   a3, b3, c3, d3,		\
138 			   st0, st1)			\
139 	vmovdqu d2, st0;				\
140 	vmovdqu d3, st1;				\
141 	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
142 	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
143 	vmovdqu st0, d2;				\
144 	vmovdqu st1, d3;				\
145 							\
146 	vmovdqu a0, st0;				\
147 	vmovdqu a1, st1;				\
148 	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
149 	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
150 							\
151 	vbroadcasti128 .Lshufb_16x16b(%rip), a0;	\
152 	vmovdqu st1, a1;				\
153 	vpshufb a0, a2, a2;				\
154 	vpshufb a0, a3, a3;				\
155 	vpshufb a0, b0, b0;				\
156 	vpshufb a0, b1, b1;				\
157 	vpshufb a0, b2, b2;				\
158 	vpshufb a0, b3, b3;				\
159 	vpshufb a0, a1, a1;				\
160 	vpshufb a0, c0, c0;				\
161 	vpshufb a0, c1, c1;				\
162 	vpshufb a0, c2, c2;				\
163 	vpshufb a0, c3, c3;				\
164 	vpshufb a0, d0, d0;				\
165 	vpshufb a0, d1, d1;				\
166 	vpshufb a0, d2, d2;				\
167 	vpshufb a0, d3, d3;				\
168 	vmovdqu d3, st1;				\
169 	vmovdqu st0, d3;				\
170 	vpshufb a0, d3, a0;				\
171 	vmovdqu d2, st0;				\
172 							\
173 	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
174 	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
175 	vmovdqu st0, d2;				\
176 	vmovdqu st1, d3;				\
177 							\
178 	vmovdqu b0, st0;				\
179 	vmovdqu b1, st1;				\
180 	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
181 	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
182 	vmovdqu st0, b0;				\
183 	vmovdqu st1, b1;				\
184 	/* does not adjust output bytes inside vectors */
185 
186 /* load blocks to registers and apply pre-whitening */
187 #define inpack16_pre(x0, x1, x2, x3,			\
188 		     x4, x5, x6, x7,			\
189 		     y0, y1, y2, y3,			\
190 		     y4, y5, y6, y7,			\
191 		     rio)				\
192 	vmovdqu (0 * 32)(rio), x0;			\
193 	vmovdqu (1 * 32)(rio), x1;			\
194 	vmovdqu (2 * 32)(rio), x2;			\
195 	vmovdqu (3 * 32)(rio), x3;			\
196 	vmovdqu (4 * 32)(rio), x4;			\
197 	vmovdqu (5 * 32)(rio), x5;			\
198 	vmovdqu (6 * 32)(rio), x6;			\
199 	vmovdqu (7 * 32)(rio), x7;			\
200 	vmovdqu (8 * 32)(rio), y0;			\
201 	vmovdqu (9 * 32)(rio), y1;			\
202 	vmovdqu (10 * 32)(rio), y2;			\
203 	vmovdqu (11 * 32)(rio), y3;			\
204 	vmovdqu (12 * 32)(rio), y4;			\
205 	vmovdqu (13 * 32)(rio), y5;			\
206 	vmovdqu (14 * 32)(rio), y6;			\
207 	vmovdqu (15 * 32)(rio), y7;
208 
209 /* byteslice pre-whitened blocks and store to temporary memory */
210 #define inpack16_post(x0, x1, x2, x3,			\
211 		      x4, x5, x6, x7,			\
212 		      y0, y1, y2, y3,			\
213 		      y4, y5, y6, y7,			\
214 		      mem_ab, mem_cd)			\
215 	byteslice_16x16b(x0, x1, x2, x3,		\
216 			 x4, x5, x6, x7,		\
217 			 y0, y1, y2, y3,		\
218 			 y4, y5, y6, y7,		\
219 			 (mem_ab), (mem_cd));		\
220 							\
221 	vmovdqu x0, 0 * 32(mem_ab);			\
222 	vmovdqu x1, 1 * 32(mem_ab);			\
223 	vmovdqu x2, 2 * 32(mem_ab);			\
224 	vmovdqu x3, 3 * 32(mem_ab);			\
225 	vmovdqu x4, 4 * 32(mem_ab);			\
226 	vmovdqu x5, 5 * 32(mem_ab);			\
227 	vmovdqu x6, 6 * 32(mem_ab);			\
228 	vmovdqu x7, 7 * 32(mem_ab);			\
229 	vmovdqu y0, 0 * 32(mem_cd);			\
230 	vmovdqu y1, 1 * 32(mem_cd);			\
231 	vmovdqu y2, 2 * 32(mem_cd);			\
232 	vmovdqu y3, 3 * 32(mem_cd);			\
233 	vmovdqu y4, 4 * 32(mem_cd);			\
234 	vmovdqu y5, 5 * 32(mem_cd);			\
235 	vmovdqu y6, 6 * 32(mem_cd);			\
236 	vmovdqu y7, 7 * 32(mem_cd);
237 
238 #define write_output(x0, x1, x2, x3,			\
239 		     x4, x5, x6, x7,			\
240 		     y0, y1, y2, y3,			\
241 		     y4, y5, y6, y7,			\
242 		     mem)				\
243 	vmovdqu x0, 0 * 32(mem);			\
244 	vmovdqu x1, 1 * 32(mem);			\
245 	vmovdqu x2, 2 * 32(mem);			\
246 	vmovdqu x3, 3 * 32(mem);			\
247 	vmovdqu x4, 4 * 32(mem);			\
248 	vmovdqu x5, 5 * 32(mem);			\
249 	vmovdqu x6, 6 * 32(mem);			\
250 	vmovdqu x7, 7 * 32(mem);			\
251 	vmovdqu y0, 8 * 32(mem);			\
252 	vmovdqu y1, 9 * 32(mem);			\
253 	vmovdqu y2, 10 * 32(mem);			\
254 	vmovdqu y3, 11 * 32(mem);			\
255 	vmovdqu y4, 12 * 32(mem);			\
256 	vmovdqu y5, 13 * 32(mem);			\
257 	vmovdqu y6, 14 * 32(mem);			\
258 	vmovdqu y7, 15 * 32(mem);			\
259 
260 #define aria_store_state_8way(x0, x1, x2, x3,		\
261 			      x4, x5, x6, x7,		\
262 			      mem_tmp, idx)		\
263 	vmovdqu x0, ((idx + 0) * 32)(mem_tmp);		\
264 	vmovdqu x1, ((idx + 1) * 32)(mem_tmp);		\
265 	vmovdqu x2, ((idx + 2) * 32)(mem_tmp);		\
266 	vmovdqu x3, ((idx + 3) * 32)(mem_tmp);		\
267 	vmovdqu x4, ((idx + 4) * 32)(mem_tmp);		\
268 	vmovdqu x5, ((idx + 5) * 32)(mem_tmp);		\
269 	vmovdqu x6, ((idx + 6) * 32)(mem_tmp);		\
270 	vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
271 
272 #define aria_load_state_8way(x0, x1, x2, x3,		\
273 			     x4, x5, x6, x7,		\
274 			     mem_tmp, idx)		\
275 	vmovdqu ((idx + 0) * 32)(mem_tmp), x0;		\
276 	vmovdqu ((idx + 1) * 32)(mem_tmp), x1;		\
277 	vmovdqu ((idx + 2) * 32)(mem_tmp), x2;		\
278 	vmovdqu ((idx + 3) * 32)(mem_tmp), x3;		\
279 	vmovdqu ((idx + 4) * 32)(mem_tmp), x4;		\
280 	vmovdqu ((idx + 5) * 32)(mem_tmp), x5;		\
281 	vmovdqu ((idx + 6) * 32)(mem_tmp), x6;		\
282 	vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
283 
284 #define aria_ark_8way(x0, x1, x2, x3,			\
285 		      x4, x5, x6, x7,			\
286 		      t0, rk, idx, round)		\
287 	/* AddRoundKey */                               \
288 	vpbroadcastb ((round * 16) + idx + 3)(rk), t0;	\
289 	vpxor t0, x0, x0;				\
290 	vpbroadcastb ((round * 16) + idx + 2)(rk), t0;	\
291 	vpxor t0, x1, x1;				\
292 	vpbroadcastb ((round * 16) + idx + 1)(rk), t0;	\
293 	vpxor t0, x2, x2;				\
294 	vpbroadcastb ((round * 16) + idx + 0)(rk), t0;	\
295 	vpxor t0, x3, x3;				\
296 	vpbroadcastb ((round * 16) + idx + 7)(rk), t0;	\
297 	vpxor t0, x4, x4;				\
298 	vpbroadcastb ((round * 16) + idx + 6)(rk), t0;	\
299 	vpxor t0, x5, x5;				\
300 	vpbroadcastb ((round * 16) + idx + 5)(rk), t0;	\
301 	vpxor t0, x6, x6;				\
302 	vpbroadcastb ((round * 16) + idx + 4)(rk), t0;	\
303 	vpxor t0, x7, x7;
304 
305 #ifdef CONFIG_AS_GFNI
306 #define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
307 			    x4, x5, x6, x7,		\
308 			    t0, t1, t2, t3,		\
309 			    t4, t5, t6, t7)		\
310 	vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0;	\
311 	vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1;	\
312 	vpbroadcastq .Ltf_id_bitmatrix(%rip), t2;	\
313 	vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3;	\
314 	vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4;	\
315 	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
316 	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
317 	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
318 	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
319 	vgf2p8affineinvqb $0, t2, x2, x2;		\
320 	vgf2p8affineinvqb $0, t2, x6, x6;		\
321 	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
322 	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
323 	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
324 	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
325 	vgf2p8affineinvqb $0, t2, x3, x3;		\
326 	vgf2p8affineinvqb $0, t2, x7, x7
327 
328 #endif /* CONFIG_AS_GFNI */
329 #define aria_sbox_8way(x0, x1, x2, x3,			\
330 		       x4, x5, x6, x7,			\
331 		       t0, t1, t2, t3,			\
332 		       t4, t5, t6, t7)			\
333 	vpxor t7, t7, t7;				\
334 	vpxor t6, t6, t6;				\
335 	vbroadcasti128 .Linv_shift_row(%rip), t0;	\
336 	vbroadcasti128 .Lshift_row(%rip), t1;		\
337 	vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \
338 	vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \
339 	vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
340 	vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
341 							\
342 	vextracti128 $1, x0, t6##_x;			\
343 	vaesenclast t7##_x, x0##_x, x0##_x;		\
344 	vaesenclast t7##_x, t6##_x, t6##_x;		\
345 	vinserti128 $1, t6##_x, x0, x0;			\
346 							\
347 	vextracti128 $1, x4, t6##_x;			\
348 	vaesenclast t7##_x, x4##_x, x4##_x;		\
349 	vaesenclast t7##_x, t6##_x, t6##_x;		\
350 	vinserti128 $1, t6##_x, x4, x4;			\
351 							\
352 	vextracti128 $1, x1, t6##_x;			\
353 	vaesenclast t7##_x, x1##_x, x1##_x;		\
354 	vaesenclast t7##_x, t6##_x, t6##_x;		\
355 	vinserti128 $1, t6##_x, x1, x1;			\
356 							\
357 	vextracti128 $1, x5, t6##_x;			\
358 	vaesenclast t7##_x, x5##_x, x5##_x;		\
359 	vaesenclast t7##_x, t6##_x, t6##_x;		\
360 	vinserti128 $1, t6##_x, x5, x5;			\
361 							\
362 	vextracti128 $1, x2, t6##_x;			\
363 	vaesdeclast t7##_x, x2##_x, x2##_x;		\
364 	vaesdeclast t7##_x, t6##_x, t6##_x;		\
365 	vinserti128 $1, t6##_x, x2, x2;			\
366 							\
367 	vextracti128 $1, x6, t6##_x;			\
368 	vaesdeclast t7##_x, x6##_x, x6##_x;		\
369 	vaesdeclast t7##_x, t6##_x, t6##_x;		\
370 	vinserti128 $1, t6##_x, x6, x6;			\
371 							\
372 	vpbroadcastd .L0f0f0f0f(%rip), t6;		\
373 							\
374 	/* AES inverse shift rows */			\
375 	vpshufb t0, x0, x0;				\
376 	vpshufb t0, x4, x4;				\
377 	vpshufb t0, x1, x1;				\
378 	vpshufb t0, x5, x5;				\
379 	vpshufb t1, x3, x3;				\
380 	vpshufb t1, x7, x7;				\
381 	vpshufb t1, x2, x2;				\
382 	vpshufb t1, x6, x6;				\
383 							\
384 	/* affine transformation for S2 */		\
385 	filter_8bit(x1, t2, t3, t6, t0);		\
386 	/* affine transformation for S2 */		\
387 	filter_8bit(x5, t2, t3, t6, t0);		\
388 							\
389 	/* affine transformation for X2 */		\
390 	filter_8bit(x3, t4, t5, t6, t0);		\
391 	/* affine transformation for X2 */		\
392 	filter_8bit(x7, t4, t5, t6, t0);		\
393 							\
394 	vpxor t6, t6, t6;				\
395 	vextracti128 $1, x3, t6##_x;			\
396 	vaesdeclast t7##_x, x3##_x, x3##_x;		\
397 	vaesdeclast t7##_x, t6##_x, t6##_x;		\
398 	vinserti128 $1, t6##_x, x3, x3;			\
399 							\
400 	vextracti128 $1, x7, t6##_x;			\
401 	vaesdeclast t7##_x, x7##_x, x7##_x;		\
402 	vaesdeclast t7##_x, t6##_x, t6##_x;		\
403 	vinserti128 $1, t6##_x, x7, x7;			\
404 
405 #define aria_diff_m(x0, x1, x2, x3,			\
406 		    t0, t1, t2, t3)			\
407 	/* T = rotr32(X, 8); */				\
408 	/* X ^= T */					\
409 	vpxor x0, x3, t0;				\
410 	vpxor x1, x0, t1;				\
411 	vpxor x2, x1, t2;				\
412 	vpxor x3, x2, t3;				\
413 	/* X = T ^ rotr(X, 16); */			\
414 	vpxor t2, x0, x0;				\
415 	vpxor x1, t3, t3;				\
416 	vpxor t0, x2, x2;				\
417 	vpxor t1, x3, x1;				\
418 	vmovdqu t3, x3;
419 
420 #define aria_diff_word(x0, x1, x2, x3,			\
421 		       x4, x5, x6, x7,			\
422 		       y0, y1, y2, y3,			\
423 		       y4, y5, y6, y7)			\
424 	/* t1 ^= t2; */					\
425 	vpxor y0, x4, x4;				\
426 	vpxor y1, x5, x5;				\
427 	vpxor y2, x6, x6;				\
428 	vpxor y3, x7, x7;				\
429 							\
430 	/* t2 ^= t3; */					\
431 	vpxor y4, y0, y0;				\
432 	vpxor y5, y1, y1;				\
433 	vpxor y6, y2, y2;				\
434 	vpxor y7, y3, y3;				\
435 							\
436 	/* t0 ^= t1; */					\
437 	vpxor x4, x0, x0;				\
438 	vpxor x5, x1, x1;				\
439 	vpxor x6, x2, x2;				\
440 	vpxor x7, x3, x3;				\
441 							\
442 	/* t3 ^= t1; */					\
443 	vpxor x4, y4, y4;				\
444 	vpxor x5, y5, y5;				\
445 	vpxor x6, y6, y6;				\
446 	vpxor x7, y7, y7;				\
447 							\
448 	/* t2 ^= t0; */					\
449 	vpxor x0, y0, y0;				\
450 	vpxor x1, y1, y1;				\
451 	vpxor x2, y2, y2;				\
452 	vpxor x3, y3, y3;				\
453 							\
454 	/* t1 ^= t2; */					\
455 	vpxor y0, x4, x4;				\
456 	vpxor y1, x5, x5;				\
457 	vpxor y2, x6, x6;				\
458 	vpxor y3, x7, x7;
459 
460 #define aria_fe(x0, x1, x2, x3,				\
461 		x4, x5, x6, x7,				\
462 		y0, y1, y2, y3,				\
463 		y4, y5, y6, y7,				\
464 		mem_tmp, rk, round)			\
465 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
466 		      y0, rk, 8, round);		\
467 							\
468 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
469 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
470 							\
471 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
472 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
473 	aria_store_state_8way(x0, x1, x2, x3,		\
474 			      x4, x5, x6, x7,		\
475 			      mem_tmp, 8);		\
476 							\
477 	aria_load_state_8way(x0, x1, x2, x3,		\
478 			     x4, x5, x6, x7,		\
479 			     mem_tmp, 0);		\
480 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
481 		      y0, rk, 0, round);		\
482 							\
483 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
484 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
485 							\
486 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
487 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
488 	aria_store_state_8way(x0, x1, x2, x3,		\
489 			      x4, x5, x6, x7,		\
490 			      mem_tmp, 0);		\
491 	aria_load_state_8way(y0, y1, y2, y3,		\
492 			     y4, y5, y6, y7,		\
493 			     mem_tmp, 8);		\
494 	aria_diff_word(x0, x1, x2, x3,			\
495 		       x4, x5, x6, x7,			\
496 		       y0, y1, y2, y3,			\
497 		       y4, y5, y6, y7);			\
498 	/* aria_diff_byte()				\
499 	 * T3 = ABCD -> BADC				\
500 	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
501 	 * T0 = ABCD -> CDAB				\
502 	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
503 	 * T1 = ABCD -> DCBA				\
504 	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
505 	 */						\
506 	aria_diff_word(x2, x3, x0, x1,			\
507 		       x7, x6, x5, x4,			\
508 		       y0, y1, y2, y3,			\
509 		       y5, y4, y7, y6);			\
510 	aria_store_state_8way(x3, x2, x1, x0,		\
511 			      x6, x7, x4, x5,		\
512 			      mem_tmp, 0);
513 
514 #define aria_fo(x0, x1, x2, x3,				\
515 		x4, x5, x6, x7,				\
516 		y0, y1, y2, y3,				\
517 		y4, y5, y6, y7,				\
518 		mem_tmp, rk, round)			\
519 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
520 		      y0, rk, 8, round);		\
521 							\
522 	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
523 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
524 							\
525 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
526 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
527 	aria_store_state_8way(x0, x1, x2, x3,		\
528 			      x4, x5, x6, x7,		\
529 			      mem_tmp, 8);		\
530 							\
531 	aria_load_state_8way(x0, x1, x2, x3,		\
532 			     x4, x5, x6, x7,		\
533 			     mem_tmp, 0);		\
534 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
535 		      y0, rk, 0, round);		\
536 							\
537 	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
538 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
539 							\
540 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
541 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
542 	aria_store_state_8way(x0, x1, x2, x3,		\
543 			      x4, x5, x6, x7,		\
544 			      mem_tmp, 0);		\
545 	aria_load_state_8way(y0, y1, y2, y3,		\
546 			     y4, y5, y6, y7,		\
547 			     mem_tmp, 8);		\
548 	aria_diff_word(x0, x1, x2, x3,			\
549 		       x4, x5, x6, x7,			\
550 		       y0, y1, y2, y3,			\
551 		       y4, y5, y6, y7);			\
552 	/* aria_diff_byte()				\
553 	 * T1 = ABCD -> BADC				\
554 	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
555 	 * T2 = ABCD -> CDAB				\
556 	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
557 	 * T3 = ABCD -> DCBA				\
558 	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
559 	 */						\
560 	aria_diff_word(x0, x1, x2, x3,			\
561 		       x5, x4, x7, x6,			\
562 		       y2, y3, y0, y1,			\
563 		       y7, y6, y5, y4);			\
564 	aria_store_state_8way(x3, x2, x1, x0,		\
565 			      x6, x7, x4, x5,		\
566 			      mem_tmp, 0);
567 
568 #define aria_ff(x0, x1, x2, x3,				\
569 		x4, x5, x6, x7,				\
570 		y0, y1, y2, y3,				\
571 		y4, y5, y6, y7,				\
572 		mem_tmp, rk, round, last_round)		\
573 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
574 		      y0, rk, 8, round);		\
575 							\
576 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
577 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
578 							\
579 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
580 		      y0, rk, 8, last_round);		\
581 							\
582 	aria_store_state_8way(x0, x1, x2, x3,		\
583 			      x4, x5, x6, x7,		\
584 			      mem_tmp, 8);		\
585 							\
586 	aria_load_state_8way(x0, x1, x2, x3,		\
587 			     x4, x5, x6, x7,		\
588 			     mem_tmp, 0);		\
589 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
590 		      y0, rk, 0, round);		\
591 							\
592 	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
593 		       y0, y1, y2, y3, y4, y5, y6, y7);	\
594 							\
595 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
596 		      y0, rk, 0, last_round);		\
597 							\
598 	aria_load_state_8way(y0, y1, y2, y3,		\
599 			     y4, y5, y6, y7,		\
600 			     mem_tmp, 8);
601 #ifdef CONFIG_AS_GFNI
602 #define aria_fe_gfni(x0, x1, x2, x3,			\
603 		     x4, x5, x6, x7,			\
604 		     y0, y1, y2, y3,			\
605 		     y4, y5, y6, y7,			\
606 		     mem_tmp, rk, round)		\
607 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
608 		      y0, rk, 8, round);		\
609 							\
610 	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
611 			    x6, x7, x4, x5,		\
612 			    y0, y1, y2, y3,		\
613 			    y4, y5, y6, y7);		\
614 							\
615 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
616 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
617 	aria_store_state_8way(x0, x1, x2, x3,		\
618 			      x4, x5, x6, x7,		\
619 			      mem_tmp, 8);		\
620 							\
621 	aria_load_state_8way(x0, x1, x2, x3,		\
622 			     x4, x5, x6, x7,		\
623 			     mem_tmp, 0);		\
624 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
625 		      y0, rk, 0, round);		\
626 							\
627 	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
628 			    x6, x7, x4, x5,		\
629 			    y0, y1, y2, y3,		\
630 			    y4, y5, y6, y7);		\
631 							\
632 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
633 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
634 	aria_store_state_8way(x0, x1, x2, x3,		\
635 			      x4, x5, x6, x7,		\
636 			      mem_tmp, 0);		\
637 	aria_load_state_8way(y0, y1, y2, y3,		\
638 			     y4, y5, y6, y7,		\
639 			     mem_tmp, 8);		\
640 	aria_diff_word(x0, x1, x2, x3,			\
641 		       x4, x5, x6, x7,			\
642 		       y0, y1, y2, y3,			\
643 		       y4, y5, y6, y7);			\
644 	/* aria_diff_byte()				\
645 	 * T3 = ABCD -> BADC				\
646 	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
647 	 * T0 = ABCD -> CDAB				\
648 	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
649 	 * T1 = ABCD -> DCBA				\
650 	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
651 	 */						\
652 	aria_diff_word(x2, x3, x0, x1,			\
653 		       x7, x6, x5, x4,			\
654 		       y0, y1, y2, y3,			\
655 		       y5, y4, y7, y6);			\
656 	aria_store_state_8way(x3, x2, x1, x0,		\
657 			      x6, x7, x4, x5,		\
658 			      mem_tmp, 0);
659 
660 #define aria_fo_gfni(x0, x1, x2, x3,			\
661 		     x4, x5, x6, x7,			\
662 		     y0, y1, y2, y3,			\
663 		     y4, y5, y6, y7,			\
664 		     mem_tmp, rk, round)		\
665 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
666 		      y0, rk, 8, round);		\
667 							\
668 	aria_sbox_8way_gfni(x0, x1, x2, x3,		\
669 			    x4, x5, x6, x7,		\
670 			    y0, y1, y2, y3,		\
671 			    y4, y5, y6, y7);		\
672 							\
673 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
674 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
675 	aria_store_state_8way(x0, x1, x2, x3,		\
676 			      x4, x5, x6, x7,		\
677 			      mem_tmp, 8);		\
678 							\
679 	aria_load_state_8way(x0, x1, x2, x3,		\
680 			     x4, x5, x6, x7,		\
681 			     mem_tmp, 0);		\
682 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
683 		      y0, rk, 0, round);		\
684 							\
685 	aria_sbox_8way_gfni(x0, x1, x2, x3,		\
686 			    x4, x5, x6, x7,		\
687 			    y0, y1, y2, y3,		\
688 			    y4, y5, y6, y7);		\
689 							\
690 	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
691 	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
692 	aria_store_state_8way(x0, x1, x2, x3,		\
693 			      x4, x5, x6, x7,		\
694 			      mem_tmp, 0);		\
695 	aria_load_state_8way(y0, y1, y2, y3,		\
696 			     y4, y5, y6, y7,		\
697 			     mem_tmp, 8);		\
698 	aria_diff_word(x0, x1, x2, x3,			\
699 		       x4, x5, x6, x7,			\
700 		       y0, y1, y2, y3,			\
701 		       y4, y5, y6, y7);			\
702 	/* aria_diff_byte()				\
703 	 * T1 = ABCD -> BADC				\
704 	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
705 	 * T2 = ABCD -> CDAB				\
706 	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
707 	 * T3 = ABCD -> DCBA				\
708 	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
709 	 */						\
710 	aria_diff_word(x0, x1, x2, x3,			\
711 		       x5, x4, x7, x6,			\
712 		       y2, y3, y0, y1,			\
713 		       y7, y6, y5, y4);			\
714 	aria_store_state_8way(x3, x2, x1, x0,		\
715 			      x6, x7, x4, x5,		\
716 			      mem_tmp, 0);
717 
718 #define aria_ff_gfni(x0, x1, x2, x3,			\
719 		x4, x5, x6, x7,				\
720 		y0, y1, y2, y3,				\
721 		y4, y5, y6, y7,				\
722 		mem_tmp, rk, round, last_round)		\
723 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
724 		      y0, rk, 8, round);		\
725 							\
726 	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
727 			    x6, x7, x4, x5,		\
728 			    y0, y1, y2, y3,		\
729 			    y4, y5, y6, y7);		\
730 							\
731 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
732 		      y0, rk, 8, last_round);		\
733 							\
734 	aria_store_state_8way(x0, x1, x2, x3,		\
735 			      x4, x5, x6, x7,		\
736 			      mem_tmp, 8);		\
737 							\
738 	aria_load_state_8way(x0, x1, x2, x3,		\
739 			     x4, x5, x6, x7,		\
740 			     mem_tmp, 0);		\
741 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
742 		      y0, rk, 0, round);		\
743 							\
744 	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
745 			    x6, x7, x4, x5,		\
746 			    y0, y1, y2, y3,		\
747 			    y4, y5, y6, y7);		\
748 							\
749 	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
750 		      y0, rk, 0, last_round);		\
751 							\
752 	aria_load_state_8way(y0, y1, y2, y3,		\
753 			     y4, y5, y6, y7,		\
754 			     mem_tmp, 8);
755 #endif /* CONFIG_AS_GFNI */
756 
757 .section        .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
758 .align 32
759 #define SHUFB_BYTES(idx) \
760 	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
761 .Lshufb_16x16b:
762 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
763 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
764 
765 .section	.rodata.cst16, "aM", @progbits, 16
766 .align 16
767 /* For isolating SubBytes from AESENCLAST, inverse shift row */
768 .Linv_shift_row:
769 	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
770 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
771 .Lshift_row:
772 	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
773 	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
774 /* For CTR-mode IV byteswap */
775 .Lbswap128_mask:
776 	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
777 	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
778 
779 /* AES inverse affine and S2 combined:
780  *      1 1 0 0 0 0 0 1     x0     0
781  *      0 1 0 0 1 0 0 0     x1     0
782  *      1 1 0 0 1 1 1 1     x2     0
783  *      0 1 1 0 1 0 0 1     x3     1
784  *      0 1 0 0 1 1 0 0  *  x4  +  0
785  *      0 1 0 1 1 0 0 0     x5     0
786  *      0 0 0 0 0 1 0 1     x6     0
787  *      1 1 1 0 0 1 1 1     x7     1
788  */
789 .Ltf_lo__inv_aff__and__s2:
790 	.octa 0x92172DA81A9FA520B2370D883ABF8500
791 .Ltf_hi__inv_aff__and__s2:
792 	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
793 
794 /* X2 and AES forward affine combined:
795  *      1 0 1 1 0 0 0 1     x0     0
796  *      0 1 1 1 1 0 1 1     x1     0
797  *      0 0 0 1 1 0 1 0     x2     1
798  *      0 1 0 0 0 1 0 0     x3     0
799  *      0 0 1 1 1 0 1 1  *  x4  +  0
800  *      0 1 0 0 1 0 0 0     x5     0
801  *      1 1 0 1 0 0 1 1     x6     0
802  *      0 1 0 0 1 0 1 0     x7     0
803  */
804 .Ltf_lo__x2__and__fwd_aff:
805 	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
806 .Ltf_hi__x2__and__fwd_aff:
807 	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
808 
809 #ifdef CONFIG_AS_GFNI
810 .section	.rodata.cst8, "aM", @progbits, 8
811 .align 8
812 /* AES affine: */
813 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
814 .Ltf_aff_bitmatrix:
815 	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
816 		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
817 		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
818 		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
819 		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
820 		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
821 		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
822 		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
823 
824 /* AES inverse affine: */
825 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
826 .Ltf_inv_bitmatrix:
827 	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
828 		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
829 		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
830 		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
831 		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
832 		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
833 		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
834 		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
835 
836 /* S2: */
837 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
838 .Ltf_s2_bitmatrix:
839 	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
840 		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
841 		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
842 		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
843 		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
844 		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
845 		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
846 		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
847 
848 /* X2: */
849 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
850 .Ltf_x2_bitmatrix:
851 	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
852 		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
853 		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
854 		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
855 		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
856 		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
857 		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
858 		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
859 
860 /* Identity matrix: */
861 .Ltf_id_bitmatrix:
862 	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
863 		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
864 		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
865 		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
866 		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
867 		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
868 		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
869 		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
870 
871 #endif /* CONFIG_AS_GFNI */
872 
873 /* 4-bit mask */
874 .section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
875 .align 4
876 .L0f0f0f0f:
877 	.long 0x0f0f0f0f
878 
879 .text
880 
881 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)
882 	/* input:
883 	 *      %r9: rk
884 	 *      %rsi: dst
885 	 *      %rdx: src
886 	 *      %ymm0..%ymm15: byte-sliced blocks
887 	 */
888 
889 	FRAME_BEGIN
890 
891 	movq %rsi, %rax;
892 	leaq 8 * 32(%rax), %r8;
893 
894 	inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
895 		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
896 		      %ymm15, %rax, %r8);
897 	aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
898 		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
899 		%rax, %r9, 0);
900 	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
901 		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
902 		%ymm15, %rax, %r9, 1);
903 	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
904 		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
905 		%rax, %r9, 2);
906 	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
907 		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
908 		%ymm15, %rax, %r9, 3);
909 	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
910 		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
911 		%rax, %r9, 4);
912 	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
913 		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
914 		%ymm15, %rax, %r9, 5);
915 	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
916 		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
917 		%rax, %r9, 6);
918 	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
919 		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
920 		%ymm15, %rax, %r9, 7);
921 	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
922 		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
923 		%rax, %r9, 8);
924 	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
925 		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
926 		%ymm15, %rax, %r9, 9);
927 	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
928 		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
929 		%rax, %r9, 10);
930 	cmpl $12, ARIA_CTX_rounds(CTX);
931 	jne .Laria_192;
932 	aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
933 		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
934 		%ymm15, %rax, %r9, 11, 12);
935 	jmp .Laria_end;
936 .Laria_192:
937 	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
938 		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
939 		%ymm15, %rax, %r9, 11);
940 	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
941 		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
942 		%rax, %r9, 12);
943 	cmpl $14, ARIA_CTX_rounds(CTX);
944 	jne .Laria_256;
945 	aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
946 		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
947 		%ymm15, %rax, %r9, 13, 14);
948 	jmp .Laria_end;
949 .Laria_256:
950 	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
951 		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
952 		%ymm15, %rax, %r9, 13);
953 	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
954 		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
955 		%rax, %r9, 14);
956 	aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
957 		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
958 		%ymm15, %rax, %r9, 15, 16);
959 .Laria_end:
960 	debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
961 			   %ymm9, %ymm13, %ymm0, %ymm5,
962 			   %ymm10, %ymm14, %ymm3, %ymm6,
963 			   %ymm11, %ymm15, %ymm2, %ymm7,
964 			   (%rax), (%r8));
965 
966 	FRAME_END
967 	RET;
968 SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)
969 
970 SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)
971 	/* input:
972 	 *      %rdi: ctx, CTX
973 	 *      %rsi: dst
974 	 *      %rdx: src
975 	 */
976 
977 	FRAME_BEGIN
978 
979 	leaq ARIA_CTX_enc_key(CTX), %r9;
980 
981 	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
982 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
983 		     %ymm15, %rdx);
984 
985 	call __aria_aesni_avx2_crypt_32way;
986 
987 	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
988 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
989 		     %ymm15, %rax);
990 
991 	FRAME_END
992 	RET;
993 SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)
994 
995 SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)
996 	/* input:
997 	 *      %rdi: ctx, CTX
998 	 *      %rsi: dst
999 	 *      %rdx: src
1000 	 */
1001 
1002 	FRAME_BEGIN
1003 
1004 	leaq ARIA_CTX_dec_key(CTX), %r9;
1005 
1006 	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1007 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1008 		     %ymm15, %rdx);
1009 
1010 	call __aria_aesni_avx2_crypt_32way;
1011 
1012 	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1013 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1014 		     %ymm15, %rax);
1015 
1016 	FRAME_END
1017 	RET;
1018 SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)
1019 
1020 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)
1021 	/* input:
1022 	 *      %rdi: ctx
1023 	 *      %rsi: dst
1024 	 *      %rdx: src
1025 	 *      %rcx: keystream
1026 	 *      %r8: iv (big endian, 128bit)
1027 	 */
1028 
1029 	FRAME_BEGIN
1030 	movq 8(%r8), %r11;
1031 	bswapq %r11;
1032 
1033 	vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;
1034 	vpcmpeqd %ymm0, %ymm0, %ymm0;
1035 	vpsrldq $8, %ymm0, %ymm0;   /* ab: -1:0 ; cd: -1:0 */
1036 	vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
1037 
1038 	/* load IV and byteswap */
1039 	vmovdqu (%r8), %xmm7;
1040 	vpshufb %xmm6, %xmm7, %xmm7;
1041 	vmovdqa %xmm7, %xmm3;
1042 	inc_le128(%xmm7, %xmm0, %xmm4);
1043 	vinserti128 $1, %xmm7, %ymm3, %ymm3;
1044 	vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
1045 
1046 	/* check need for handling 64-bit overflow and carry */
1047 	cmpq $(0xffffffffffffffff - 32), %r11;
1048 	ja .Lhandle_ctr_carry;
1049 
1050 	/* construct IVs */
1051 	vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
1052 	vpshufb %ymm6, %ymm3, %ymm9;
1053 	vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
1054 	vpshufb %ymm6, %ymm3, %ymm10;
1055 	vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
1056 	vpshufb %ymm6, %ymm3, %ymm11;
1057 	vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
1058 	vpshufb %ymm6, %ymm3, %ymm12;
1059 	vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
1060 	vpshufb %ymm6, %ymm3, %ymm13;
1061 	vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
1062 	vpshufb %ymm6, %ymm3, %ymm14;
1063 	vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
1064 	vpshufb %ymm6, %ymm3, %ymm15;
1065 	vmovdqu %ymm8, (0 * 32)(%rcx);
1066 	vmovdqu %ymm9, (1 * 32)(%rcx);
1067 	vmovdqu %ymm10, (2 * 32)(%rcx);
1068 	vmovdqu %ymm11, (3 * 32)(%rcx);
1069 	vmovdqu %ymm12, (4 * 32)(%rcx);
1070 	vmovdqu %ymm13, (5 * 32)(%rcx);
1071 	vmovdqu %ymm14, (6 * 32)(%rcx);
1072 	vmovdqu %ymm15, (7 * 32)(%rcx);
1073 
1074 	vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
1075 	vpshufb %ymm6, %ymm3, %ymm8;
1076 	vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
1077 	vpshufb %ymm6, %ymm3, %ymm9;
1078 	vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
1079 	vpshufb %ymm6, %ymm3, %ymm10;
1080 	vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
1081 	vpshufb %ymm6, %ymm3, %ymm11;
1082 	vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
1083 	vpshufb %ymm6, %ymm3, %ymm12;
1084 	vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
1085 	vpshufb %ymm6, %ymm3, %ymm13;
1086 	vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
1087 	vpshufb %ymm6, %ymm3, %ymm14;
1088 	vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
1089 	vpshufb %ymm6, %ymm3, %ymm15;
1090 	vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
1091 	vpshufb %xmm6, %xmm3, %xmm3;
1092 	vmovdqu %xmm3, (%r8);
1093 	vmovdqu (0 * 32)(%rcx), %ymm0;
1094 	vmovdqu (1 * 32)(%rcx), %ymm1;
1095 	vmovdqu (2 * 32)(%rcx), %ymm2;
1096 	vmovdqu (3 * 32)(%rcx), %ymm3;
1097 	vmovdqu (4 * 32)(%rcx), %ymm4;
1098 	vmovdqu (5 * 32)(%rcx), %ymm5;
1099 	vmovdqu (6 * 32)(%rcx), %ymm6;
1100 	vmovdqu (7 * 32)(%rcx), %ymm7;
1101 	jmp .Lctr_carry_done;
1102 
1103 	.Lhandle_ctr_carry:
1104 	/* construct IVs */
1105 	inc_le128(%ymm3, %ymm0, %ymm4);
1106 	inc_le128(%ymm3, %ymm0, %ymm4);
1107 	vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
1108 	inc_le128(%ymm3, %ymm0, %ymm4);
1109 	inc_le128(%ymm3, %ymm0, %ymm4);
1110 	vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
1111 	inc_le128(%ymm3, %ymm0, %ymm4);
1112 	inc_le128(%ymm3, %ymm0, %ymm4);
1113 	vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
1114 	inc_le128(%ymm3, %ymm0, %ymm4);
1115 	inc_le128(%ymm3, %ymm0, %ymm4);
1116 	vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
1117 	inc_le128(%ymm3, %ymm0, %ymm4);
1118 	inc_le128(%ymm3, %ymm0, %ymm4);
1119 	vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
1120 	inc_le128(%ymm3, %ymm0, %ymm4);
1121 	inc_le128(%ymm3, %ymm0, %ymm4);
1122 	vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
1123 	inc_le128(%ymm3, %ymm0, %ymm4);
1124 	inc_le128(%ymm3, %ymm0, %ymm4);
1125 	vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
1126 	vmovdqu %ymm8, (0 * 32)(%rcx);
1127 	vmovdqu %ymm9, (1 * 32)(%rcx);
1128 	vmovdqu %ymm10, (2 * 32)(%rcx);
1129 	vmovdqu %ymm11, (3 * 32)(%rcx);
1130 	vmovdqu %ymm12, (4 * 32)(%rcx);
1131 	vmovdqu %ymm13, (5 * 32)(%rcx);
1132 	vmovdqu %ymm14, (6 * 32)(%rcx);
1133 	vmovdqu %ymm15, (7 * 32)(%rcx);
1134 
1135 	inc_le128(%ymm3, %ymm0, %ymm4);
1136 	inc_le128(%ymm3, %ymm0, %ymm4);
1137 	vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
1138 	inc_le128(%ymm3, %ymm0, %ymm4);
1139 	inc_le128(%ymm3, %ymm0, %ymm4);
1140 	vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
1141 	inc_le128(%ymm3, %ymm0, %ymm4);
1142 	inc_le128(%ymm3, %ymm0, %ymm4);
1143 	vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
1144 	inc_le128(%ymm3, %ymm0, %ymm4);
1145 	inc_le128(%ymm3, %ymm0, %ymm4);
1146 	vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
1147 	inc_le128(%ymm3, %ymm0, %ymm4);
1148 	inc_le128(%ymm3, %ymm0, %ymm4);
1149 	vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
1150 	inc_le128(%ymm3, %ymm0, %ymm4);
1151 	inc_le128(%ymm3, %ymm0, %ymm4);
1152 	vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
1153 	inc_le128(%ymm3, %ymm0, %ymm4);
1154 	inc_le128(%ymm3, %ymm0, %ymm4);
1155 	vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
1156 	inc_le128(%ymm3, %ymm0, %ymm4);
1157 	inc_le128(%ymm3, %ymm0, %ymm4);
1158 	vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
1159 	inc_le128(%ymm3, %ymm0, %ymm4);
1160 	vextracti128 $1, %ymm3, %xmm3;
1161 	vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
1162 	vmovdqu %xmm3, (%r8);
1163 	vmovdqu (0 * 32)(%rcx), %ymm0;
1164 	vmovdqu (1 * 32)(%rcx), %ymm1;
1165 	vmovdqu (2 * 32)(%rcx), %ymm2;
1166 	vmovdqu (3 * 32)(%rcx), %ymm3;
1167 	vmovdqu (4 * 32)(%rcx), %ymm4;
1168 	vmovdqu (5 * 32)(%rcx), %ymm5;
1169 	vmovdqu (6 * 32)(%rcx), %ymm6;
1170 	vmovdqu (7 * 32)(%rcx), %ymm7;
1171 
1172 	.Lctr_carry_done:
1173 
1174 	FRAME_END
1175 	RET;
1176 SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)
1177 
1178 SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)
1179 	/* input:
1180 	 *      %rdi: ctx
1181 	 *      %rsi: dst
1182 	 *      %rdx: src
1183 	 *      %rcx: keystream
1184 	 *      %r8: iv (big endian, 128bit)
1185 	 */
1186 	FRAME_BEGIN
1187 
1188 	call __aria_aesni_avx2_ctr_gen_keystream_32way;
1189 
1190 	leaq (%rsi), %r10;
1191 	leaq (%rdx), %r11;
1192 	leaq (%rcx), %rsi;
1193 	leaq (%rcx), %rdx;
1194 	leaq ARIA_CTX_enc_key(CTX), %r9;
1195 
1196 	call __aria_aesni_avx2_crypt_32way;
1197 
1198 	vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1199 	vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1200 	vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1201 	vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1202 	vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1203 	vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1204 	vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1205 	vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1206 	vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1207 	vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1208 	vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1209 	vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1210 	vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1211 	vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1212 	vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1213 	vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1214 	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1215 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1216 		     %ymm15, %r10);
1217 
1218 	FRAME_END
1219 	RET;
1220 SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)
1221 
1222 #ifdef CONFIG_AS_GFNI
1223 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)
1224 	/* input:
1225 	 *      %r9: rk
1226 	 *      %rsi: dst
1227 	 *      %rdx: src
1228 	 *      %ymm0..%ymm15: 16 byte-sliced blocks
1229 	 */
1230 
1231 	FRAME_BEGIN
1232 
1233 	movq %rsi, %rax;
1234 	leaq 8 * 32(%rax), %r8;
1235 
1236 	inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
1237 		      %ymm4, %ymm5, %ymm6, %ymm7,
1238 		      %ymm8, %ymm9, %ymm10, %ymm11,
1239 		      %ymm12, %ymm13, %ymm14,
1240 		      %ymm15, %rax, %r8);
1241 	aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
1242 		     %ymm12, %ymm13, %ymm14, %ymm15,
1243 		     %ymm0, %ymm1, %ymm2, %ymm3,
1244 		     %ymm4, %ymm5, %ymm6, %ymm7,
1245 		     %rax, %r9, 0);
1246 	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1247 		     %ymm4, %ymm5, %ymm6, %ymm7,
1248 		     %ymm8, %ymm9, %ymm10, %ymm11,
1249 		     %ymm12, %ymm13, %ymm14,
1250 		     %ymm15, %rax, %r9, 1);
1251 	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1252 		     %ymm12, %ymm13, %ymm14, %ymm15,
1253 		     %ymm0, %ymm1, %ymm2, %ymm3,
1254 		     %ymm4, %ymm5, %ymm6, %ymm7,
1255 		     %rax, %r9, 2);
1256 	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1257 		     %ymm4, %ymm5, %ymm6, %ymm7,
1258 		     %ymm8, %ymm9, %ymm10, %ymm11,
1259 		     %ymm12, %ymm13, %ymm14,
1260 		     %ymm15, %rax, %r9, 3);
1261 	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1262 		     %ymm12, %ymm13, %ymm14, %ymm15,
1263 		     %ymm0, %ymm1, %ymm2, %ymm3,
1264 		     %ymm4, %ymm5, %ymm6, %ymm7,
1265 		     %rax, %r9, 4);
1266 	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1267 		     %ymm4, %ymm5, %ymm6, %ymm7,
1268 		     %ymm8, %ymm9, %ymm10, %ymm11,
1269 		     %ymm12, %ymm13, %ymm14,
1270 		     %ymm15, %rax, %r9, 5);
1271 	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1272 		     %ymm12, %ymm13, %ymm14, %ymm15,
1273 		     %ymm0, %ymm1, %ymm2, %ymm3,
1274 		     %ymm4, %ymm5, %ymm6, %ymm7,
1275 		     %rax, %r9, 6);
1276 	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1277 		     %ymm4, %ymm5, %ymm6, %ymm7,
1278 		     %ymm8, %ymm9, %ymm10, %ymm11,
1279 		     %ymm12, %ymm13, %ymm14,
1280 		     %ymm15, %rax, %r9, 7);
1281 	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1282 		     %ymm12, %ymm13, %ymm14, %ymm15,
1283 		     %ymm0, %ymm1, %ymm2, %ymm3,
1284 		     %ymm4, %ymm5, %ymm6, %ymm7,
1285 		     %rax, %r9, 8);
1286 	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1287 		     %ymm4, %ymm5, %ymm6, %ymm7,
1288 		     %ymm8, %ymm9, %ymm10, %ymm11,
1289 		     %ymm12, %ymm13, %ymm14,
1290 		     %ymm15, %rax, %r9, 9);
1291 	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1292 		     %ymm12, %ymm13, %ymm14, %ymm15,
1293 		     %ymm0, %ymm1, %ymm2, %ymm3,
1294 		     %ymm4, %ymm5, %ymm6, %ymm7,
1295 		     %rax, %r9, 10);
1296 	cmpl $12, ARIA_CTX_rounds(CTX);
1297 	jne .Laria_gfni_192;
1298 	aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1299 		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1300 		%ymm15, %rax, %r9, 11, 12);
1301 	jmp .Laria_gfni_end;
1302 .Laria_gfni_192:
1303 	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1304 		     %ymm4, %ymm5, %ymm6, %ymm7,
1305 		     %ymm8, %ymm9, %ymm10, %ymm11,
1306 		     %ymm12, %ymm13, %ymm14,
1307 		     %ymm15, %rax, %r9, 11);
1308 	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1309 		     %ymm12, %ymm13, %ymm14, %ymm15,
1310 		     %ymm0, %ymm1, %ymm2, %ymm3,
1311 		     %ymm4, %ymm5, %ymm6, %ymm7,
1312 		     %rax, %r9, 12);
1313 	cmpl $14, ARIA_CTX_rounds(CTX);
1314 	jne .Laria_gfni_256;
1315 	aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1316 		     %ymm4, %ymm5, %ymm6, %ymm7,
1317 		     %ymm8, %ymm9, %ymm10, %ymm11,
1318 		     %ymm12, %ymm13, %ymm14,
1319 		     %ymm15, %rax, %r9, 13, 14);
1320 	jmp .Laria_gfni_end;
1321 .Laria_gfni_256:
1322 	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1323 		     %ymm4, %ymm5, %ymm6, %ymm7,
1324 		     %ymm8, %ymm9, %ymm10, %ymm11,
1325 		     %ymm12, %ymm13, %ymm14,
1326 		     %ymm15, %rax, %r9, 13);
1327 	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1328 		     %ymm12, %ymm13, %ymm14, %ymm15,
1329 		     %ymm0, %ymm1, %ymm2, %ymm3,
1330 		     %ymm4, %ymm5, %ymm6, %ymm7,
1331 		     %rax, %r9, 14);
1332 	aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1333 		     %ymm4, %ymm5, %ymm6, %ymm7,
1334 		     %ymm8, %ymm9, %ymm10, %ymm11,
1335 		     %ymm12, %ymm13, %ymm14,
1336 		     %ymm15, %rax, %r9, 15, 16);
1337 .Laria_gfni_end:
1338 	debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
1339 			   %ymm9, %ymm13, %ymm0, %ymm5,
1340 			   %ymm10, %ymm14, %ymm3, %ymm6,
1341 			   %ymm11, %ymm15, %ymm2, %ymm7,
1342 			   (%rax), (%r8));
1343 
1344 	FRAME_END
1345 	RET;
1346 SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)
1347 
1348 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)
1349 	/* input:
1350 	 *      %rdi: ctx, CTX
1351 	 *      %rsi: dst
1352 	 *      %rdx: src
1353 	 */
1354 
1355 	FRAME_BEGIN
1356 
1357 	leaq ARIA_CTX_enc_key(CTX), %r9;
1358 
1359 	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1360 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1361 		     %ymm15, %rdx);
1362 
1363 	call __aria_aesni_avx2_gfni_crypt_32way;
1364 
1365 	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1366 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1367 		     %ymm15, %rax);
1368 
1369 	FRAME_END
1370 	RET;
1371 SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)
1372 
1373 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)
1374 	/* input:
1375 	 *      %rdi: ctx, CTX
1376 	 *      %rsi: dst
1377 	 *      %rdx: src
1378 	 */
1379 
1380 	FRAME_BEGIN
1381 
1382 	leaq ARIA_CTX_dec_key(CTX), %r9;
1383 
1384 	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1385 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1386 		     %ymm15, %rdx);
1387 
1388 	call __aria_aesni_avx2_gfni_crypt_32way;
1389 
1390 	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1391 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1392 		     %ymm15, %rax);
1393 
1394 	FRAME_END
1395 	RET;
1396 SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)
1397 
1398 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)
1399 	/* input:
1400 	 *      %rdi: ctx
1401 	 *      %rsi: dst
1402 	 *      %rdx: src
1403 	 *      %rcx: keystream
1404 	 *      %r8: iv (big endian, 128bit)
1405 	 */
1406 	FRAME_BEGIN
1407 
1408 	call __aria_aesni_avx2_ctr_gen_keystream_32way
1409 
1410 	leaq (%rsi), %r10;
1411 	leaq (%rdx), %r11;
1412 	leaq (%rcx), %rsi;
1413 	leaq (%rcx), %rdx;
1414 	leaq ARIA_CTX_enc_key(CTX), %r9;
1415 
1416 	call __aria_aesni_avx2_gfni_crypt_32way;
1417 
1418 	vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1419 	vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1420 	vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1421 	vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1422 	vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1423 	vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1424 	vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1425 	vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1426 	vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1427 	vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1428 	vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1429 	vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1430 	vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1431 	vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1432 	vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1433 	vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1434 	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1435 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1436 		     %ymm15, %r10);
1437 
1438 	FRAME_END
1439 	RET;
1440 SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)
1441 #endif /* CONFIG_AS_GFNI */
1442