1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * ARIA Cipher 64-way parallel algorithm (AVX512)
4  *
5  * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6  *
7  */
8 
9 #include <linux/linkage.h>
10 #include <asm/frame.h>
11 #include <asm/asm-offsets.h>
12 #include <linux/cfi_types.h>
13 
14 /* register macros */
15 #define CTX %rdi
16 
17 
18 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
19 	( (((a0) & 1) << 0) |				\
20 	  (((a1) & 1) << 1) |				\
21 	  (((a2) & 1) << 2) |				\
22 	  (((a3) & 1) << 3) |				\
23 	  (((a4) & 1) << 4) |				\
24 	  (((a5) & 1) << 5) |				\
25 	  (((a6) & 1) << 6) |				\
26 	  (((a7) & 1) << 7) )
27 
28 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
29 	( ((l7) << (0 * 8)) |				\
30 	  ((l6) << (1 * 8)) |				\
31 	  ((l5) << (2 * 8)) |				\
32 	  ((l4) << (3 * 8)) |				\
33 	  ((l3) << (4 * 8)) |				\
34 	  ((l2) << (5 * 8)) |				\
35 	  ((l1) << (6 * 8)) |				\
36 	  ((l0) << (7 * 8)) )
37 
38 #define add_le128(out, in, lo_counter, hi_counter1)	\
39 	vpaddq lo_counter, in, out;			\
40 	vpcmpuq $1, lo_counter, out, %k1;		\
41 	kaddb %k1, %k1, %k1;				\
42 	vpaddq hi_counter1, out, out{%k1};
43 
44 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
45 	vpandq x, mask4bit, tmp0;			\
46 	vpandqn x, mask4bit, x;				\
47 	vpsrld $4, x, x;				\
48 							\
49 	vpshufb tmp0, lo_t, tmp0;			\
50 	vpshufb x, hi_t, x;				\
51 	vpxorq tmp0, x, x;
52 
53 #define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
54 	vpunpckhdq x1, x0, t2;				\
55 	vpunpckldq x1, x0, x0;				\
56 							\
57 	vpunpckldq x3, x2, t1;				\
58 	vpunpckhdq x3, x2, x2;				\
59 							\
60 	vpunpckhqdq t1, x0, x1;				\
61 	vpunpcklqdq t1, x0, x0;				\
62 							\
63 	vpunpckhqdq x2, t2, x3;				\
64 	vpunpcklqdq x2, t2, x2;
65 
66 #define byteslice_16x16b(a0, b0, c0, d0,		\
67 			 a1, b1, c1, d1,		\
68 			 a2, b2, c2, d2,		\
69 			 a3, b3, c3, d3,		\
70 			 st0, st1)			\
71 	vmovdqu64 d2, st0;				\
72 	vmovdqu64 d3, st1;				\
73 	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
74 	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
75 	vmovdqu64 st0, d2;				\
76 	vmovdqu64 st1, d3;				\
77 							\
78 	vmovdqu64 a0, st0;				\
79 	vmovdqu64 a1, st1;				\
80 	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
81 	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
82 							\
83 	vbroadcasti64x2 .Lshufb_16x16b(%rip), a0;	\
84 	vmovdqu64 st1, a1;				\
85 	vpshufb a0, a2, a2;				\
86 	vpshufb a0, a3, a3;				\
87 	vpshufb a0, b0, b0;				\
88 	vpshufb a0, b1, b1;				\
89 	vpshufb a0, b2, b2;				\
90 	vpshufb a0, b3, b3;				\
91 	vpshufb a0, a1, a1;				\
92 	vpshufb a0, c0, c0;				\
93 	vpshufb a0, c1, c1;				\
94 	vpshufb a0, c2, c2;				\
95 	vpshufb a0, c3, c3;				\
96 	vpshufb a0, d0, d0;				\
97 	vpshufb a0, d1, d1;				\
98 	vpshufb a0, d2, d2;				\
99 	vpshufb a0, d3, d3;				\
100 	vmovdqu64 d3, st1;				\
101 	vmovdqu64 st0, d3;				\
102 	vpshufb a0, d3, a0;				\
103 	vmovdqu64 d2, st0;				\
104 							\
105 	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
106 	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
107 	vmovdqu64 st0, d2;				\
108 	vmovdqu64 st1, d3;				\
109 							\
110 	vmovdqu64 b0, st0;				\
111 	vmovdqu64 b1, st1;				\
112 	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
113 	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
114 	vmovdqu64 st0, b0;				\
115 	vmovdqu64 st1, b1;				\
116 	/* does not adjust output bytes inside vectors */
117 
118 #define debyteslice_16x16b(a0, b0, c0, d0,		\
119 			   a1, b1, c1, d1,		\
120 			   a2, b2, c2, d2,		\
121 			   a3, b3, c3, d3,		\
122 			   st0, st1)			\
123 	vmovdqu64 d2, st0;				\
124 	vmovdqu64 d3, st1;				\
125 	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
126 	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
127 	vmovdqu64 st0, d2;				\
128 	vmovdqu64 st1, d3;				\
129 							\
130 	vmovdqu64 a0, st0;				\
131 	vmovdqu64 a1, st1;				\
132 	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
133 	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
134 							\
135 	vbroadcasti64x2 .Lshufb_16x16b(%rip), a0;	\
136 	vmovdqu64 st1, a1;				\
137 	vpshufb a0, a2, a2;				\
138 	vpshufb a0, a3, a3;				\
139 	vpshufb a0, b0, b0;				\
140 	vpshufb a0, b1, b1;				\
141 	vpshufb a0, b2, b2;				\
142 	vpshufb a0, b3, b3;				\
143 	vpshufb a0, a1, a1;				\
144 	vpshufb a0, c0, c0;				\
145 	vpshufb a0, c1, c1;				\
146 	vpshufb a0, c2, c2;				\
147 	vpshufb a0, c3, c3;				\
148 	vpshufb a0, d0, d0;				\
149 	vpshufb a0, d1, d1;				\
150 	vpshufb a0, d2, d2;				\
151 	vpshufb a0, d3, d3;				\
152 	vmovdqu64 d3, st1;				\
153 	vmovdqu64 st0, d3;				\
154 	vpshufb a0, d3, a0;				\
155 	vmovdqu64 d2, st0;				\
156 							\
157 	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
158 	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
159 	vmovdqu64 st0, d2;				\
160 	vmovdqu64 st1, d3;				\
161 							\
162 	vmovdqu64 b0, st0;				\
163 	vmovdqu64 b1, st1;				\
164 	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
165 	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
166 	vmovdqu64 st0, b0;				\
167 	vmovdqu64 st1, b1;				\
168 	/* does not adjust output bytes inside vectors */
169 
170 /* load blocks to registers and apply pre-whitening */
171 #define inpack16_pre(x0, x1, x2, x3,			\
172 		     x4, x5, x6, x7,			\
173 		     y0, y1, y2, y3,			\
174 		     y4, y5, y6, y7,			\
175 		     rio)				\
176 	vmovdqu64 (0 * 64)(rio), x0;			\
177 	vmovdqu64 (1 * 64)(rio), x1;			\
178 	vmovdqu64 (2 * 64)(rio), x2;			\
179 	vmovdqu64 (3 * 64)(rio), x3;			\
180 	vmovdqu64 (4 * 64)(rio), x4;			\
181 	vmovdqu64 (5 * 64)(rio), x5;			\
182 	vmovdqu64 (6 * 64)(rio), x6;			\
183 	vmovdqu64 (7 * 64)(rio), x7;			\
184 	vmovdqu64 (8 * 64)(rio), y0;			\
185 	vmovdqu64 (9 * 64)(rio), y1;			\
186 	vmovdqu64 (10 * 64)(rio), y2;			\
187 	vmovdqu64 (11 * 64)(rio), y3;			\
188 	vmovdqu64 (12 * 64)(rio), y4;			\
189 	vmovdqu64 (13 * 64)(rio), y5;			\
190 	vmovdqu64 (14 * 64)(rio), y6;			\
191 	vmovdqu64 (15 * 64)(rio), y7;
192 
193 /* byteslice pre-whitened blocks and store to temporary memory */
194 #define inpack16_post(x0, x1, x2, x3,			\
195 		      x4, x5, x6, x7,			\
196 		      y0, y1, y2, y3,			\
197 		      y4, y5, y6, y7,			\
198 		      mem_ab, mem_cd)			\
199 	byteslice_16x16b(x0, x1, x2, x3,		\
200 			 x4, x5, x6, x7,		\
201 			 y0, y1, y2, y3,		\
202 			 y4, y5, y6, y7,		\
203 			 (mem_ab), (mem_cd));		\
204 							\
205 	vmovdqu64 x0, 0 * 64(mem_ab);			\
206 	vmovdqu64 x1, 1 * 64(mem_ab);			\
207 	vmovdqu64 x2, 2 * 64(mem_ab);			\
208 	vmovdqu64 x3, 3 * 64(mem_ab);			\
209 	vmovdqu64 x4, 4 * 64(mem_ab);			\
210 	vmovdqu64 x5, 5 * 64(mem_ab);			\
211 	vmovdqu64 x6, 6 * 64(mem_ab);			\
212 	vmovdqu64 x7, 7 * 64(mem_ab);			\
213 	vmovdqu64 y0, 0 * 64(mem_cd);			\
214 	vmovdqu64 y1, 1 * 64(mem_cd);			\
215 	vmovdqu64 y2, 2 * 64(mem_cd);			\
216 	vmovdqu64 y3, 3 * 64(mem_cd);			\
217 	vmovdqu64 y4, 4 * 64(mem_cd);			\
218 	vmovdqu64 y5, 5 * 64(mem_cd);			\
219 	vmovdqu64 y6, 6 * 64(mem_cd);			\
220 	vmovdqu64 y7, 7 * 64(mem_cd);
221 
222 #define write_output(x0, x1, x2, x3,			\
223 		     x4, x5, x6, x7,			\
224 		     y0, y1, y2, y3,			\
225 		     y4, y5, y6, y7,			\
226 		     mem)				\
227 	vmovdqu64 x0, 0 * 64(mem);			\
228 	vmovdqu64 x1, 1 * 64(mem);			\
229 	vmovdqu64 x2, 2 * 64(mem);			\
230 	vmovdqu64 x3, 3 * 64(mem);			\
231 	vmovdqu64 x4, 4 * 64(mem);			\
232 	vmovdqu64 x5, 5 * 64(mem);			\
233 	vmovdqu64 x6, 6 * 64(mem);			\
234 	vmovdqu64 x7, 7 * 64(mem);			\
235 	vmovdqu64 y0, 8 * 64(mem);			\
236 	vmovdqu64 y1, 9 * 64(mem);			\
237 	vmovdqu64 y2, 10 * 64(mem);			\
238 	vmovdqu64 y3, 11 * 64(mem);			\
239 	vmovdqu64 y4, 12 * 64(mem);			\
240 	vmovdqu64 y5, 13 * 64(mem);			\
241 	vmovdqu64 y6, 14 * 64(mem);			\
242 	vmovdqu64 y7, 15 * 64(mem);			\
243 
244 #define aria_store_state_8way(x0, x1, x2, x3,		\
245 			      x4, x5, x6, x7,		\
246 			      mem_tmp, idx)		\
247 	vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp);	\
248 	vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp);	\
249 	vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp);	\
250 	vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp);	\
251 	vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp);	\
252 	vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp);	\
253 	vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp);	\
254 	vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);
255 
256 #define aria_load_state_8way(x0, x1, x2, x3,		\
257 			     x4, x5, x6, x7,		\
258 			     mem_tmp, idx)		\
259 	vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0;	\
260 	vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1;	\
261 	vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2;	\
262 	vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3;	\
263 	vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4;	\
264 	vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5;	\
265 	vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6;	\
266 	vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;
267 
268 #define aria_ark_16way(x0, x1, x2, x3,			\
269 		       x4, x5, x6, x7,			\
270 		       y0, y1, y2, y3,			\
271 		       y4, y5, y6, y7,			\
272 		       t0, rk, round)			\
273 	/* AddRoundKey */                               \
274 	vpbroadcastb ((round * 16) + 3)(rk), t0;	\
275 	vpxorq t0, x0, x0;				\
276 	vpbroadcastb ((round * 16) + 2)(rk), t0;	\
277 	vpxorq t0, x1, x1;				\
278 	vpbroadcastb ((round * 16) + 1)(rk), t0;	\
279 	vpxorq t0, x2, x2;				\
280 	vpbroadcastb ((round * 16) + 0)(rk), t0;	\
281 	vpxorq t0, x3, x3;				\
282 	vpbroadcastb ((round * 16) + 7)(rk), t0;	\
283 	vpxorq t0, x4, x4;				\
284 	vpbroadcastb ((round * 16) + 6)(rk), t0;	\
285 	vpxorq t0, x5, x5;				\
286 	vpbroadcastb ((round * 16) + 5)(rk), t0;	\
287 	vpxorq t0, x6, x6;				\
288 	vpbroadcastb ((round * 16) + 4)(rk), t0;	\
289 	vpxorq t0, x7, x7;				\
290 	vpbroadcastb ((round * 16) + 11)(rk), t0;	\
291 	vpxorq t0, y0, y0;				\
292 	vpbroadcastb ((round * 16) + 10)(rk), t0;	\
293 	vpxorq t0, y1, y1;				\
294 	vpbroadcastb ((round * 16) + 9)(rk), t0;	\
295 	vpxorq t0, y2, y2;				\
296 	vpbroadcastb ((round * 16) + 8)(rk), t0;	\
297 	vpxorq t0, y3, y3;				\
298 	vpbroadcastb ((round * 16) + 15)(rk), t0;	\
299 	vpxorq t0, y4, y4;				\
300 	vpbroadcastb ((round * 16) + 14)(rk), t0;	\
301 	vpxorq t0, y5, y5;				\
302 	vpbroadcastb ((round * 16) + 13)(rk), t0;	\
303 	vpxorq t0, y6, y6;				\
304 	vpbroadcastb ((round * 16) + 12)(rk), t0;	\
305 	vpxorq t0, y7, y7;
306 
307 #define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
308 			    x4, x5, x6, x7,		\
309 			    t0, t1, t2, t3,		\
310 			    t4, t5, t6, t7)		\
311 	vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0;	\
312 	vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1;	\
313 	vpbroadcastq .Ltf_id_bitmatrix(%rip), t2;	\
314 	vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3;	\
315 	vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4;	\
316 	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
317 	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
318 	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
319 	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
320 	vgf2p8affineinvqb $0, t2, x2, x2;		\
321 	vgf2p8affineinvqb $0, t2, x6, x6;		\
322 	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
323 	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
324 	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
325 	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
326 	vgf2p8affineinvqb $0, t2, x3, x3;		\
327 	vgf2p8affineinvqb $0, t2, x7, x7;
328 
329 #define aria_sbox_16way_gfni(x0, x1, x2, x3,		\
330 			     x4, x5, x6, x7,		\
331 			     y0, y1, y2, y3,		\
332 			     y4, y5, y6, y7,		\
333 			     t0, t1, t2, t3,		\
334 			     t4, t5, t6, t7)		\
335 	vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0;	\
336 	vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1;	\
337 	vpbroadcastq .Ltf_id_bitmatrix(%rip), t2;	\
338 	vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3;	\
339 	vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4;	\
340 	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
341 	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
342 	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
343 	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
344 	vgf2p8affineinvqb $0, t2, x2, x2;		\
345 	vgf2p8affineinvqb $0, t2, x6, x6;		\
346 	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
347 	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
348 	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
349 	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
350 	vgf2p8affineinvqb $0, t2, x3, x3;		\
351 	vgf2p8affineinvqb $0, t2, x7, x7;		\
352 	vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1;	\
353 	vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5;	\
354 	vgf2p8affineqb $(tf_inv_const), t1, y2, y2;	\
355 	vgf2p8affineqb $(tf_inv_const), t1, y6, y6;	\
356 	vgf2p8affineinvqb $0, t2, y2, y2;		\
357 	vgf2p8affineinvqb $0, t2, y6, y6;		\
358 	vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0;	\
359 	vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4;	\
360 	vgf2p8affineqb $(tf_x2_const), t4, y3, y3;	\
361 	vgf2p8affineqb $(tf_x2_const), t4, y7, y7;	\
362 	vgf2p8affineinvqb $0, t2, y3, y3;		\
363 	vgf2p8affineinvqb $0, t2, y7, y7;
364 
365 
366 #define aria_diff_m(x0, x1, x2, x3,			\
367 		    t0, t1, t2, t3)			\
368 	/* T = rotr32(X, 8); */				\
369 	/* X ^= T */					\
370 	vpxorq x0, x3, t0;				\
371 	vpxorq x1, x0, t1;				\
372 	vpxorq x2, x1, t2;				\
373 	vpxorq x3, x2, t3;				\
374 	/* X = T ^ rotr(X, 16); */			\
375 	vpxorq t2, x0, x0;				\
376 	vpxorq x1, t3, t3;				\
377 	vpxorq t0, x2, x2;				\
378 	vpxorq t1, x3, x1;				\
379 	vmovdqu64 t3, x3;
380 
381 #define aria_diff_word(x0, x1, x2, x3,			\
382 		       x4, x5, x6, x7,			\
383 		       y0, y1, y2, y3,			\
384 		       y4, y5, y6, y7)			\
385 	/* t1 ^= t2; */					\
386 	vpxorq y0, x4, x4;				\
387 	vpxorq y1, x5, x5;				\
388 	vpxorq y2, x6, x6;				\
389 	vpxorq y3, x7, x7;				\
390 							\
391 	/* t2 ^= t3; */					\
392 	vpxorq y4, y0, y0;				\
393 	vpxorq y5, y1, y1;				\
394 	vpxorq y6, y2, y2;				\
395 	vpxorq y7, y3, y3;				\
396 							\
397 	/* t0 ^= t1; */					\
398 	vpxorq x4, x0, x0;				\
399 	vpxorq x5, x1, x1;				\
400 	vpxorq x6, x2, x2;				\
401 	vpxorq x7, x3, x3;				\
402 							\
403 	/* t3 ^= t1; */					\
404 	vpxorq x4, y4, y4;				\
405 	vpxorq x5, y5, y5;				\
406 	vpxorq x6, y6, y6;				\
407 	vpxorq x7, y7, y7;				\
408 							\
409 	/* t2 ^= t0; */					\
410 	vpxorq x0, y0, y0;				\
411 	vpxorq x1, y1, y1;				\
412 	vpxorq x2, y2, y2;				\
413 	vpxorq x3, y3, y3;				\
414 							\
415 	/* t1 ^= t2; */					\
416 	vpxorq y0, x4, x4;				\
417 	vpxorq y1, x5, x5;				\
418 	vpxorq y2, x6, x6;				\
419 	vpxorq y3, x7, x7;
420 
421 #define aria_fe_gfni(x0, x1, x2, x3,			\
422 		     x4, x5, x6, x7,			\
423 		     y0, y1, y2, y3,			\
424 		     y4, y5, y6, y7,			\
425 		     z0, z1, z2, z3,			\
426 		     z4, z5, z6, z7,			\
427 		     mem_tmp, rk, round)		\
428 	aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,	\
429 		       y0, y1, y2, y3, y4, y5, y6, y7,	\
430 		       z0, rk, round);			\
431 							\
432 	aria_sbox_16way_gfni(x2, x3, x0, x1,		\
433 			     x6, x7, x4, x5,		\
434 			     y2, y3, y0, y1,		\
435 			     y6, y7, y4, y5,		\
436 			     z0, z1, z2, z3,		\
437 			     z4, z5, z6, z7);		\
438 							\
439 	aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);	\
440 	aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);	\
441 	aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);	\
442 	aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);	\
443 	aria_diff_word(x0, x1, x2, x3,			\
444 		       x4, x5, x6, x7,			\
445 		       y0, y1, y2, y3,			\
446 		       y4, y5, y6, y7);			\
447 	/* aria_diff_byte()				\
448 	 * T3 = ABCD -> BADC				\
449 	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
450 	 * T0 = ABCD -> CDAB				\
451 	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
452 	 * T1 = ABCD -> DCBA				\
453 	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
454 	 */						\
455 	aria_diff_word(x2, x3, x0, x1,			\
456 		       x7, x6, x5, x4,			\
457 		       y0, y1, y2, y3,			\
458 		       y5, y4, y7, y6);			\
459 
460 
461 #define aria_fo_gfni(x0, x1, x2, x3,			\
462 		     x4, x5, x6, x7,			\
463 		     y0, y1, y2, y3,			\
464 		     y4, y5, y6, y7,			\
465 		     z0, z1, z2, z3,			\
466 		     z4, z5, z6, z7,			\
467 		     mem_tmp, rk, round)		\
468 	aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,	\
469 		       y0, y1, y2, y3, y4, y5, y6, y7,	\
470 		       z0, rk, round);			\
471 							\
472 	aria_sbox_16way_gfni(x0, x1, x2, x3,		\
473 			     x4, x5, x6, x7,		\
474 			     y0, y1, y2, y3,		\
475 			     y4, y5, y6, y7,		\
476 			     z0, z1, z2, z3,		\
477 			     z4, z5, z6, z7);		\
478 							\
479 	aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);	\
480 	aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);	\
481 	aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);	\
482 	aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);	\
483 	aria_diff_word(x0, x1, x2, x3,			\
484 		       x4, x5, x6, x7,			\
485 		       y0, y1, y2, y3,			\
486 		       y4, y5, y6, y7);			\
487 	/* aria_diff_byte()				\
488 	 * T1 = ABCD -> BADC				\
489 	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
490 	 * T2 = ABCD -> CDAB				\
491 	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
492 	 * T3 = ABCD -> DCBA				\
493 	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
494 	 */						\
495 	aria_diff_word(x0, x1, x2, x3,			\
496 		       x5, x4, x7, x6,			\
497 		       y2, y3, y0, y1,			\
498 		       y7, y6, y5, y4);
499 
500 #define aria_ff_gfni(x0, x1, x2, x3,			\
501 		     x4, x5, x6, x7,			\
502 		     y0, y1, y2, y3,			\
503 		     y4, y5, y6, y7,			\
504 		     z0, z1, z2, z3,			\
505 		     z4, z5, z6, z7,			\
506 		     mem_tmp, rk, round, last_round)	\
507 	aria_ark_16way(x0, x1, x2, x3,			\
508 		       x4, x5, x6, x7,			\
509 		       y0, y1, y2, y3,			\
510 		       y4, y5, y6, y7,			\
511 		       z0, rk, round);			\
512 	aria_sbox_16way_gfni(x2, x3, x0, x1,		\
513 			     x6, x7, x4, x5,		\
514 			     y2, y3, y0, y1,		\
515 			     y6, y7, y4, y5,		\
516 			     z0, z1, z2, z3,		\
517 			     z4, z5, z6, z7);		\
518 	aria_ark_16way(x0, x1, x2, x3,			\
519 		       x4, x5, x6, x7,			\
520 		       y0, y1, y2, y3,			\
521 		       y4, y5, y6, y7,			\
522 		       z0, rk, last_round);
523 
524 
525 .section        .rodata.cst64, "aM", @progbits, 64
526 .align 64
527 .Lcounter0123_lo:
528 	.quad 0, 0
529 	.quad 1, 0
530 	.quad 2, 0
531 	.quad 3, 0
532 
533 .section        .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
534 .align 32
535 #define SHUFB_BYTES(idx) \
536 	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
537 .Lshufb_16x16b:
538 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
539 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
540 
541 .section	.rodata.cst16, "aM", @progbits, 16
542 .align 16
543 
544 .Lcounter4444_lo:
545 	.quad 4, 0
546 .Lcounter8888_lo:
547 	.quad 8, 0
548 .Lcounter16161616_lo:
549 	.quad 16, 0
550 .Lcounter1111_hi:
551 	.quad 0, 1
552 
553 /* For CTR-mode IV byteswap */
554 .Lbswap128_mask:
555 	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
556 	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
557 
558 .section	.rodata.cst8, "aM", @progbits, 8
559 .align 8
560 /* AES affine: */
561 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
562 .Ltf_aff_bitmatrix:
563 	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
564 		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
565 		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
566 		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
567 		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
568 		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
569 		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
570 		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
571 
572 /* AES inverse affine: */
573 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
574 .Ltf_inv_bitmatrix:
575 	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
576 		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
577 		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
578 		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
579 		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
580 		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
581 		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
582 		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
583 
584 /* S2: */
585 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
586 .Ltf_s2_bitmatrix:
587 	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
588 		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
589 		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
590 		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
591 		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
592 		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
593 		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
594 		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
595 
596 /* X2: */
597 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
598 .Ltf_x2_bitmatrix:
599 	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
600 		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
601 		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
602 		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
603 		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
604 		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
605 		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
606 		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
607 
608 /* Identity matrix: */
609 .Ltf_id_bitmatrix:
610 	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
611 		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
612 		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
613 		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
614 		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
615 		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
616 		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
617 		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
618 
619 .text
620 SYM_FUNC_START_LOCAL(__aria_gfni_avx512_crypt_64way)
621 	/* input:
622 	 *      %r9: rk
623 	 *      %rsi: dst
624 	 *      %rdx: src
625 	 *      %zmm0..%zmm15: byte-sliced blocks
626 	 */
627 
628 	FRAME_BEGIN
629 
630 	movq %rsi, %rax;
631 	leaq 8 * 64(%rax), %r8;
632 
633 	inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,
634 		      %zmm4, %zmm5, %zmm6, %zmm7,
635 		      %zmm8, %zmm9, %zmm10, %zmm11,
636 		      %zmm12, %zmm13, %zmm14,
637 		      %zmm15, %rax, %r8);
638 	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
639 		     %zmm4, %zmm5, %zmm6, %zmm7,
640 		     %zmm8, %zmm9, %zmm10, %zmm11,
641 		     %zmm12, %zmm13, %zmm14, %zmm15,
642 		     %zmm24, %zmm25, %zmm26, %zmm27,
643 		     %zmm28, %zmm29, %zmm30, %zmm31,
644 		     %rax, %r9, 0);
645 	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
646 		     %zmm6, %zmm7, %zmm4, %zmm5,
647 		     %zmm9, %zmm8, %zmm11, %zmm10,
648 		     %zmm12, %zmm13, %zmm14, %zmm15,
649 		     %zmm24, %zmm25, %zmm26, %zmm27,
650 		     %zmm28, %zmm29, %zmm30, %zmm31,
651 		     %rax, %r9, 1);
652 	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
653 		     %zmm4, %zmm5, %zmm6, %zmm7,
654 		     %zmm8, %zmm9, %zmm10, %zmm11,
655 		     %zmm12, %zmm13, %zmm14, %zmm15,
656 		     %zmm24, %zmm25, %zmm26, %zmm27,
657 		     %zmm28, %zmm29, %zmm30, %zmm31,
658 		     %rax, %r9, 2);
659 	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
660 		     %zmm6, %zmm7, %zmm4, %zmm5,
661 		     %zmm9, %zmm8, %zmm11, %zmm10,
662 		     %zmm12, %zmm13, %zmm14, %zmm15,
663 		     %zmm24, %zmm25, %zmm26, %zmm27,
664 		     %zmm28, %zmm29, %zmm30, %zmm31,
665 		     %rax, %r9, 3);
666 	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
667 		     %zmm4, %zmm5, %zmm6, %zmm7,
668 		     %zmm8, %zmm9, %zmm10, %zmm11,
669 		     %zmm12, %zmm13, %zmm14, %zmm15,
670 		     %zmm24, %zmm25, %zmm26, %zmm27,
671 		     %zmm28, %zmm29, %zmm30, %zmm31,
672 		     %rax, %r9, 4);
673 	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
674 		     %zmm6, %zmm7, %zmm4, %zmm5,
675 		     %zmm9, %zmm8, %zmm11, %zmm10,
676 		     %zmm12, %zmm13, %zmm14, %zmm15,
677 		     %zmm24, %zmm25, %zmm26, %zmm27,
678 		     %zmm28, %zmm29, %zmm30, %zmm31,
679 		     %rax, %r9, 5);
680 	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
681 		     %zmm4, %zmm5, %zmm6, %zmm7,
682 		     %zmm8, %zmm9, %zmm10, %zmm11,
683 		     %zmm12, %zmm13, %zmm14, %zmm15,
684 		     %zmm24, %zmm25, %zmm26, %zmm27,
685 		     %zmm28, %zmm29, %zmm30, %zmm31,
686 		     %rax, %r9, 6);
687 	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
688 		     %zmm6, %zmm7, %zmm4, %zmm5,
689 		     %zmm9, %zmm8, %zmm11, %zmm10,
690 		     %zmm12, %zmm13, %zmm14, %zmm15,
691 		     %zmm24, %zmm25, %zmm26, %zmm27,
692 		     %zmm28, %zmm29, %zmm30, %zmm31,
693 		     %rax, %r9, 7);
694 	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
695 		     %zmm4, %zmm5, %zmm6, %zmm7,
696 		     %zmm8, %zmm9, %zmm10, %zmm11,
697 		     %zmm12, %zmm13, %zmm14, %zmm15,
698 		     %zmm24, %zmm25, %zmm26, %zmm27,
699 		     %zmm28, %zmm29, %zmm30, %zmm31,
700 		     %rax, %r9, 8);
701 	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
702 		     %zmm6, %zmm7, %zmm4, %zmm5,
703 		     %zmm9, %zmm8, %zmm11, %zmm10,
704 		     %zmm12, %zmm13, %zmm14, %zmm15,
705 		     %zmm24, %zmm25, %zmm26, %zmm27,
706 		     %zmm28, %zmm29, %zmm30, %zmm31,
707 		     %rax, %r9, 9);
708 	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
709 		     %zmm4, %zmm5, %zmm6, %zmm7,
710 		     %zmm8, %zmm9, %zmm10, %zmm11,
711 		     %zmm12, %zmm13, %zmm14, %zmm15,
712 		     %zmm24, %zmm25, %zmm26, %zmm27,
713 		     %zmm28, %zmm29, %zmm30, %zmm31,
714 		     %rax, %r9, 10);
715 	cmpl $12, ARIA_CTX_rounds(CTX);
716 	jne .Laria_gfni_192;
717 	aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
718 		     %zmm6, %zmm7, %zmm4, %zmm5,
719 		     %zmm9, %zmm8, %zmm11, %zmm10,
720 		     %zmm12, %zmm13, %zmm14, %zmm15,
721 		     %zmm24, %zmm25, %zmm26, %zmm27,
722 		     %zmm28, %zmm29, %zmm30, %zmm31,
723 		     %rax, %r9, 11, 12);
724 	jmp .Laria_gfni_end;
725 .Laria_gfni_192:
726 	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
727 		     %zmm6, %zmm7, %zmm4, %zmm5,
728 		     %zmm9, %zmm8, %zmm11, %zmm10,
729 		     %zmm12, %zmm13, %zmm14, %zmm15,
730 		     %zmm24, %zmm25, %zmm26, %zmm27,
731 		     %zmm28, %zmm29, %zmm30, %zmm31,
732 		     %rax, %r9, 11);
733 	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
734 		     %zmm4, %zmm5, %zmm6, %zmm7,
735 		     %zmm8, %zmm9, %zmm10, %zmm11,
736 		     %zmm12, %zmm13, %zmm14, %zmm15,
737 		     %zmm24, %zmm25, %zmm26, %zmm27,
738 		     %zmm28, %zmm29, %zmm30, %zmm31,
739 		     %rax, %r9, 12);
740 	cmpl $14, ARIA_CTX_rounds(CTX);
741 	jne .Laria_gfni_256;
742 	aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
743 		     %zmm6, %zmm7, %zmm4, %zmm5,
744 		     %zmm9, %zmm8, %zmm11, %zmm10,
745 		     %zmm12, %zmm13, %zmm14, %zmm15,
746 		     %zmm24, %zmm25, %zmm26, %zmm27,
747 		     %zmm28, %zmm29, %zmm30, %zmm31,
748 		     %rax, %r9, 13, 14);
749 	jmp .Laria_gfni_end;
750 .Laria_gfni_256:
751 	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
752 		     %zmm6, %zmm7, %zmm4, %zmm5,
753 		     %zmm9, %zmm8, %zmm11, %zmm10,
754 		     %zmm12, %zmm13, %zmm14, %zmm15,
755 		     %zmm24, %zmm25, %zmm26, %zmm27,
756 		     %zmm28, %zmm29, %zmm30, %zmm31,
757 		     %rax, %r9, 13);
758 	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
759 		     %zmm4, %zmm5, %zmm6, %zmm7,
760 		     %zmm8, %zmm9, %zmm10, %zmm11,
761 		     %zmm12, %zmm13, %zmm14, %zmm15,
762 		     %zmm24, %zmm25, %zmm26, %zmm27,
763 		     %zmm28, %zmm29, %zmm30, %zmm31,
764 		     %rax, %r9, 14);
765 	aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
766 		     %zmm6, %zmm7, %zmm4, %zmm5,
767 		     %zmm9, %zmm8, %zmm11, %zmm10,
768 		     %zmm12, %zmm13, %zmm14, %zmm15,
769 		     %zmm24, %zmm25, %zmm26, %zmm27,
770 		     %zmm28, %zmm29, %zmm30, %zmm31,
771 		     %rax, %r9, 15, 16);
772 .Laria_gfni_end:
773 	debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,
774 			   %zmm8, %zmm13, %zmm2, %zmm7,
775 			   %zmm11, %zmm14, %zmm1, %zmm4,
776 			   %zmm10, %zmm15, %zmm0, %zmm5,
777 			   (%rax), (%r8));
778 	FRAME_END
779 	RET;
780 SYM_FUNC_END(__aria_gfni_avx512_crypt_64way)
781 
782 SYM_TYPED_FUNC_START(aria_gfni_avx512_encrypt_64way)
783 	/* input:
784 	 *      %rdi: ctx, CTX
785 	 *      %rsi: dst
786 	 *      %rdx: src
787 	 */
788 
789 	FRAME_BEGIN
790 
791 	leaq ARIA_CTX_enc_key(CTX), %r9;
792 
793 	inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
794 		     %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
795 		     %zmm15, %rdx);
796 
797 	call __aria_gfni_avx512_crypt_64way;
798 
799 	write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
800 		     %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
801 		     %zmm15, %rax);
802 
803 	FRAME_END
804 	RET;
805 SYM_FUNC_END(aria_gfni_avx512_encrypt_64way)
806 
807 SYM_TYPED_FUNC_START(aria_gfni_avx512_decrypt_64way)
808 	/* input:
809 	 *      %rdi: ctx, CTX
810 	 *      %rsi: dst
811 	 *      %rdx: src
812 	 */
813 
814 	FRAME_BEGIN
815 
816 	leaq ARIA_CTX_dec_key(CTX), %r9;
817 
818 	inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
819 		     %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
820 		     %zmm15, %rdx);
821 
822 	call __aria_gfni_avx512_crypt_64way;
823 
824 	write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
825 		     %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
826 		     %zmm15, %rax);
827 
828 	FRAME_END
829 	RET;
830 SYM_FUNC_END(aria_gfni_avx512_decrypt_64way)
831 
832 SYM_FUNC_START_LOCAL(__aria_gfni_avx512_ctr_gen_keystream_64way)
833 	/* input:
834 	 *      %rdi: ctx
835 	 *      %rsi: dst
836 	 *      %rdx: src
837 	 *      %rcx: keystream
838 	 *      %r8: iv (big endian, 128bit)
839 	 */
840 
841 	FRAME_BEGIN
842 
843 	vbroadcasti64x2 .Lbswap128_mask (%rip), %zmm19;
844 	vmovdqa64 .Lcounter0123_lo (%rip), %zmm21;
845 	vbroadcasti64x2 .Lcounter4444_lo (%rip), %zmm22;
846 	vbroadcasti64x2 .Lcounter8888_lo (%rip), %zmm23;
847 	vbroadcasti64x2 .Lcounter16161616_lo (%rip), %zmm24;
848 	vbroadcasti64x2 .Lcounter1111_hi (%rip), %zmm25;
849 
850 	/* load IV and byteswap */
851 	movq 8(%r8), %r11;
852 	movq (%r8), %r10;
853 	bswapq %r11;
854 	bswapq %r10;
855 	vbroadcasti64x2 (%r8), %zmm20;
856 	vpshufb %zmm19, %zmm20, %zmm20;
857 
858 	/* check need for handling 64-bit overflow and carry */
859 	cmpq $(0xffffffffffffffff - 64), %r11;
860 	ja .Lload_ctr_carry;
861 
862 	/* construct IVs */
863 	vpaddq %zmm21, %zmm20, %zmm0;  /* +0:+1:+2:+3 */
864 	vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */
865 	vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */
866 	vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */
867 	vpaddq %zmm24, %zmm0, %zmm4; /* +16... */
868 	vpaddq %zmm24, %zmm1, %zmm5; /* +20... */
869 	vpaddq %zmm24, %zmm2, %zmm6; /* +24... */
870 	vpaddq %zmm24, %zmm3, %zmm7; /* +28... */
871 	vpaddq %zmm24, %zmm4, %zmm8; /* +32... */
872 	vpaddq %zmm24, %zmm5, %zmm9; /* +36... */
873 	vpaddq %zmm24, %zmm6, %zmm10; /* +40... */
874 	vpaddq %zmm24, %zmm7, %zmm11; /* +44... */
875 	vpaddq %zmm24, %zmm8, %zmm12; /* +48... */
876 	vpaddq %zmm24, %zmm9, %zmm13; /* +52... */
877 	vpaddq %zmm24, %zmm10, %zmm14; /* +56... */
878 	vpaddq %zmm24, %zmm11, %zmm15; /* +60... */
879 	jmp .Lload_ctr_done;
880 
881 .Lload_ctr_carry:
882 	/* construct IVs */
883 	add_le128(%zmm0, %zmm20, %zmm21, %zmm25);  /* +0:+1:+2:+3 */
884 	add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */
885 	add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */
886 	add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */
887 	add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */
888 	add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */
889 	add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */
890 	add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */
891 	add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */
892 	add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */
893 	add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */
894 	add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */
895 	add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */
896 	add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */
897 	add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */
898 	add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */
899 
900 .Lload_ctr_done:
901 	/* Byte-swap IVs and update counter. */
902 	addq $64, %r11;
903 	adcq $0, %r10;
904 	vpshufb %zmm19, %zmm15, %zmm15;
905 	vpshufb %zmm19, %zmm14, %zmm14;
906 	vpshufb %zmm19, %zmm13, %zmm13;
907 	vpshufb %zmm19, %zmm12, %zmm12;
908 	vpshufb %zmm19, %zmm11, %zmm11;
909 	vpshufb %zmm19, %zmm10, %zmm10;
910 	vpshufb %zmm19, %zmm9, %zmm9;
911 	vpshufb %zmm19, %zmm8, %zmm8;
912 	bswapq %r11;
913 	bswapq %r10;
914 	vpshufb %zmm19, %zmm7, %zmm7;
915 	vpshufb %zmm19, %zmm6, %zmm6;
916 	vpshufb %zmm19, %zmm5, %zmm5;
917 	vpshufb %zmm19, %zmm4, %zmm4;
918 	vpshufb %zmm19, %zmm3, %zmm3;
919 	vpshufb %zmm19, %zmm2, %zmm2;
920 	vpshufb %zmm19, %zmm1, %zmm1;
921 	vpshufb %zmm19, %zmm0, %zmm0;
922 	movq %r11, 8(%r8);
923 	movq %r10, (%r8);
924 
925 	FRAME_END
926 	RET;
927 SYM_FUNC_END(__aria_gfni_avx512_ctr_gen_keystream_64way)
928 
929 SYM_TYPED_FUNC_START(aria_gfni_avx512_ctr_crypt_64way)
930 	/* input:
931 	 *      %rdi: ctx
932 	 *      %rsi: dst
933 	 *      %rdx: src
934 	 *      %rcx: keystream
935 	 *      %r8: iv (big endian, 128bit)
936 	 */
937 	FRAME_BEGIN
938 
939 	call __aria_gfni_avx512_ctr_gen_keystream_64way
940 
941 	leaq (%rsi), %r10;
942 	leaq (%rdx), %r11;
943 	leaq (%rcx), %rsi;
944 	leaq (%rcx), %rdx;
945 	leaq ARIA_CTX_enc_key(CTX), %r9;
946 
947 	call __aria_gfni_avx512_crypt_64way;
948 
949 	vpxorq (0 * 64)(%r11), %zmm3, %zmm3;
950 	vpxorq (1 * 64)(%r11), %zmm2, %zmm2;
951 	vpxorq (2 * 64)(%r11), %zmm1, %zmm1;
952 	vpxorq (3 * 64)(%r11), %zmm0, %zmm0;
953 	vpxorq (4 * 64)(%r11), %zmm6, %zmm6;
954 	vpxorq (5 * 64)(%r11), %zmm7, %zmm7;
955 	vpxorq (6 * 64)(%r11), %zmm4, %zmm4;
956 	vpxorq (7 * 64)(%r11), %zmm5, %zmm5;
957 	vpxorq (8 * 64)(%r11), %zmm9, %zmm9;
958 	vpxorq (9 * 64)(%r11), %zmm8, %zmm8;
959 	vpxorq (10 * 64)(%r11), %zmm11, %zmm11;
960 	vpxorq (11 * 64)(%r11), %zmm10, %zmm10;
961 	vpxorq (12 * 64)(%r11), %zmm12, %zmm12;
962 	vpxorq (13 * 64)(%r11), %zmm13, %zmm13;
963 	vpxorq (14 * 64)(%r11), %zmm14, %zmm14;
964 	vpxorq (15 * 64)(%r11), %zmm15, %zmm15;
965 	write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
966 		     %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
967 		     %zmm15, %r10);
968 
969 	FRAME_END
970 	RET;
971 SYM_FUNC_END(aria_gfni_avx512_ctr_crypt_64way)
972