162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * ARIA Cipher 64-way parallel algorithm (AVX512)
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci */
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci#include <linux/linkage.h>
1062306a36Sopenharmony_ci#include <asm/frame.h>
1162306a36Sopenharmony_ci#include <asm/asm-offsets.h>
1262306a36Sopenharmony_ci#include <linux/cfi_types.h>
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci/* register macros */
1562306a36Sopenharmony_ci#define CTX %rdi
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
1962306a36Sopenharmony_ci	( (((a0) & 1) << 0) |				\
2062306a36Sopenharmony_ci	  (((a1) & 1) << 1) |				\
2162306a36Sopenharmony_ci	  (((a2) & 1) << 2) |				\
2262306a36Sopenharmony_ci	  (((a3) & 1) << 3) |				\
2362306a36Sopenharmony_ci	  (((a4) & 1) << 4) |				\
2462306a36Sopenharmony_ci	  (((a5) & 1) << 5) |				\
2562306a36Sopenharmony_ci	  (((a6) & 1) << 6) |				\
2662306a36Sopenharmony_ci	  (((a7) & 1) << 7) )
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
2962306a36Sopenharmony_ci	( ((l7) << (0 * 8)) |				\
3062306a36Sopenharmony_ci	  ((l6) << (1 * 8)) |				\
3162306a36Sopenharmony_ci	  ((l5) << (2 * 8)) |				\
3262306a36Sopenharmony_ci	  ((l4) << (3 * 8)) |				\
3362306a36Sopenharmony_ci	  ((l3) << (4 * 8)) |				\
3462306a36Sopenharmony_ci	  ((l2) << (5 * 8)) |				\
3562306a36Sopenharmony_ci	  ((l1) << (6 * 8)) |				\
3662306a36Sopenharmony_ci	  ((l0) << (7 * 8)) )
3762306a36Sopenharmony_ci
3862306a36Sopenharmony_ci#define add_le128(out, in, lo_counter, hi_counter1)	\
3962306a36Sopenharmony_ci	vpaddq lo_counter, in, out;			\
4062306a36Sopenharmony_ci	vpcmpuq $1, lo_counter, out, %k1;		\
4162306a36Sopenharmony_ci	kaddb %k1, %k1, %k1;				\
4262306a36Sopenharmony_ci	vpaddq hi_counter1, out, out{%k1};
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
4562306a36Sopenharmony_ci	vpandq x, mask4bit, tmp0;			\
4662306a36Sopenharmony_ci	vpandqn x, mask4bit, x;				\
4762306a36Sopenharmony_ci	vpsrld $4, x, x;				\
4862306a36Sopenharmony_ci							\
4962306a36Sopenharmony_ci	vpshufb tmp0, lo_t, tmp0;			\
5062306a36Sopenharmony_ci	vpshufb x, hi_t, x;				\
5162306a36Sopenharmony_ci	vpxorq tmp0, x, x;
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
5462306a36Sopenharmony_ci	vpunpckhdq x1, x0, t2;				\
5562306a36Sopenharmony_ci	vpunpckldq x1, x0, x0;				\
5662306a36Sopenharmony_ci							\
5762306a36Sopenharmony_ci	vpunpckldq x3, x2, t1;				\
5862306a36Sopenharmony_ci	vpunpckhdq x3, x2, x2;				\
5962306a36Sopenharmony_ci							\
6062306a36Sopenharmony_ci	vpunpckhqdq t1, x0, x1;				\
6162306a36Sopenharmony_ci	vpunpcklqdq t1, x0, x0;				\
6262306a36Sopenharmony_ci							\
6362306a36Sopenharmony_ci	vpunpckhqdq x2, t2, x3;				\
6462306a36Sopenharmony_ci	vpunpcklqdq x2, t2, x2;
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci#define byteslice_16x16b(a0, b0, c0, d0,		\
6762306a36Sopenharmony_ci			 a1, b1, c1, d1,		\
6862306a36Sopenharmony_ci			 a2, b2, c2, d2,		\
6962306a36Sopenharmony_ci			 a3, b3, c3, d3,		\
7062306a36Sopenharmony_ci			 st0, st1)			\
7162306a36Sopenharmony_ci	vmovdqu64 d2, st0;				\
7262306a36Sopenharmony_ci	vmovdqu64 d3, st1;				\
7362306a36Sopenharmony_ci	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
7462306a36Sopenharmony_ci	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
7562306a36Sopenharmony_ci	vmovdqu64 st0, d2;				\
7662306a36Sopenharmony_ci	vmovdqu64 st1, d3;				\
7762306a36Sopenharmony_ci							\
7862306a36Sopenharmony_ci	vmovdqu64 a0, st0;				\
7962306a36Sopenharmony_ci	vmovdqu64 a1, st1;				\
8062306a36Sopenharmony_ci	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
8162306a36Sopenharmony_ci	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
8262306a36Sopenharmony_ci							\
8362306a36Sopenharmony_ci	vbroadcasti64x2 .Lshufb_16x16b(%rip), a0;	\
8462306a36Sopenharmony_ci	vmovdqu64 st1, a1;				\
8562306a36Sopenharmony_ci	vpshufb a0, a2, a2;				\
8662306a36Sopenharmony_ci	vpshufb a0, a3, a3;				\
8762306a36Sopenharmony_ci	vpshufb a0, b0, b0;				\
8862306a36Sopenharmony_ci	vpshufb a0, b1, b1;				\
8962306a36Sopenharmony_ci	vpshufb a0, b2, b2;				\
9062306a36Sopenharmony_ci	vpshufb a0, b3, b3;				\
9162306a36Sopenharmony_ci	vpshufb a0, a1, a1;				\
9262306a36Sopenharmony_ci	vpshufb a0, c0, c0;				\
9362306a36Sopenharmony_ci	vpshufb a0, c1, c1;				\
9462306a36Sopenharmony_ci	vpshufb a0, c2, c2;				\
9562306a36Sopenharmony_ci	vpshufb a0, c3, c3;				\
9662306a36Sopenharmony_ci	vpshufb a0, d0, d0;				\
9762306a36Sopenharmony_ci	vpshufb a0, d1, d1;				\
9862306a36Sopenharmony_ci	vpshufb a0, d2, d2;				\
9962306a36Sopenharmony_ci	vpshufb a0, d3, d3;				\
10062306a36Sopenharmony_ci	vmovdqu64 d3, st1;				\
10162306a36Sopenharmony_ci	vmovdqu64 st0, d3;				\
10262306a36Sopenharmony_ci	vpshufb a0, d3, a0;				\
10362306a36Sopenharmony_ci	vmovdqu64 d2, st0;				\
10462306a36Sopenharmony_ci							\
10562306a36Sopenharmony_ci	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
10662306a36Sopenharmony_ci	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
10762306a36Sopenharmony_ci	vmovdqu64 st0, d2;				\
10862306a36Sopenharmony_ci	vmovdqu64 st1, d3;				\
10962306a36Sopenharmony_ci							\
11062306a36Sopenharmony_ci	vmovdqu64 b0, st0;				\
11162306a36Sopenharmony_ci	vmovdqu64 b1, st1;				\
11262306a36Sopenharmony_ci	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
11362306a36Sopenharmony_ci	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
11462306a36Sopenharmony_ci	vmovdqu64 st0, b0;				\
11562306a36Sopenharmony_ci	vmovdqu64 st1, b1;				\
11662306a36Sopenharmony_ci	/* does not adjust output bytes inside vectors */
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci#define debyteslice_16x16b(a0, b0, c0, d0,		\
11962306a36Sopenharmony_ci			   a1, b1, c1, d1,		\
12062306a36Sopenharmony_ci			   a2, b2, c2, d2,		\
12162306a36Sopenharmony_ci			   a3, b3, c3, d3,		\
12262306a36Sopenharmony_ci			   st0, st1)			\
12362306a36Sopenharmony_ci	vmovdqu64 d2, st0;				\
12462306a36Sopenharmony_ci	vmovdqu64 d3, st1;				\
12562306a36Sopenharmony_ci	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
12662306a36Sopenharmony_ci	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
12762306a36Sopenharmony_ci	vmovdqu64 st0, d2;				\
12862306a36Sopenharmony_ci	vmovdqu64 st1, d3;				\
12962306a36Sopenharmony_ci							\
13062306a36Sopenharmony_ci	vmovdqu64 a0, st0;				\
13162306a36Sopenharmony_ci	vmovdqu64 a1, st1;				\
13262306a36Sopenharmony_ci	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
13362306a36Sopenharmony_ci	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
13462306a36Sopenharmony_ci							\
13562306a36Sopenharmony_ci	vbroadcasti64x2 .Lshufb_16x16b(%rip), a0;	\
13662306a36Sopenharmony_ci	vmovdqu64 st1, a1;				\
13762306a36Sopenharmony_ci	vpshufb a0, a2, a2;				\
13862306a36Sopenharmony_ci	vpshufb a0, a3, a3;				\
13962306a36Sopenharmony_ci	vpshufb a0, b0, b0;				\
14062306a36Sopenharmony_ci	vpshufb a0, b1, b1;				\
14162306a36Sopenharmony_ci	vpshufb a0, b2, b2;				\
14262306a36Sopenharmony_ci	vpshufb a0, b3, b3;				\
14362306a36Sopenharmony_ci	vpshufb a0, a1, a1;				\
14462306a36Sopenharmony_ci	vpshufb a0, c0, c0;				\
14562306a36Sopenharmony_ci	vpshufb a0, c1, c1;				\
14662306a36Sopenharmony_ci	vpshufb a0, c2, c2;				\
14762306a36Sopenharmony_ci	vpshufb a0, c3, c3;				\
14862306a36Sopenharmony_ci	vpshufb a0, d0, d0;				\
14962306a36Sopenharmony_ci	vpshufb a0, d1, d1;				\
15062306a36Sopenharmony_ci	vpshufb a0, d2, d2;				\
15162306a36Sopenharmony_ci	vpshufb a0, d3, d3;				\
15262306a36Sopenharmony_ci	vmovdqu64 d3, st1;				\
15362306a36Sopenharmony_ci	vmovdqu64 st0, d3;				\
15462306a36Sopenharmony_ci	vpshufb a0, d3, a0;				\
15562306a36Sopenharmony_ci	vmovdqu64 d2, st0;				\
15662306a36Sopenharmony_ci							\
15762306a36Sopenharmony_ci	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
15862306a36Sopenharmony_ci	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
15962306a36Sopenharmony_ci	vmovdqu64 st0, d2;				\
16062306a36Sopenharmony_ci	vmovdqu64 st1, d3;				\
16162306a36Sopenharmony_ci							\
16262306a36Sopenharmony_ci	vmovdqu64 b0, st0;				\
16362306a36Sopenharmony_ci	vmovdqu64 b1, st1;				\
16462306a36Sopenharmony_ci	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
16562306a36Sopenharmony_ci	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
16662306a36Sopenharmony_ci	vmovdqu64 st0, b0;				\
16762306a36Sopenharmony_ci	vmovdqu64 st1, b1;				\
16862306a36Sopenharmony_ci	/* does not adjust output bytes inside vectors */
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci/* load blocks to registers and apply pre-whitening */
17162306a36Sopenharmony_ci#define inpack16_pre(x0, x1, x2, x3,			\
17262306a36Sopenharmony_ci		     x4, x5, x6, x7,			\
17362306a36Sopenharmony_ci		     y0, y1, y2, y3,			\
17462306a36Sopenharmony_ci		     y4, y5, y6, y7,			\
17562306a36Sopenharmony_ci		     rio)				\
17662306a36Sopenharmony_ci	vmovdqu64 (0 * 64)(rio), x0;			\
17762306a36Sopenharmony_ci	vmovdqu64 (1 * 64)(rio), x1;			\
17862306a36Sopenharmony_ci	vmovdqu64 (2 * 64)(rio), x2;			\
17962306a36Sopenharmony_ci	vmovdqu64 (3 * 64)(rio), x3;			\
18062306a36Sopenharmony_ci	vmovdqu64 (4 * 64)(rio), x4;			\
18162306a36Sopenharmony_ci	vmovdqu64 (5 * 64)(rio), x5;			\
18262306a36Sopenharmony_ci	vmovdqu64 (6 * 64)(rio), x6;			\
18362306a36Sopenharmony_ci	vmovdqu64 (7 * 64)(rio), x7;			\
18462306a36Sopenharmony_ci	vmovdqu64 (8 * 64)(rio), y0;			\
18562306a36Sopenharmony_ci	vmovdqu64 (9 * 64)(rio), y1;			\
18662306a36Sopenharmony_ci	vmovdqu64 (10 * 64)(rio), y2;			\
18762306a36Sopenharmony_ci	vmovdqu64 (11 * 64)(rio), y3;			\
18862306a36Sopenharmony_ci	vmovdqu64 (12 * 64)(rio), y4;			\
18962306a36Sopenharmony_ci	vmovdqu64 (13 * 64)(rio), y5;			\
19062306a36Sopenharmony_ci	vmovdqu64 (14 * 64)(rio), y6;			\
19162306a36Sopenharmony_ci	vmovdqu64 (15 * 64)(rio), y7;
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci/* byteslice pre-whitened blocks and store to temporary memory */
19462306a36Sopenharmony_ci#define inpack16_post(x0, x1, x2, x3,			\
19562306a36Sopenharmony_ci		      x4, x5, x6, x7,			\
19662306a36Sopenharmony_ci		      y0, y1, y2, y3,			\
19762306a36Sopenharmony_ci		      y4, y5, y6, y7,			\
19862306a36Sopenharmony_ci		      mem_ab, mem_cd)			\
19962306a36Sopenharmony_ci	byteslice_16x16b(x0, x1, x2, x3,		\
20062306a36Sopenharmony_ci			 x4, x5, x6, x7,		\
20162306a36Sopenharmony_ci			 y0, y1, y2, y3,		\
20262306a36Sopenharmony_ci			 y4, y5, y6, y7,		\
20362306a36Sopenharmony_ci			 (mem_ab), (mem_cd));		\
20462306a36Sopenharmony_ci							\
20562306a36Sopenharmony_ci	vmovdqu64 x0, 0 * 64(mem_ab);			\
20662306a36Sopenharmony_ci	vmovdqu64 x1, 1 * 64(mem_ab);			\
20762306a36Sopenharmony_ci	vmovdqu64 x2, 2 * 64(mem_ab);			\
20862306a36Sopenharmony_ci	vmovdqu64 x3, 3 * 64(mem_ab);			\
20962306a36Sopenharmony_ci	vmovdqu64 x4, 4 * 64(mem_ab);			\
21062306a36Sopenharmony_ci	vmovdqu64 x5, 5 * 64(mem_ab);			\
21162306a36Sopenharmony_ci	vmovdqu64 x6, 6 * 64(mem_ab);			\
21262306a36Sopenharmony_ci	vmovdqu64 x7, 7 * 64(mem_ab);			\
21362306a36Sopenharmony_ci	vmovdqu64 y0, 0 * 64(mem_cd);			\
21462306a36Sopenharmony_ci	vmovdqu64 y1, 1 * 64(mem_cd);			\
21562306a36Sopenharmony_ci	vmovdqu64 y2, 2 * 64(mem_cd);			\
21662306a36Sopenharmony_ci	vmovdqu64 y3, 3 * 64(mem_cd);			\
21762306a36Sopenharmony_ci	vmovdqu64 y4, 4 * 64(mem_cd);			\
21862306a36Sopenharmony_ci	vmovdqu64 y5, 5 * 64(mem_cd);			\
21962306a36Sopenharmony_ci	vmovdqu64 y6, 6 * 64(mem_cd);			\
22062306a36Sopenharmony_ci	vmovdqu64 y7, 7 * 64(mem_cd);
22162306a36Sopenharmony_ci
22262306a36Sopenharmony_ci#define write_output(x0, x1, x2, x3,			\
22362306a36Sopenharmony_ci		     x4, x5, x6, x7,			\
22462306a36Sopenharmony_ci		     y0, y1, y2, y3,			\
22562306a36Sopenharmony_ci		     y4, y5, y6, y7,			\
22662306a36Sopenharmony_ci		     mem)				\
22762306a36Sopenharmony_ci	vmovdqu64 x0, 0 * 64(mem);			\
22862306a36Sopenharmony_ci	vmovdqu64 x1, 1 * 64(mem);			\
22962306a36Sopenharmony_ci	vmovdqu64 x2, 2 * 64(mem);			\
23062306a36Sopenharmony_ci	vmovdqu64 x3, 3 * 64(mem);			\
23162306a36Sopenharmony_ci	vmovdqu64 x4, 4 * 64(mem);			\
23262306a36Sopenharmony_ci	vmovdqu64 x5, 5 * 64(mem);			\
23362306a36Sopenharmony_ci	vmovdqu64 x6, 6 * 64(mem);			\
23462306a36Sopenharmony_ci	vmovdqu64 x7, 7 * 64(mem);			\
23562306a36Sopenharmony_ci	vmovdqu64 y0, 8 * 64(mem);			\
23662306a36Sopenharmony_ci	vmovdqu64 y1, 9 * 64(mem);			\
23762306a36Sopenharmony_ci	vmovdqu64 y2, 10 * 64(mem);			\
23862306a36Sopenharmony_ci	vmovdqu64 y3, 11 * 64(mem);			\
23962306a36Sopenharmony_ci	vmovdqu64 y4, 12 * 64(mem);			\
24062306a36Sopenharmony_ci	vmovdqu64 y5, 13 * 64(mem);			\
24162306a36Sopenharmony_ci	vmovdqu64 y6, 14 * 64(mem);			\
24262306a36Sopenharmony_ci	vmovdqu64 y7, 15 * 64(mem);			\
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci#define aria_store_state_8way(x0, x1, x2, x3,		\
24562306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
24662306a36Sopenharmony_ci			      mem_tmp, idx)		\
24762306a36Sopenharmony_ci	vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp);	\
24862306a36Sopenharmony_ci	vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp);	\
24962306a36Sopenharmony_ci	vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp);	\
25062306a36Sopenharmony_ci	vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp);	\
25162306a36Sopenharmony_ci	vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp);	\
25262306a36Sopenharmony_ci	vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp);	\
25362306a36Sopenharmony_ci	vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp);	\
25462306a36Sopenharmony_ci	vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci#define aria_load_state_8way(x0, x1, x2, x3,		\
25762306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
25862306a36Sopenharmony_ci			     mem_tmp, idx)		\
25962306a36Sopenharmony_ci	vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0;	\
26062306a36Sopenharmony_ci	vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1;	\
26162306a36Sopenharmony_ci	vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2;	\
26262306a36Sopenharmony_ci	vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3;	\
26362306a36Sopenharmony_ci	vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4;	\
26462306a36Sopenharmony_ci	vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5;	\
26562306a36Sopenharmony_ci	vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6;	\
26662306a36Sopenharmony_ci	vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci#define aria_ark_16way(x0, x1, x2, x3,			\
26962306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
27062306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
27162306a36Sopenharmony_ci		       y4, y5, y6, y7,			\
27262306a36Sopenharmony_ci		       t0, rk, round)			\
27362306a36Sopenharmony_ci	/* AddRoundKey */                               \
27462306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 3)(rk), t0;	\
27562306a36Sopenharmony_ci	vpxorq t0, x0, x0;				\
27662306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 2)(rk), t0;	\
27762306a36Sopenharmony_ci	vpxorq t0, x1, x1;				\
27862306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 1)(rk), t0;	\
27962306a36Sopenharmony_ci	vpxorq t0, x2, x2;				\
28062306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 0)(rk), t0;	\
28162306a36Sopenharmony_ci	vpxorq t0, x3, x3;				\
28262306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 7)(rk), t0;	\
28362306a36Sopenharmony_ci	vpxorq t0, x4, x4;				\
28462306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 6)(rk), t0;	\
28562306a36Sopenharmony_ci	vpxorq t0, x5, x5;				\
28662306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 5)(rk), t0;	\
28762306a36Sopenharmony_ci	vpxorq t0, x6, x6;				\
28862306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 4)(rk), t0;	\
28962306a36Sopenharmony_ci	vpxorq t0, x7, x7;				\
29062306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 11)(rk), t0;	\
29162306a36Sopenharmony_ci	vpxorq t0, y0, y0;				\
29262306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 10)(rk), t0;	\
29362306a36Sopenharmony_ci	vpxorq t0, y1, y1;				\
29462306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 9)(rk), t0;	\
29562306a36Sopenharmony_ci	vpxorq t0, y2, y2;				\
29662306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 8)(rk), t0;	\
29762306a36Sopenharmony_ci	vpxorq t0, y3, y3;				\
29862306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 15)(rk), t0;	\
29962306a36Sopenharmony_ci	vpxorq t0, y4, y4;				\
30062306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 14)(rk), t0;	\
30162306a36Sopenharmony_ci	vpxorq t0, y5, y5;				\
30262306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 13)(rk), t0;	\
30362306a36Sopenharmony_ci	vpxorq t0, y6, y6;				\
30462306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + 12)(rk), t0;	\
30562306a36Sopenharmony_ci	vpxorq t0, y7, y7;
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_ci#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
30862306a36Sopenharmony_ci			    x4, x5, x6, x7,		\
30962306a36Sopenharmony_ci			    t0, t1, t2, t3,		\
31062306a36Sopenharmony_ci			    t4, t5, t6, t7)		\
31162306a36Sopenharmony_ci	vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0;	\
31262306a36Sopenharmony_ci	vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1;	\
31362306a36Sopenharmony_ci	vpbroadcastq .Ltf_id_bitmatrix(%rip), t2;	\
31462306a36Sopenharmony_ci	vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3;	\
31562306a36Sopenharmony_ci	vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4;	\
31662306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
31762306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
31862306a36Sopenharmony_ci	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
31962306a36Sopenharmony_ci	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
32062306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x2, x2;		\
32162306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x6, x6;		\
32262306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
32362306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
32462306a36Sopenharmony_ci	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
32562306a36Sopenharmony_ci	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
32662306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x3, x3;		\
32762306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x7, x7;
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_ci#define aria_sbox_16way_gfni(x0, x1, x2, x3,		\
33062306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
33162306a36Sopenharmony_ci			     y0, y1, y2, y3,		\
33262306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
33362306a36Sopenharmony_ci			     t0, t1, t2, t3,		\
33462306a36Sopenharmony_ci			     t4, t5, t6, t7)		\
33562306a36Sopenharmony_ci	vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0;	\
33662306a36Sopenharmony_ci	vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1;	\
33762306a36Sopenharmony_ci	vpbroadcastq .Ltf_id_bitmatrix(%rip), t2;	\
33862306a36Sopenharmony_ci	vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3;	\
33962306a36Sopenharmony_ci	vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4;	\
34062306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
34162306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
34262306a36Sopenharmony_ci	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
34362306a36Sopenharmony_ci	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
34462306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x2, x2;		\
34562306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x6, x6;		\
34662306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
34762306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
34862306a36Sopenharmony_ci	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
34962306a36Sopenharmony_ci	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
35062306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x3, x3;		\
35162306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x7, x7;		\
35262306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1;	\
35362306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5;	\
35462306a36Sopenharmony_ci	vgf2p8affineqb $(tf_inv_const), t1, y2, y2;	\
35562306a36Sopenharmony_ci	vgf2p8affineqb $(tf_inv_const), t1, y6, y6;	\
35662306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, y2, y2;		\
35762306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, y6, y6;		\
35862306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0;	\
35962306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4;	\
36062306a36Sopenharmony_ci	vgf2p8affineqb $(tf_x2_const), t4, y3, y3;	\
36162306a36Sopenharmony_ci	vgf2p8affineqb $(tf_x2_const), t4, y7, y7;	\
36262306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, y3, y3;		\
36362306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, y7, y7;
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci
36662306a36Sopenharmony_ci#define aria_diff_m(x0, x1, x2, x3,			\
36762306a36Sopenharmony_ci		    t0, t1, t2, t3)			\
36862306a36Sopenharmony_ci	/* T = rotr32(X, 8); */				\
36962306a36Sopenharmony_ci	/* X ^= T */					\
37062306a36Sopenharmony_ci	vpxorq x0, x3, t0;				\
37162306a36Sopenharmony_ci	vpxorq x1, x0, t1;				\
37262306a36Sopenharmony_ci	vpxorq x2, x1, t2;				\
37362306a36Sopenharmony_ci	vpxorq x3, x2, t3;				\
37462306a36Sopenharmony_ci	/* X = T ^ rotr(X, 16); */			\
37562306a36Sopenharmony_ci	vpxorq t2, x0, x0;				\
37662306a36Sopenharmony_ci	vpxorq x1, t3, t3;				\
37762306a36Sopenharmony_ci	vpxorq t0, x2, x2;				\
37862306a36Sopenharmony_ci	vpxorq t1, x3, x1;				\
37962306a36Sopenharmony_ci	vmovdqu64 t3, x3;
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_ci#define aria_diff_word(x0, x1, x2, x3,			\
38262306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
38362306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
38462306a36Sopenharmony_ci		       y4, y5, y6, y7)			\
38562306a36Sopenharmony_ci	/* t1 ^= t2; */					\
38662306a36Sopenharmony_ci	vpxorq y0, x4, x4;				\
38762306a36Sopenharmony_ci	vpxorq y1, x5, x5;				\
38862306a36Sopenharmony_ci	vpxorq y2, x6, x6;				\
38962306a36Sopenharmony_ci	vpxorq y3, x7, x7;				\
39062306a36Sopenharmony_ci							\
39162306a36Sopenharmony_ci	/* t2 ^= t3; */					\
39262306a36Sopenharmony_ci	vpxorq y4, y0, y0;				\
39362306a36Sopenharmony_ci	vpxorq y5, y1, y1;				\
39462306a36Sopenharmony_ci	vpxorq y6, y2, y2;				\
39562306a36Sopenharmony_ci	vpxorq y7, y3, y3;				\
39662306a36Sopenharmony_ci							\
39762306a36Sopenharmony_ci	/* t0 ^= t1; */					\
39862306a36Sopenharmony_ci	vpxorq x4, x0, x0;				\
39962306a36Sopenharmony_ci	vpxorq x5, x1, x1;				\
40062306a36Sopenharmony_ci	vpxorq x6, x2, x2;				\
40162306a36Sopenharmony_ci	vpxorq x7, x3, x3;				\
40262306a36Sopenharmony_ci							\
40362306a36Sopenharmony_ci	/* t3 ^= t1; */					\
40462306a36Sopenharmony_ci	vpxorq x4, y4, y4;				\
40562306a36Sopenharmony_ci	vpxorq x5, y5, y5;				\
40662306a36Sopenharmony_ci	vpxorq x6, y6, y6;				\
40762306a36Sopenharmony_ci	vpxorq x7, y7, y7;				\
40862306a36Sopenharmony_ci							\
40962306a36Sopenharmony_ci	/* t2 ^= t0; */					\
41062306a36Sopenharmony_ci	vpxorq x0, y0, y0;				\
41162306a36Sopenharmony_ci	vpxorq x1, y1, y1;				\
41262306a36Sopenharmony_ci	vpxorq x2, y2, y2;				\
41362306a36Sopenharmony_ci	vpxorq x3, y3, y3;				\
41462306a36Sopenharmony_ci							\
41562306a36Sopenharmony_ci	/* t1 ^= t2; */					\
41662306a36Sopenharmony_ci	vpxorq y0, x4, x4;				\
41762306a36Sopenharmony_ci	vpxorq y1, x5, x5;				\
41862306a36Sopenharmony_ci	vpxorq y2, x6, x6;				\
41962306a36Sopenharmony_ci	vpxorq y3, x7, x7;
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_ci#define aria_fe_gfni(x0, x1, x2, x3,			\
42262306a36Sopenharmony_ci		     x4, x5, x6, x7,			\
42362306a36Sopenharmony_ci		     y0, y1, y2, y3,			\
42462306a36Sopenharmony_ci		     y4, y5, y6, y7,			\
42562306a36Sopenharmony_ci		     z0, z1, z2, z3,			\
42662306a36Sopenharmony_ci		     z4, z5, z6, z7,			\
42762306a36Sopenharmony_ci		     mem_tmp, rk, round)		\
42862306a36Sopenharmony_ci	aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,	\
42962306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7,	\
43062306a36Sopenharmony_ci		       z0, rk, round);			\
43162306a36Sopenharmony_ci							\
43262306a36Sopenharmony_ci	aria_sbox_16way_gfni(x2, x3, x0, x1,		\
43362306a36Sopenharmony_ci			     x6, x7, x4, x5,		\
43462306a36Sopenharmony_ci			     y2, y3, y0, y1,		\
43562306a36Sopenharmony_ci			     y6, y7, y4, y5,		\
43662306a36Sopenharmony_ci			     z0, z1, z2, z3,		\
43762306a36Sopenharmony_ci			     z4, z5, z6, z7);		\
43862306a36Sopenharmony_ci							\
43962306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);	\
44062306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);	\
44162306a36Sopenharmony_ci	aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);	\
44262306a36Sopenharmony_ci	aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);	\
44362306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
44462306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
44562306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
44662306a36Sopenharmony_ci		       y4, y5, y6, y7);			\
44762306a36Sopenharmony_ci	/* aria_diff_byte()				\
44862306a36Sopenharmony_ci	 * T3 = ABCD -> BADC				\
44962306a36Sopenharmony_ci	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
45062306a36Sopenharmony_ci	 * T0 = ABCD -> CDAB				\
45162306a36Sopenharmony_ci	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
45262306a36Sopenharmony_ci	 * T1 = ABCD -> DCBA				\
45362306a36Sopenharmony_ci	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
45462306a36Sopenharmony_ci	 */						\
45562306a36Sopenharmony_ci	aria_diff_word(x2, x3, x0, x1,			\
45662306a36Sopenharmony_ci		       x7, x6, x5, x4,			\
45762306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
45862306a36Sopenharmony_ci		       y5, y4, y7, y6);			\
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci
46162306a36Sopenharmony_ci#define aria_fo_gfni(x0, x1, x2, x3,			\
46262306a36Sopenharmony_ci		     x4, x5, x6, x7,			\
46362306a36Sopenharmony_ci		     y0, y1, y2, y3,			\
46462306a36Sopenharmony_ci		     y4, y5, y6, y7,			\
46562306a36Sopenharmony_ci		     z0, z1, z2, z3,			\
46662306a36Sopenharmony_ci		     z4, z5, z6, z7,			\
46762306a36Sopenharmony_ci		     mem_tmp, rk, round)		\
46862306a36Sopenharmony_ci	aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,	\
46962306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7,	\
47062306a36Sopenharmony_ci		       z0, rk, round);			\
47162306a36Sopenharmony_ci							\
47262306a36Sopenharmony_ci	aria_sbox_16way_gfni(x0, x1, x2, x3,		\
47362306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
47462306a36Sopenharmony_ci			     y0, y1, y2, y3,		\
47562306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
47662306a36Sopenharmony_ci			     z0, z1, z2, z3,		\
47762306a36Sopenharmony_ci			     z4, z5, z6, z7);		\
47862306a36Sopenharmony_ci							\
47962306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);	\
48062306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);	\
48162306a36Sopenharmony_ci	aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);	\
48262306a36Sopenharmony_ci	aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);	\
48362306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
48462306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
48562306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
48662306a36Sopenharmony_ci		       y4, y5, y6, y7);			\
48762306a36Sopenharmony_ci	/* aria_diff_byte()				\
48862306a36Sopenharmony_ci	 * T1 = ABCD -> BADC				\
48962306a36Sopenharmony_ci	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
49062306a36Sopenharmony_ci	 * T2 = ABCD -> CDAB				\
49162306a36Sopenharmony_ci	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
49262306a36Sopenharmony_ci	 * T3 = ABCD -> DCBA				\
49362306a36Sopenharmony_ci	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
49462306a36Sopenharmony_ci	 */						\
49562306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
49662306a36Sopenharmony_ci		       x5, x4, x7, x6,			\
49762306a36Sopenharmony_ci		       y2, y3, y0, y1,			\
49862306a36Sopenharmony_ci		       y7, y6, y5, y4);
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci#define aria_ff_gfni(x0, x1, x2, x3,			\
50162306a36Sopenharmony_ci		     x4, x5, x6, x7,			\
50262306a36Sopenharmony_ci		     y0, y1, y2, y3,			\
50362306a36Sopenharmony_ci		     y4, y5, y6, y7,			\
50462306a36Sopenharmony_ci		     z0, z1, z2, z3,			\
50562306a36Sopenharmony_ci		     z4, z5, z6, z7,			\
50662306a36Sopenharmony_ci		     mem_tmp, rk, round, last_round)	\
50762306a36Sopenharmony_ci	aria_ark_16way(x0, x1, x2, x3,			\
50862306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
50962306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
51062306a36Sopenharmony_ci		       y4, y5, y6, y7,			\
51162306a36Sopenharmony_ci		       z0, rk, round);			\
51262306a36Sopenharmony_ci	aria_sbox_16way_gfni(x2, x3, x0, x1,		\
51362306a36Sopenharmony_ci			     x6, x7, x4, x5,		\
51462306a36Sopenharmony_ci			     y2, y3, y0, y1,		\
51562306a36Sopenharmony_ci			     y6, y7, y4, y5,		\
51662306a36Sopenharmony_ci			     z0, z1, z2, z3,		\
51762306a36Sopenharmony_ci			     z4, z5, z6, z7);		\
51862306a36Sopenharmony_ci	aria_ark_16way(x0, x1, x2, x3,			\
51962306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
52062306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
52162306a36Sopenharmony_ci		       y4, y5, y6, y7,			\
52262306a36Sopenharmony_ci		       z0, rk, last_round);
52362306a36Sopenharmony_ci
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_ci.section        .rodata.cst64, "aM", @progbits, 64
52662306a36Sopenharmony_ci.align 64
52762306a36Sopenharmony_ci.Lcounter0123_lo:
52862306a36Sopenharmony_ci	.quad 0, 0
52962306a36Sopenharmony_ci	.quad 1, 0
53062306a36Sopenharmony_ci	.quad 2, 0
53162306a36Sopenharmony_ci	.quad 3, 0
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_ci.section        .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
53462306a36Sopenharmony_ci.align 32
53562306a36Sopenharmony_ci#define SHUFB_BYTES(idx) \
53662306a36Sopenharmony_ci	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
53762306a36Sopenharmony_ci.Lshufb_16x16b:
53862306a36Sopenharmony_ci	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
53962306a36Sopenharmony_ci	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
54062306a36Sopenharmony_ci
54162306a36Sopenharmony_ci.section	.rodata.cst16, "aM", @progbits, 16
54262306a36Sopenharmony_ci.align 16
54362306a36Sopenharmony_ci
54462306a36Sopenharmony_ci.Lcounter4444_lo:
54562306a36Sopenharmony_ci	.quad 4, 0
54662306a36Sopenharmony_ci.Lcounter8888_lo:
54762306a36Sopenharmony_ci	.quad 8, 0
54862306a36Sopenharmony_ci.Lcounter16161616_lo:
54962306a36Sopenharmony_ci	.quad 16, 0
55062306a36Sopenharmony_ci.Lcounter1111_hi:
55162306a36Sopenharmony_ci	.quad 0, 1
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci/* For CTR-mode IV byteswap */
55462306a36Sopenharmony_ci.Lbswap128_mask:
55562306a36Sopenharmony_ci	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
55662306a36Sopenharmony_ci	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ci.section	.rodata.cst8, "aM", @progbits, 8
55962306a36Sopenharmony_ci.align 8
56062306a36Sopenharmony_ci/* AES affine: */
56162306a36Sopenharmony_ci#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
56262306a36Sopenharmony_ci.Ltf_aff_bitmatrix:
56362306a36Sopenharmony_ci	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
56462306a36Sopenharmony_ci		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
56562306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
56662306a36Sopenharmony_ci		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
56762306a36Sopenharmony_ci		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
56862306a36Sopenharmony_ci		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
56962306a36Sopenharmony_ci		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
57062306a36Sopenharmony_ci		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
57162306a36Sopenharmony_ci
57262306a36Sopenharmony_ci/* AES inverse affine: */
57362306a36Sopenharmony_ci#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
57462306a36Sopenharmony_ci.Ltf_inv_bitmatrix:
57562306a36Sopenharmony_ci	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
57662306a36Sopenharmony_ci		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
57762306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
57862306a36Sopenharmony_ci		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
57962306a36Sopenharmony_ci		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
58062306a36Sopenharmony_ci		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
58162306a36Sopenharmony_ci		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
58262306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
58362306a36Sopenharmony_ci
58462306a36Sopenharmony_ci/* S2: */
58562306a36Sopenharmony_ci#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
58662306a36Sopenharmony_ci.Ltf_s2_bitmatrix:
58762306a36Sopenharmony_ci	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
58862306a36Sopenharmony_ci		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
58962306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
59062306a36Sopenharmony_ci		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
59162306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
59262306a36Sopenharmony_ci		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
59362306a36Sopenharmony_ci		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
59462306a36Sopenharmony_ci		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
59562306a36Sopenharmony_ci
59662306a36Sopenharmony_ci/* X2: */
59762306a36Sopenharmony_ci#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
59862306a36Sopenharmony_ci.Ltf_x2_bitmatrix:
59962306a36Sopenharmony_ci	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
60062306a36Sopenharmony_ci		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
60162306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
60262306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
60362306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
60462306a36Sopenharmony_ci		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
60562306a36Sopenharmony_ci		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
60662306a36Sopenharmony_ci		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
60762306a36Sopenharmony_ci
60862306a36Sopenharmony_ci/* Identity matrix: */
60962306a36Sopenharmony_ci.Ltf_id_bitmatrix:
61062306a36Sopenharmony_ci	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
61162306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
61262306a36Sopenharmony_ci		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
61362306a36Sopenharmony_ci		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
61462306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
61562306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
61662306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
61762306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
61862306a36Sopenharmony_ci
61962306a36Sopenharmony_ci.text
62062306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__aria_gfni_avx512_crypt_64way)
62162306a36Sopenharmony_ci	/* input:
62262306a36Sopenharmony_ci	 *      %r9: rk
62362306a36Sopenharmony_ci	 *      %rsi: dst
62462306a36Sopenharmony_ci	 *      %rdx: src
62562306a36Sopenharmony_ci	 *      %zmm0..%zmm15: byte-sliced blocks
62662306a36Sopenharmony_ci	 */
62762306a36Sopenharmony_ci
62862306a36Sopenharmony_ci	FRAME_BEGIN
62962306a36Sopenharmony_ci
63062306a36Sopenharmony_ci	movq %rsi, %rax;
63162306a36Sopenharmony_ci	leaq 8 * 64(%rax), %r8;
63262306a36Sopenharmony_ci
63362306a36Sopenharmony_ci	inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,
63462306a36Sopenharmony_ci		      %zmm4, %zmm5, %zmm6, %zmm7,
63562306a36Sopenharmony_ci		      %zmm8, %zmm9, %zmm10, %zmm11,
63662306a36Sopenharmony_ci		      %zmm12, %zmm13, %zmm14,
63762306a36Sopenharmony_ci		      %zmm15, %rax, %r8);
63862306a36Sopenharmony_ci	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
63962306a36Sopenharmony_ci		     %zmm4, %zmm5, %zmm6, %zmm7,
64062306a36Sopenharmony_ci		     %zmm8, %zmm9, %zmm10, %zmm11,
64162306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
64262306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
64362306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
64462306a36Sopenharmony_ci		     %rax, %r9, 0);
64562306a36Sopenharmony_ci	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
64662306a36Sopenharmony_ci		     %zmm6, %zmm7, %zmm4, %zmm5,
64762306a36Sopenharmony_ci		     %zmm9, %zmm8, %zmm11, %zmm10,
64862306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
64962306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
65062306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
65162306a36Sopenharmony_ci		     %rax, %r9, 1);
65262306a36Sopenharmony_ci	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
65362306a36Sopenharmony_ci		     %zmm4, %zmm5, %zmm6, %zmm7,
65462306a36Sopenharmony_ci		     %zmm8, %zmm9, %zmm10, %zmm11,
65562306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
65662306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
65762306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
65862306a36Sopenharmony_ci		     %rax, %r9, 2);
65962306a36Sopenharmony_ci	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
66062306a36Sopenharmony_ci		     %zmm6, %zmm7, %zmm4, %zmm5,
66162306a36Sopenharmony_ci		     %zmm9, %zmm8, %zmm11, %zmm10,
66262306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
66362306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
66462306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
66562306a36Sopenharmony_ci		     %rax, %r9, 3);
66662306a36Sopenharmony_ci	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
66762306a36Sopenharmony_ci		     %zmm4, %zmm5, %zmm6, %zmm7,
66862306a36Sopenharmony_ci		     %zmm8, %zmm9, %zmm10, %zmm11,
66962306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
67062306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
67162306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
67262306a36Sopenharmony_ci		     %rax, %r9, 4);
67362306a36Sopenharmony_ci	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
67462306a36Sopenharmony_ci		     %zmm6, %zmm7, %zmm4, %zmm5,
67562306a36Sopenharmony_ci		     %zmm9, %zmm8, %zmm11, %zmm10,
67662306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
67762306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
67862306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
67962306a36Sopenharmony_ci		     %rax, %r9, 5);
68062306a36Sopenharmony_ci	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
68162306a36Sopenharmony_ci		     %zmm4, %zmm5, %zmm6, %zmm7,
68262306a36Sopenharmony_ci		     %zmm8, %zmm9, %zmm10, %zmm11,
68362306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
68462306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
68562306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
68662306a36Sopenharmony_ci		     %rax, %r9, 6);
68762306a36Sopenharmony_ci	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
68862306a36Sopenharmony_ci		     %zmm6, %zmm7, %zmm4, %zmm5,
68962306a36Sopenharmony_ci		     %zmm9, %zmm8, %zmm11, %zmm10,
69062306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
69162306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
69262306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
69362306a36Sopenharmony_ci		     %rax, %r9, 7);
69462306a36Sopenharmony_ci	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
69562306a36Sopenharmony_ci		     %zmm4, %zmm5, %zmm6, %zmm7,
69662306a36Sopenharmony_ci		     %zmm8, %zmm9, %zmm10, %zmm11,
69762306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
69862306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
69962306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
70062306a36Sopenharmony_ci		     %rax, %r9, 8);
70162306a36Sopenharmony_ci	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
70262306a36Sopenharmony_ci		     %zmm6, %zmm7, %zmm4, %zmm5,
70362306a36Sopenharmony_ci		     %zmm9, %zmm8, %zmm11, %zmm10,
70462306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
70562306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
70662306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
70762306a36Sopenharmony_ci		     %rax, %r9, 9);
70862306a36Sopenharmony_ci	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
70962306a36Sopenharmony_ci		     %zmm4, %zmm5, %zmm6, %zmm7,
71062306a36Sopenharmony_ci		     %zmm8, %zmm9, %zmm10, %zmm11,
71162306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
71262306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
71362306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
71462306a36Sopenharmony_ci		     %rax, %r9, 10);
71562306a36Sopenharmony_ci	cmpl $12, ARIA_CTX_rounds(CTX);
71662306a36Sopenharmony_ci	jne .Laria_gfni_192;
71762306a36Sopenharmony_ci	aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
71862306a36Sopenharmony_ci		     %zmm6, %zmm7, %zmm4, %zmm5,
71962306a36Sopenharmony_ci		     %zmm9, %zmm8, %zmm11, %zmm10,
72062306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
72162306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
72262306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
72362306a36Sopenharmony_ci		     %rax, %r9, 11, 12);
72462306a36Sopenharmony_ci	jmp .Laria_gfni_end;
72562306a36Sopenharmony_ci.Laria_gfni_192:
72662306a36Sopenharmony_ci	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
72762306a36Sopenharmony_ci		     %zmm6, %zmm7, %zmm4, %zmm5,
72862306a36Sopenharmony_ci		     %zmm9, %zmm8, %zmm11, %zmm10,
72962306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
73062306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
73162306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
73262306a36Sopenharmony_ci		     %rax, %r9, 11);
73362306a36Sopenharmony_ci	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
73462306a36Sopenharmony_ci		     %zmm4, %zmm5, %zmm6, %zmm7,
73562306a36Sopenharmony_ci		     %zmm8, %zmm9, %zmm10, %zmm11,
73662306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
73762306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
73862306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
73962306a36Sopenharmony_ci		     %rax, %r9, 12);
74062306a36Sopenharmony_ci	cmpl $14, ARIA_CTX_rounds(CTX);
74162306a36Sopenharmony_ci	jne .Laria_gfni_256;
74262306a36Sopenharmony_ci	aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
74362306a36Sopenharmony_ci		     %zmm6, %zmm7, %zmm4, %zmm5,
74462306a36Sopenharmony_ci		     %zmm9, %zmm8, %zmm11, %zmm10,
74562306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
74662306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
74762306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
74862306a36Sopenharmony_ci		     %rax, %r9, 13, 14);
74962306a36Sopenharmony_ci	jmp .Laria_gfni_end;
75062306a36Sopenharmony_ci.Laria_gfni_256:
75162306a36Sopenharmony_ci	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
75262306a36Sopenharmony_ci		     %zmm6, %zmm7, %zmm4, %zmm5,
75362306a36Sopenharmony_ci		     %zmm9, %zmm8, %zmm11, %zmm10,
75462306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
75562306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
75662306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
75762306a36Sopenharmony_ci		     %rax, %r9, 13);
75862306a36Sopenharmony_ci	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
75962306a36Sopenharmony_ci		     %zmm4, %zmm5, %zmm6, %zmm7,
76062306a36Sopenharmony_ci		     %zmm8, %zmm9, %zmm10, %zmm11,
76162306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
76262306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
76362306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
76462306a36Sopenharmony_ci		     %rax, %r9, 14);
76562306a36Sopenharmony_ci	aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
76662306a36Sopenharmony_ci		     %zmm6, %zmm7, %zmm4, %zmm5,
76762306a36Sopenharmony_ci		     %zmm9, %zmm8, %zmm11, %zmm10,
76862306a36Sopenharmony_ci		     %zmm12, %zmm13, %zmm14, %zmm15,
76962306a36Sopenharmony_ci		     %zmm24, %zmm25, %zmm26, %zmm27,
77062306a36Sopenharmony_ci		     %zmm28, %zmm29, %zmm30, %zmm31,
77162306a36Sopenharmony_ci		     %rax, %r9, 15, 16);
77262306a36Sopenharmony_ci.Laria_gfni_end:
77362306a36Sopenharmony_ci	debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,
77462306a36Sopenharmony_ci			   %zmm8, %zmm13, %zmm2, %zmm7,
77562306a36Sopenharmony_ci			   %zmm11, %zmm14, %zmm1, %zmm4,
77662306a36Sopenharmony_ci			   %zmm10, %zmm15, %zmm0, %zmm5,
77762306a36Sopenharmony_ci			   (%rax), (%r8));
77862306a36Sopenharmony_ci	FRAME_END
77962306a36Sopenharmony_ci	RET;
78062306a36Sopenharmony_ciSYM_FUNC_END(__aria_gfni_avx512_crypt_64way)
78162306a36Sopenharmony_ci
78262306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_gfni_avx512_encrypt_64way)
78362306a36Sopenharmony_ci	/* input:
78462306a36Sopenharmony_ci	 *      %rdi: ctx, CTX
78562306a36Sopenharmony_ci	 *      %rsi: dst
78662306a36Sopenharmony_ci	 *      %rdx: src
78762306a36Sopenharmony_ci	 */
78862306a36Sopenharmony_ci
78962306a36Sopenharmony_ci	FRAME_BEGIN
79062306a36Sopenharmony_ci
79162306a36Sopenharmony_ci	leaq ARIA_CTX_enc_key(CTX), %r9;
79262306a36Sopenharmony_ci
79362306a36Sopenharmony_ci	inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
79462306a36Sopenharmony_ci		     %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
79562306a36Sopenharmony_ci		     %zmm15, %rdx);
79662306a36Sopenharmony_ci
79762306a36Sopenharmony_ci	call __aria_gfni_avx512_crypt_64way;
79862306a36Sopenharmony_ci
79962306a36Sopenharmony_ci	write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
80062306a36Sopenharmony_ci		     %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
80162306a36Sopenharmony_ci		     %zmm15, %rax);
80262306a36Sopenharmony_ci
80362306a36Sopenharmony_ci	FRAME_END
80462306a36Sopenharmony_ci	RET;
80562306a36Sopenharmony_ciSYM_FUNC_END(aria_gfni_avx512_encrypt_64way)
80662306a36Sopenharmony_ci
80762306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_gfni_avx512_decrypt_64way)
80862306a36Sopenharmony_ci	/* input:
80962306a36Sopenharmony_ci	 *      %rdi: ctx, CTX
81062306a36Sopenharmony_ci	 *      %rsi: dst
81162306a36Sopenharmony_ci	 *      %rdx: src
81262306a36Sopenharmony_ci	 */
81362306a36Sopenharmony_ci
81462306a36Sopenharmony_ci	FRAME_BEGIN
81562306a36Sopenharmony_ci
81662306a36Sopenharmony_ci	leaq ARIA_CTX_dec_key(CTX), %r9;
81762306a36Sopenharmony_ci
81862306a36Sopenharmony_ci	inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
81962306a36Sopenharmony_ci		     %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
82062306a36Sopenharmony_ci		     %zmm15, %rdx);
82162306a36Sopenharmony_ci
82262306a36Sopenharmony_ci	call __aria_gfni_avx512_crypt_64way;
82362306a36Sopenharmony_ci
82462306a36Sopenharmony_ci	write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
82562306a36Sopenharmony_ci		     %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
82662306a36Sopenharmony_ci		     %zmm15, %rax);
82762306a36Sopenharmony_ci
82862306a36Sopenharmony_ci	FRAME_END
82962306a36Sopenharmony_ci	RET;
83062306a36Sopenharmony_ciSYM_FUNC_END(aria_gfni_avx512_decrypt_64way)
83162306a36Sopenharmony_ci
83262306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__aria_gfni_avx512_ctr_gen_keystream_64way)
83362306a36Sopenharmony_ci	/* input:
83462306a36Sopenharmony_ci	 *      %rdi: ctx
83562306a36Sopenharmony_ci	 *      %rsi: dst
83662306a36Sopenharmony_ci	 *      %rdx: src
83762306a36Sopenharmony_ci	 *      %rcx: keystream
83862306a36Sopenharmony_ci	 *      %r8: iv (big endian, 128bit)
83962306a36Sopenharmony_ci	 */
84062306a36Sopenharmony_ci
84162306a36Sopenharmony_ci	FRAME_BEGIN
84262306a36Sopenharmony_ci
84362306a36Sopenharmony_ci	vbroadcasti64x2 .Lbswap128_mask (%rip), %zmm19;
84462306a36Sopenharmony_ci	vmovdqa64 .Lcounter0123_lo (%rip), %zmm21;
84562306a36Sopenharmony_ci	vbroadcasti64x2 .Lcounter4444_lo (%rip), %zmm22;
84662306a36Sopenharmony_ci	vbroadcasti64x2 .Lcounter8888_lo (%rip), %zmm23;
84762306a36Sopenharmony_ci	vbroadcasti64x2 .Lcounter16161616_lo (%rip), %zmm24;
84862306a36Sopenharmony_ci	vbroadcasti64x2 .Lcounter1111_hi (%rip), %zmm25;
84962306a36Sopenharmony_ci
85062306a36Sopenharmony_ci	/* load IV and byteswap */
85162306a36Sopenharmony_ci	movq 8(%r8), %r11;
85262306a36Sopenharmony_ci	movq (%r8), %r10;
85362306a36Sopenharmony_ci	bswapq %r11;
85462306a36Sopenharmony_ci	bswapq %r10;
85562306a36Sopenharmony_ci	vbroadcasti64x2 (%r8), %zmm20;
85662306a36Sopenharmony_ci	vpshufb %zmm19, %zmm20, %zmm20;
85762306a36Sopenharmony_ci
85862306a36Sopenharmony_ci	/* check need for handling 64-bit overflow and carry */
85962306a36Sopenharmony_ci	cmpq $(0xffffffffffffffff - 64), %r11;
86062306a36Sopenharmony_ci	ja .Lload_ctr_carry;
86162306a36Sopenharmony_ci
86262306a36Sopenharmony_ci	/* construct IVs */
86362306a36Sopenharmony_ci	vpaddq %zmm21, %zmm20, %zmm0;  /* +0:+1:+2:+3 */
86462306a36Sopenharmony_ci	vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */
86562306a36Sopenharmony_ci	vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */
86662306a36Sopenharmony_ci	vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */
86762306a36Sopenharmony_ci	vpaddq %zmm24, %zmm0, %zmm4; /* +16... */
86862306a36Sopenharmony_ci	vpaddq %zmm24, %zmm1, %zmm5; /* +20... */
86962306a36Sopenharmony_ci	vpaddq %zmm24, %zmm2, %zmm6; /* +24... */
87062306a36Sopenharmony_ci	vpaddq %zmm24, %zmm3, %zmm7; /* +28... */
87162306a36Sopenharmony_ci	vpaddq %zmm24, %zmm4, %zmm8; /* +32... */
87262306a36Sopenharmony_ci	vpaddq %zmm24, %zmm5, %zmm9; /* +36... */
87362306a36Sopenharmony_ci	vpaddq %zmm24, %zmm6, %zmm10; /* +40... */
87462306a36Sopenharmony_ci	vpaddq %zmm24, %zmm7, %zmm11; /* +44... */
87562306a36Sopenharmony_ci	vpaddq %zmm24, %zmm8, %zmm12; /* +48... */
87662306a36Sopenharmony_ci	vpaddq %zmm24, %zmm9, %zmm13; /* +52... */
87762306a36Sopenharmony_ci	vpaddq %zmm24, %zmm10, %zmm14; /* +56... */
87862306a36Sopenharmony_ci	vpaddq %zmm24, %zmm11, %zmm15; /* +60... */
87962306a36Sopenharmony_ci	jmp .Lload_ctr_done;
88062306a36Sopenharmony_ci
88162306a36Sopenharmony_ci.Lload_ctr_carry:
88262306a36Sopenharmony_ci	/* construct IVs */
88362306a36Sopenharmony_ci	add_le128(%zmm0, %zmm20, %zmm21, %zmm25);  /* +0:+1:+2:+3 */
88462306a36Sopenharmony_ci	add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */
88562306a36Sopenharmony_ci	add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */
88662306a36Sopenharmony_ci	add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */
88762306a36Sopenharmony_ci	add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */
88862306a36Sopenharmony_ci	add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */
88962306a36Sopenharmony_ci	add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */
89062306a36Sopenharmony_ci	add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */
89162306a36Sopenharmony_ci	add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */
89262306a36Sopenharmony_ci	add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */
89362306a36Sopenharmony_ci	add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */
89462306a36Sopenharmony_ci	add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */
89562306a36Sopenharmony_ci	add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */
89662306a36Sopenharmony_ci	add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */
89762306a36Sopenharmony_ci	add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */
89862306a36Sopenharmony_ci	add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */
89962306a36Sopenharmony_ci
90062306a36Sopenharmony_ci.Lload_ctr_done:
90162306a36Sopenharmony_ci	/* Byte-swap IVs and update counter. */
90262306a36Sopenharmony_ci	addq $64, %r11;
90362306a36Sopenharmony_ci	adcq $0, %r10;
90462306a36Sopenharmony_ci	vpshufb %zmm19, %zmm15, %zmm15;
90562306a36Sopenharmony_ci	vpshufb %zmm19, %zmm14, %zmm14;
90662306a36Sopenharmony_ci	vpshufb %zmm19, %zmm13, %zmm13;
90762306a36Sopenharmony_ci	vpshufb %zmm19, %zmm12, %zmm12;
90862306a36Sopenharmony_ci	vpshufb %zmm19, %zmm11, %zmm11;
90962306a36Sopenharmony_ci	vpshufb %zmm19, %zmm10, %zmm10;
91062306a36Sopenharmony_ci	vpshufb %zmm19, %zmm9, %zmm9;
91162306a36Sopenharmony_ci	vpshufb %zmm19, %zmm8, %zmm8;
91262306a36Sopenharmony_ci	bswapq %r11;
91362306a36Sopenharmony_ci	bswapq %r10;
91462306a36Sopenharmony_ci	vpshufb %zmm19, %zmm7, %zmm7;
91562306a36Sopenharmony_ci	vpshufb %zmm19, %zmm6, %zmm6;
91662306a36Sopenharmony_ci	vpshufb %zmm19, %zmm5, %zmm5;
91762306a36Sopenharmony_ci	vpshufb %zmm19, %zmm4, %zmm4;
91862306a36Sopenharmony_ci	vpshufb %zmm19, %zmm3, %zmm3;
91962306a36Sopenharmony_ci	vpshufb %zmm19, %zmm2, %zmm2;
92062306a36Sopenharmony_ci	vpshufb %zmm19, %zmm1, %zmm1;
92162306a36Sopenharmony_ci	vpshufb %zmm19, %zmm0, %zmm0;
92262306a36Sopenharmony_ci	movq %r11, 8(%r8);
92362306a36Sopenharmony_ci	movq %r10, (%r8);
92462306a36Sopenharmony_ci
92562306a36Sopenharmony_ci	FRAME_END
92662306a36Sopenharmony_ci	RET;
92762306a36Sopenharmony_ciSYM_FUNC_END(__aria_gfni_avx512_ctr_gen_keystream_64way)
92862306a36Sopenharmony_ci
92962306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_gfni_avx512_ctr_crypt_64way)
93062306a36Sopenharmony_ci	/* input:
93162306a36Sopenharmony_ci	 *      %rdi: ctx
93262306a36Sopenharmony_ci	 *      %rsi: dst
93362306a36Sopenharmony_ci	 *      %rdx: src
93462306a36Sopenharmony_ci	 *      %rcx: keystream
93562306a36Sopenharmony_ci	 *      %r8: iv (big endian, 128bit)
93662306a36Sopenharmony_ci	 */
93762306a36Sopenharmony_ci	FRAME_BEGIN
93862306a36Sopenharmony_ci
93962306a36Sopenharmony_ci	call __aria_gfni_avx512_ctr_gen_keystream_64way
94062306a36Sopenharmony_ci
94162306a36Sopenharmony_ci	leaq (%rsi), %r10;
94262306a36Sopenharmony_ci	leaq (%rdx), %r11;
94362306a36Sopenharmony_ci	leaq (%rcx), %rsi;
94462306a36Sopenharmony_ci	leaq (%rcx), %rdx;
94562306a36Sopenharmony_ci	leaq ARIA_CTX_enc_key(CTX), %r9;
94662306a36Sopenharmony_ci
94762306a36Sopenharmony_ci	call __aria_gfni_avx512_crypt_64way;
94862306a36Sopenharmony_ci
94962306a36Sopenharmony_ci	vpxorq (0 * 64)(%r11), %zmm3, %zmm3;
95062306a36Sopenharmony_ci	vpxorq (1 * 64)(%r11), %zmm2, %zmm2;
95162306a36Sopenharmony_ci	vpxorq (2 * 64)(%r11), %zmm1, %zmm1;
95262306a36Sopenharmony_ci	vpxorq (3 * 64)(%r11), %zmm0, %zmm0;
95362306a36Sopenharmony_ci	vpxorq (4 * 64)(%r11), %zmm6, %zmm6;
95462306a36Sopenharmony_ci	vpxorq (5 * 64)(%r11), %zmm7, %zmm7;
95562306a36Sopenharmony_ci	vpxorq (6 * 64)(%r11), %zmm4, %zmm4;
95662306a36Sopenharmony_ci	vpxorq (7 * 64)(%r11), %zmm5, %zmm5;
95762306a36Sopenharmony_ci	vpxorq (8 * 64)(%r11), %zmm9, %zmm9;
95862306a36Sopenharmony_ci	vpxorq (9 * 64)(%r11), %zmm8, %zmm8;
95962306a36Sopenharmony_ci	vpxorq (10 * 64)(%r11), %zmm11, %zmm11;
96062306a36Sopenharmony_ci	vpxorq (11 * 64)(%r11), %zmm10, %zmm10;
96162306a36Sopenharmony_ci	vpxorq (12 * 64)(%r11), %zmm12, %zmm12;
96262306a36Sopenharmony_ci	vpxorq (13 * 64)(%r11), %zmm13, %zmm13;
96362306a36Sopenharmony_ci	vpxorq (14 * 64)(%r11), %zmm14, %zmm14;
96462306a36Sopenharmony_ci	vpxorq (15 * 64)(%r11), %zmm15, %zmm15;
96562306a36Sopenharmony_ci	write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
96662306a36Sopenharmony_ci		     %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
96762306a36Sopenharmony_ci		     %zmm15, %r10);
96862306a36Sopenharmony_ci
96962306a36Sopenharmony_ci	FRAME_END
97062306a36Sopenharmony_ci	RET;
97162306a36Sopenharmony_ciSYM_FUNC_END(aria_gfni_avx512_ctr_crypt_64way)
972