162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * ARIA Cipher 16-way parallel algorithm (AVX)
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci */
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci#include <linux/linkage.h>
1062306a36Sopenharmony_ci#include <linux/cfi_types.h>
1162306a36Sopenharmony_ci#include <asm/asm-offsets.h>
1262306a36Sopenharmony_ci#include <asm/frame.h>
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci/* register macros */
1562306a36Sopenharmony_ci#define CTX %rdi
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
1962306a36Sopenharmony_ci	( (((a0) & 1) << 0) |				\
2062306a36Sopenharmony_ci	  (((a1) & 1) << 1) |				\
2162306a36Sopenharmony_ci	  (((a2) & 1) << 2) |				\
2262306a36Sopenharmony_ci	  (((a3) & 1) << 3) |				\
2362306a36Sopenharmony_ci	  (((a4) & 1) << 4) |				\
2462306a36Sopenharmony_ci	  (((a5) & 1) << 5) |				\
2562306a36Sopenharmony_ci	  (((a6) & 1) << 6) |				\
2662306a36Sopenharmony_ci	  (((a7) & 1) << 7) )
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
2962306a36Sopenharmony_ci	( ((l7) << (0 * 8)) |				\
3062306a36Sopenharmony_ci	  ((l6) << (1 * 8)) |				\
3162306a36Sopenharmony_ci	  ((l5) << (2 * 8)) |				\
3262306a36Sopenharmony_ci	  ((l4) << (3 * 8)) |				\
3362306a36Sopenharmony_ci	  ((l3) << (4 * 8)) |				\
3462306a36Sopenharmony_ci	  ((l2) << (5 * 8)) |				\
3562306a36Sopenharmony_ci	  ((l1) << (6 * 8)) |				\
3662306a36Sopenharmony_ci	  ((l0) << (7 * 8)) )
3762306a36Sopenharmony_ci
3862306a36Sopenharmony_ci#define inc_le128(x, minus_one, tmp)			\
3962306a36Sopenharmony_ci	vpcmpeqq minus_one, x, tmp;			\
4062306a36Sopenharmony_ci	vpsubq minus_one, x, x;				\
4162306a36Sopenharmony_ci	vpslldq $8, tmp, tmp;				\
4262306a36Sopenharmony_ci	vpsubq tmp, x, x;
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
4562306a36Sopenharmony_ci	vpand x, mask4bit, tmp0;			\
4662306a36Sopenharmony_ci	vpandn x, mask4bit, x;				\
4762306a36Sopenharmony_ci	vpsrld $4, x, x;				\
4862306a36Sopenharmony_ci							\
4962306a36Sopenharmony_ci	vpshufb tmp0, lo_t, tmp0;			\
5062306a36Sopenharmony_ci	vpshufb x, hi_t, x;				\
5162306a36Sopenharmony_ci	vpxor tmp0, x, x;
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
5462306a36Sopenharmony_ci	vpunpckhdq x1, x0, t2;				\
5562306a36Sopenharmony_ci	vpunpckldq x1, x0, x0;				\
5662306a36Sopenharmony_ci							\
5762306a36Sopenharmony_ci	vpunpckldq x3, x2, t1;				\
5862306a36Sopenharmony_ci	vpunpckhdq x3, x2, x2;				\
5962306a36Sopenharmony_ci							\
6062306a36Sopenharmony_ci	vpunpckhqdq t1, x0, x1;				\
6162306a36Sopenharmony_ci	vpunpcklqdq t1, x0, x0;				\
6262306a36Sopenharmony_ci							\
6362306a36Sopenharmony_ci	vpunpckhqdq x2, t2, x3;				\
6462306a36Sopenharmony_ci	vpunpcklqdq x2, t2, x2;
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci#define byteslice_16x16b(a0, b0, c0, d0,		\
6762306a36Sopenharmony_ci			 a1, b1, c1, d1,		\
6862306a36Sopenharmony_ci			 a2, b2, c2, d2,		\
6962306a36Sopenharmony_ci			 a3, b3, c3, d3,		\
7062306a36Sopenharmony_ci			 st0, st1)			\
7162306a36Sopenharmony_ci	vmovdqu d2, st0;				\
7262306a36Sopenharmony_ci	vmovdqu d3, st1;				\
7362306a36Sopenharmony_ci	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
7462306a36Sopenharmony_ci	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
7562306a36Sopenharmony_ci	vmovdqu st0, d2;				\
7662306a36Sopenharmony_ci	vmovdqu st1, d3;				\
7762306a36Sopenharmony_ci							\
7862306a36Sopenharmony_ci	vmovdqu a0, st0;				\
7962306a36Sopenharmony_ci	vmovdqu a1, st1;				\
8062306a36Sopenharmony_ci	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
8162306a36Sopenharmony_ci	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
8262306a36Sopenharmony_ci							\
8362306a36Sopenharmony_ci	vmovdqu .Lshufb_16x16b(%rip), a0;		\
8462306a36Sopenharmony_ci	vmovdqu st1, a1;				\
8562306a36Sopenharmony_ci	vpshufb a0, a2, a2;				\
8662306a36Sopenharmony_ci	vpshufb a0, a3, a3;				\
8762306a36Sopenharmony_ci	vpshufb a0, b0, b0;				\
8862306a36Sopenharmony_ci	vpshufb a0, b1, b1;				\
8962306a36Sopenharmony_ci	vpshufb a0, b2, b2;				\
9062306a36Sopenharmony_ci	vpshufb a0, b3, b3;				\
9162306a36Sopenharmony_ci	vpshufb a0, a1, a1;				\
9262306a36Sopenharmony_ci	vpshufb a0, c0, c0;				\
9362306a36Sopenharmony_ci	vpshufb a0, c1, c1;				\
9462306a36Sopenharmony_ci	vpshufb a0, c2, c2;				\
9562306a36Sopenharmony_ci	vpshufb a0, c3, c3;				\
9662306a36Sopenharmony_ci	vpshufb a0, d0, d0;				\
9762306a36Sopenharmony_ci	vpshufb a0, d1, d1;				\
9862306a36Sopenharmony_ci	vpshufb a0, d2, d2;				\
9962306a36Sopenharmony_ci	vpshufb a0, d3, d3;				\
10062306a36Sopenharmony_ci	vmovdqu d3, st1;				\
10162306a36Sopenharmony_ci	vmovdqu st0, d3;				\
10262306a36Sopenharmony_ci	vpshufb a0, d3, a0;				\
10362306a36Sopenharmony_ci	vmovdqu d2, st0;				\
10462306a36Sopenharmony_ci							\
10562306a36Sopenharmony_ci	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
10662306a36Sopenharmony_ci	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
10762306a36Sopenharmony_ci	vmovdqu st0, d2;				\
10862306a36Sopenharmony_ci	vmovdqu st1, d3;				\
10962306a36Sopenharmony_ci							\
11062306a36Sopenharmony_ci	vmovdqu b0, st0;				\
11162306a36Sopenharmony_ci	vmovdqu b1, st1;				\
11262306a36Sopenharmony_ci	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
11362306a36Sopenharmony_ci	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
11462306a36Sopenharmony_ci	vmovdqu st0, b0;				\
11562306a36Sopenharmony_ci	vmovdqu st1, b1;				\
11662306a36Sopenharmony_ci	/* does not adjust output bytes inside vectors */
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci#define debyteslice_16x16b(a0, b0, c0, d0,		\
11962306a36Sopenharmony_ci			   a1, b1, c1, d1,		\
12062306a36Sopenharmony_ci			   a2, b2, c2, d2,		\
12162306a36Sopenharmony_ci			   a3, b3, c3, d3,		\
12262306a36Sopenharmony_ci			   st0, st1)			\
12362306a36Sopenharmony_ci	vmovdqu d2, st0;				\
12462306a36Sopenharmony_ci	vmovdqu d3, st1;				\
12562306a36Sopenharmony_ci	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
12662306a36Sopenharmony_ci	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
12762306a36Sopenharmony_ci	vmovdqu st0, d2;				\
12862306a36Sopenharmony_ci	vmovdqu st1, d3;				\
12962306a36Sopenharmony_ci							\
13062306a36Sopenharmony_ci	vmovdqu a0, st0;				\
13162306a36Sopenharmony_ci	vmovdqu a1, st1;				\
13262306a36Sopenharmony_ci	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
13362306a36Sopenharmony_ci	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
13462306a36Sopenharmony_ci							\
13562306a36Sopenharmony_ci	vmovdqu .Lshufb_16x16b(%rip), a0;		\
13662306a36Sopenharmony_ci	vmovdqu st1, a1;				\
13762306a36Sopenharmony_ci	vpshufb a0, a2, a2;				\
13862306a36Sopenharmony_ci	vpshufb a0, a3, a3;				\
13962306a36Sopenharmony_ci	vpshufb a0, b0, b0;				\
14062306a36Sopenharmony_ci	vpshufb a0, b1, b1;				\
14162306a36Sopenharmony_ci	vpshufb a0, b2, b2;				\
14262306a36Sopenharmony_ci	vpshufb a0, b3, b3;				\
14362306a36Sopenharmony_ci	vpshufb a0, a1, a1;				\
14462306a36Sopenharmony_ci	vpshufb a0, c0, c0;				\
14562306a36Sopenharmony_ci	vpshufb a0, c1, c1;				\
14662306a36Sopenharmony_ci	vpshufb a0, c2, c2;				\
14762306a36Sopenharmony_ci	vpshufb a0, c3, c3;				\
14862306a36Sopenharmony_ci	vpshufb a0, d0, d0;				\
14962306a36Sopenharmony_ci	vpshufb a0, d1, d1;				\
15062306a36Sopenharmony_ci	vpshufb a0, d2, d2;				\
15162306a36Sopenharmony_ci	vpshufb a0, d3, d3;				\
15262306a36Sopenharmony_ci	vmovdqu d3, st1;				\
15362306a36Sopenharmony_ci	vmovdqu st0, d3;				\
15462306a36Sopenharmony_ci	vpshufb a0, d3, a0;				\
15562306a36Sopenharmony_ci	vmovdqu d2, st0;				\
15662306a36Sopenharmony_ci							\
15762306a36Sopenharmony_ci	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
15862306a36Sopenharmony_ci	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
15962306a36Sopenharmony_ci	vmovdqu st0, d2;				\
16062306a36Sopenharmony_ci	vmovdqu st1, d3;				\
16162306a36Sopenharmony_ci							\
16262306a36Sopenharmony_ci	vmovdqu b0, st0;				\
16362306a36Sopenharmony_ci	vmovdqu b1, st1;				\
16462306a36Sopenharmony_ci	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
16562306a36Sopenharmony_ci	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
16662306a36Sopenharmony_ci	vmovdqu st0, b0;				\
16762306a36Sopenharmony_ci	vmovdqu st1, b1;				\
16862306a36Sopenharmony_ci	/* does not adjust output bytes inside vectors */
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci/* load blocks to registers and apply pre-whitening */
17162306a36Sopenharmony_ci#define inpack16_pre(x0, x1, x2, x3,			\
17262306a36Sopenharmony_ci		     x4, x5, x6, x7,			\
17362306a36Sopenharmony_ci		     y0, y1, y2, y3,			\
17462306a36Sopenharmony_ci		     y4, y5, y6, y7,			\
17562306a36Sopenharmony_ci		     rio)				\
17662306a36Sopenharmony_ci	vmovdqu (0 * 16)(rio), x0;			\
17762306a36Sopenharmony_ci	vmovdqu (1 * 16)(rio), x1;			\
17862306a36Sopenharmony_ci	vmovdqu (2 * 16)(rio), x2;			\
17962306a36Sopenharmony_ci	vmovdqu (3 * 16)(rio), x3;			\
18062306a36Sopenharmony_ci	vmovdqu (4 * 16)(rio), x4;			\
18162306a36Sopenharmony_ci	vmovdqu (5 * 16)(rio), x5;			\
18262306a36Sopenharmony_ci	vmovdqu (6 * 16)(rio), x6;			\
18362306a36Sopenharmony_ci	vmovdqu (7 * 16)(rio), x7;			\
18462306a36Sopenharmony_ci	vmovdqu (8 * 16)(rio), y0;			\
18562306a36Sopenharmony_ci	vmovdqu (9 * 16)(rio), y1;			\
18662306a36Sopenharmony_ci	vmovdqu (10 * 16)(rio), y2;			\
18762306a36Sopenharmony_ci	vmovdqu (11 * 16)(rio), y3;			\
18862306a36Sopenharmony_ci	vmovdqu (12 * 16)(rio), y4;			\
18962306a36Sopenharmony_ci	vmovdqu (13 * 16)(rio), y5;			\
19062306a36Sopenharmony_ci	vmovdqu (14 * 16)(rio), y6;			\
19162306a36Sopenharmony_ci	vmovdqu (15 * 16)(rio), y7;
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci/* byteslice pre-whitened blocks and store to temporary memory */
19462306a36Sopenharmony_ci#define inpack16_post(x0, x1, x2, x3,			\
19562306a36Sopenharmony_ci		      x4, x5, x6, x7,			\
19662306a36Sopenharmony_ci		      y0, y1, y2, y3,			\
19762306a36Sopenharmony_ci		      y4, y5, y6, y7,			\
19862306a36Sopenharmony_ci		      mem_ab, mem_cd)			\
19962306a36Sopenharmony_ci	byteslice_16x16b(x0, x1, x2, x3,		\
20062306a36Sopenharmony_ci			 x4, x5, x6, x7,		\
20162306a36Sopenharmony_ci			 y0, y1, y2, y3,		\
20262306a36Sopenharmony_ci			 y4, y5, y6, y7,		\
20362306a36Sopenharmony_ci			 (mem_ab), (mem_cd));		\
20462306a36Sopenharmony_ci							\
20562306a36Sopenharmony_ci	vmovdqu x0, 0 * 16(mem_ab);			\
20662306a36Sopenharmony_ci	vmovdqu x1, 1 * 16(mem_ab);			\
20762306a36Sopenharmony_ci	vmovdqu x2, 2 * 16(mem_ab);			\
20862306a36Sopenharmony_ci	vmovdqu x3, 3 * 16(mem_ab);			\
20962306a36Sopenharmony_ci	vmovdqu x4, 4 * 16(mem_ab);			\
21062306a36Sopenharmony_ci	vmovdqu x5, 5 * 16(mem_ab);			\
21162306a36Sopenharmony_ci	vmovdqu x6, 6 * 16(mem_ab);			\
21262306a36Sopenharmony_ci	vmovdqu x7, 7 * 16(mem_ab);			\
21362306a36Sopenharmony_ci	vmovdqu y0, 0 * 16(mem_cd);			\
21462306a36Sopenharmony_ci	vmovdqu y1, 1 * 16(mem_cd);			\
21562306a36Sopenharmony_ci	vmovdqu y2, 2 * 16(mem_cd);			\
21662306a36Sopenharmony_ci	vmovdqu y3, 3 * 16(mem_cd);			\
21762306a36Sopenharmony_ci	vmovdqu y4, 4 * 16(mem_cd);			\
21862306a36Sopenharmony_ci	vmovdqu y5, 5 * 16(mem_cd);			\
21962306a36Sopenharmony_ci	vmovdqu y6, 6 * 16(mem_cd);			\
22062306a36Sopenharmony_ci	vmovdqu y7, 7 * 16(mem_cd);
22162306a36Sopenharmony_ci
22262306a36Sopenharmony_ci#define write_output(x0, x1, x2, x3,			\
22362306a36Sopenharmony_ci		     x4, x5, x6, x7,			\
22462306a36Sopenharmony_ci		     y0, y1, y2, y3,			\
22562306a36Sopenharmony_ci		     y4, y5, y6, y7,			\
22662306a36Sopenharmony_ci		     mem)				\
22762306a36Sopenharmony_ci	vmovdqu x0, 0 * 16(mem);			\
22862306a36Sopenharmony_ci	vmovdqu x1, 1 * 16(mem);			\
22962306a36Sopenharmony_ci	vmovdqu x2, 2 * 16(mem);			\
23062306a36Sopenharmony_ci	vmovdqu x3, 3 * 16(mem);			\
23162306a36Sopenharmony_ci	vmovdqu x4, 4 * 16(mem);			\
23262306a36Sopenharmony_ci	vmovdqu x5, 5 * 16(mem);			\
23362306a36Sopenharmony_ci	vmovdqu x6, 6 * 16(mem);			\
23462306a36Sopenharmony_ci	vmovdqu x7, 7 * 16(mem);			\
23562306a36Sopenharmony_ci	vmovdqu y0, 8 * 16(mem);			\
23662306a36Sopenharmony_ci	vmovdqu y1, 9 * 16(mem);			\
23762306a36Sopenharmony_ci	vmovdqu y2, 10 * 16(mem);			\
23862306a36Sopenharmony_ci	vmovdqu y3, 11 * 16(mem);			\
23962306a36Sopenharmony_ci	vmovdqu y4, 12 * 16(mem);			\
24062306a36Sopenharmony_ci	vmovdqu y5, 13 * 16(mem);			\
24162306a36Sopenharmony_ci	vmovdqu y6, 14 * 16(mem);			\
24262306a36Sopenharmony_ci	vmovdqu y7, 15 * 16(mem);			\
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci#define aria_store_state_8way(x0, x1, x2, x3,		\
24562306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
24662306a36Sopenharmony_ci			      mem_tmp, idx)		\
24762306a36Sopenharmony_ci	vmovdqu x0, ((idx + 0) * 16)(mem_tmp);		\
24862306a36Sopenharmony_ci	vmovdqu x1, ((idx + 1) * 16)(mem_tmp);		\
24962306a36Sopenharmony_ci	vmovdqu x2, ((idx + 2) * 16)(mem_tmp);		\
25062306a36Sopenharmony_ci	vmovdqu x3, ((idx + 3) * 16)(mem_tmp);		\
25162306a36Sopenharmony_ci	vmovdqu x4, ((idx + 4) * 16)(mem_tmp);		\
25262306a36Sopenharmony_ci	vmovdqu x5, ((idx + 5) * 16)(mem_tmp);		\
25362306a36Sopenharmony_ci	vmovdqu x6, ((idx + 6) * 16)(mem_tmp);		\
25462306a36Sopenharmony_ci	vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci#define aria_load_state_8way(x0, x1, x2, x3,		\
25762306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
25862306a36Sopenharmony_ci			     mem_tmp, idx)		\
25962306a36Sopenharmony_ci	vmovdqu ((idx + 0) * 16)(mem_tmp), x0;		\
26062306a36Sopenharmony_ci	vmovdqu ((idx + 1) * 16)(mem_tmp), x1;		\
26162306a36Sopenharmony_ci	vmovdqu ((idx + 2) * 16)(mem_tmp), x2;		\
26262306a36Sopenharmony_ci	vmovdqu ((idx + 3) * 16)(mem_tmp), x3;		\
26362306a36Sopenharmony_ci	vmovdqu ((idx + 4) * 16)(mem_tmp), x4;		\
26462306a36Sopenharmony_ci	vmovdqu ((idx + 5) * 16)(mem_tmp), x5;		\
26562306a36Sopenharmony_ci	vmovdqu ((idx + 6) * 16)(mem_tmp), x6;		\
26662306a36Sopenharmony_ci	vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci#define aria_ark_8way(x0, x1, x2, x3,			\
26962306a36Sopenharmony_ci		      x4, x5, x6, x7,			\
27062306a36Sopenharmony_ci		      t0, t1, t2, rk,			\
27162306a36Sopenharmony_ci		      idx, round)			\
27262306a36Sopenharmony_ci	/* AddRoundKey */                               \
27362306a36Sopenharmony_ci	vbroadcastss ((round * 16) + idx + 0)(rk), t0;	\
27462306a36Sopenharmony_ci	vpsrld $24, t0, t2;				\
27562306a36Sopenharmony_ci	vpshufb t1, t2, t2;				\
27662306a36Sopenharmony_ci	vpxor t2, x0, x0;				\
27762306a36Sopenharmony_ci	vpsrld $16, t0, t2;				\
27862306a36Sopenharmony_ci	vpshufb t1, t2, t2;				\
27962306a36Sopenharmony_ci	vpxor t2, x1, x1;				\
28062306a36Sopenharmony_ci	vpsrld $8, t0, t2;				\
28162306a36Sopenharmony_ci	vpshufb t1, t2, t2;				\
28262306a36Sopenharmony_ci	vpxor t2, x2, x2;				\
28362306a36Sopenharmony_ci	vpshufb t1, t0, t2;				\
28462306a36Sopenharmony_ci	vpxor t2, x3, x3;				\
28562306a36Sopenharmony_ci	vbroadcastss ((round * 16) + idx + 4)(rk), t0;	\
28662306a36Sopenharmony_ci	vpsrld $24, t0, t2;				\
28762306a36Sopenharmony_ci	vpshufb t1, t2, t2;				\
28862306a36Sopenharmony_ci	vpxor t2, x4, x4;				\
28962306a36Sopenharmony_ci	vpsrld $16, t0, t2;				\
29062306a36Sopenharmony_ci	vpshufb t1, t2, t2;				\
29162306a36Sopenharmony_ci	vpxor t2, x5, x5;				\
29262306a36Sopenharmony_ci	vpsrld $8, t0, t2;				\
29362306a36Sopenharmony_ci	vpshufb t1, t2, t2;				\
29462306a36Sopenharmony_ci	vpxor t2, x6, x6;				\
29562306a36Sopenharmony_ci	vpshufb t1, t0, t2;				\
29662306a36Sopenharmony_ci	vpxor t2, x7, x7;
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci#ifdef CONFIG_AS_GFNI
29962306a36Sopenharmony_ci#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
30062306a36Sopenharmony_ci			    x4, x5, x6, x7,		\
30162306a36Sopenharmony_ci			    t0, t1, t2, t3,		\
30262306a36Sopenharmony_ci			    t4, t5, t6, t7)		\
30362306a36Sopenharmony_ci	vmovdqa .Ltf_s2_bitmatrix(%rip), t0;		\
30462306a36Sopenharmony_ci	vmovdqa .Ltf_inv_bitmatrix(%rip), t1;		\
30562306a36Sopenharmony_ci	vmovdqa .Ltf_id_bitmatrix(%rip), t2;		\
30662306a36Sopenharmony_ci	vmovdqa .Ltf_aff_bitmatrix(%rip), t3;		\
30762306a36Sopenharmony_ci	vmovdqa .Ltf_x2_bitmatrix(%rip), t4;		\
30862306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
30962306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
31062306a36Sopenharmony_ci	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
31162306a36Sopenharmony_ci	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
31262306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x2, x2;		\
31362306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x6, x6;		\
31462306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
31562306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
31662306a36Sopenharmony_ci	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
31762306a36Sopenharmony_ci	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
31862306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x3, x3;		\
31962306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x7, x7
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_ci#endif /* CONFIG_AS_GFNI */
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci#define aria_sbox_8way(x0, x1, x2, x3,            	\
32462306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
32562306a36Sopenharmony_ci		       t0, t1, t2, t3,			\
32662306a36Sopenharmony_ci		       t4, t5, t6, t7)			\
32762306a36Sopenharmony_ci	vmovdqa .Linv_shift_row(%rip), t0;		\
32862306a36Sopenharmony_ci	vmovdqa .Lshift_row(%rip), t1;			\
32962306a36Sopenharmony_ci	vbroadcastss .L0f0f0f0f(%rip), t6;		\
33062306a36Sopenharmony_ci	vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2;	\
33162306a36Sopenharmony_ci	vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3;	\
33262306a36Sopenharmony_ci	vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4;	\
33362306a36Sopenharmony_ci	vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5;	\
33462306a36Sopenharmony_ci							\
33562306a36Sopenharmony_ci	vaesenclast t7, x0, x0;				\
33662306a36Sopenharmony_ci	vaesenclast t7, x4, x4;				\
33762306a36Sopenharmony_ci	vaesenclast t7, x1, x1;				\
33862306a36Sopenharmony_ci	vaesenclast t7, x5, x5;				\
33962306a36Sopenharmony_ci	vaesdeclast t7, x2, x2;				\
34062306a36Sopenharmony_ci	vaesdeclast t7, x6, x6;				\
34162306a36Sopenharmony_ci							\
34262306a36Sopenharmony_ci	/* AES inverse shift rows */			\
34362306a36Sopenharmony_ci	vpshufb t0, x0, x0;				\
34462306a36Sopenharmony_ci	vpshufb t0, x4, x4;				\
34562306a36Sopenharmony_ci	vpshufb t0, x1, x1;				\
34662306a36Sopenharmony_ci	vpshufb t0, x5, x5;				\
34762306a36Sopenharmony_ci	vpshufb t1, x3, x3;				\
34862306a36Sopenharmony_ci	vpshufb t1, x7, x7;				\
34962306a36Sopenharmony_ci	vpshufb t1, x2, x2;				\
35062306a36Sopenharmony_ci	vpshufb t1, x6, x6;				\
35162306a36Sopenharmony_ci							\
35262306a36Sopenharmony_ci	/* affine transformation for S2 */		\
35362306a36Sopenharmony_ci	filter_8bit(x1, t2, t3, t6, t0);		\
35462306a36Sopenharmony_ci	/* affine transformation for S2 */		\
35562306a36Sopenharmony_ci	filter_8bit(x5, t2, t3, t6, t0);		\
35662306a36Sopenharmony_ci							\
35762306a36Sopenharmony_ci	/* affine transformation for X2 */		\
35862306a36Sopenharmony_ci	filter_8bit(x3, t4, t5, t6, t0);		\
35962306a36Sopenharmony_ci	/* affine transformation for X2 */		\
36062306a36Sopenharmony_ci	filter_8bit(x7, t4, t5, t6, t0);		\
36162306a36Sopenharmony_ci	vaesdeclast t7, x3, x3;				\
36262306a36Sopenharmony_ci	vaesdeclast t7, x7, x7;
36362306a36Sopenharmony_ci
36462306a36Sopenharmony_ci#define aria_diff_m(x0, x1, x2, x3,			\
36562306a36Sopenharmony_ci		    t0, t1, t2, t3)			\
36662306a36Sopenharmony_ci	/* T = rotr32(X, 8); */				\
36762306a36Sopenharmony_ci	/* X ^= T */					\
36862306a36Sopenharmony_ci	vpxor x0, x3, t0;				\
36962306a36Sopenharmony_ci	vpxor x1, x0, t1;				\
37062306a36Sopenharmony_ci	vpxor x2, x1, t2;				\
37162306a36Sopenharmony_ci	vpxor x3, x2, t3;				\
37262306a36Sopenharmony_ci	/* X = T ^ rotr(X, 16); */			\
37362306a36Sopenharmony_ci	vpxor t2, x0, x0;				\
37462306a36Sopenharmony_ci	vpxor x1, t3, t3;				\
37562306a36Sopenharmony_ci	vpxor t0, x2, x2;				\
37662306a36Sopenharmony_ci	vpxor t1, x3, x1;				\
37762306a36Sopenharmony_ci	vmovdqu t3, x3;
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci#define aria_diff_word(x0, x1, x2, x3,			\
38062306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
38162306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
38262306a36Sopenharmony_ci		       y4, y5, y6, y7)			\
38362306a36Sopenharmony_ci	/* t1 ^= t2; */					\
38462306a36Sopenharmony_ci	vpxor y0, x4, x4;				\
38562306a36Sopenharmony_ci	vpxor y1, x5, x5;				\
38662306a36Sopenharmony_ci	vpxor y2, x6, x6;				\
38762306a36Sopenharmony_ci	vpxor y3, x7, x7;				\
38862306a36Sopenharmony_ci							\
38962306a36Sopenharmony_ci	/* t2 ^= t3; */					\
39062306a36Sopenharmony_ci	vpxor y4, y0, y0;				\
39162306a36Sopenharmony_ci	vpxor y5, y1, y1;				\
39262306a36Sopenharmony_ci	vpxor y6, y2, y2;				\
39362306a36Sopenharmony_ci	vpxor y7, y3, y3;				\
39462306a36Sopenharmony_ci							\
39562306a36Sopenharmony_ci	/* t0 ^= t1; */					\
39662306a36Sopenharmony_ci	vpxor x4, x0, x0;				\
39762306a36Sopenharmony_ci	vpxor x5, x1, x1;				\
39862306a36Sopenharmony_ci	vpxor x6, x2, x2;				\
39962306a36Sopenharmony_ci	vpxor x7, x3, x3;				\
40062306a36Sopenharmony_ci							\
40162306a36Sopenharmony_ci	/* t3 ^= t1; */					\
40262306a36Sopenharmony_ci	vpxor x4, y4, y4;				\
40362306a36Sopenharmony_ci	vpxor x5, y5, y5;				\
40462306a36Sopenharmony_ci	vpxor x6, y6, y6;				\
40562306a36Sopenharmony_ci	vpxor x7, y7, y7;				\
40662306a36Sopenharmony_ci							\
40762306a36Sopenharmony_ci	/* t2 ^= t0; */					\
40862306a36Sopenharmony_ci	vpxor x0, y0, y0;				\
40962306a36Sopenharmony_ci	vpxor x1, y1, y1;				\
41062306a36Sopenharmony_ci	vpxor x2, y2, y2;				\
41162306a36Sopenharmony_ci	vpxor x3, y3, y3;				\
41262306a36Sopenharmony_ci							\
41362306a36Sopenharmony_ci	/* t1 ^= t2; */					\
41462306a36Sopenharmony_ci	vpxor y0, x4, x4;				\
41562306a36Sopenharmony_ci	vpxor y1, x5, x5;				\
41662306a36Sopenharmony_ci	vpxor y2, x6, x6;				\
41762306a36Sopenharmony_ci	vpxor y3, x7, x7;
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci#define aria_fe(x0, x1, x2, x3,				\
42062306a36Sopenharmony_ci		x4, x5, x6, x7,				\
42162306a36Sopenharmony_ci		y0, y1, y2, y3,				\
42262306a36Sopenharmony_ci		y4, y5, y6, y7,				\
42362306a36Sopenharmony_ci		mem_tmp, rk, round)			\
42462306a36Sopenharmony_ci	vpxor y7, y7, y7;				\
42562306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
42662306a36Sopenharmony_ci		      y0, y7, y2, rk, 8, round);	\
42762306a36Sopenharmony_ci							\
42862306a36Sopenharmony_ci	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
42962306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7);	\
43062306a36Sopenharmony_ci							\
43162306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
43262306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
43362306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
43462306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
43562306a36Sopenharmony_ci			      mem_tmp, 8);		\
43662306a36Sopenharmony_ci							\
43762306a36Sopenharmony_ci	aria_load_state_8way(x0, x1, x2, x3,		\
43862306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
43962306a36Sopenharmony_ci			     mem_tmp, 0);		\
44062306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
44162306a36Sopenharmony_ci		      y0, y7, y2, rk, 0, round);	\
44262306a36Sopenharmony_ci							\
44362306a36Sopenharmony_ci	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
44462306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7);	\
44562306a36Sopenharmony_ci							\
44662306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
44762306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
44862306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
44962306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
45062306a36Sopenharmony_ci			      mem_tmp, 0);		\
45162306a36Sopenharmony_ci	aria_load_state_8way(y0, y1, y2, y3,		\
45262306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
45362306a36Sopenharmony_ci			     mem_tmp, 8);		\
45462306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
45562306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
45662306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
45762306a36Sopenharmony_ci		       y4, y5, y6, y7);			\
45862306a36Sopenharmony_ci	/* aria_diff_byte() 				\
45962306a36Sopenharmony_ci	 * T3 = ABCD -> BADC 				\
46062306a36Sopenharmony_ci	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
46162306a36Sopenharmony_ci	 * T0 = ABCD -> CDAB 				\
46262306a36Sopenharmony_ci	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
46362306a36Sopenharmony_ci	 * T1 = ABCD -> DCBA 				\
46462306a36Sopenharmony_ci	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
46562306a36Sopenharmony_ci	 */						\
46662306a36Sopenharmony_ci	aria_diff_word(x2, x3, x0, x1,			\
46762306a36Sopenharmony_ci		       x7, x6, x5, x4,			\
46862306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
46962306a36Sopenharmony_ci		       y5, y4, y7, y6);			\
47062306a36Sopenharmony_ci	aria_store_state_8way(x3, x2, x1, x0,		\
47162306a36Sopenharmony_ci			      x6, x7, x4, x5,		\
47262306a36Sopenharmony_ci			      mem_tmp, 0);
47362306a36Sopenharmony_ci
47462306a36Sopenharmony_ci#define aria_fo(x0, x1, x2, x3,				\
47562306a36Sopenharmony_ci		x4, x5, x6, x7,				\
47662306a36Sopenharmony_ci		y0, y1, y2, y3,				\
47762306a36Sopenharmony_ci		y4, y5, y6, y7,				\
47862306a36Sopenharmony_ci		mem_tmp, rk, round)			\
47962306a36Sopenharmony_ci	vpxor y7, y7, y7;				\
48062306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
48162306a36Sopenharmony_ci		      y0, y7, y2, rk, 8, round);	\
48262306a36Sopenharmony_ci							\
48362306a36Sopenharmony_ci	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
48462306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7);	\
48562306a36Sopenharmony_ci							\
48662306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
48762306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
48862306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
48962306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
49062306a36Sopenharmony_ci			      mem_tmp, 8);		\
49162306a36Sopenharmony_ci							\
49262306a36Sopenharmony_ci	aria_load_state_8way(x0, x1, x2, x3,		\
49362306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
49462306a36Sopenharmony_ci			     mem_tmp, 0);		\
49562306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
49662306a36Sopenharmony_ci		      y0, y7, y2, rk, 0, round);	\
49762306a36Sopenharmony_ci							\
49862306a36Sopenharmony_ci	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
49962306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7);	\
50062306a36Sopenharmony_ci							\
50162306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
50262306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
50362306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
50462306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
50562306a36Sopenharmony_ci			      mem_tmp, 0);		\
50662306a36Sopenharmony_ci	aria_load_state_8way(y0, y1, y2, y3,		\
50762306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
50862306a36Sopenharmony_ci			     mem_tmp, 8);		\
50962306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
51062306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
51162306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
51262306a36Sopenharmony_ci		       y4, y5, y6, y7);			\
51362306a36Sopenharmony_ci	/* aria_diff_byte() 				\
51462306a36Sopenharmony_ci	 * T1 = ABCD -> BADC 				\
51562306a36Sopenharmony_ci	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
51662306a36Sopenharmony_ci	 * T2 = ABCD -> CDAB 				\
51762306a36Sopenharmony_ci	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
51862306a36Sopenharmony_ci	 * T3 = ABCD -> DCBA 				\
51962306a36Sopenharmony_ci	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
52062306a36Sopenharmony_ci	 */						\
52162306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
52262306a36Sopenharmony_ci		       x5, x4, x7, x6,			\
52362306a36Sopenharmony_ci		       y2, y3, y0, y1,			\
52462306a36Sopenharmony_ci		       y7, y6, y5, y4);			\
52562306a36Sopenharmony_ci	aria_store_state_8way(x3, x2, x1, x0,		\
52662306a36Sopenharmony_ci			      x6, x7, x4, x5,		\
52762306a36Sopenharmony_ci			      mem_tmp, 0);
52862306a36Sopenharmony_ci
52962306a36Sopenharmony_ci#define aria_ff(x0, x1, x2, x3,				\
53062306a36Sopenharmony_ci		x4, x5, x6, x7,				\
53162306a36Sopenharmony_ci		y0, y1, y2, y3,				\
53262306a36Sopenharmony_ci		y4, y5, y6, y7,				\
53362306a36Sopenharmony_ci		mem_tmp, rk, round, last_round)		\
53462306a36Sopenharmony_ci	vpxor y7, y7, y7;				\
53562306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
53662306a36Sopenharmony_ci		      y0, y7, y2, rk, 8, round);	\
53762306a36Sopenharmony_ci							\
53862306a36Sopenharmony_ci	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
53962306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7);	\
54062306a36Sopenharmony_ci							\
54162306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
54262306a36Sopenharmony_ci		      y0, y7, y2, rk, 8, last_round);	\
54362306a36Sopenharmony_ci							\
54462306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
54562306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
54662306a36Sopenharmony_ci			      mem_tmp, 8);		\
54762306a36Sopenharmony_ci							\
54862306a36Sopenharmony_ci	aria_load_state_8way(x0, x1, x2, x3,		\
54962306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
55062306a36Sopenharmony_ci			     mem_tmp, 0);		\
55162306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
55262306a36Sopenharmony_ci		      y0, y7, y2, rk, 0, round);	\
55362306a36Sopenharmony_ci							\
55462306a36Sopenharmony_ci	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
55562306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7);	\
55662306a36Sopenharmony_ci							\
55762306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
55862306a36Sopenharmony_ci		      y0, y7, y2, rk, 0, last_round);	\
55962306a36Sopenharmony_ci							\
56062306a36Sopenharmony_ci	aria_load_state_8way(y0, y1, y2, y3,		\
56162306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
56262306a36Sopenharmony_ci			     mem_tmp, 8);
56362306a36Sopenharmony_ci
56462306a36Sopenharmony_ci#ifdef CONFIG_AS_GFNI
56562306a36Sopenharmony_ci#define aria_fe_gfni(x0, x1, x2, x3,			\
56662306a36Sopenharmony_ci		     x4, x5, x6, x7,			\
56762306a36Sopenharmony_ci		     y0, y1, y2, y3,			\
56862306a36Sopenharmony_ci		     y4, y5, y6, y7,			\
56962306a36Sopenharmony_ci		     mem_tmp, rk, round)		\
57062306a36Sopenharmony_ci	vpxor y7, y7, y7;				\
57162306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
57262306a36Sopenharmony_ci		      y0, y7, y2, rk, 8, round);	\
57362306a36Sopenharmony_ci							\
57462306a36Sopenharmony_ci	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
57562306a36Sopenharmony_ci			    x6, x7, x4, x5,		\
57662306a36Sopenharmony_ci			    y0, y1, y2, y3, 		\
57762306a36Sopenharmony_ci			    y4, y5, y6, y7);		\
57862306a36Sopenharmony_ci							\
57962306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
58062306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
58162306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
58262306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
58362306a36Sopenharmony_ci			      mem_tmp, 8);		\
58462306a36Sopenharmony_ci							\
58562306a36Sopenharmony_ci	aria_load_state_8way(x0, x1, x2, x3,		\
58662306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
58762306a36Sopenharmony_ci			     mem_tmp, 0);		\
58862306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
58962306a36Sopenharmony_ci		      y0, y7, y2, rk, 0, round);	\
59062306a36Sopenharmony_ci							\
59162306a36Sopenharmony_ci	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
59262306a36Sopenharmony_ci			    x6, x7, x4, x5,		\
59362306a36Sopenharmony_ci			    y0, y1, y2, y3, 		\
59462306a36Sopenharmony_ci			    y4, y5, y6, y7);		\
59562306a36Sopenharmony_ci							\
59662306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
59762306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
59862306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
59962306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
60062306a36Sopenharmony_ci			      mem_tmp, 0);		\
60162306a36Sopenharmony_ci	aria_load_state_8way(y0, y1, y2, y3,		\
60262306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
60362306a36Sopenharmony_ci			     mem_tmp, 8);		\
60462306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
60562306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
60662306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
60762306a36Sopenharmony_ci		       y4, y5, y6, y7);			\
60862306a36Sopenharmony_ci	/* aria_diff_byte() 				\
60962306a36Sopenharmony_ci	 * T3 = ABCD -> BADC 				\
61062306a36Sopenharmony_ci	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
61162306a36Sopenharmony_ci	 * T0 = ABCD -> CDAB 				\
61262306a36Sopenharmony_ci	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
61362306a36Sopenharmony_ci	 * T1 = ABCD -> DCBA 				\
61462306a36Sopenharmony_ci	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
61562306a36Sopenharmony_ci	 */						\
61662306a36Sopenharmony_ci	aria_diff_word(x2, x3, x0, x1,			\
61762306a36Sopenharmony_ci		       x7, x6, x5, x4,			\
61862306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
61962306a36Sopenharmony_ci		       y5, y4, y7, y6);			\
62062306a36Sopenharmony_ci	aria_store_state_8way(x3, x2, x1, x0,		\
62162306a36Sopenharmony_ci			      x6, x7, x4, x5,		\
62262306a36Sopenharmony_ci			      mem_tmp, 0);
62362306a36Sopenharmony_ci
62462306a36Sopenharmony_ci#define aria_fo_gfni(x0, x1, x2, x3,			\
62562306a36Sopenharmony_ci		     x4, x5, x6, x7,			\
62662306a36Sopenharmony_ci		     y0, y1, y2, y3,			\
62762306a36Sopenharmony_ci		     y4, y5, y6, y7,			\
62862306a36Sopenharmony_ci		     mem_tmp, rk, round)		\
62962306a36Sopenharmony_ci	vpxor y7, y7, y7;				\
63062306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
63162306a36Sopenharmony_ci		      y0, y7, y2, rk, 8, round);	\
63262306a36Sopenharmony_ci							\
63362306a36Sopenharmony_ci	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
63462306a36Sopenharmony_ci			    x4, x5, x6, x7,		\
63562306a36Sopenharmony_ci			    y0, y1, y2, y3, 		\
63662306a36Sopenharmony_ci			    y4, y5, y6, y7);		\
63762306a36Sopenharmony_ci							\
63862306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
63962306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
64062306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
64162306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
64262306a36Sopenharmony_ci			      mem_tmp, 8);		\
64362306a36Sopenharmony_ci							\
64462306a36Sopenharmony_ci	aria_load_state_8way(x0, x1, x2, x3,		\
64562306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
64662306a36Sopenharmony_ci			     mem_tmp, 0);		\
64762306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
64862306a36Sopenharmony_ci		      y0, y7, y2, rk, 0, round);	\
64962306a36Sopenharmony_ci							\
65062306a36Sopenharmony_ci	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
65162306a36Sopenharmony_ci			    x4, x5, x6, x7,		\
65262306a36Sopenharmony_ci			    y0, y1, y2, y3, 		\
65362306a36Sopenharmony_ci			    y4, y5, y6, y7);		\
65462306a36Sopenharmony_ci							\
65562306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
65662306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
65762306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
65862306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
65962306a36Sopenharmony_ci			      mem_tmp, 0);		\
66062306a36Sopenharmony_ci	aria_load_state_8way(y0, y1, y2, y3,		\
66162306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
66262306a36Sopenharmony_ci			     mem_tmp, 8);		\
66362306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
66462306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
66562306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
66662306a36Sopenharmony_ci		       y4, y5, y6, y7);			\
66762306a36Sopenharmony_ci	/* aria_diff_byte() 				\
66862306a36Sopenharmony_ci	 * T1 = ABCD -> BADC 				\
66962306a36Sopenharmony_ci	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
67062306a36Sopenharmony_ci	 * T2 = ABCD -> CDAB 				\
67162306a36Sopenharmony_ci	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
67262306a36Sopenharmony_ci	 * T3 = ABCD -> DCBA 				\
67362306a36Sopenharmony_ci	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
67462306a36Sopenharmony_ci	 */						\
67562306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
67662306a36Sopenharmony_ci		       x5, x4, x7, x6,			\
67762306a36Sopenharmony_ci		       y2, y3, y0, y1,			\
67862306a36Sopenharmony_ci		       y7, y6, y5, y4);			\
67962306a36Sopenharmony_ci	aria_store_state_8way(x3, x2, x1, x0,		\
68062306a36Sopenharmony_ci			      x6, x7, x4, x5,		\
68162306a36Sopenharmony_ci			      mem_tmp, 0);
68262306a36Sopenharmony_ci
68362306a36Sopenharmony_ci#define aria_ff_gfni(x0, x1, x2, x3,			\
68462306a36Sopenharmony_ci		x4, x5, x6, x7,				\
68562306a36Sopenharmony_ci		y0, y1, y2, y3,				\
68662306a36Sopenharmony_ci		y4, y5, y6, y7,				\
68762306a36Sopenharmony_ci		mem_tmp, rk, round, last_round)		\
68862306a36Sopenharmony_ci	vpxor y7, y7, y7;				\
68962306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
69062306a36Sopenharmony_ci		      y0, y7, y2, rk, 8, round);	\
69162306a36Sopenharmony_ci							\
69262306a36Sopenharmony_ci	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
69362306a36Sopenharmony_ci			    x6, x7, x4, x5,		\
69462306a36Sopenharmony_ci			    y0, y1, y2, y3, 		\
69562306a36Sopenharmony_ci			    y4, y5, y6, y7);		\
69662306a36Sopenharmony_ci							\
69762306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
69862306a36Sopenharmony_ci		      y0, y7, y2, rk, 8, last_round);	\
69962306a36Sopenharmony_ci							\
70062306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
70162306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
70262306a36Sopenharmony_ci			      mem_tmp, 8);		\
70362306a36Sopenharmony_ci							\
70462306a36Sopenharmony_ci	aria_load_state_8way(x0, x1, x2, x3,		\
70562306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
70662306a36Sopenharmony_ci			     mem_tmp, 0);		\
70762306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
70862306a36Sopenharmony_ci		      y0, y7, y2, rk, 0, round);	\
70962306a36Sopenharmony_ci							\
71062306a36Sopenharmony_ci	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
71162306a36Sopenharmony_ci			    x6, x7, x4, x5,		\
71262306a36Sopenharmony_ci			    y0, y1, y2, y3, 		\
71362306a36Sopenharmony_ci			    y4, y5, y6, y7);		\
71462306a36Sopenharmony_ci							\
71562306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
71662306a36Sopenharmony_ci		      y0, y7, y2, rk, 0, last_round);	\
71762306a36Sopenharmony_ci							\
71862306a36Sopenharmony_ci	aria_load_state_8way(y0, y1, y2, y3,		\
71962306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
72062306a36Sopenharmony_ci			     mem_tmp, 8);
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_ci#endif /* CONFIG_AS_GFNI */
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_ci/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
72562306a36Sopenharmony_ci.section	.rodata.cst16, "aM", @progbits, 16
72662306a36Sopenharmony_ci.align 16
72762306a36Sopenharmony_ci
72862306a36Sopenharmony_ci#define SHUFB_BYTES(idx) \
72962306a36Sopenharmony_ci	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
73062306a36Sopenharmony_ci
73162306a36Sopenharmony_ci.Lshufb_16x16b:
73262306a36Sopenharmony_ci	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
73362306a36Sopenharmony_ci/* For isolating SubBytes from AESENCLAST, inverse shift row */
73462306a36Sopenharmony_ci.Linv_shift_row:
73562306a36Sopenharmony_ci	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
73662306a36Sopenharmony_ci	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
73762306a36Sopenharmony_ci.Lshift_row:
73862306a36Sopenharmony_ci	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
73962306a36Sopenharmony_ci	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
74062306a36Sopenharmony_ci/* For CTR-mode IV byteswap */
74162306a36Sopenharmony_ci.Lbswap128_mask:
74262306a36Sopenharmony_ci	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
74362306a36Sopenharmony_ci	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
74462306a36Sopenharmony_ci
74562306a36Sopenharmony_ci/* AES inverse affine and S2 combined:
74662306a36Sopenharmony_ci *      1 1 0 0 0 0 0 1     x0     0
74762306a36Sopenharmony_ci *      0 1 0 0 1 0 0 0     x1     0
74862306a36Sopenharmony_ci *      1 1 0 0 1 1 1 1     x2     0
74962306a36Sopenharmony_ci *      0 1 1 0 1 0 0 1     x3     1
75062306a36Sopenharmony_ci *      0 1 0 0 1 1 0 0  *  x4  +  0
75162306a36Sopenharmony_ci *      0 1 0 1 1 0 0 0     x5     0
75262306a36Sopenharmony_ci *      0 0 0 0 0 1 0 1     x6     0
75362306a36Sopenharmony_ci *      1 1 1 0 0 1 1 1     x7     1
75462306a36Sopenharmony_ci */
75562306a36Sopenharmony_ci.Ltf_lo__inv_aff__and__s2:
75662306a36Sopenharmony_ci	.octa 0x92172DA81A9FA520B2370D883ABF8500
75762306a36Sopenharmony_ci.Ltf_hi__inv_aff__and__s2:
75862306a36Sopenharmony_ci	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
75962306a36Sopenharmony_ci
76062306a36Sopenharmony_ci/* X2 and AES forward affine combined:
76162306a36Sopenharmony_ci *      1 0 1 1 0 0 0 1     x0     0
76262306a36Sopenharmony_ci *      0 1 1 1 1 0 1 1     x1     0
76362306a36Sopenharmony_ci *      0 0 0 1 1 0 1 0     x2     1
76462306a36Sopenharmony_ci *      0 1 0 0 0 1 0 0     x3     0
76562306a36Sopenharmony_ci *      0 0 1 1 1 0 1 1  *  x4  +  0
76662306a36Sopenharmony_ci *      0 1 0 0 1 0 0 0     x5     0
76762306a36Sopenharmony_ci *      1 1 0 1 0 0 1 1     x6     0
76862306a36Sopenharmony_ci *      0 1 0 0 1 0 1 0     x7     0
76962306a36Sopenharmony_ci */
77062306a36Sopenharmony_ci.Ltf_lo__x2__and__fwd_aff:
77162306a36Sopenharmony_ci	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
77262306a36Sopenharmony_ci.Ltf_hi__x2__and__fwd_aff:
77362306a36Sopenharmony_ci	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
77462306a36Sopenharmony_ci
77562306a36Sopenharmony_ci#ifdef CONFIG_AS_GFNI
77662306a36Sopenharmony_ci/* AES affine: */
77762306a36Sopenharmony_ci#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
77862306a36Sopenharmony_ci.Ltf_aff_bitmatrix:
77962306a36Sopenharmony_ci	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
78062306a36Sopenharmony_ci		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
78162306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
78262306a36Sopenharmony_ci		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
78362306a36Sopenharmony_ci		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
78462306a36Sopenharmony_ci		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
78562306a36Sopenharmony_ci		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
78662306a36Sopenharmony_ci		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
78762306a36Sopenharmony_ci	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
78862306a36Sopenharmony_ci		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
78962306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
79062306a36Sopenharmony_ci		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
79162306a36Sopenharmony_ci		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
79262306a36Sopenharmony_ci		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
79362306a36Sopenharmony_ci		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
79462306a36Sopenharmony_ci		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
79562306a36Sopenharmony_ci
79662306a36Sopenharmony_ci/* AES inverse affine: */
79762306a36Sopenharmony_ci#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
79862306a36Sopenharmony_ci.Ltf_inv_bitmatrix:
79962306a36Sopenharmony_ci	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
80062306a36Sopenharmony_ci		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
80162306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
80262306a36Sopenharmony_ci		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
80362306a36Sopenharmony_ci		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
80462306a36Sopenharmony_ci		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
80562306a36Sopenharmony_ci		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
80662306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
80762306a36Sopenharmony_ci	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
80862306a36Sopenharmony_ci		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
80962306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
81062306a36Sopenharmony_ci		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
81162306a36Sopenharmony_ci		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
81262306a36Sopenharmony_ci		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
81362306a36Sopenharmony_ci		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
81462306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
81562306a36Sopenharmony_ci
81662306a36Sopenharmony_ci/* S2: */
81762306a36Sopenharmony_ci#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
81862306a36Sopenharmony_ci.Ltf_s2_bitmatrix:
81962306a36Sopenharmony_ci	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
82062306a36Sopenharmony_ci		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
82162306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
82262306a36Sopenharmony_ci		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
82362306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
82462306a36Sopenharmony_ci		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
82562306a36Sopenharmony_ci		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
82662306a36Sopenharmony_ci		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
82762306a36Sopenharmony_ci	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
82862306a36Sopenharmony_ci		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
82962306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
83062306a36Sopenharmony_ci		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
83162306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
83262306a36Sopenharmony_ci		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
83362306a36Sopenharmony_ci		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
83462306a36Sopenharmony_ci		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
83562306a36Sopenharmony_ci
83662306a36Sopenharmony_ci/* X2: */
83762306a36Sopenharmony_ci#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
83862306a36Sopenharmony_ci.Ltf_x2_bitmatrix:
83962306a36Sopenharmony_ci	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
84062306a36Sopenharmony_ci		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
84162306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
84262306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
84362306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
84462306a36Sopenharmony_ci		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
84562306a36Sopenharmony_ci		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
84662306a36Sopenharmony_ci		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
84762306a36Sopenharmony_ci	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
84862306a36Sopenharmony_ci		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
84962306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
85062306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
85162306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
85262306a36Sopenharmony_ci		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
85362306a36Sopenharmony_ci		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
85462306a36Sopenharmony_ci		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
85562306a36Sopenharmony_ci
85662306a36Sopenharmony_ci/* Identity matrix: */
85762306a36Sopenharmony_ci.Ltf_id_bitmatrix:
85862306a36Sopenharmony_ci	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
85962306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
86062306a36Sopenharmony_ci		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
86162306a36Sopenharmony_ci		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
86262306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
86362306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
86462306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
86562306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
86662306a36Sopenharmony_ci	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
86762306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
86862306a36Sopenharmony_ci		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
86962306a36Sopenharmony_ci		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
87062306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
87162306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
87262306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
87362306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
87462306a36Sopenharmony_ci#endif /* CONFIG_AS_GFNI */
87562306a36Sopenharmony_ci
87662306a36Sopenharmony_ci/* 4-bit mask */
87762306a36Sopenharmony_ci.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
87862306a36Sopenharmony_ci.align 4
87962306a36Sopenharmony_ci.L0f0f0f0f:
88062306a36Sopenharmony_ci	.long 0x0f0f0f0f
88162306a36Sopenharmony_ci
88262306a36Sopenharmony_ci.text
88362306a36Sopenharmony_ci
88462306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
88562306a36Sopenharmony_ci	/* input:
88662306a36Sopenharmony_ci	*      %r9: rk
88762306a36Sopenharmony_ci	*      %rsi: dst
88862306a36Sopenharmony_ci	*      %rdx: src
88962306a36Sopenharmony_ci	*      %xmm0..%xmm15: 16 byte-sliced blocks
89062306a36Sopenharmony_ci	*/
89162306a36Sopenharmony_ci
89262306a36Sopenharmony_ci	FRAME_BEGIN
89362306a36Sopenharmony_ci
89462306a36Sopenharmony_ci	movq %rsi, %rax;
89562306a36Sopenharmony_ci	leaq 8 * 16(%rax), %r8;
89662306a36Sopenharmony_ci
89762306a36Sopenharmony_ci	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
89862306a36Sopenharmony_ci		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
89962306a36Sopenharmony_ci		      %xmm15, %rax, %r8);
90062306a36Sopenharmony_ci	aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
90162306a36Sopenharmony_ci		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
90262306a36Sopenharmony_ci		%rax, %r9, 0);
90362306a36Sopenharmony_ci	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
90462306a36Sopenharmony_ci		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
90562306a36Sopenharmony_ci		%xmm15, %rax, %r9, 1);
90662306a36Sopenharmony_ci	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
90762306a36Sopenharmony_ci		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
90862306a36Sopenharmony_ci		%rax, %r9, 2);
90962306a36Sopenharmony_ci	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
91062306a36Sopenharmony_ci		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
91162306a36Sopenharmony_ci		%xmm15, %rax, %r9, 3);
91262306a36Sopenharmony_ci	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
91362306a36Sopenharmony_ci		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
91462306a36Sopenharmony_ci		%rax, %r9, 4);
91562306a36Sopenharmony_ci	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
91662306a36Sopenharmony_ci		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
91762306a36Sopenharmony_ci		%xmm15, %rax, %r9, 5);
91862306a36Sopenharmony_ci	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
91962306a36Sopenharmony_ci		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
92062306a36Sopenharmony_ci		%rax, %r9, 6);
92162306a36Sopenharmony_ci	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
92262306a36Sopenharmony_ci		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
92362306a36Sopenharmony_ci		%xmm15, %rax, %r9, 7);
92462306a36Sopenharmony_ci	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
92562306a36Sopenharmony_ci		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
92662306a36Sopenharmony_ci		%rax, %r9, 8);
92762306a36Sopenharmony_ci	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
92862306a36Sopenharmony_ci		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
92962306a36Sopenharmony_ci		%xmm15, %rax, %r9, 9);
93062306a36Sopenharmony_ci	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
93162306a36Sopenharmony_ci		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
93262306a36Sopenharmony_ci		%rax, %r9, 10);
93362306a36Sopenharmony_ci	cmpl $12, ARIA_CTX_rounds(CTX);
93462306a36Sopenharmony_ci	jne .Laria_192;
93562306a36Sopenharmony_ci	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
93662306a36Sopenharmony_ci		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
93762306a36Sopenharmony_ci		%xmm15, %rax, %r9, 11, 12);
93862306a36Sopenharmony_ci	jmp .Laria_end;
93962306a36Sopenharmony_ci.Laria_192:
94062306a36Sopenharmony_ci	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
94162306a36Sopenharmony_ci		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
94262306a36Sopenharmony_ci		%xmm15, %rax, %r9, 11);
94362306a36Sopenharmony_ci	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
94462306a36Sopenharmony_ci		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
94562306a36Sopenharmony_ci		%rax, %r9, 12);
94662306a36Sopenharmony_ci	cmpl $14, ARIA_CTX_rounds(CTX);
94762306a36Sopenharmony_ci	jne .Laria_256;
94862306a36Sopenharmony_ci	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
94962306a36Sopenharmony_ci		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
95062306a36Sopenharmony_ci		%xmm15, %rax, %r9, 13, 14);
95162306a36Sopenharmony_ci	jmp .Laria_end;
95262306a36Sopenharmony_ci.Laria_256:
95362306a36Sopenharmony_ci	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
95462306a36Sopenharmony_ci		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
95562306a36Sopenharmony_ci		%xmm15, %rax, %r9, 13);
95662306a36Sopenharmony_ci	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
95762306a36Sopenharmony_ci		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
95862306a36Sopenharmony_ci		%rax, %r9, 14);
95962306a36Sopenharmony_ci	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
96062306a36Sopenharmony_ci		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
96162306a36Sopenharmony_ci		%xmm15, %rax, %r9, 15, 16);
96262306a36Sopenharmony_ci.Laria_end:
96362306a36Sopenharmony_ci	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
96462306a36Sopenharmony_ci			   %xmm9, %xmm13, %xmm0, %xmm5,
96562306a36Sopenharmony_ci			   %xmm10, %xmm14, %xmm3, %xmm6,
96662306a36Sopenharmony_ci			   %xmm11, %xmm15, %xmm2, %xmm7,
96762306a36Sopenharmony_ci			   (%rax), (%r8));
96862306a36Sopenharmony_ci
96962306a36Sopenharmony_ci	FRAME_END
97062306a36Sopenharmony_ci	RET;
97162306a36Sopenharmony_ciSYM_FUNC_END(__aria_aesni_avx_crypt_16way)
97262306a36Sopenharmony_ci
97362306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
97462306a36Sopenharmony_ci	/* input:
97562306a36Sopenharmony_ci	*      %rdi: ctx, CTX
97662306a36Sopenharmony_ci	*      %rsi: dst
97762306a36Sopenharmony_ci	*      %rdx: src
97862306a36Sopenharmony_ci	*/
97962306a36Sopenharmony_ci
98062306a36Sopenharmony_ci	FRAME_BEGIN
98162306a36Sopenharmony_ci
98262306a36Sopenharmony_ci	leaq ARIA_CTX_enc_key(CTX), %r9;
98362306a36Sopenharmony_ci
98462306a36Sopenharmony_ci	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
98562306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
98662306a36Sopenharmony_ci		     %xmm15, %rdx);
98762306a36Sopenharmony_ci
98862306a36Sopenharmony_ci	call __aria_aesni_avx_crypt_16way;
98962306a36Sopenharmony_ci
99062306a36Sopenharmony_ci	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
99162306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
99262306a36Sopenharmony_ci		     %xmm15, %rax);
99362306a36Sopenharmony_ci
99462306a36Sopenharmony_ci	FRAME_END
99562306a36Sopenharmony_ci	RET;
99662306a36Sopenharmony_ciSYM_FUNC_END(aria_aesni_avx_encrypt_16way)
99762306a36Sopenharmony_ci
99862306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
99962306a36Sopenharmony_ci	/* input:
100062306a36Sopenharmony_ci	*      %rdi: ctx, CTX
100162306a36Sopenharmony_ci	*      %rsi: dst
100262306a36Sopenharmony_ci	*      %rdx: src
100362306a36Sopenharmony_ci	*/
100462306a36Sopenharmony_ci
100562306a36Sopenharmony_ci	FRAME_BEGIN
100662306a36Sopenharmony_ci
100762306a36Sopenharmony_ci	leaq ARIA_CTX_dec_key(CTX), %r9;
100862306a36Sopenharmony_ci
100962306a36Sopenharmony_ci	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
101062306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
101162306a36Sopenharmony_ci		     %xmm15, %rdx);
101262306a36Sopenharmony_ci
101362306a36Sopenharmony_ci	call __aria_aesni_avx_crypt_16way;
101462306a36Sopenharmony_ci
101562306a36Sopenharmony_ci	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
101662306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
101762306a36Sopenharmony_ci		     %xmm15, %rax);
101862306a36Sopenharmony_ci
101962306a36Sopenharmony_ci	FRAME_END
102062306a36Sopenharmony_ci	RET;
102162306a36Sopenharmony_ciSYM_FUNC_END(aria_aesni_avx_decrypt_16way)
102262306a36Sopenharmony_ci
102362306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
102462306a36Sopenharmony_ci	/* input:
102562306a36Sopenharmony_ci	*      %rdi: ctx
102662306a36Sopenharmony_ci	*      %rsi: dst
102762306a36Sopenharmony_ci	*      %rdx: src
102862306a36Sopenharmony_ci	*      %rcx: keystream
102962306a36Sopenharmony_ci	*      %r8: iv (big endian, 128bit)
103062306a36Sopenharmony_ci	*/
103162306a36Sopenharmony_ci
103262306a36Sopenharmony_ci	FRAME_BEGIN
103362306a36Sopenharmony_ci	/* load IV and byteswap */
103462306a36Sopenharmony_ci	vmovdqu (%r8), %xmm8;
103562306a36Sopenharmony_ci
103662306a36Sopenharmony_ci	vmovdqa .Lbswap128_mask (%rip), %xmm1;
103762306a36Sopenharmony_ci	vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
103862306a36Sopenharmony_ci
103962306a36Sopenharmony_ci	vpcmpeqd %xmm0, %xmm0, %xmm0;
104062306a36Sopenharmony_ci	vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
104162306a36Sopenharmony_ci
104262306a36Sopenharmony_ci	/* construct IVs */
104362306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
104462306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm9;
104562306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
104662306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm10;
104762306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
104862306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm11;
104962306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
105062306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm12;
105162306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
105262306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm13;
105362306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
105462306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm14;
105562306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
105662306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm15;
105762306a36Sopenharmony_ci	vmovdqu %xmm8, (0 * 16)(%rcx);
105862306a36Sopenharmony_ci	vmovdqu %xmm9, (1 * 16)(%rcx);
105962306a36Sopenharmony_ci	vmovdqu %xmm10, (2 * 16)(%rcx);
106062306a36Sopenharmony_ci	vmovdqu %xmm11, (3 * 16)(%rcx);
106162306a36Sopenharmony_ci	vmovdqu %xmm12, (4 * 16)(%rcx);
106262306a36Sopenharmony_ci	vmovdqu %xmm13, (5 * 16)(%rcx);
106362306a36Sopenharmony_ci	vmovdqu %xmm14, (6 * 16)(%rcx);
106462306a36Sopenharmony_ci	vmovdqu %xmm15, (7 * 16)(%rcx);
106562306a36Sopenharmony_ci
106662306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
106762306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm8;
106862306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
106962306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm9;
107062306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
107162306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm10;
107262306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
107362306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm11;
107462306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
107562306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm12;
107662306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
107762306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm13;
107862306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
107962306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm14;
108062306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
108162306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm15;
108262306a36Sopenharmony_ci	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
108362306a36Sopenharmony_ci	vpshufb %xmm1, %xmm3, %xmm4;
108462306a36Sopenharmony_ci	vmovdqu %xmm4, (%r8);
108562306a36Sopenharmony_ci
108662306a36Sopenharmony_ci	vmovdqu (0 * 16)(%rcx), %xmm0;
108762306a36Sopenharmony_ci	vmovdqu (1 * 16)(%rcx), %xmm1;
108862306a36Sopenharmony_ci	vmovdqu (2 * 16)(%rcx), %xmm2;
108962306a36Sopenharmony_ci	vmovdqu (3 * 16)(%rcx), %xmm3;
109062306a36Sopenharmony_ci	vmovdqu (4 * 16)(%rcx), %xmm4;
109162306a36Sopenharmony_ci	vmovdqu (5 * 16)(%rcx), %xmm5;
109262306a36Sopenharmony_ci	vmovdqu (6 * 16)(%rcx), %xmm6;
109362306a36Sopenharmony_ci	vmovdqu (7 * 16)(%rcx), %xmm7;
109462306a36Sopenharmony_ci
109562306a36Sopenharmony_ci	FRAME_END
109662306a36Sopenharmony_ci	RET;
109762306a36Sopenharmony_ciSYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
109862306a36Sopenharmony_ci
109962306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
110062306a36Sopenharmony_ci	/* input:
110162306a36Sopenharmony_ci	*      %rdi: ctx
110262306a36Sopenharmony_ci	*      %rsi: dst
110362306a36Sopenharmony_ci	*      %rdx: src
110462306a36Sopenharmony_ci	*      %rcx: keystream
110562306a36Sopenharmony_ci	*      %r8: iv (big endian, 128bit)
110662306a36Sopenharmony_ci	*/
110762306a36Sopenharmony_ci	FRAME_BEGIN
110862306a36Sopenharmony_ci
110962306a36Sopenharmony_ci	call __aria_aesni_avx_ctr_gen_keystream_16way;
111062306a36Sopenharmony_ci
111162306a36Sopenharmony_ci	leaq (%rsi), %r10;
111262306a36Sopenharmony_ci	leaq (%rdx), %r11;
111362306a36Sopenharmony_ci	leaq (%rcx), %rsi;
111462306a36Sopenharmony_ci	leaq (%rcx), %rdx;
111562306a36Sopenharmony_ci	leaq ARIA_CTX_enc_key(CTX), %r9;
111662306a36Sopenharmony_ci
111762306a36Sopenharmony_ci	call __aria_aesni_avx_crypt_16way;
111862306a36Sopenharmony_ci
111962306a36Sopenharmony_ci	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
112062306a36Sopenharmony_ci	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
112162306a36Sopenharmony_ci	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
112262306a36Sopenharmony_ci	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
112362306a36Sopenharmony_ci	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
112462306a36Sopenharmony_ci	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
112562306a36Sopenharmony_ci	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
112662306a36Sopenharmony_ci	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
112762306a36Sopenharmony_ci	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
112862306a36Sopenharmony_ci	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
112962306a36Sopenharmony_ci	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
113062306a36Sopenharmony_ci	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
113162306a36Sopenharmony_ci	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
113262306a36Sopenharmony_ci	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
113362306a36Sopenharmony_ci	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
113462306a36Sopenharmony_ci	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
113562306a36Sopenharmony_ci	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
113662306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
113762306a36Sopenharmony_ci		     %xmm15, %r10);
113862306a36Sopenharmony_ci
113962306a36Sopenharmony_ci	FRAME_END
114062306a36Sopenharmony_ci	RET;
114162306a36Sopenharmony_ciSYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
114262306a36Sopenharmony_ci
114362306a36Sopenharmony_ci#ifdef CONFIG_AS_GFNI
114462306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
114562306a36Sopenharmony_ci	/* input:
114662306a36Sopenharmony_ci	*      %r9: rk
114762306a36Sopenharmony_ci	*      %rsi: dst
114862306a36Sopenharmony_ci	*      %rdx: src
114962306a36Sopenharmony_ci	*      %xmm0..%xmm15: 16 byte-sliced blocks
115062306a36Sopenharmony_ci	*/
115162306a36Sopenharmony_ci
115262306a36Sopenharmony_ci	FRAME_BEGIN
115362306a36Sopenharmony_ci
115462306a36Sopenharmony_ci	movq %rsi, %rax;
115562306a36Sopenharmony_ci	leaq 8 * 16(%rax), %r8;
115662306a36Sopenharmony_ci
115762306a36Sopenharmony_ci	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
115862306a36Sopenharmony_ci		      %xmm4, %xmm5, %xmm6, %xmm7,
115962306a36Sopenharmony_ci		      %xmm8, %xmm9, %xmm10, %xmm11,
116062306a36Sopenharmony_ci		      %xmm12, %xmm13, %xmm14,
116162306a36Sopenharmony_ci		      %xmm15, %rax, %r8);
116262306a36Sopenharmony_ci	aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
116362306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14, %xmm15,
116462306a36Sopenharmony_ci		     %xmm0, %xmm1, %xmm2, %xmm3,
116562306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
116662306a36Sopenharmony_ci		     %rax, %r9, 0);
116762306a36Sopenharmony_ci	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
116862306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
116962306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11,
117062306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14,
117162306a36Sopenharmony_ci		     %xmm15, %rax, %r9, 1);
117262306a36Sopenharmony_ci	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
117362306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14, %xmm15,
117462306a36Sopenharmony_ci		     %xmm0, %xmm1, %xmm2, %xmm3,
117562306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
117662306a36Sopenharmony_ci		     %rax, %r9, 2);
117762306a36Sopenharmony_ci	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
117862306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
117962306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11,
118062306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14,
118162306a36Sopenharmony_ci		     %xmm15, %rax, %r9, 3);
118262306a36Sopenharmony_ci	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
118362306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14, %xmm15,
118462306a36Sopenharmony_ci		     %xmm0, %xmm1, %xmm2, %xmm3,
118562306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
118662306a36Sopenharmony_ci		     %rax, %r9, 4);
118762306a36Sopenharmony_ci	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
118862306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
118962306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11,
119062306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14,
119162306a36Sopenharmony_ci		     %xmm15, %rax, %r9, 5);
119262306a36Sopenharmony_ci	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
119362306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14, %xmm15,
119462306a36Sopenharmony_ci		     %xmm0, %xmm1, %xmm2, %xmm3,
119562306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
119662306a36Sopenharmony_ci		     %rax, %r9, 6);
119762306a36Sopenharmony_ci	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
119862306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
119962306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11,
120062306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14,
120162306a36Sopenharmony_ci		     %xmm15, %rax, %r9, 7);
120262306a36Sopenharmony_ci	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
120362306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14, %xmm15,
120462306a36Sopenharmony_ci		     %xmm0, %xmm1, %xmm2, %xmm3,
120562306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
120662306a36Sopenharmony_ci		     %rax, %r9, 8);
120762306a36Sopenharmony_ci	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
120862306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
120962306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11,
121062306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14,
121162306a36Sopenharmony_ci		     %xmm15, %rax, %r9, 9);
121262306a36Sopenharmony_ci	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
121362306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14, %xmm15,
121462306a36Sopenharmony_ci		     %xmm0, %xmm1, %xmm2, %xmm3,
121562306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
121662306a36Sopenharmony_ci		     %rax, %r9, 10);
121762306a36Sopenharmony_ci	cmpl $12, ARIA_CTX_rounds(CTX);
121862306a36Sopenharmony_ci	jne .Laria_gfni_192;
121962306a36Sopenharmony_ci	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
122062306a36Sopenharmony_ci		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
122162306a36Sopenharmony_ci		%xmm15, %rax, %r9, 11, 12);
122262306a36Sopenharmony_ci	jmp .Laria_gfni_end;
122362306a36Sopenharmony_ci.Laria_gfni_192:
122462306a36Sopenharmony_ci	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
122562306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
122662306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11,
122762306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14,
122862306a36Sopenharmony_ci		     %xmm15, %rax, %r9, 11);
122962306a36Sopenharmony_ci	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
123062306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14, %xmm15,
123162306a36Sopenharmony_ci		     %xmm0, %xmm1, %xmm2, %xmm3,
123262306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
123362306a36Sopenharmony_ci		     %rax, %r9, 12);
123462306a36Sopenharmony_ci	cmpl $14, ARIA_CTX_rounds(CTX);
123562306a36Sopenharmony_ci	jne .Laria_gfni_256;
123662306a36Sopenharmony_ci	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
123762306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
123862306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11,
123962306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14,
124062306a36Sopenharmony_ci		     %xmm15, %rax, %r9, 13, 14);
124162306a36Sopenharmony_ci	jmp .Laria_gfni_end;
124262306a36Sopenharmony_ci.Laria_gfni_256:
124362306a36Sopenharmony_ci	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
124462306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
124562306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11,
124662306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14,
124762306a36Sopenharmony_ci		     %xmm15, %rax, %r9, 13);
124862306a36Sopenharmony_ci	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
124962306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14, %xmm15,
125062306a36Sopenharmony_ci		     %xmm0, %xmm1, %xmm2, %xmm3,
125162306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
125262306a36Sopenharmony_ci		     %rax, %r9, 14);
125362306a36Sopenharmony_ci	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
125462306a36Sopenharmony_ci		     %xmm4, %xmm5, %xmm6, %xmm7,
125562306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11,
125662306a36Sopenharmony_ci		     %xmm12, %xmm13, %xmm14,
125762306a36Sopenharmony_ci		     %xmm15, %rax, %r9, 15, 16);
125862306a36Sopenharmony_ci.Laria_gfni_end:
125962306a36Sopenharmony_ci	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
126062306a36Sopenharmony_ci			   %xmm9, %xmm13, %xmm0, %xmm5,
126162306a36Sopenharmony_ci			   %xmm10, %xmm14, %xmm3, %xmm6,
126262306a36Sopenharmony_ci			   %xmm11, %xmm15, %xmm2, %xmm7,
126362306a36Sopenharmony_ci			   (%rax), (%r8));
126462306a36Sopenharmony_ci
126562306a36Sopenharmony_ci	FRAME_END
126662306a36Sopenharmony_ci	RET;
126762306a36Sopenharmony_ciSYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
126862306a36Sopenharmony_ci
126962306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
127062306a36Sopenharmony_ci	/* input:
127162306a36Sopenharmony_ci	*      %rdi: ctx, CTX
127262306a36Sopenharmony_ci	*      %rsi: dst
127362306a36Sopenharmony_ci	*      %rdx: src
127462306a36Sopenharmony_ci	*/
127562306a36Sopenharmony_ci
127662306a36Sopenharmony_ci	FRAME_BEGIN
127762306a36Sopenharmony_ci
127862306a36Sopenharmony_ci	leaq ARIA_CTX_enc_key(CTX), %r9;
127962306a36Sopenharmony_ci
128062306a36Sopenharmony_ci	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
128162306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
128262306a36Sopenharmony_ci		     %xmm15, %rdx);
128362306a36Sopenharmony_ci
128462306a36Sopenharmony_ci	call __aria_aesni_avx_gfni_crypt_16way;
128562306a36Sopenharmony_ci
128662306a36Sopenharmony_ci	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
128762306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
128862306a36Sopenharmony_ci		     %xmm15, %rax);
128962306a36Sopenharmony_ci
129062306a36Sopenharmony_ci	FRAME_END
129162306a36Sopenharmony_ci	RET;
129262306a36Sopenharmony_ciSYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
129362306a36Sopenharmony_ci
129462306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
129562306a36Sopenharmony_ci	/* input:
129662306a36Sopenharmony_ci	*      %rdi: ctx, CTX
129762306a36Sopenharmony_ci	*      %rsi: dst
129862306a36Sopenharmony_ci	*      %rdx: src
129962306a36Sopenharmony_ci	*/
130062306a36Sopenharmony_ci
130162306a36Sopenharmony_ci	FRAME_BEGIN
130262306a36Sopenharmony_ci
130362306a36Sopenharmony_ci	leaq ARIA_CTX_dec_key(CTX), %r9;
130462306a36Sopenharmony_ci
130562306a36Sopenharmony_ci	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
130662306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
130762306a36Sopenharmony_ci		     %xmm15, %rdx);
130862306a36Sopenharmony_ci
130962306a36Sopenharmony_ci	call __aria_aesni_avx_gfni_crypt_16way;
131062306a36Sopenharmony_ci
131162306a36Sopenharmony_ci	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
131262306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
131362306a36Sopenharmony_ci		     %xmm15, %rax);
131462306a36Sopenharmony_ci
131562306a36Sopenharmony_ci	FRAME_END
131662306a36Sopenharmony_ci	RET;
131762306a36Sopenharmony_ciSYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
131862306a36Sopenharmony_ci
131962306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
132062306a36Sopenharmony_ci	/* input:
132162306a36Sopenharmony_ci	*      %rdi: ctx
132262306a36Sopenharmony_ci	*      %rsi: dst
132362306a36Sopenharmony_ci	*      %rdx: src
132462306a36Sopenharmony_ci	*      %rcx: keystream
132562306a36Sopenharmony_ci	*      %r8: iv (big endian, 128bit)
132662306a36Sopenharmony_ci	*/
132762306a36Sopenharmony_ci	FRAME_BEGIN
132862306a36Sopenharmony_ci
132962306a36Sopenharmony_ci	call __aria_aesni_avx_ctr_gen_keystream_16way
133062306a36Sopenharmony_ci
133162306a36Sopenharmony_ci	leaq (%rsi), %r10;
133262306a36Sopenharmony_ci	leaq (%rdx), %r11;
133362306a36Sopenharmony_ci	leaq (%rcx), %rsi;
133462306a36Sopenharmony_ci	leaq (%rcx), %rdx;
133562306a36Sopenharmony_ci	leaq ARIA_CTX_enc_key(CTX), %r9;
133662306a36Sopenharmony_ci
133762306a36Sopenharmony_ci	call __aria_aesni_avx_gfni_crypt_16way;
133862306a36Sopenharmony_ci
133962306a36Sopenharmony_ci	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
134062306a36Sopenharmony_ci	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
134162306a36Sopenharmony_ci	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
134262306a36Sopenharmony_ci	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
134362306a36Sopenharmony_ci	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
134462306a36Sopenharmony_ci	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
134562306a36Sopenharmony_ci	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
134662306a36Sopenharmony_ci	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
134762306a36Sopenharmony_ci	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
134862306a36Sopenharmony_ci	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
134962306a36Sopenharmony_ci	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
135062306a36Sopenharmony_ci	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
135162306a36Sopenharmony_ci	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
135262306a36Sopenharmony_ci	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
135362306a36Sopenharmony_ci	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
135462306a36Sopenharmony_ci	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
135562306a36Sopenharmony_ci	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
135662306a36Sopenharmony_ci		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
135762306a36Sopenharmony_ci		     %xmm15, %r10);
135862306a36Sopenharmony_ci
135962306a36Sopenharmony_ci	FRAME_END
136062306a36Sopenharmony_ci	RET;
136162306a36Sopenharmony_ciSYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
136262306a36Sopenharmony_ci#endif /* CONFIG_AS_GFNI */
1363