162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * ARIA Cipher 32-way parallel algorithm (AVX2)
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci */
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci#include <linux/linkage.h>
1062306a36Sopenharmony_ci#include <asm/frame.h>
1162306a36Sopenharmony_ci#include <asm/asm-offsets.h>
1262306a36Sopenharmony_ci#include <linux/cfi_types.h>
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci/* register macros */
1562306a36Sopenharmony_ci#define CTX %rdi
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ci#define ymm0_x xmm0
1862306a36Sopenharmony_ci#define ymm1_x xmm1
1962306a36Sopenharmony_ci#define ymm2_x xmm2
2062306a36Sopenharmony_ci#define ymm3_x xmm3
2162306a36Sopenharmony_ci#define ymm4_x xmm4
2262306a36Sopenharmony_ci#define ymm5_x xmm5
2362306a36Sopenharmony_ci#define ymm6_x xmm6
2462306a36Sopenharmony_ci#define ymm7_x xmm7
2562306a36Sopenharmony_ci#define ymm8_x xmm8
2662306a36Sopenharmony_ci#define ymm9_x xmm9
2762306a36Sopenharmony_ci#define ymm10_x xmm10
2862306a36Sopenharmony_ci#define ymm11_x xmm11
2962306a36Sopenharmony_ci#define ymm12_x xmm12
3062306a36Sopenharmony_ci#define ymm13_x xmm13
3162306a36Sopenharmony_ci#define ymm14_x xmm14
3262306a36Sopenharmony_ci#define ymm15_x xmm15
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
3562306a36Sopenharmony_ci	( (((a0) & 1) << 0) |				\
3662306a36Sopenharmony_ci	  (((a1) & 1) << 1) |				\
3762306a36Sopenharmony_ci	  (((a2) & 1) << 2) |				\
3862306a36Sopenharmony_ci	  (((a3) & 1) << 3) |				\
3962306a36Sopenharmony_ci	  (((a4) & 1) << 4) |				\
4062306a36Sopenharmony_ci	  (((a5) & 1) << 5) |				\
4162306a36Sopenharmony_ci	  (((a6) & 1) << 6) |				\
4262306a36Sopenharmony_ci	  (((a7) & 1) << 7) )
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
4562306a36Sopenharmony_ci	( ((l7) << (0 * 8)) |				\
4662306a36Sopenharmony_ci	  ((l6) << (1 * 8)) |				\
4762306a36Sopenharmony_ci	  ((l5) << (2 * 8)) |				\
4862306a36Sopenharmony_ci	  ((l4) << (3 * 8)) |				\
4962306a36Sopenharmony_ci	  ((l3) << (4 * 8)) |				\
5062306a36Sopenharmony_ci	  ((l2) << (5 * 8)) |				\
5162306a36Sopenharmony_ci	  ((l1) << (6 * 8)) |				\
5262306a36Sopenharmony_ci	  ((l0) << (7 * 8)) )
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci#define inc_le128(x, minus_one, tmp)			\
5562306a36Sopenharmony_ci	vpcmpeqq minus_one, x, tmp;			\
5662306a36Sopenharmony_ci	vpsubq minus_one, x, x;				\
5762306a36Sopenharmony_ci	vpslldq $8, tmp, tmp;				\
5862306a36Sopenharmony_ci	vpsubq tmp, x, x;
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ci#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
6162306a36Sopenharmony_ci	vpand x, mask4bit, tmp0;			\
6262306a36Sopenharmony_ci	vpandn x, mask4bit, x;				\
6362306a36Sopenharmony_ci	vpsrld $4, x, x;				\
6462306a36Sopenharmony_ci							\
6562306a36Sopenharmony_ci	vpshufb tmp0, lo_t, tmp0;			\
6662306a36Sopenharmony_ci	vpshufb x, hi_t, x;				\
6762306a36Sopenharmony_ci	vpxor tmp0, x, x;
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
7062306a36Sopenharmony_ci	vpunpckhdq x1, x0, t2;				\
7162306a36Sopenharmony_ci	vpunpckldq x1, x0, x0;				\
7262306a36Sopenharmony_ci							\
7362306a36Sopenharmony_ci	vpunpckldq x3, x2, t1;				\
7462306a36Sopenharmony_ci	vpunpckhdq x3, x2, x2;				\
7562306a36Sopenharmony_ci							\
7662306a36Sopenharmony_ci	vpunpckhqdq t1, x0, x1;				\
7762306a36Sopenharmony_ci	vpunpcklqdq t1, x0, x0;				\
7862306a36Sopenharmony_ci							\
7962306a36Sopenharmony_ci	vpunpckhqdq x2, t2, x3;				\
8062306a36Sopenharmony_ci	vpunpcklqdq x2, t2, x2;
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci#define byteslice_16x16b(a0, b0, c0, d0,		\
8362306a36Sopenharmony_ci			 a1, b1, c1, d1,		\
8462306a36Sopenharmony_ci			 a2, b2, c2, d2,		\
8562306a36Sopenharmony_ci			 a3, b3, c3, d3,		\
8662306a36Sopenharmony_ci			 st0, st1)			\
8762306a36Sopenharmony_ci	vmovdqu d2, st0;				\
8862306a36Sopenharmony_ci	vmovdqu d3, st1;				\
8962306a36Sopenharmony_ci	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
9062306a36Sopenharmony_ci	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
9162306a36Sopenharmony_ci	vmovdqu st0, d2;				\
9262306a36Sopenharmony_ci	vmovdqu st1, d3;				\
9362306a36Sopenharmony_ci							\
9462306a36Sopenharmony_ci	vmovdqu a0, st0;				\
9562306a36Sopenharmony_ci	vmovdqu a1, st1;				\
9662306a36Sopenharmony_ci	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
9762306a36Sopenharmony_ci	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
9862306a36Sopenharmony_ci							\
9962306a36Sopenharmony_ci	vbroadcasti128 .Lshufb_16x16b(%rip), a0;	\
10062306a36Sopenharmony_ci	vmovdqu st1, a1;				\
10162306a36Sopenharmony_ci	vpshufb a0, a2, a2;				\
10262306a36Sopenharmony_ci	vpshufb a0, a3, a3;				\
10362306a36Sopenharmony_ci	vpshufb a0, b0, b0;				\
10462306a36Sopenharmony_ci	vpshufb a0, b1, b1;				\
10562306a36Sopenharmony_ci	vpshufb a0, b2, b2;				\
10662306a36Sopenharmony_ci	vpshufb a0, b3, b3;				\
10762306a36Sopenharmony_ci	vpshufb a0, a1, a1;				\
10862306a36Sopenharmony_ci	vpshufb a0, c0, c0;				\
10962306a36Sopenharmony_ci	vpshufb a0, c1, c1;				\
11062306a36Sopenharmony_ci	vpshufb a0, c2, c2;				\
11162306a36Sopenharmony_ci	vpshufb a0, c3, c3;				\
11262306a36Sopenharmony_ci	vpshufb a0, d0, d0;				\
11362306a36Sopenharmony_ci	vpshufb a0, d1, d1;				\
11462306a36Sopenharmony_ci	vpshufb a0, d2, d2;				\
11562306a36Sopenharmony_ci	vpshufb a0, d3, d3;				\
11662306a36Sopenharmony_ci	vmovdqu d3, st1;				\
11762306a36Sopenharmony_ci	vmovdqu st0, d3;				\
11862306a36Sopenharmony_ci	vpshufb a0, d3, a0;				\
11962306a36Sopenharmony_ci	vmovdqu d2, st0;				\
12062306a36Sopenharmony_ci							\
12162306a36Sopenharmony_ci	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
12262306a36Sopenharmony_ci	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
12362306a36Sopenharmony_ci	vmovdqu st0, d2;				\
12462306a36Sopenharmony_ci	vmovdqu st1, d3;				\
12562306a36Sopenharmony_ci							\
12662306a36Sopenharmony_ci	vmovdqu b0, st0;				\
12762306a36Sopenharmony_ci	vmovdqu b1, st1;				\
12862306a36Sopenharmony_ci	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
12962306a36Sopenharmony_ci	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
13062306a36Sopenharmony_ci	vmovdqu st0, b0;				\
13162306a36Sopenharmony_ci	vmovdqu st1, b1;				\
13262306a36Sopenharmony_ci	/* does not adjust output bytes inside vectors */
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci#define debyteslice_16x16b(a0, b0, c0, d0,		\
13562306a36Sopenharmony_ci			   a1, b1, c1, d1,		\
13662306a36Sopenharmony_ci			   a2, b2, c2, d2,		\
13762306a36Sopenharmony_ci			   a3, b3, c3, d3,		\
13862306a36Sopenharmony_ci			   st0, st1)			\
13962306a36Sopenharmony_ci	vmovdqu d2, st0;				\
14062306a36Sopenharmony_ci	vmovdqu d3, st1;				\
14162306a36Sopenharmony_ci	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
14262306a36Sopenharmony_ci	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
14362306a36Sopenharmony_ci	vmovdqu st0, d2;				\
14462306a36Sopenharmony_ci	vmovdqu st1, d3;				\
14562306a36Sopenharmony_ci							\
14662306a36Sopenharmony_ci	vmovdqu a0, st0;				\
14762306a36Sopenharmony_ci	vmovdqu a1, st1;				\
14862306a36Sopenharmony_ci	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
14962306a36Sopenharmony_ci	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
15062306a36Sopenharmony_ci							\
15162306a36Sopenharmony_ci	vbroadcasti128 .Lshufb_16x16b(%rip), a0;	\
15262306a36Sopenharmony_ci	vmovdqu st1, a1;				\
15362306a36Sopenharmony_ci	vpshufb a0, a2, a2;				\
15462306a36Sopenharmony_ci	vpshufb a0, a3, a3;				\
15562306a36Sopenharmony_ci	vpshufb a0, b0, b0;				\
15662306a36Sopenharmony_ci	vpshufb a0, b1, b1;				\
15762306a36Sopenharmony_ci	vpshufb a0, b2, b2;				\
15862306a36Sopenharmony_ci	vpshufb a0, b3, b3;				\
15962306a36Sopenharmony_ci	vpshufb a0, a1, a1;				\
16062306a36Sopenharmony_ci	vpshufb a0, c0, c0;				\
16162306a36Sopenharmony_ci	vpshufb a0, c1, c1;				\
16262306a36Sopenharmony_ci	vpshufb a0, c2, c2;				\
16362306a36Sopenharmony_ci	vpshufb a0, c3, c3;				\
16462306a36Sopenharmony_ci	vpshufb a0, d0, d0;				\
16562306a36Sopenharmony_ci	vpshufb a0, d1, d1;				\
16662306a36Sopenharmony_ci	vpshufb a0, d2, d2;				\
16762306a36Sopenharmony_ci	vpshufb a0, d3, d3;				\
16862306a36Sopenharmony_ci	vmovdqu d3, st1;				\
16962306a36Sopenharmony_ci	vmovdqu st0, d3;				\
17062306a36Sopenharmony_ci	vpshufb a0, d3, a0;				\
17162306a36Sopenharmony_ci	vmovdqu d2, st0;				\
17262306a36Sopenharmony_ci							\
17362306a36Sopenharmony_ci	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
17462306a36Sopenharmony_ci	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
17562306a36Sopenharmony_ci	vmovdqu st0, d2;				\
17662306a36Sopenharmony_ci	vmovdqu st1, d3;				\
17762306a36Sopenharmony_ci							\
17862306a36Sopenharmony_ci	vmovdqu b0, st0;				\
17962306a36Sopenharmony_ci	vmovdqu b1, st1;				\
18062306a36Sopenharmony_ci	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
18162306a36Sopenharmony_ci	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
18262306a36Sopenharmony_ci	vmovdqu st0, b0;				\
18362306a36Sopenharmony_ci	vmovdqu st1, b1;				\
18462306a36Sopenharmony_ci	/* does not adjust output bytes inside vectors */
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ci/* load blocks to registers and apply pre-whitening */
18762306a36Sopenharmony_ci#define inpack16_pre(x0, x1, x2, x3,			\
18862306a36Sopenharmony_ci		     x4, x5, x6, x7,			\
18962306a36Sopenharmony_ci		     y0, y1, y2, y3,			\
19062306a36Sopenharmony_ci		     y4, y5, y6, y7,			\
19162306a36Sopenharmony_ci		     rio)				\
19262306a36Sopenharmony_ci	vmovdqu (0 * 32)(rio), x0;			\
19362306a36Sopenharmony_ci	vmovdqu (1 * 32)(rio), x1;			\
19462306a36Sopenharmony_ci	vmovdqu (2 * 32)(rio), x2;			\
19562306a36Sopenharmony_ci	vmovdqu (3 * 32)(rio), x3;			\
19662306a36Sopenharmony_ci	vmovdqu (4 * 32)(rio), x4;			\
19762306a36Sopenharmony_ci	vmovdqu (5 * 32)(rio), x5;			\
19862306a36Sopenharmony_ci	vmovdqu (6 * 32)(rio), x6;			\
19962306a36Sopenharmony_ci	vmovdqu (7 * 32)(rio), x7;			\
20062306a36Sopenharmony_ci	vmovdqu (8 * 32)(rio), y0;			\
20162306a36Sopenharmony_ci	vmovdqu (9 * 32)(rio), y1;			\
20262306a36Sopenharmony_ci	vmovdqu (10 * 32)(rio), y2;			\
20362306a36Sopenharmony_ci	vmovdqu (11 * 32)(rio), y3;			\
20462306a36Sopenharmony_ci	vmovdqu (12 * 32)(rio), y4;			\
20562306a36Sopenharmony_ci	vmovdqu (13 * 32)(rio), y5;			\
20662306a36Sopenharmony_ci	vmovdqu (14 * 32)(rio), y6;			\
20762306a36Sopenharmony_ci	vmovdqu (15 * 32)(rio), y7;
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci/* byteslice pre-whitened blocks and store to temporary memory */
21062306a36Sopenharmony_ci#define inpack16_post(x0, x1, x2, x3,			\
21162306a36Sopenharmony_ci		      x4, x5, x6, x7,			\
21262306a36Sopenharmony_ci		      y0, y1, y2, y3,			\
21362306a36Sopenharmony_ci		      y4, y5, y6, y7,			\
21462306a36Sopenharmony_ci		      mem_ab, mem_cd)			\
21562306a36Sopenharmony_ci	byteslice_16x16b(x0, x1, x2, x3,		\
21662306a36Sopenharmony_ci			 x4, x5, x6, x7,		\
21762306a36Sopenharmony_ci			 y0, y1, y2, y3,		\
21862306a36Sopenharmony_ci			 y4, y5, y6, y7,		\
21962306a36Sopenharmony_ci			 (mem_ab), (mem_cd));		\
22062306a36Sopenharmony_ci							\
22162306a36Sopenharmony_ci	vmovdqu x0, 0 * 32(mem_ab);			\
22262306a36Sopenharmony_ci	vmovdqu x1, 1 * 32(mem_ab);			\
22362306a36Sopenharmony_ci	vmovdqu x2, 2 * 32(mem_ab);			\
22462306a36Sopenharmony_ci	vmovdqu x3, 3 * 32(mem_ab);			\
22562306a36Sopenharmony_ci	vmovdqu x4, 4 * 32(mem_ab);			\
22662306a36Sopenharmony_ci	vmovdqu x5, 5 * 32(mem_ab);			\
22762306a36Sopenharmony_ci	vmovdqu x6, 6 * 32(mem_ab);			\
22862306a36Sopenharmony_ci	vmovdqu x7, 7 * 32(mem_ab);			\
22962306a36Sopenharmony_ci	vmovdqu y0, 0 * 32(mem_cd);			\
23062306a36Sopenharmony_ci	vmovdqu y1, 1 * 32(mem_cd);			\
23162306a36Sopenharmony_ci	vmovdqu y2, 2 * 32(mem_cd);			\
23262306a36Sopenharmony_ci	vmovdqu y3, 3 * 32(mem_cd);			\
23362306a36Sopenharmony_ci	vmovdqu y4, 4 * 32(mem_cd);			\
23462306a36Sopenharmony_ci	vmovdqu y5, 5 * 32(mem_cd);			\
23562306a36Sopenharmony_ci	vmovdqu y6, 6 * 32(mem_cd);			\
23662306a36Sopenharmony_ci	vmovdqu y7, 7 * 32(mem_cd);
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci#define write_output(x0, x1, x2, x3,			\
23962306a36Sopenharmony_ci		     x4, x5, x6, x7,			\
24062306a36Sopenharmony_ci		     y0, y1, y2, y3,			\
24162306a36Sopenharmony_ci		     y4, y5, y6, y7,			\
24262306a36Sopenharmony_ci		     mem)				\
24362306a36Sopenharmony_ci	vmovdqu x0, 0 * 32(mem);			\
24462306a36Sopenharmony_ci	vmovdqu x1, 1 * 32(mem);			\
24562306a36Sopenharmony_ci	vmovdqu x2, 2 * 32(mem);			\
24662306a36Sopenharmony_ci	vmovdqu x3, 3 * 32(mem);			\
24762306a36Sopenharmony_ci	vmovdqu x4, 4 * 32(mem);			\
24862306a36Sopenharmony_ci	vmovdqu x5, 5 * 32(mem);			\
24962306a36Sopenharmony_ci	vmovdqu x6, 6 * 32(mem);			\
25062306a36Sopenharmony_ci	vmovdqu x7, 7 * 32(mem);			\
25162306a36Sopenharmony_ci	vmovdqu y0, 8 * 32(mem);			\
25262306a36Sopenharmony_ci	vmovdqu y1, 9 * 32(mem);			\
25362306a36Sopenharmony_ci	vmovdqu y2, 10 * 32(mem);			\
25462306a36Sopenharmony_ci	vmovdqu y3, 11 * 32(mem);			\
25562306a36Sopenharmony_ci	vmovdqu y4, 12 * 32(mem);			\
25662306a36Sopenharmony_ci	vmovdqu y5, 13 * 32(mem);			\
25762306a36Sopenharmony_ci	vmovdqu y6, 14 * 32(mem);			\
25862306a36Sopenharmony_ci	vmovdqu y7, 15 * 32(mem);			\
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci#define aria_store_state_8way(x0, x1, x2, x3,		\
26162306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
26262306a36Sopenharmony_ci			      mem_tmp, idx)		\
26362306a36Sopenharmony_ci	vmovdqu x0, ((idx + 0) * 32)(mem_tmp);		\
26462306a36Sopenharmony_ci	vmovdqu x1, ((idx + 1) * 32)(mem_tmp);		\
26562306a36Sopenharmony_ci	vmovdqu x2, ((idx + 2) * 32)(mem_tmp);		\
26662306a36Sopenharmony_ci	vmovdqu x3, ((idx + 3) * 32)(mem_tmp);		\
26762306a36Sopenharmony_ci	vmovdqu x4, ((idx + 4) * 32)(mem_tmp);		\
26862306a36Sopenharmony_ci	vmovdqu x5, ((idx + 5) * 32)(mem_tmp);		\
26962306a36Sopenharmony_ci	vmovdqu x6, ((idx + 6) * 32)(mem_tmp);		\
27062306a36Sopenharmony_ci	vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci#define aria_load_state_8way(x0, x1, x2, x3,		\
27362306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
27462306a36Sopenharmony_ci			     mem_tmp, idx)		\
27562306a36Sopenharmony_ci	vmovdqu ((idx + 0) * 32)(mem_tmp), x0;		\
27662306a36Sopenharmony_ci	vmovdqu ((idx + 1) * 32)(mem_tmp), x1;		\
27762306a36Sopenharmony_ci	vmovdqu ((idx + 2) * 32)(mem_tmp), x2;		\
27862306a36Sopenharmony_ci	vmovdqu ((idx + 3) * 32)(mem_tmp), x3;		\
27962306a36Sopenharmony_ci	vmovdqu ((idx + 4) * 32)(mem_tmp), x4;		\
28062306a36Sopenharmony_ci	vmovdqu ((idx + 5) * 32)(mem_tmp), x5;		\
28162306a36Sopenharmony_ci	vmovdqu ((idx + 6) * 32)(mem_tmp), x6;		\
28262306a36Sopenharmony_ci	vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
28362306a36Sopenharmony_ci
28462306a36Sopenharmony_ci#define aria_ark_8way(x0, x1, x2, x3,			\
28562306a36Sopenharmony_ci		      x4, x5, x6, x7,			\
28662306a36Sopenharmony_ci		      t0, rk, idx, round)		\
28762306a36Sopenharmony_ci	/* AddRoundKey */                               \
28862306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + idx + 3)(rk), t0;	\
28962306a36Sopenharmony_ci	vpxor t0, x0, x0;				\
29062306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + idx + 2)(rk), t0;	\
29162306a36Sopenharmony_ci	vpxor t0, x1, x1;				\
29262306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + idx + 1)(rk), t0;	\
29362306a36Sopenharmony_ci	vpxor t0, x2, x2;				\
29462306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + idx + 0)(rk), t0;	\
29562306a36Sopenharmony_ci	vpxor t0, x3, x3;				\
29662306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + idx + 7)(rk), t0;	\
29762306a36Sopenharmony_ci	vpxor t0, x4, x4;				\
29862306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + idx + 6)(rk), t0;	\
29962306a36Sopenharmony_ci	vpxor t0, x5, x5;				\
30062306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + idx + 5)(rk), t0;	\
30162306a36Sopenharmony_ci	vpxor t0, x6, x6;				\
30262306a36Sopenharmony_ci	vpbroadcastb ((round * 16) + idx + 4)(rk), t0;	\
30362306a36Sopenharmony_ci	vpxor t0, x7, x7;
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci#ifdef CONFIG_AS_GFNI
30662306a36Sopenharmony_ci#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
30762306a36Sopenharmony_ci			    x4, x5, x6, x7,		\
30862306a36Sopenharmony_ci			    t0, t1, t2, t3,		\
30962306a36Sopenharmony_ci			    t4, t5, t6, t7)		\
31062306a36Sopenharmony_ci	vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0;	\
31162306a36Sopenharmony_ci	vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1;	\
31262306a36Sopenharmony_ci	vpbroadcastq .Ltf_id_bitmatrix(%rip), t2;	\
31362306a36Sopenharmony_ci	vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3;	\
31462306a36Sopenharmony_ci	vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4;	\
31562306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
31662306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
31762306a36Sopenharmony_ci	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
31862306a36Sopenharmony_ci	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
31962306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x2, x2;		\
32062306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x6, x6;		\
32162306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
32262306a36Sopenharmony_ci	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
32362306a36Sopenharmony_ci	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
32462306a36Sopenharmony_ci	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
32562306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x3, x3;		\
32662306a36Sopenharmony_ci	vgf2p8affineinvqb $0, t2, x7, x7
32762306a36Sopenharmony_ci
32862306a36Sopenharmony_ci#endif /* CONFIG_AS_GFNI */
32962306a36Sopenharmony_ci#define aria_sbox_8way(x0, x1, x2, x3,			\
33062306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
33162306a36Sopenharmony_ci		       t0, t1, t2, t3,			\
33262306a36Sopenharmony_ci		       t4, t5, t6, t7)			\
33362306a36Sopenharmony_ci	vpxor t7, t7, t7;				\
33462306a36Sopenharmony_ci	vpxor t6, t6, t6;				\
33562306a36Sopenharmony_ci	vbroadcasti128 .Linv_shift_row(%rip), t0;	\
33662306a36Sopenharmony_ci	vbroadcasti128 .Lshift_row(%rip), t1;		\
33762306a36Sopenharmony_ci	vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \
33862306a36Sopenharmony_ci	vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \
33962306a36Sopenharmony_ci	vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
34062306a36Sopenharmony_ci	vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
34162306a36Sopenharmony_ci							\
34262306a36Sopenharmony_ci	vextracti128 $1, x0, t6##_x;			\
34362306a36Sopenharmony_ci	vaesenclast t7##_x, x0##_x, x0##_x;		\
34462306a36Sopenharmony_ci	vaesenclast t7##_x, t6##_x, t6##_x;		\
34562306a36Sopenharmony_ci	vinserti128 $1, t6##_x, x0, x0;			\
34662306a36Sopenharmony_ci							\
34762306a36Sopenharmony_ci	vextracti128 $1, x4, t6##_x;			\
34862306a36Sopenharmony_ci	vaesenclast t7##_x, x4##_x, x4##_x;		\
34962306a36Sopenharmony_ci	vaesenclast t7##_x, t6##_x, t6##_x;		\
35062306a36Sopenharmony_ci	vinserti128 $1, t6##_x, x4, x4;			\
35162306a36Sopenharmony_ci							\
35262306a36Sopenharmony_ci	vextracti128 $1, x1, t6##_x;			\
35362306a36Sopenharmony_ci	vaesenclast t7##_x, x1##_x, x1##_x;		\
35462306a36Sopenharmony_ci	vaesenclast t7##_x, t6##_x, t6##_x;		\
35562306a36Sopenharmony_ci	vinserti128 $1, t6##_x, x1, x1;			\
35662306a36Sopenharmony_ci							\
35762306a36Sopenharmony_ci	vextracti128 $1, x5, t6##_x;			\
35862306a36Sopenharmony_ci	vaesenclast t7##_x, x5##_x, x5##_x;		\
35962306a36Sopenharmony_ci	vaesenclast t7##_x, t6##_x, t6##_x;		\
36062306a36Sopenharmony_ci	vinserti128 $1, t6##_x, x5, x5;			\
36162306a36Sopenharmony_ci							\
36262306a36Sopenharmony_ci	vextracti128 $1, x2, t6##_x;			\
36362306a36Sopenharmony_ci	vaesdeclast t7##_x, x2##_x, x2##_x;		\
36462306a36Sopenharmony_ci	vaesdeclast t7##_x, t6##_x, t6##_x;		\
36562306a36Sopenharmony_ci	vinserti128 $1, t6##_x, x2, x2;			\
36662306a36Sopenharmony_ci							\
36762306a36Sopenharmony_ci	vextracti128 $1, x6, t6##_x;			\
36862306a36Sopenharmony_ci	vaesdeclast t7##_x, x6##_x, x6##_x;		\
36962306a36Sopenharmony_ci	vaesdeclast t7##_x, t6##_x, t6##_x;		\
37062306a36Sopenharmony_ci	vinserti128 $1, t6##_x, x6, x6;			\
37162306a36Sopenharmony_ci							\
37262306a36Sopenharmony_ci	vpbroadcastd .L0f0f0f0f(%rip), t6;		\
37362306a36Sopenharmony_ci							\
37462306a36Sopenharmony_ci	/* AES inverse shift rows */			\
37562306a36Sopenharmony_ci	vpshufb t0, x0, x0;				\
37662306a36Sopenharmony_ci	vpshufb t0, x4, x4;				\
37762306a36Sopenharmony_ci	vpshufb t0, x1, x1;				\
37862306a36Sopenharmony_ci	vpshufb t0, x5, x5;				\
37962306a36Sopenharmony_ci	vpshufb t1, x3, x3;				\
38062306a36Sopenharmony_ci	vpshufb t1, x7, x7;				\
38162306a36Sopenharmony_ci	vpshufb t1, x2, x2;				\
38262306a36Sopenharmony_ci	vpshufb t1, x6, x6;				\
38362306a36Sopenharmony_ci							\
38462306a36Sopenharmony_ci	/* affine transformation for S2 */		\
38562306a36Sopenharmony_ci	filter_8bit(x1, t2, t3, t6, t0);		\
38662306a36Sopenharmony_ci	/* affine transformation for S2 */		\
38762306a36Sopenharmony_ci	filter_8bit(x5, t2, t3, t6, t0);		\
38862306a36Sopenharmony_ci							\
38962306a36Sopenharmony_ci	/* affine transformation for X2 */		\
39062306a36Sopenharmony_ci	filter_8bit(x3, t4, t5, t6, t0);		\
39162306a36Sopenharmony_ci	/* affine transformation for X2 */		\
39262306a36Sopenharmony_ci	filter_8bit(x7, t4, t5, t6, t0);		\
39362306a36Sopenharmony_ci							\
39462306a36Sopenharmony_ci	vpxor t6, t6, t6;				\
39562306a36Sopenharmony_ci	vextracti128 $1, x3, t6##_x;			\
39662306a36Sopenharmony_ci	vaesdeclast t7##_x, x3##_x, x3##_x;		\
39762306a36Sopenharmony_ci	vaesdeclast t7##_x, t6##_x, t6##_x;		\
39862306a36Sopenharmony_ci	vinserti128 $1, t6##_x, x3, x3;			\
39962306a36Sopenharmony_ci							\
40062306a36Sopenharmony_ci	vextracti128 $1, x7, t6##_x;			\
40162306a36Sopenharmony_ci	vaesdeclast t7##_x, x7##_x, x7##_x;		\
40262306a36Sopenharmony_ci	vaesdeclast t7##_x, t6##_x, t6##_x;		\
40362306a36Sopenharmony_ci	vinserti128 $1, t6##_x, x7, x7;			\
40462306a36Sopenharmony_ci
40562306a36Sopenharmony_ci#define aria_diff_m(x0, x1, x2, x3,			\
40662306a36Sopenharmony_ci		    t0, t1, t2, t3)			\
40762306a36Sopenharmony_ci	/* T = rotr32(X, 8); */				\
40862306a36Sopenharmony_ci	/* X ^= T */					\
40962306a36Sopenharmony_ci	vpxor x0, x3, t0;				\
41062306a36Sopenharmony_ci	vpxor x1, x0, t1;				\
41162306a36Sopenharmony_ci	vpxor x2, x1, t2;				\
41262306a36Sopenharmony_ci	vpxor x3, x2, t3;				\
41362306a36Sopenharmony_ci	/* X = T ^ rotr(X, 16); */			\
41462306a36Sopenharmony_ci	vpxor t2, x0, x0;				\
41562306a36Sopenharmony_ci	vpxor x1, t3, t3;				\
41662306a36Sopenharmony_ci	vpxor t0, x2, x2;				\
41762306a36Sopenharmony_ci	vpxor t1, x3, x1;				\
41862306a36Sopenharmony_ci	vmovdqu t3, x3;
41962306a36Sopenharmony_ci
42062306a36Sopenharmony_ci#define aria_diff_word(x0, x1, x2, x3,			\
42162306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
42262306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
42362306a36Sopenharmony_ci		       y4, y5, y6, y7)			\
42462306a36Sopenharmony_ci	/* t1 ^= t2; */					\
42562306a36Sopenharmony_ci	vpxor y0, x4, x4;				\
42662306a36Sopenharmony_ci	vpxor y1, x5, x5;				\
42762306a36Sopenharmony_ci	vpxor y2, x6, x6;				\
42862306a36Sopenharmony_ci	vpxor y3, x7, x7;				\
42962306a36Sopenharmony_ci							\
43062306a36Sopenharmony_ci	/* t2 ^= t3; */					\
43162306a36Sopenharmony_ci	vpxor y4, y0, y0;				\
43262306a36Sopenharmony_ci	vpxor y5, y1, y1;				\
43362306a36Sopenharmony_ci	vpxor y6, y2, y2;				\
43462306a36Sopenharmony_ci	vpxor y7, y3, y3;				\
43562306a36Sopenharmony_ci							\
43662306a36Sopenharmony_ci	/* t0 ^= t1; */					\
43762306a36Sopenharmony_ci	vpxor x4, x0, x0;				\
43862306a36Sopenharmony_ci	vpxor x5, x1, x1;				\
43962306a36Sopenharmony_ci	vpxor x6, x2, x2;				\
44062306a36Sopenharmony_ci	vpxor x7, x3, x3;				\
44162306a36Sopenharmony_ci							\
44262306a36Sopenharmony_ci	/* t3 ^= t1; */					\
44362306a36Sopenharmony_ci	vpxor x4, y4, y4;				\
44462306a36Sopenharmony_ci	vpxor x5, y5, y5;				\
44562306a36Sopenharmony_ci	vpxor x6, y6, y6;				\
44662306a36Sopenharmony_ci	vpxor x7, y7, y7;				\
44762306a36Sopenharmony_ci							\
44862306a36Sopenharmony_ci	/* t2 ^= t0; */					\
44962306a36Sopenharmony_ci	vpxor x0, y0, y0;				\
45062306a36Sopenharmony_ci	vpxor x1, y1, y1;				\
45162306a36Sopenharmony_ci	vpxor x2, y2, y2;				\
45262306a36Sopenharmony_ci	vpxor x3, y3, y3;				\
45362306a36Sopenharmony_ci							\
45462306a36Sopenharmony_ci	/* t1 ^= t2; */					\
45562306a36Sopenharmony_ci	vpxor y0, x4, x4;				\
45662306a36Sopenharmony_ci	vpxor y1, x5, x5;				\
45762306a36Sopenharmony_ci	vpxor y2, x6, x6;				\
45862306a36Sopenharmony_ci	vpxor y3, x7, x7;
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci#define aria_fe(x0, x1, x2, x3,				\
46162306a36Sopenharmony_ci		x4, x5, x6, x7,				\
46262306a36Sopenharmony_ci		y0, y1, y2, y3,				\
46362306a36Sopenharmony_ci		y4, y5, y6, y7,				\
46462306a36Sopenharmony_ci		mem_tmp, rk, round)			\
46562306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
46662306a36Sopenharmony_ci		      y0, rk, 8, round);		\
46762306a36Sopenharmony_ci							\
46862306a36Sopenharmony_ci	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
46962306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7);	\
47062306a36Sopenharmony_ci							\
47162306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
47262306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
47362306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
47462306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
47562306a36Sopenharmony_ci			      mem_tmp, 8);		\
47662306a36Sopenharmony_ci							\
47762306a36Sopenharmony_ci	aria_load_state_8way(x0, x1, x2, x3,		\
47862306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
47962306a36Sopenharmony_ci			     mem_tmp, 0);		\
48062306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
48162306a36Sopenharmony_ci		      y0, rk, 0, round);		\
48262306a36Sopenharmony_ci							\
48362306a36Sopenharmony_ci	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
48462306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7);	\
48562306a36Sopenharmony_ci							\
48662306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
48762306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
48862306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
48962306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
49062306a36Sopenharmony_ci			      mem_tmp, 0);		\
49162306a36Sopenharmony_ci	aria_load_state_8way(y0, y1, y2, y3,		\
49262306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
49362306a36Sopenharmony_ci			     mem_tmp, 8);		\
49462306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
49562306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
49662306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
49762306a36Sopenharmony_ci		       y4, y5, y6, y7);			\
49862306a36Sopenharmony_ci	/* aria_diff_byte()				\
49962306a36Sopenharmony_ci	 * T3 = ABCD -> BADC				\
50062306a36Sopenharmony_ci	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
50162306a36Sopenharmony_ci	 * T0 = ABCD -> CDAB				\
50262306a36Sopenharmony_ci	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
50362306a36Sopenharmony_ci	 * T1 = ABCD -> DCBA				\
50462306a36Sopenharmony_ci	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
50562306a36Sopenharmony_ci	 */						\
50662306a36Sopenharmony_ci	aria_diff_word(x2, x3, x0, x1,			\
50762306a36Sopenharmony_ci		       x7, x6, x5, x4,			\
50862306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
50962306a36Sopenharmony_ci		       y5, y4, y7, y6);			\
51062306a36Sopenharmony_ci	aria_store_state_8way(x3, x2, x1, x0,		\
51162306a36Sopenharmony_ci			      x6, x7, x4, x5,		\
51262306a36Sopenharmony_ci			      mem_tmp, 0);
51362306a36Sopenharmony_ci
51462306a36Sopenharmony_ci#define aria_fo(x0, x1, x2, x3,				\
51562306a36Sopenharmony_ci		x4, x5, x6, x7,				\
51662306a36Sopenharmony_ci		y0, y1, y2, y3,				\
51762306a36Sopenharmony_ci		y4, y5, y6, y7,				\
51862306a36Sopenharmony_ci		mem_tmp, rk, round)			\
51962306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
52062306a36Sopenharmony_ci		      y0, rk, 8, round);		\
52162306a36Sopenharmony_ci							\
52262306a36Sopenharmony_ci	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
52362306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7);	\
52462306a36Sopenharmony_ci							\
52562306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
52662306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
52762306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
52862306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
52962306a36Sopenharmony_ci			      mem_tmp, 8);		\
53062306a36Sopenharmony_ci							\
53162306a36Sopenharmony_ci	aria_load_state_8way(x0, x1, x2, x3,		\
53262306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
53362306a36Sopenharmony_ci			     mem_tmp, 0);		\
53462306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
53562306a36Sopenharmony_ci		      y0, rk, 0, round);		\
53662306a36Sopenharmony_ci							\
53762306a36Sopenharmony_ci	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
53862306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7);	\
53962306a36Sopenharmony_ci							\
54062306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
54162306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
54262306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
54362306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
54462306a36Sopenharmony_ci			      mem_tmp, 0);		\
54562306a36Sopenharmony_ci	aria_load_state_8way(y0, y1, y2, y3,		\
54662306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
54762306a36Sopenharmony_ci			     mem_tmp, 8);		\
54862306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
54962306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
55062306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
55162306a36Sopenharmony_ci		       y4, y5, y6, y7);			\
55262306a36Sopenharmony_ci	/* aria_diff_byte()				\
55362306a36Sopenharmony_ci	 * T1 = ABCD -> BADC				\
55462306a36Sopenharmony_ci	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
55562306a36Sopenharmony_ci	 * T2 = ABCD -> CDAB				\
55662306a36Sopenharmony_ci	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
55762306a36Sopenharmony_ci	 * T3 = ABCD -> DCBA				\
55862306a36Sopenharmony_ci	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
55962306a36Sopenharmony_ci	 */						\
56062306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
56162306a36Sopenharmony_ci		       x5, x4, x7, x6,			\
56262306a36Sopenharmony_ci		       y2, y3, y0, y1,			\
56362306a36Sopenharmony_ci		       y7, y6, y5, y4);			\
56462306a36Sopenharmony_ci	aria_store_state_8way(x3, x2, x1, x0,		\
56562306a36Sopenharmony_ci			      x6, x7, x4, x5,		\
56662306a36Sopenharmony_ci			      mem_tmp, 0);
56762306a36Sopenharmony_ci
56862306a36Sopenharmony_ci#define aria_ff(x0, x1, x2, x3,				\
56962306a36Sopenharmony_ci		x4, x5, x6, x7,				\
57062306a36Sopenharmony_ci		y0, y1, y2, y3,				\
57162306a36Sopenharmony_ci		y4, y5, y6, y7,				\
57262306a36Sopenharmony_ci		mem_tmp, rk, round, last_round)		\
57362306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
57462306a36Sopenharmony_ci		      y0, rk, 8, round);		\
57562306a36Sopenharmony_ci							\
57662306a36Sopenharmony_ci	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
57762306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7);	\
57862306a36Sopenharmony_ci							\
57962306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
58062306a36Sopenharmony_ci		      y0, rk, 8, last_round);		\
58162306a36Sopenharmony_ci							\
58262306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
58362306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
58462306a36Sopenharmony_ci			      mem_tmp, 8);		\
58562306a36Sopenharmony_ci							\
58662306a36Sopenharmony_ci	aria_load_state_8way(x0, x1, x2, x3,		\
58762306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
58862306a36Sopenharmony_ci			     mem_tmp, 0);		\
58962306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
59062306a36Sopenharmony_ci		      y0, rk, 0, round);		\
59162306a36Sopenharmony_ci							\
59262306a36Sopenharmony_ci	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
59362306a36Sopenharmony_ci		       y0, y1, y2, y3, y4, y5, y6, y7);	\
59462306a36Sopenharmony_ci							\
59562306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
59662306a36Sopenharmony_ci		      y0, rk, 0, last_round);		\
59762306a36Sopenharmony_ci							\
59862306a36Sopenharmony_ci	aria_load_state_8way(y0, y1, y2, y3,		\
59962306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
60062306a36Sopenharmony_ci			     mem_tmp, 8);
60162306a36Sopenharmony_ci#ifdef CONFIG_AS_GFNI
60262306a36Sopenharmony_ci#define aria_fe_gfni(x0, x1, x2, x3,			\
60362306a36Sopenharmony_ci		     x4, x5, x6, x7,			\
60462306a36Sopenharmony_ci		     y0, y1, y2, y3,			\
60562306a36Sopenharmony_ci		     y4, y5, y6, y7,			\
60662306a36Sopenharmony_ci		     mem_tmp, rk, round)		\
60762306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
60862306a36Sopenharmony_ci		      y0, rk, 8, round);		\
60962306a36Sopenharmony_ci							\
61062306a36Sopenharmony_ci	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
61162306a36Sopenharmony_ci			    x6, x7, x4, x5,		\
61262306a36Sopenharmony_ci			    y0, y1, y2, y3,		\
61362306a36Sopenharmony_ci			    y4, y5, y6, y7);		\
61462306a36Sopenharmony_ci							\
61562306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
61662306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
61762306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
61862306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
61962306a36Sopenharmony_ci			      mem_tmp, 8);		\
62062306a36Sopenharmony_ci							\
62162306a36Sopenharmony_ci	aria_load_state_8way(x0, x1, x2, x3,		\
62262306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
62362306a36Sopenharmony_ci			     mem_tmp, 0);		\
62462306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
62562306a36Sopenharmony_ci		      y0, rk, 0, round);		\
62662306a36Sopenharmony_ci							\
62762306a36Sopenharmony_ci	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
62862306a36Sopenharmony_ci			    x6, x7, x4, x5,		\
62962306a36Sopenharmony_ci			    y0, y1, y2, y3,		\
63062306a36Sopenharmony_ci			    y4, y5, y6, y7);		\
63162306a36Sopenharmony_ci							\
63262306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
63362306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
63462306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
63562306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
63662306a36Sopenharmony_ci			      mem_tmp, 0);		\
63762306a36Sopenharmony_ci	aria_load_state_8way(y0, y1, y2, y3,		\
63862306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
63962306a36Sopenharmony_ci			     mem_tmp, 8);		\
64062306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
64162306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
64262306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
64362306a36Sopenharmony_ci		       y4, y5, y6, y7);			\
64462306a36Sopenharmony_ci	/* aria_diff_byte()				\
64562306a36Sopenharmony_ci	 * T3 = ABCD -> BADC				\
64662306a36Sopenharmony_ci	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
64762306a36Sopenharmony_ci	 * T0 = ABCD -> CDAB				\
64862306a36Sopenharmony_ci	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
64962306a36Sopenharmony_ci	 * T1 = ABCD -> DCBA				\
65062306a36Sopenharmony_ci	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
65162306a36Sopenharmony_ci	 */						\
65262306a36Sopenharmony_ci	aria_diff_word(x2, x3, x0, x1,			\
65362306a36Sopenharmony_ci		       x7, x6, x5, x4,			\
65462306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
65562306a36Sopenharmony_ci		       y5, y4, y7, y6);			\
65662306a36Sopenharmony_ci	aria_store_state_8way(x3, x2, x1, x0,		\
65762306a36Sopenharmony_ci			      x6, x7, x4, x5,		\
65862306a36Sopenharmony_ci			      mem_tmp, 0);
65962306a36Sopenharmony_ci
66062306a36Sopenharmony_ci#define aria_fo_gfni(x0, x1, x2, x3,			\
66162306a36Sopenharmony_ci		     x4, x5, x6, x7,			\
66262306a36Sopenharmony_ci		     y0, y1, y2, y3,			\
66362306a36Sopenharmony_ci		     y4, y5, y6, y7,			\
66462306a36Sopenharmony_ci		     mem_tmp, rk, round)		\
66562306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
66662306a36Sopenharmony_ci		      y0, rk, 8, round);		\
66762306a36Sopenharmony_ci							\
66862306a36Sopenharmony_ci	aria_sbox_8way_gfni(x0, x1, x2, x3,		\
66962306a36Sopenharmony_ci			    x4, x5, x6, x7,		\
67062306a36Sopenharmony_ci			    y0, y1, y2, y3,		\
67162306a36Sopenharmony_ci			    y4, y5, y6, y7);		\
67262306a36Sopenharmony_ci							\
67362306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
67462306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
67562306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
67662306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
67762306a36Sopenharmony_ci			      mem_tmp, 8);		\
67862306a36Sopenharmony_ci							\
67962306a36Sopenharmony_ci	aria_load_state_8way(x0, x1, x2, x3,		\
68062306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
68162306a36Sopenharmony_ci			     mem_tmp, 0);		\
68262306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
68362306a36Sopenharmony_ci		      y0, rk, 0, round);		\
68462306a36Sopenharmony_ci							\
68562306a36Sopenharmony_ci	aria_sbox_8way_gfni(x0, x1, x2, x3,		\
68662306a36Sopenharmony_ci			    x4, x5, x6, x7,		\
68762306a36Sopenharmony_ci			    y0, y1, y2, y3,		\
68862306a36Sopenharmony_ci			    y4, y5, y6, y7);		\
68962306a36Sopenharmony_ci							\
69062306a36Sopenharmony_ci	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
69162306a36Sopenharmony_ci	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
69262306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
69362306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
69462306a36Sopenharmony_ci			      mem_tmp, 0);		\
69562306a36Sopenharmony_ci	aria_load_state_8way(y0, y1, y2, y3,		\
69662306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
69762306a36Sopenharmony_ci			     mem_tmp, 8);		\
69862306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
69962306a36Sopenharmony_ci		       x4, x5, x6, x7,			\
70062306a36Sopenharmony_ci		       y0, y1, y2, y3,			\
70162306a36Sopenharmony_ci		       y4, y5, y6, y7);			\
70262306a36Sopenharmony_ci	/* aria_diff_byte()				\
70362306a36Sopenharmony_ci	 * T1 = ABCD -> BADC				\
70462306a36Sopenharmony_ci	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
70562306a36Sopenharmony_ci	 * T2 = ABCD -> CDAB				\
70662306a36Sopenharmony_ci	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
70762306a36Sopenharmony_ci	 * T3 = ABCD -> DCBA				\
70862306a36Sopenharmony_ci	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
70962306a36Sopenharmony_ci	 */						\
71062306a36Sopenharmony_ci	aria_diff_word(x0, x1, x2, x3,			\
71162306a36Sopenharmony_ci		       x5, x4, x7, x6,			\
71262306a36Sopenharmony_ci		       y2, y3, y0, y1,			\
71362306a36Sopenharmony_ci		       y7, y6, y5, y4);			\
71462306a36Sopenharmony_ci	aria_store_state_8way(x3, x2, x1, x0,		\
71562306a36Sopenharmony_ci			      x6, x7, x4, x5,		\
71662306a36Sopenharmony_ci			      mem_tmp, 0);
71762306a36Sopenharmony_ci
71862306a36Sopenharmony_ci#define aria_ff_gfni(x0, x1, x2, x3,			\
71962306a36Sopenharmony_ci		x4, x5, x6, x7,				\
72062306a36Sopenharmony_ci		y0, y1, y2, y3,				\
72162306a36Sopenharmony_ci		y4, y5, y6, y7,				\
72262306a36Sopenharmony_ci		mem_tmp, rk, round, last_round)		\
72362306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
72462306a36Sopenharmony_ci		      y0, rk, 8, round);		\
72562306a36Sopenharmony_ci							\
72662306a36Sopenharmony_ci	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
72762306a36Sopenharmony_ci			    x6, x7, x4, x5,		\
72862306a36Sopenharmony_ci			    y0, y1, y2, y3,		\
72962306a36Sopenharmony_ci			    y4, y5, y6, y7);		\
73062306a36Sopenharmony_ci							\
73162306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
73262306a36Sopenharmony_ci		      y0, rk, 8, last_round);		\
73362306a36Sopenharmony_ci							\
73462306a36Sopenharmony_ci	aria_store_state_8way(x0, x1, x2, x3,		\
73562306a36Sopenharmony_ci			      x4, x5, x6, x7,		\
73662306a36Sopenharmony_ci			      mem_tmp, 8);		\
73762306a36Sopenharmony_ci							\
73862306a36Sopenharmony_ci	aria_load_state_8way(x0, x1, x2, x3,		\
73962306a36Sopenharmony_ci			     x4, x5, x6, x7,		\
74062306a36Sopenharmony_ci			     mem_tmp, 0);		\
74162306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
74262306a36Sopenharmony_ci		      y0, rk, 0, round);		\
74362306a36Sopenharmony_ci							\
74462306a36Sopenharmony_ci	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
74562306a36Sopenharmony_ci			    x6, x7, x4, x5,		\
74662306a36Sopenharmony_ci			    y0, y1, y2, y3,		\
74762306a36Sopenharmony_ci			    y4, y5, y6, y7);		\
74862306a36Sopenharmony_ci							\
74962306a36Sopenharmony_ci	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
75062306a36Sopenharmony_ci		      y0, rk, 0, last_round);		\
75162306a36Sopenharmony_ci							\
75262306a36Sopenharmony_ci	aria_load_state_8way(y0, y1, y2, y3,		\
75362306a36Sopenharmony_ci			     y4, y5, y6, y7,		\
75462306a36Sopenharmony_ci			     mem_tmp, 8);
75562306a36Sopenharmony_ci#endif /* CONFIG_AS_GFNI */
75662306a36Sopenharmony_ci
75762306a36Sopenharmony_ci.section        .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
75862306a36Sopenharmony_ci.align 32
75962306a36Sopenharmony_ci#define SHUFB_BYTES(idx) \
76062306a36Sopenharmony_ci	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
76162306a36Sopenharmony_ci.Lshufb_16x16b:
76262306a36Sopenharmony_ci	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
76362306a36Sopenharmony_ci	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
76462306a36Sopenharmony_ci
76562306a36Sopenharmony_ci.section	.rodata.cst16, "aM", @progbits, 16
76662306a36Sopenharmony_ci.align 16
76762306a36Sopenharmony_ci/* For isolating SubBytes from AESENCLAST, inverse shift row */
76862306a36Sopenharmony_ci.Linv_shift_row:
76962306a36Sopenharmony_ci	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
77062306a36Sopenharmony_ci	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
77162306a36Sopenharmony_ci.Lshift_row:
77262306a36Sopenharmony_ci	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
77362306a36Sopenharmony_ci	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
77462306a36Sopenharmony_ci/* For CTR-mode IV byteswap */
77562306a36Sopenharmony_ci.Lbswap128_mask:
77662306a36Sopenharmony_ci	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
77762306a36Sopenharmony_ci	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
77862306a36Sopenharmony_ci
77962306a36Sopenharmony_ci/* AES inverse affine and S2 combined:
78062306a36Sopenharmony_ci *      1 1 0 0 0 0 0 1     x0     0
78162306a36Sopenharmony_ci *      0 1 0 0 1 0 0 0     x1     0
78262306a36Sopenharmony_ci *      1 1 0 0 1 1 1 1     x2     0
78362306a36Sopenharmony_ci *      0 1 1 0 1 0 0 1     x3     1
78462306a36Sopenharmony_ci *      0 1 0 0 1 1 0 0  *  x4  +  0
78562306a36Sopenharmony_ci *      0 1 0 1 1 0 0 0     x5     0
78662306a36Sopenharmony_ci *      0 0 0 0 0 1 0 1     x6     0
78762306a36Sopenharmony_ci *      1 1 1 0 0 1 1 1     x7     1
78862306a36Sopenharmony_ci */
78962306a36Sopenharmony_ci.Ltf_lo__inv_aff__and__s2:
79062306a36Sopenharmony_ci	.octa 0x92172DA81A9FA520B2370D883ABF8500
79162306a36Sopenharmony_ci.Ltf_hi__inv_aff__and__s2:
79262306a36Sopenharmony_ci	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
79362306a36Sopenharmony_ci
79462306a36Sopenharmony_ci/* X2 and AES forward affine combined:
79562306a36Sopenharmony_ci *      1 0 1 1 0 0 0 1     x0     0
79662306a36Sopenharmony_ci *      0 1 1 1 1 0 1 1     x1     0
79762306a36Sopenharmony_ci *      0 0 0 1 1 0 1 0     x2     1
79862306a36Sopenharmony_ci *      0 1 0 0 0 1 0 0     x3     0
79962306a36Sopenharmony_ci *      0 0 1 1 1 0 1 1  *  x4  +  0
80062306a36Sopenharmony_ci *      0 1 0 0 1 0 0 0     x5     0
80162306a36Sopenharmony_ci *      1 1 0 1 0 0 1 1     x6     0
80262306a36Sopenharmony_ci *      0 1 0 0 1 0 1 0     x7     0
80362306a36Sopenharmony_ci */
80462306a36Sopenharmony_ci.Ltf_lo__x2__and__fwd_aff:
80562306a36Sopenharmony_ci	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
80662306a36Sopenharmony_ci.Ltf_hi__x2__and__fwd_aff:
80762306a36Sopenharmony_ci	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
80862306a36Sopenharmony_ci
80962306a36Sopenharmony_ci#ifdef CONFIG_AS_GFNI
81062306a36Sopenharmony_ci.section	.rodata.cst8, "aM", @progbits, 8
81162306a36Sopenharmony_ci.align 8
81262306a36Sopenharmony_ci/* AES affine: */
81362306a36Sopenharmony_ci#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
81462306a36Sopenharmony_ci.Ltf_aff_bitmatrix:
81562306a36Sopenharmony_ci	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
81662306a36Sopenharmony_ci		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
81762306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
81862306a36Sopenharmony_ci		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
81962306a36Sopenharmony_ci		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
82062306a36Sopenharmony_ci		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
82162306a36Sopenharmony_ci		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
82262306a36Sopenharmony_ci		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
82362306a36Sopenharmony_ci
82462306a36Sopenharmony_ci/* AES inverse affine: */
82562306a36Sopenharmony_ci#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
82662306a36Sopenharmony_ci.Ltf_inv_bitmatrix:
82762306a36Sopenharmony_ci	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
82862306a36Sopenharmony_ci		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
82962306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
83062306a36Sopenharmony_ci		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
83162306a36Sopenharmony_ci		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
83262306a36Sopenharmony_ci		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
83362306a36Sopenharmony_ci		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
83462306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
83562306a36Sopenharmony_ci
83662306a36Sopenharmony_ci/* S2: */
83762306a36Sopenharmony_ci#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
83862306a36Sopenharmony_ci.Ltf_s2_bitmatrix:
83962306a36Sopenharmony_ci	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
84062306a36Sopenharmony_ci		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
84162306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
84262306a36Sopenharmony_ci		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
84362306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
84462306a36Sopenharmony_ci		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
84562306a36Sopenharmony_ci		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
84662306a36Sopenharmony_ci		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
84762306a36Sopenharmony_ci
84862306a36Sopenharmony_ci/* X2: */
84962306a36Sopenharmony_ci#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
85062306a36Sopenharmony_ci.Ltf_x2_bitmatrix:
85162306a36Sopenharmony_ci	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
85262306a36Sopenharmony_ci		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
85362306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
85462306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
85562306a36Sopenharmony_ci		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
85662306a36Sopenharmony_ci		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
85762306a36Sopenharmony_ci		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
85862306a36Sopenharmony_ci		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
85962306a36Sopenharmony_ci
86062306a36Sopenharmony_ci/* Identity matrix: */
86162306a36Sopenharmony_ci.Ltf_id_bitmatrix:
86262306a36Sopenharmony_ci	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
86362306a36Sopenharmony_ci		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
86462306a36Sopenharmony_ci		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
86562306a36Sopenharmony_ci		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
86662306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
86762306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
86862306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
86962306a36Sopenharmony_ci		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
87062306a36Sopenharmony_ci
87162306a36Sopenharmony_ci#endif /* CONFIG_AS_GFNI */
87262306a36Sopenharmony_ci
87362306a36Sopenharmony_ci/* 4-bit mask */
87462306a36Sopenharmony_ci.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
87562306a36Sopenharmony_ci.align 4
87662306a36Sopenharmony_ci.L0f0f0f0f:
87762306a36Sopenharmony_ci	.long 0x0f0f0f0f
87862306a36Sopenharmony_ci
87962306a36Sopenharmony_ci.text
88062306a36Sopenharmony_ci
88162306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)
88262306a36Sopenharmony_ci	/* input:
88362306a36Sopenharmony_ci	 *      %r9: rk
88462306a36Sopenharmony_ci	 *      %rsi: dst
88562306a36Sopenharmony_ci	 *      %rdx: src
88662306a36Sopenharmony_ci	 *      %ymm0..%ymm15: byte-sliced blocks
88762306a36Sopenharmony_ci	 */
88862306a36Sopenharmony_ci
88962306a36Sopenharmony_ci	FRAME_BEGIN
89062306a36Sopenharmony_ci
89162306a36Sopenharmony_ci	movq %rsi, %rax;
89262306a36Sopenharmony_ci	leaq 8 * 32(%rax), %r8;
89362306a36Sopenharmony_ci
89462306a36Sopenharmony_ci	inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
89562306a36Sopenharmony_ci		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
89662306a36Sopenharmony_ci		      %ymm15, %rax, %r8);
89762306a36Sopenharmony_ci	aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
89862306a36Sopenharmony_ci		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
89962306a36Sopenharmony_ci		%rax, %r9, 0);
90062306a36Sopenharmony_ci	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
90162306a36Sopenharmony_ci		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
90262306a36Sopenharmony_ci		%ymm15, %rax, %r9, 1);
90362306a36Sopenharmony_ci	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
90462306a36Sopenharmony_ci		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
90562306a36Sopenharmony_ci		%rax, %r9, 2);
90662306a36Sopenharmony_ci	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
90762306a36Sopenharmony_ci		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
90862306a36Sopenharmony_ci		%ymm15, %rax, %r9, 3);
90962306a36Sopenharmony_ci	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
91062306a36Sopenharmony_ci		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
91162306a36Sopenharmony_ci		%rax, %r9, 4);
91262306a36Sopenharmony_ci	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
91362306a36Sopenharmony_ci		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
91462306a36Sopenharmony_ci		%ymm15, %rax, %r9, 5);
91562306a36Sopenharmony_ci	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
91662306a36Sopenharmony_ci		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
91762306a36Sopenharmony_ci		%rax, %r9, 6);
91862306a36Sopenharmony_ci	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
91962306a36Sopenharmony_ci		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
92062306a36Sopenharmony_ci		%ymm15, %rax, %r9, 7);
92162306a36Sopenharmony_ci	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
92262306a36Sopenharmony_ci		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
92362306a36Sopenharmony_ci		%rax, %r9, 8);
92462306a36Sopenharmony_ci	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
92562306a36Sopenharmony_ci		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
92662306a36Sopenharmony_ci		%ymm15, %rax, %r9, 9);
92762306a36Sopenharmony_ci	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
92862306a36Sopenharmony_ci		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
92962306a36Sopenharmony_ci		%rax, %r9, 10);
93062306a36Sopenharmony_ci	cmpl $12, ARIA_CTX_rounds(CTX);
93162306a36Sopenharmony_ci	jne .Laria_192;
93262306a36Sopenharmony_ci	aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
93362306a36Sopenharmony_ci		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
93462306a36Sopenharmony_ci		%ymm15, %rax, %r9, 11, 12);
93562306a36Sopenharmony_ci	jmp .Laria_end;
93662306a36Sopenharmony_ci.Laria_192:
93762306a36Sopenharmony_ci	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
93862306a36Sopenharmony_ci		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
93962306a36Sopenharmony_ci		%ymm15, %rax, %r9, 11);
94062306a36Sopenharmony_ci	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
94162306a36Sopenharmony_ci		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
94262306a36Sopenharmony_ci		%rax, %r9, 12);
94362306a36Sopenharmony_ci	cmpl $14, ARIA_CTX_rounds(CTX);
94462306a36Sopenharmony_ci	jne .Laria_256;
94562306a36Sopenharmony_ci	aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
94662306a36Sopenharmony_ci		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
94762306a36Sopenharmony_ci		%ymm15, %rax, %r9, 13, 14);
94862306a36Sopenharmony_ci	jmp .Laria_end;
94962306a36Sopenharmony_ci.Laria_256:
95062306a36Sopenharmony_ci	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
95162306a36Sopenharmony_ci		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
95262306a36Sopenharmony_ci		%ymm15, %rax, %r9, 13);
95362306a36Sopenharmony_ci	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
95462306a36Sopenharmony_ci		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
95562306a36Sopenharmony_ci		%rax, %r9, 14);
95662306a36Sopenharmony_ci	aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
95762306a36Sopenharmony_ci		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
95862306a36Sopenharmony_ci		%ymm15, %rax, %r9, 15, 16);
95962306a36Sopenharmony_ci.Laria_end:
96062306a36Sopenharmony_ci	debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
96162306a36Sopenharmony_ci			   %ymm9, %ymm13, %ymm0, %ymm5,
96262306a36Sopenharmony_ci			   %ymm10, %ymm14, %ymm3, %ymm6,
96362306a36Sopenharmony_ci			   %ymm11, %ymm15, %ymm2, %ymm7,
96462306a36Sopenharmony_ci			   (%rax), (%r8));
96562306a36Sopenharmony_ci
96662306a36Sopenharmony_ci	FRAME_END
96762306a36Sopenharmony_ci	RET;
96862306a36Sopenharmony_ciSYM_FUNC_END(__aria_aesni_avx2_crypt_32way)
96962306a36Sopenharmony_ci
97062306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)
97162306a36Sopenharmony_ci	/* input:
97262306a36Sopenharmony_ci	 *      %rdi: ctx, CTX
97362306a36Sopenharmony_ci	 *      %rsi: dst
97462306a36Sopenharmony_ci	 *      %rdx: src
97562306a36Sopenharmony_ci	 */
97662306a36Sopenharmony_ci
97762306a36Sopenharmony_ci	FRAME_BEGIN
97862306a36Sopenharmony_ci
97962306a36Sopenharmony_ci	leaq ARIA_CTX_enc_key(CTX), %r9;
98062306a36Sopenharmony_ci
98162306a36Sopenharmony_ci	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
98262306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
98362306a36Sopenharmony_ci		     %ymm15, %rdx);
98462306a36Sopenharmony_ci
98562306a36Sopenharmony_ci	call __aria_aesni_avx2_crypt_32way;
98662306a36Sopenharmony_ci
98762306a36Sopenharmony_ci	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
98862306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
98962306a36Sopenharmony_ci		     %ymm15, %rax);
99062306a36Sopenharmony_ci
99162306a36Sopenharmony_ci	FRAME_END
99262306a36Sopenharmony_ci	RET;
99362306a36Sopenharmony_ciSYM_FUNC_END(aria_aesni_avx2_encrypt_32way)
99462306a36Sopenharmony_ci
99562306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)
99662306a36Sopenharmony_ci	/* input:
99762306a36Sopenharmony_ci	 *      %rdi: ctx, CTX
99862306a36Sopenharmony_ci	 *      %rsi: dst
99962306a36Sopenharmony_ci	 *      %rdx: src
100062306a36Sopenharmony_ci	 */
100162306a36Sopenharmony_ci
100262306a36Sopenharmony_ci	FRAME_BEGIN
100362306a36Sopenharmony_ci
100462306a36Sopenharmony_ci	leaq ARIA_CTX_dec_key(CTX), %r9;
100562306a36Sopenharmony_ci
100662306a36Sopenharmony_ci	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
100762306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
100862306a36Sopenharmony_ci		     %ymm15, %rdx);
100962306a36Sopenharmony_ci
101062306a36Sopenharmony_ci	call __aria_aesni_avx2_crypt_32way;
101162306a36Sopenharmony_ci
101262306a36Sopenharmony_ci	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
101362306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
101462306a36Sopenharmony_ci		     %ymm15, %rax);
101562306a36Sopenharmony_ci
101662306a36Sopenharmony_ci	FRAME_END
101762306a36Sopenharmony_ci	RET;
101862306a36Sopenharmony_ciSYM_FUNC_END(aria_aesni_avx2_decrypt_32way)
101962306a36Sopenharmony_ci
102062306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)
102162306a36Sopenharmony_ci	/* input:
102262306a36Sopenharmony_ci	 *      %rdi: ctx
102362306a36Sopenharmony_ci	 *      %rsi: dst
102462306a36Sopenharmony_ci	 *      %rdx: src
102562306a36Sopenharmony_ci	 *      %rcx: keystream
102662306a36Sopenharmony_ci	 *      %r8: iv (big endian, 128bit)
102762306a36Sopenharmony_ci	 */
102862306a36Sopenharmony_ci
102962306a36Sopenharmony_ci	FRAME_BEGIN
103062306a36Sopenharmony_ci	movq 8(%r8), %r11;
103162306a36Sopenharmony_ci	bswapq %r11;
103262306a36Sopenharmony_ci
103362306a36Sopenharmony_ci	vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;
103462306a36Sopenharmony_ci	vpcmpeqd %ymm0, %ymm0, %ymm0;
103562306a36Sopenharmony_ci	vpsrldq $8, %ymm0, %ymm0;   /* ab: -1:0 ; cd: -1:0 */
103662306a36Sopenharmony_ci	vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
103762306a36Sopenharmony_ci
103862306a36Sopenharmony_ci	/* load IV and byteswap */
103962306a36Sopenharmony_ci	vmovdqu (%r8), %xmm7;
104062306a36Sopenharmony_ci	vpshufb %xmm6, %xmm7, %xmm7;
104162306a36Sopenharmony_ci	vmovdqa %xmm7, %xmm3;
104262306a36Sopenharmony_ci	inc_le128(%xmm7, %xmm0, %xmm4);
104362306a36Sopenharmony_ci	vinserti128 $1, %xmm7, %ymm3, %ymm3;
104462306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
104562306a36Sopenharmony_ci
104662306a36Sopenharmony_ci	/* check need for handling 64-bit overflow and carry */
104762306a36Sopenharmony_ci	cmpq $(0xffffffffffffffff - 32), %r11;
104862306a36Sopenharmony_ci	ja .Lhandle_ctr_carry;
104962306a36Sopenharmony_ci
105062306a36Sopenharmony_ci	/* construct IVs */
105162306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
105262306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm9;
105362306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
105462306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm10;
105562306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
105662306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm11;
105762306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
105862306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm12;
105962306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
106062306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm13;
106162306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
106262306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm14;
106362306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
106462306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm15;
106562306a36Sopenharmony_ci	vmovdqu %ymm8, (0 * 32)(%rcx);
106662306a36Sopenharmony_ci	vmovdqu %ymm9, (1 * 32)(%rcx);
106762306a36Sopenharmony_ci	vmovdqu %ymm10, (2 * 32)(%rcx);
106862306a36Sopenharmony_ci	vmovdqu %ymm11, (3 * 32)(%rcx);
106962306a36Sopenharmony_ci	vmovdqu %ymm12, (4 * 32)(%rcx);
107062306a36Sopenharmony_ci	vmovdqu %ymm13, (5 * 32)(%rcx);
107162306a36Sopenharmony_ci	vmovdqu %ymm14, (6 * 32)(%rcx);
107262306a36Sopenharmony_ci	vmovdqu %ymm15, (7 * 32)(%rcx);
107362306a36Sopenharmony_ci
107462306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
107562306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm8;
107662306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
107762306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm9;
107862306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
107962306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm10;
108062306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
108162306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm11;
108262306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
108362306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm12;
108462306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
108562306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm13;
108662306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
108762306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm14;
108862306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
108962306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm15;
109062306a36Sopenharmony_ci	vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
109162306a36Sopenharmony_ci	vpshufb %xmm6, %xmm3, %xmm3;
109262306a36Sopenharmony_ci	vmovdqu %xmm3, (%r8);
109362306a36Sopenharmony_ci	vmovdqu (0 * 32)(%rcx), %ymm0;
109462306a36Sopenharmony_ci	vmovdqu (1 * 32)(%rcx), %ymm1;
109562306a36Sopenharmony_ci	vmovdqu (2 * 32)(%rcx), %ymm2;
109662306a36Sopenharmony_ci	vmovdqu (3 * 32)(%rcx), %ymm3;
109762306a36Sopenharmony_ci	vmovdqu (4 * 32)(%rcx), %ymm4;
109862306a36Sopenharmony_ci	vmovdqu (5 * 32)(%rcx), %ymm5;
109962306a36Sopenharmony_ci	vmovdqu (6 * 32)(%rcx), %ymm6;
110062306a36Sopenharmony_ci	vmovdqu (7 * 32)(%rcx), %ymm7;
110162306a36Sopenharmony_ci	jmp .Lctr_carry_done;
110262306a36Sopenharmony_ci
110362306a36Sopenharmony_ci	.Lhandle_ctr_carry:
110462306a36Sopenharmony_ci	/* construct IVs */
110562306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
110662306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
110762306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
110862306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
110962306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
111062306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
111162306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
111262306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
111362306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
111462306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
111562306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
111662306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
111762306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
111862306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
111962306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
112062306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
112162306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
112262306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
112362306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
112462306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
112562306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
112662306a36Sopenharmony_ci	vmovdqu %ymm8, (0 * 32)(%rcx);
112762306a36Sopenharmony_ci	vmovdqu %ymm9, (1 * 32)(%rcx);
112862306a36Sopenharmony_ci	vmovdqu %ymm10, (2 * 32)(%rcx);
112962306a36Sopenharmony_ci	vmovdqu %ymm11, (3 * 32)(%rcx);
113062306a36Sopenharmony_ci	vmovdqu %ymm12, (4 * 32)(%rcx);
113162306a36Sopenharmony_ci	vmovdqu %ymm13, (5 * 32)(%rcx);
113262306a36Sopenharmony_ci	vmovdqu %ymm14, (6 * 32)(%rcx);
113362306a36Sopenharmony_ci	vmovdqu %ymm15, (7 * 32)(%rcx);
113462306a36Sopenharmony_ci
113562306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
113662306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
113762306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
113862306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
113962306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
114062306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
114162306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
114262306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
114362306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
114462306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
114562306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
114662306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
114762306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
114862306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
114962306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
115062306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
115162306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
115262306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
115362306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
115462306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
115562306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
115662306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
115762306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
115862306a36Sopenharmony_ci	vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
115962306a36Sopenharmony_ci	inc_le128(%ymm3, %ymm0, %ymm4);
116062306a36Sopenharmony_ci	vextracti128 $1, %ymm3, %xmm3;
116162306a36Sopenharmony_ci	vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
116262306a36Sopenharmony_ci	vmovdqu %xmm3, (%r8);
116362306a36Sopenharmony_ci	vmovdqu (0 * 32)(%rcx), %ymm0;
116462306a36Sopenharmony_ci	vmovdqu (1 * 32)(%rcx), %ymm1;
116562306a36Sopenharmony_ci	vmovdqu (2 * 32)(%rcx), %ymm2;
116662306a36Sopenharmony_ci	vmovdqu (3 * 32)(%rcx), %ymm3;
116762306a36Sopenharmony_ci	vmovdqu (4 * 32)(%rcx), %ymm4;
116862306a36Sopenharmony_ci	vmovdqu (5 * 32)(%rcx), %ymm5;
116962306a36Sopenharmony_ci	vmovdqu (6 * 32)(%rcx), %ymm6;
117062306a36Sopenharmony_ci	vmovdqu (7 * 32)(%rcx), %ymm7;
117162306a36Sopenharmony_ci
117262306a36Sopenharmony_ci	.Lctr_carry_done:
117362306a36Sopenharmony_ci
117462306a36Sopenharmony_ci	FRAME_END
117562306a36Sopenharmony_ci	RET;
117662306a36Sopenharmony_ciSYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)
117762306a36Sopenharmony_ci
117862306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)
117962306a36Sopenharmony_ci	/* input:
118062306a36Sopenharmony_ci	 *      %rdi: ctx
118162306a36Sopenharmony_ci	 *      %rsi: dst
118262306a36Sopenharmony_ci	 *      %rdx: src
118362306a36Sopenharmony_ci	 *      %rcx: keystream
118462306a36Sopenharmony_ci	 *      %r8: iv (big endian, 128bit)
118562306a36Sopenharmony_ci	 */
118662306a36Sopenharmony_ci	FRAME_BEGIN
118762306a36Sopenharmony_ci
118862306a36Sopenharmony_ci	call __aria_aesni_avx2_ctr_gen_keystream_32way;
118962306a36Sopenharmony_ci
119062306a36Sopenharmony_ci	leaq (%rsi), %r10;
119162306a36Sopenharmony_ci	leaq (%rdx), %r11;
119262306a36Sopenharmony_ci	leaq (%rcx), %rsi;
119362306a36Sopenharmony_ci	leaq (%rcx), %rdx;
119462306a36Sopenharmony_ci	leaq ARIA_CTX_enc_key(CTX), %r9;
119562306a36Sopenharmony_ci
119662306a36Sopenharmony_ci	call __aria_aesni_avx2_crypt_32way;
119762306a36Sopenharmony_ci
119862306a36Sopenharmony_ci	vpxor (0 * 32)(%r11), %ymm1, %ymm1;
119962306a36Sopenharmony_ci	vpxor (1 * 32)(%r11), %ymm0, %ymm0;
120062306a36Sopenharmony_ci	vpxor (2 * 32)(%r11), %ymm3, %ymm3;
120162306a36Sopenharmony_ci	vpxor (3 * 32)(%r11), %ymm2, %ymm2;
120262306a36Sopenharmony_ci	vpxor (4 * 32)(%r11), %ymm4, %ymm4;
120362306a36Sopenharmony_ci	vpxor (5 * 32)(%r11), %ymm5, %ymm5;
120462306a36Sopenharmony_ci	vpxor (6 * 32)(%r11), %ymm6, %ymm6;
120562306a36Sopenharmony_ci	vpxor (7 * 32)(%r11), %ymm7, %ymm7;
120662306a36Sopenharmony_ci	vpxor (8 * 32)(%r11), %ymm8, %ymm8;
120762306a36Sopenharmony_ci	vpxor (9 * 32)(%r11), %ymm9, %ymm9;
120862306a36Sopenharmony_ci	vpxor (10 * 32)(%r11), %ymm10, %ymm10;
120962306a36Sopenharmony_ci	vpxor (11 * 32)(%r11), %ymm11, %ymm11;
121062306a36Sopenharmony_ci	vpxor (12 * 32)(%r11), %ymm12, %ymm12;
121162306a36Sopenharmony_ci	vpxor (13 * 32)(%r11), %ymm13, %ymm13;
121262306a36Sopenharmony_ci	vpxor (14 * 32)(%r11), %ymm14, %ymm14;
121362306a36Sopenharmony_ci	vpxor (15 * 32)(%r11), %ymm15, %ymm15;
121462306a36Sopenharmony_ci	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
121562306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
121662306a36Sopenharmony_ci		     %ymm15, %r10);
121762306a36Sopenharmony_ci
121862306a36Sopenharmony_ci	FRAME_END
121962306a36Sopenharmony_ci	RET;
122062306a36Sopenharmony_ciSYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)
122162306a36Sopenharmony_ci
122262306a36Sopenharmony_ci#ifdef CONFIG_AS_GFNI
122362306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)
122462306a36Sopenharmony_ci	/* input:
122562306a36Sopenharmony_ci	 *      %r9: rk
122662306a36Sopenharmony_ci	 *      %rsi: dst
122762306a36Sopenharmony_ci	 *      %rdx: src
122862306a36Sopenharmony_ci	 *      %ymm0..%ymm15: 16 byte-sliced blocks
122962306a36Sopenharmony_ci	 */
123062306a36Sopenharmony_ci
123162306a36Sopenharmony_ci	FRAME_BEGIN
123262306a36Sopenharmony_ci
123362306a36Sopenharmony_ci	movq %rsi, %rax;
123462306a36Sopenharmony_ci	leaq 8 * 32(%rax), %r8;
123562306a36Sopenharmony_ci
123662306a36Sopenharmony_ci	inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
123762306a36Sopenharmony_ci		      %ymm4, %ymm5, %ymm6, %ymm7,
123862306a36Sopenharmony_ci		      %ymm8, %ymm9, %ymm10, %ymm11,
123962306a36Sopenharmony_ci		      %ymm12, %ymm13, %ymm14,
124062306a36Sopenharmony_ci		      %ymm15, %rax, %r8);
124162306a36Sopenharmony_ci	aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
124262306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14, %ymm15,
124362306a36Sopenharmony_ci		     %ymm0, %ymm1, %ymm2, %ymm3,
124462306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
124562306a36Sopenharmony_ci		     %rax, %r9, 0);
124662306a36Sopenharmony_ci	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
124762306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
124862306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11,
124962306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14,
125062306a36Sopenharmony_ci		     %ymm15, %rax, %r9, 1);
125162306a36Sopenharmony_ci	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
125262306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14, %ymm15,
125362306a36Sopenharmony_ci		     %ymm0, %ymm1, %ymm2, %ymm3,
125462306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
125562306a36Sopenharmony_ci		     %rax, %r9, 2);
125662306a36Sopenharmony_ci	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
125762306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
125862306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11,
125962306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14,
126062306a36Sopenharmony_ci		     %ymm15, %rax, %r9, 3);
126162306a36Sopenharmony_ci	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
126262306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14, %ymm15,
126362306a36Sopenharmony_ci		     %ymm0, %ymm1, %ymm2, %ymm3,
126462306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
126562306a36Sopenharmony_ci		     %rax, %r9, 4);
126662306a36Sopenharmony_ci	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
126762306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
126862306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11,
126962306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14,
127062306a36Sopenharmony_ci		     %ymm15, %rax, %r9, 5);
127162306a36Sopenharmony_ci	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
127262306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14, %ymm15,
127362306a36Sopenharmony_ci		     %ymm0, %ymm1, %ymm2, %ymm3,
127462306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
127562306a36Sopenharmony_ci		     %rax, %r9, 6);
127662306a36Sopenharmony_ci	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
127762306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
127862306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11,
127962306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14,
128062306a36Sopenharmony_ci		     %ymm15, %rax, %r9, 7);
128162306a36Sopenharmony_ci	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
128262306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14, %ymm15,
128362306a36Sopenharmony_ci		     %ymm0, %ymm1, %ymm2, %ymm3,
128462306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
128562306a36Sopenharmony_ci		     %rax, %r9, 8);
128662306a36Sopenharmony_ci	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
128762306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
128862306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11,
128962306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14,
129062306a36Sopenharmony_ci		     %ymm15, %rax, %r9, 9);
129162306a36Sopenharmony_ci	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
129262306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14, %ymm15,
129362306a36Sopenharmony_ci		     %ymm0, %ymm1, %ymm2, %ymm3,
129462306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
129562306a36Sopenharmony_ci		     %rax, %r9, 10);
129662306a36Sopenharmony_ci	cmpl $12, ARIA_CTX_rounds(CTX);
129762306a36Sopenharmony_ci	jne .Laria_gfni_192;
129862306a36Sopenharmony_ci	aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
129962306a36Sopenharmony_ci		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
130062306a36Sopenharmony_ci		%ymm15, %rax, %r9, 11, 12);
130162306a36Sopenharmony_ci	jmp .Laria_gfni_end;
130262306a36Sopenharmony_ci.Laria_gfni_192:
130362306a36Sopenharmony_ci	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
130462306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
130562306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11,
130662306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14,
130762306a36Sopenharmony_ci		     %ymm15, %rax, %r9, 11);
130862306a36Sopenharmony_ci	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
130962306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14, %ymm15,
131062306a36Sopenharmony_ci		     %ymm0, %ymm1, %ymm2, %ymm3,
131162306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
131262306a36Sopenharmony_ci		     %rax, %r9, 12);
131362306a36Sopenharmony_ci	cmpl $14, ARIA_CTX_rounds(CTX);
131462306a36Sopenharmony_ci	jne .Laria_gfni_256;
131562306a36Sopenharmony_ci	aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
131662306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
131762306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11,
131862306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14,
131962306a36Sopenharmony_ci		     %ymm15, %rax, %r9, 13, 14);
132062306a36Sopenharmony_ci	jmp .Laria_gfni_end;
132162306a36Sopenharmony_ci.Laria_gfni_256:
132262306a36Sopenharmony_ci	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
132362306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
132462306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11,
132562306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14,
132662306a36Sopenharmony_ci		     %ymm15, %rax, %r9, 13);
132762306a36Sopenharmony_ci	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
132862306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14, %ymm15,
132962306a36Sopenharmony_ci		     %ymm0, %ymm1, %ymm2, %ymm3,
133062306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
133162306a36Sopenharmony_ci		     %rax, %r9, 14);
133262306a36Sopenharmony_ci	aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
133362306a36Sopenharmony_ci		     %ymm4, %ymm5, %ymm6, %ymm7,
133462306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11,
133562306a36Sopenharmony_ci		     %ymm12, %ymm13, %ymm14,
133662306a36Sopenharmony_ci		     %ymm15, %rax, %r9, 15, 16);
133762306a36Sopenharmony_ci.Laria_gfni_end:
133862306a36Sopenharmony_ci	debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
133962306a36Sopenharmony_ci			   %ymm9, %ymm13, %ymm0, %ymm5,
134062306a36Sopenharmony_ci			   %ymm10, %ymm14, %ymm3, %ymm6,
134162306a36Sopenharmony_ci			   %ymm11, %ymm15, %ymm2, %ymm7,
134262306a36Sopenharmony_ci			   (%rax), (%r8));
134362306a36Sopenharmony_ci
134462306a36Sopenharmony_ci	FRAME_END
134562306a36Sopenharmony_ci	RET;
134662306a36Sopenharmony_ciSYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)
134762306a36Sopenharmony_ci
134862306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)
134962306a36Sopenharmony_ci	/* input:
135062306a36Sopenharmony_ci	 *      %rdi: ctx, CTX
135162306a36Sopenharmony_ci	 *      %rsi: dst
135262306a36Sopenharmony_ci	 *      %rdx: src
135362306a36Sopenharmony_ci	 */
135462306a36Sopenharmony_ci
135562306a36Sopenharmony_ci	FRAME_BEGIN
135662306a36Sopenharmony_ci
135762306a36Sopenharmony_ci	leaq ARIA_CTX_enc_key(CTX), %r9;
135862306a36Sopenharmony_ci
135962306a36Sopenharmony_ci	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
136062306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
136162306a36Sopenharmony_ci		     %ymm15, %rdx);
136262306a36Sopenharmony_ci
136362306a36Sopenharmony_ci	call __aria_aesni_avx2_gfni_crypt_32way;
136462306a36Sopenharmony_ci
136562306a36Sopenharmony_ci	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
136662306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
136762306a36Sopenharmony_ci		     %ymm15, %rax);
136862306a36Sopenharmony_ci
136962306a36Sopenharmony_ci	FRAME_END
137062306a36Sopenharmony_ci	RET;
137162306a36Sopenharmony_ciSYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)
137262306a36Sopenharmony_ci
137362306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)
137462306a36Sopenharmony_ci	/* input:
137562306a36Sopenharmony_ci	 *      %rdi: ctx, CTX
137662306a36Sopenharmony_ci	 *      %rsi: dst
137762306a36Sopenharmony_ci	 *      %rdx: src
137862306a36Sopenharmony_ci	 */
137962306a36Sopenharmony_ci
138062306a36Sopenharmony_ci	FRAME_BEGIN
138162306a36Sopenharmony_ci
138262306a36Sopenharmony_ci	leaq ARIA_CTX_dec_key(CTX), %r9;
138362306a36Sopenharmony_ci
138462306a36Sopenharmony_ci	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
138562306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
138662306a36Sopenharmony_ci		     %ymm15, %rdx);
138762306a36Sopenharmony_ci
138862306a36Sopenharmony_ci	call __aria_aesni_avx2_gfni_crypt_32way;
138962306a36Sopenharmony_ci
139062306a36Sopenharmony_ci	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
139162306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
139262306a36Sopenharmony_ci		     %ymm15, %rax);
139362306a36Sopenharmony_ci
139462306a36Sopenharmony_ci	FRAME_END
139562306a36Sopenharmony_ci	RET;
139662306a36Sopenharmony_ciSYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)
139762306a36Sopenharmony_ci
139862306a36Sopenharmony_ciSYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)
139962306a36Sopenharmony_ci	/* input:
140062306a36Sopenharmony_ci	 *      %rdi: ctx
140162306a36Sopenharmony_ci	 *      %rsi: dst
140262306a36Sopenharmony_ci	 *      %rdx: src
140362306a36Sopenharmony_ci	 *      %rcx: keystream
140462306a36Sopenharmony_ci	 *      %r8: iv (big endian, 128bit)
140562306a36Sopenharmony_ci	 */
140662306a36Sopenharmony_ci	FRAME_BEGIN
140762306a36Sopenharmony_ci
140862306a36Sopenharmony_ci	call __aria_aesni_avx2_ctr_gen_keystream_32way
140962306a36Sopenharmony_ci
141062306a36Sopenharmony_ci	leaq (%rsi), %r10;
141162306a36Sopenharmony_ci	leaq (%rdx), %r11;
141262306a36Sopenharmony_ci	leaq (%rcx), %rsi;
141362306a36Sopenharmony_ci	leaq (%rcx), %rdx;
141462306a36Sopenharmony_ci	leaq ARIA_CTX_enc_key(CTX), %r9;
141562306a36Sopenharmony_ci
141662306a36Sopenharmony_ci	call __aria_aesni_avx2_gfni_crypt_32way;
141762306a36Sopenharmony_ci
141862306a36Sopenharmony_ci	vpxor (0 * 32)(%r11), %ymm1, %ymm1;
141962306a36Sopenharmony_ci	vpxor (1 * 32)(%r11), %ymm0, %ymm0;
142062306a36Sopenharmony_ci	vpxor (2 * 32)(%r11), %ymm3, %ymm3;
142162306a36Sopenharmony_ci	vpxor (3 * 32)(%r11), %ymm2, %ymm2;
142262306a36Sopenharmony_ci	vpxor (4 * 32)(%r11), %ymm4, %ymm4;
142362306a36Sopenharmony_ci	vpxor (5 * 32)(%r11), %ymm5, %ymm5;
142462306a36Sopenharmony_ci	vpxor (6 * 32)(%r11), %ymm6, %ymm6;
142562306a36Sopenharmony_ci	vpxor (7 * 32)(%r11), %ymm7, %ymm7;
142662306a36Sopenharmony_ci	vpxor (8 * 32)(%r11), %ymm8, %ymm8;
142762306a36Sopenharmony_ci	vpxor (9 * 32)(%r11), %ymm9, %ymm9;
142862306a36Sopenharmony_ci	vpxor (10 * 32)(%r11), %ymm10, %ymm10;
142962306a36Sopenharmony_ci	vpxor (11 * 32)(%r11), %ymm11, %ymm11;
143062306a36Sopenharmony_ci	vpxor (12 * 32)(%r11), %ymm12, %ymm12;
143162306a36Sopenharmony_ci	vpxor (13 * 32)(%r11), %ymm13, %ymm13;
143262306a36Sopenharmony_ci	vpxor (14 * 32)(%r11), %ymm14, %ymm14;
143362306a36Sopenharmony_ci	vpxor (15 * 32)(%r11), %ymm15, %ymm15;
143462306a36Sopenharmony_ci	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
143562306a36Sopenharmony_ci		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
143662306a36Sopenharmony_ci		     %ymm15, %r10);
143762306a36Sopenharmony_ci
143862306a36Sopenharmony_ci	FRAME_END
143962306a36Sopenharmony_ci	RET;
144062306a36Sopenharmony_ciSYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)
144162306a36Sopenharmony_ci#endif /* CONFIG_AS_GFNI */
1442