162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * x86_64/AVX2 assembler optimized version of Serpent
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Based on AVX assembler implementation of Serpent by:
862306a36Sopenharmony_ci *  Copyright © 2012 Johannes Goetzfried
962306a36Sopenharmony_ci *      <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
1062306a36Sopenharmony_ci */
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci#include <linux/linkage.h>
1362306a36Sopenharmony_ci#include <asm/frame.h>
1462306a36Sopenharmony_ci#include "glue_helper-asm-avx2.S"
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci.file "serpent-avx2-asm_64.S"
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
1962306a36Sopenharmony_ci.align 16
2062306a36Sopenharmony_ci.Lbswap128_mask:
2162306a36Sopenharmony_ci	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_ci.text
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_ci#define CTX %rdi
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci#define RNOT %ymm0
2862306a36Sopenharmony_ci#define tp  %ymm1
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci#define RA1 %ymm2
3162306a36Sopenharmony_ci#define RA2 %ymm3
3262306a36Sopenharmony_ci#define RB1 %ymm4
3362306a36Sopenharmony_ci#define RB2 %ymm5
3462306a36Sopenharmony_ci#define RC1 %ymm6
3562306a36Sopenharmony_ci#define RC2 %ymm7
3662306a36Sopenharmony_ci#define RD1 %ymm8
3762306a36Sopenharmony_ci#define RD2 %ymm9
3862306a36Sopenharmony_ci#define RE1 %ymm10
3962306a36Sopenharmony_ci#define RE2 %ymm11
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_ci#define RK0 %ymm12
4262306a36Sopenharmony_ci#define RK1 %ymm13
4362306a36Sopenharmony_ci#define RK2 %ymm14
4462306a36Sopenharmony_ci#define RK3 %ymm15
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ci#define RK0x %xmm12
4762306a36Sopenharmony_ci#define RK1x %xmm13
4862306a36Sopenharmony_ci#define RK2x %xmm14
4962306a36Sopenharmony_ci#define RK3x %xmm15
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci#define S0_1(x0, x1, x2, x3, x4)      \
5262306a36Sopenharmony_ci	vpor		x0,   x3, tp; \
5362306a36Sopenharmony_ci	vpxor		x3,   x0, x0; \
5462306a36Sopenharmony_ci	vpxor		x2,   x3, x4; \
5562306a36Sopenharmony_ci	vpxor		RNOT, x4, x4; \
5662306a36Sopenharmony_ci	vpxor		x1,   tp, x3; \
5762306a36Sopenharmony_ci	vpand		x0,   x1, x1; \
5862306a36Sopenharmony_ci	vpxor		x4,   x1, x1; \
5962306a36Sopenharmony_ci	vpxor		x0,   x2, x2;
6062306a36Sopenharmony_ci#define S0_2(x0, x1, x2, x3, x4)      \
6162306a36Sopenharmony_ci	vpxor		x3,   x0, x0; \
6262306a36Sopenharmony_ci	vpor		x0,   x4, x4; \
6362306a36Sopenharmony_ci	vpxor		x2,   x0, x0; \
6462306a36Sopenharmony_ci	vpand		x1,   x2, x2; \
6562306a36Sopenharmony_ci	vpxor		x2,   x3, x3; \
6662306a36Sopenharmony_ci	vpxor		RNOT, x1, x1; \
6762306a36Sopenharmony_ci	vpxor		x4,   x2, x2; \
6862306a36Sopenharmony_ci	vpxor		x2,   x1, x1;
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci#define S1_1(x0, x1, x2, x3, x4)      \
7162306a36Sopenharmony_ci	vpxor		x0,   x1, tp; \
7262306a36Sopenharmony_ci	vpxor		x3,   x0, x0; \
7362306a36Sopenharmony_ci	vpxor		RNOT, x3, x3; \
7462306a36Sopenharmony_ci	vpand		tp,   x1, x4; \
7562306a36Sopenharmony_ci	vpor		tp,   x0, x0; \
7662306a36Sopenharmony_ci	vpxor		x2,   x3, x3; \
7762306a36Sopenharmony_ci	vpxor		x3,   x0, x0; \
7862306a36Sopenharmony_ci	vpxor		x3,   tp, x1;
7962306a36Sopenharmony_ci#define S1_2(x0, x1, x2, x3, x4)      \
8062306a36Sopenharmony_ci	vpxor		x4,   x3, x3; \
8162306a36Sopenharmony_ci	vpor		x4,   x1, x1; \
8262306a36Sopenharmony_ci	vpxor		x2,   x4, x4; \
8362306a36Sopenharmony_ci	vpand		x0,   x2, x2; \
8462306a36Sopenharmony_ci	vpxor		x1,   x2, x2; \
8562306a36Sopenharmony_ci	vpor		x0,   x1, x1; \
8662306a36Sopenharmony_ci	vpxor		RNOT, x0, x0; \
8762306a36Sopenharmony_ci	vpxor		x2,   x0, x0; \
8862306a36Sopenharmony_ci	vpxor		x1,   x4, x4;
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_ci#define S2_1(x0, x1, x2, x3, x4)      \
9162306a36Sopenharmony_ci	vpxor		RNOT, x3, x3; \
9262306a36Sopenharmony_ci	vpxor		x0,   x1, x1; \
9362306a36Sopenharmony_ci	vpand		x2,   x0, tp; \
9462306a36Sopenharmony_ci	vpxor		x3,   tp, tp; \
9562306a36Sopenharmony_ci	vpor		x0,   x3, x3; \
9662306a36Sopenharmony_ci	vpxor		x1,   x2, x2; \
9762306a36Sopenharmony_ci	vpxor		x1,   x3, x3; \
9862306a36Sopenharmony_ci	vpand		tp,   x1, x1;
9962306a36Sopenharmony_ci#define S2_2(x0, x1, x2, x3, x4)      \
10062306a36Sopenharmony_ci	vpxor		x2,   tp, tp; \
10162306a36Sopenharmony_ci	vpand		x3,   x2, x2; \
10262306a36Sopenharmony_ci	vpor		x1,   x3, x3; \
10362306a36Sopenharmony_ci	vpxor		RNOT, tp, tp; \
10462306a36Sopenharmony_ci	vpxor		tp,   x3, x3; \
10562306a36Sopenharmony_ci	vpxor		tp,   x0, x4; \
10662306a36Sopenharmony_ci	vpxor		x2,   tp, x0; \
10762306a36Sopenharmony_ci	vpor		x2,   x1, x1;
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci#define S3_1(x0, x1, x2, x3, x4)      \
11062306a36Sopenharmony_ci	vpxor		x3,   x1, tp; \
11162306a36Sopenharmony_ci	vpor		x0,   x3, x3; \
11262306a36Sopenharmony_ci	vpand		x0,   x1, x4; \
11362306a36Sopenharmony_ci	vpxor		x2,   x0, x0; \
11462306a36Sopenharmony_ci	vpxor		tp,   x2, x2; \
11562306a36Sopenharmony_ci	vpand		x3,   tp, x1; \
11662306a36Sopenharmony_ci	vpxor		x3,   x2, x2; \
11762306a36Sopenharmony_ci	vpor		x4,   x0, x0; \
11862306a36Sopenharmony_ci	vpxor		x3,   x4, x4;
11962306a36Sopenharmony_ci#define S3_2(x0, x1, x2, x3, x4)      \
12062306a36Sopenharmony_ci	vpxor		x0,   x1, x1; \
12162306a36Sopenharmony_ci	vpand		x3,   x0, x0; \
12262306a36Sopenharmony_ci	vpand		x4,   x3, x3; \
12362306a36Sopenharmony_ci	vpxor		x2,   x3, x3; \
12462306a36Sopenharmony_ci	vpor		x1,   x4, x4; \
12562306a36Sopenharmony_ci	vpand		x1,   x2, x2; \
12662306a36Sopenharmony_ci	vpxor		x3,   x4, x4; \
12762306a36Sopenharmony_ci	vpxor		x3,   x0, x0; \
12862306a36Sopenharmony_ci	vpxor		x2,   x3, x3;
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_ci#define S4_1(x0, x1, x2, x3, x4)      \
13162306a36Sopenharmony_ci	vpand		x0,   x3, tp; \
13262306a36Sopenharmony_ci	vpxor		x3,   x0, x0; \
13362306a36Sopenharmony_ci	vpxor		x2,   tp, tp; \
13462306a36Sopenharmony_ci	vpor		x3,   x2, x2; \
13562306a36Sopenharmony_ci	vpxor		x1,   x0, x0; \
13662306a36Sopenharmony_ci	vpxor		tp,   x3, x4; \
13762306a36Sopenharmony_ci	vpor		x0,   x2, x2; \
13862306a36Sopenharmony_ci	vpxor		x1,   x2, x2;
13962306a36Sopenharmony_ci#define S4_2(x0, x1, x2, x3, x4)      \
14062306a36Sopenharmony_ci	vpand		x0,   x1, x1; \
14162306a36Sopenharmony_ci	vpxor		x4,   x1, x1; \
14262306a36Sopenharmony_ci	vpand		x2,   x4, x4; \
14362306a36Sopenharmony_ci	vpxor		tp,   x2, x2; \
14462306a36Sopenharmony_ci	vpxor		x0,   x4, x4; \
14562306a36Sopenharmony_ci	vpor		x1,   tp, x3; \
14662306a36Sopenharmony_ci	vpxor		RNOT, x1, x1; \
14762306a36Sopenharmony_ci	vpxor		x0,   x3, x3;
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci#define S5_1(x0, x1, x2, x3, x4)      \
15062306a36Sopenharmony_ci	vpor		x0,   x1, tp; \
15162306a36Sopenharmony_ci	vpxor		tp,   x2, x2; \
15262306a36Sopenharmony_ci	vpxor		RNOT, x3, x3; \
15362306a36Sopenharmony_ci	vpxor		x0,   x1, x4; \
15462306a36Sopenharmony_ci	vpxor		x2,   x0, x0; \
15562306a36Sopenharmony_ci	vpand		x4,   tp, x1; \
15662306a36Sopenharmony_ci	vpor		x3,   x4, x4; \
15762306a36Sopenharmony_ci	vpxor		x0,   x4, x4;
15862306a36Sopenharmony_ci#define S5_2(x0, x1, x2, x3, x4)      \
15962306a36Sopenharmony_ci	vpand		x3,   x0, x0; \
16062306a36Sopenharmony_ci	vpxor		x3,   x1, x1; \
16162306a36Sopenharmony_ci	vpxor		x2,   x3, x3; \
16262306a36Sopenharmony_ci	vpxor		x1,   x0, x0; \
16362306a36Sopenharmony_ci	vpand		x4,   x2, x2; \
16462306a36Sopenharmony_ci	vpxor		x2,   x1, x1; \
16562306a36Sopenharmony_ci	vpand		x0,   x2, x2; \
16662306a36Sopenharmony_ci	vpxor		x2,   x3, x3;
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ci#define S6_1(x0, x1, x2, x3, x4)      \
16962306a36Sopenharmony_ci	vpxor		x0,   x3, x3; \
17062306a36Sopenharmony_ci	vpxor		x2,   x1, tp; \
17162306a36Sopenharmony_ci	vpxor		x0,   x2, x2; \
17262306a36Sopenharmony_ci	vpand		x3,   x0, x0; \
17362306a36Sopenharmony_ci	vpor		x3,   tp, tp; \
17462306a36Sopenharmony_ci	vpxor		RNOT, x1, x4; \
17562306a36Sopenharmony_ci	vpxor		tp,   x0, x0; \
17662306a36Sopenharmony_ci	vpxor		x2,   tp, x1;
17762306a36Sopenharmony_ci#define S6_2(x0, x1, x2, x3, x4)      \
17862306a36Sopenharmony_ci	vpxor		x4,   x3, x3; \
17962306a36Sopenharmony_ci	vpxor		x0,   x4, x4; \
18062306a36Sopenharmony_ci	vpand		x0,   x2, x2; \
18162306a36Sopenharmony_ci	vpxor		x1,   x4, x4; \
18262306a36Sopenharmony_ci	vpxor		x3,   x2, x2; \
18362306a36Sopenharmony_ci	vpand		x1,   x3, x3; \
18462306a36Sopenharmony_ci	vpxor		x0,   x3, x3; \
18562306a36Sopenharmony_ci	vpxor		x2,   x1, x1;
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_ci#define S7_1(x0, x1, x2, x3, x4)      \
18862306a36Sopenharmony_ci	vpxor		RNOT, x1, tp; \
18962306a36Sopenharmony_ci	vpxor		RNOT, x0, x0; \
19062306a36Sopenharmony_ci	vpand		x2,   tp, x1; \
19162306a36Sopenharmony_ci	vpxor		x3,   x1, x1; \
19262306a36Sopenharmony_ci	vpor		tp,   x3, x3; \
19362306a36Sopenharmony_ci	vpxor		x2,   tp, x4; \
19462306a36Sopenharmony_ci	vpxor		x3,   x2, x2; \
19562306a36Sopenharmony_ci	vpxor		x0,   x3, x3; \
19662306a36Sopenharmony_ci	vpor		x1,   x0, x0;
19762306a36Sopenharmony_ci#define S7_2(x0, x1, x2, x3, x4)      \
19862306a36Sopenharmony_ci	vpand		x0,   x2, x2; \
19962306a36Sopenharmony_ci	vpxor		x4,   x0, x0; \
20062306a36Sopenharmony_ci	vpxor		x3,   x4, x4; \
20162306a36Sopenharmony_ci	vpand		x0,   x3, x3; \
20262306a36Sopenharmony_ci	vpxor		x1,   x4, x4; \
20362306a36Sopenharmony_ci	vpxor		x4,   x2, x2; \
20462306a36Sopenharmony_ci	vpxor		x1,   x3, x3; \
20562306a36Sopenharmony_ci	vpor		x0,   x4, x4; \
20662306a36Sopenharmony_ci	vpxor		x1,   x4, x4;
20762306a36Sopenharmony_ci
20862306a36Sopenharmony_ci#define SI0_1(x0, x1, x2, x3, x4)     \
20962306a36Sopenharmony_ci	vpxor		x0,   x1, x1; \
21062306a36Sopenharmony_ci	vpor		x1,   x3, tp; \
21162306a36Sopenharmony_ci	vpxor		x1,   x3, x4; \
21262306a36Sopenharmony_ci	vpxor		RNOT, x0, x0; \
21362306a36Sopenharmony_ci	vpxor		tp,   x2, x2; \
21462306a36Sopenharmony_ci	vpxor		x0,   tp, x3; \
21562306a36Sopenharmony_ci	vpand		x1,   x0, x0; \
21662306a36Sopenharmony_ci	vpxor		x2,   x0, x0;
21762306a36Sopenharmony_ci#define SI0_2(x0, x1, x2, x3, x4)     \
21862306a36Sopenharmony_ci	vpand		x3,   x2, x2; \
21962306a36Sopenharmony_ci	vpxor		x4,   x3, x3; \
22062306a36Sopenharmony_ci	vpxor		x3,   x2, x2; \
22162306a36Sopenharmony_ci	vpxor		x3,   x1, x1; \
22262306a36Sopenharmony_ci	vpand		x0,   x3, x3; \
22362306a36Sopenharmony_ci	vpxor		x0,   x1, x1; \
22462306a36Sopenharmony_ci	vpxor		x2,   x0, x0; \
22562306a36Sopenharmony_ci	vpxor		x3,   x4, x4;
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci#define SI1_1(x0, x1, x2, x3, x4)     \
22862306a36Sopenharmony_ci	vpxor		x3,   x1, x1; \
22962306a36Sopenharmony_ci	vpxor		x2,   x0, tp; \
23062306a36Sopenharmony_ci	vpxor		RNOT, x2, x2; \
23162306a36Sopenharmony_ci	vpor		x1,   x0, x4; \
23262306a36Sopenharmony_ci	vpxor		x3,   x4, x4; \
23362306a36Sopenharmony_ci	vpand		x1,   x3, x3; \
23462306a36Sopenharmony_ci	vpxor		x2,   x1, x1; \
23562306a36Sopenharmony_ci	vpand		x4,   x2, x2;
23662306a36Sopenharmony_ci#define SI1_2(x0, x1, x2, x3, x4)     \
23762306a36Sopenharmony_ci	vpxor		x1,   x4, x4; \
23862306a36Sopenharmony_ci	vpor		x3,   x1, x1; \
23962306a36Sopenharmony_ci	vpxor		tp,   x3, x3; \
24062306a36Sopenharmony_ci	vpxor		tp,   x2, x2; \
24162306a36Sopenharmony_ci	vpor		x4,   tp, x0; \
24262306a36Sopenharmony_ci	vpxor		x4,   x2, x2; \
24362306a36Sopenharmony_ci	vpxor		x0,   x1, x1; \
24462306a36Sopenharmony_ci	vpxor		x1,   x4, x4;
24562306a36Sopenharmony_ci
24662306a36Sopenharmony_ci#define SI2_1(x0, x1, x2, x3, x4)     \
24762306a36Sopenharmony_ci	vpxor		x1,   x2, x2; \
24862306a36Sopenharmony_ci	vpxor		RNOT, x3, tp; \
24962306a36Sopenharmony_ci	vpor		x2,   tp, tp; \
25062306a36Sopenharmony_ci	vpxor		x3,   x2, x2; \
25162306a36Sopenharmony_ci	vpxor		x0,   x3, x4; \
25262306a36Sopenharmony_ci	vpxor		x1,   tp, x3; \
25362306a36Sopenharmony_ci	vpor		x2,   x1, x1; \
25462306a36Sopenharmony_ci	vpxor		x0,   x2, x2;
25562306a36Sopenharmony_ci#define SI2_2(x0, x1, x2, x3, x4)     \
25662306a36Sopenharmony_ci	vpxor		x4,   x1, x1; \
25762306a36Sopenharmony_ci	vpor		x3,   x4, x4; \
25862306a36Sopenharmony_ci	vpxor		x3,   x2, x2; \
25962306a36Sopenharmony_ci	vpxor		x2,   x4, x4; \
26062306a36Sopenharmony_ci	vpand		x1,   x2, x2; \
26162306a36Sopenharmony_ci	vpxor		x3,   x2, x2; \
26262306a36Sopenharmony_ci	vpxor		x4,   x3, x3; \
26362306a36Sopenharmony_ci	vpxor		x0,   x4, x4;
26462306a36Sopenharmony_ci
26562306a36Sopenharmony_ci#define SI3_1(x0, x1, x2, x3, x4)     \
26662306a36Sopenharmony_ci	vpxor		x1,   x2, x2; \
26762306a36Sopenharmony_ci	vpand		x2,   x1, tp; \
26862306a36Sopenharmony_ci	vpxor		x0,   tp, tp; \
26962306a36Sopenharmony_ci	vpor		x1,   x0, x0; \
27062306a36Sopenharmony_ci	vpxor		x3,   x1, x4; \
27162306a36Sopenharmony_ci	vpxor		x3,   x0, x0; \
27262306a36Sopenharmony_ci	vpor		tp,   x3, x3; \
27362306a36Sopenharmony_ci	vpxor		x2,   tp, x1;
27462306a36Sopenharmony_ci#define SI3_2(x0, x1, x2, x3, x4)     \
27562306a36Sopenharmony_ci	vpxor		x3,   x1, x1; \
27662306a36Sopenharmony_ci	vpxor		x2,   x0, x0; \
27762306a36Sopenharmony_ci	vpxor		x3,   x2, x2; \
27862306a36Sopenharmony_ci	vpand		x1,   x3, x3; \
27962306a36Sopenharmony_ci	vpxor		x0,   x1, x1; \
28062306a36Sopenharmony_ci	vpand		x2,   x0, x0; \
28162306a36Sopenharmony_ci	vpxor		x3,   x4, x4; \
28262306a36Sopenharmony_ci	vpxor		x0,   x3, x3; \
28362306a36Sopenharmony_ci	vpxor		x1,   x0, x0;
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_ci#define SI4_1(x0, x1, x2, x3, x4)     \
28662306a36Sopenharmony_ci	vpxor		x3,   x2, x2; \
28762306a36Sopenharmony_ci	vpand		x1,   x0, tp; \
28862306a36Sopenharmony_ci	vpxor		x2,   tp, tp; \
28962306a36Sopenharmony_ci	vpor		x3,   x2, x2; \
29062306a36Sopenharmony_ci	vpxor		RNOT, x0, x4; \
29162306a36Sopenharmony_ci	vpxor		tp,   x1, x1; \
29262306a36Sopenharmony_ci	vpxor		x2,   tp, x0; \
29362306a36Sopenharmony_ci	vpand		x4,   x2, x2;
29462306a36Sopenharmony_ci#define SI4_2(x0, x1, x2, x3, x4)     \
29562306a36Sopenharmony_ci	vpxor		x0,   x2, x2; \
29662306a36Sopenharmony_ci	vpor		x4,   x0, x0; \
29762306a36Sopenharmony_ci	vpxor		x3,   x0, x0; \
29862306a36Sopenharmony_ci	vpand		x2,   x3, x3; \
29962306a36Sopenharmony_ci	vpxor		x3,   x4, x4; \
30062306a36Sopenharmony_ci	vpxor		x1,   x3, x3; \
30162306a36Sopenharmony_ci	vpand		x0,   x1, x1; \
30262306a36Sopenharmony_ci	vpxor		x1,   x4, x4; \
30362306a36Sopenharmony_ci	vpxor		x3,   x0, x0;
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci#define SI5_1(x0, x1, x2, x3, x4)     \
30662306a36Sopenharmony_ci	vpor		x2,   x1, tp; \
30762306a36Sopenharmony_ci	vpxor		x1,   x2, x2; \
30862306a36Sopenharmony_ci	vpxor		x3,   tp, tp; \
30962306a36Sopenharmony_ci	vpand		x1,   x3, x3; \
31062306a36Sopenharmony_ci	vpxor		x3,   x2, x2; \
31162306a36Sopenharmony_ci	vpor		x0,   x3, x3; \
31262306a36Sopenharmony_ci	vpxor		RNOT, x0, x0; \
31362306a36Sopenharmony_ci	vpxor		x2,   x3, x3; \
31462306a36Sopenharmony_ci	vpor		x0,   x2, x2;
31562306a36Sopenharmony_ci#define SI5_2(x0, x1, x2, x3, x4)     \
31662306a36Sopenharmony_ci	vpxor		tp,   x1, x4; \
31762306a36Sopenharmony_ci	vpxor		x4,   x2, x2; \
31862306a36Sopenharmony_ci	vpand		x0,   x4, x4; \
31962306a36Sopenharmony_ci	vpxor		tp,   x0, x0; \
32062306a36Sopenharmony_ci	vpxor		x3,   tp, x1; \
32162306a36Sopenharmony_ci	vpand		x2,   x0, x0; \
32262306a36Sopenharmony_ci	vpxor		x3,   x2, x2; \
32362306a36Sopenharmony_ci	vpxor		x2,   x0, x0; \
32462306a36Sopenharmony_ci	vpxor		x4,   x2, x2; \
32562306a36Sopenharmony_ci	vpxor		x3,   x4, x4;
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci#define SI6_1(x0, x1, x2, x3, x4)     \
32862306a36Sopenharmony_ci	vpxor		x2,   x0, x0; \
32962306a36Sopenharmony_ci	vpand		x3,   x0, tp; \
33062306a36Sopenharmony_ci	vpxor		x3,   x2, x2; \
33162306a36Sopenharmony_ci	vpxor		x2,   tp, tp; \
33262306a36Sopenharmony_ci	vpxor		x1,   x3, x3; \
33362306a36Sopenharmony_ci	vpor		x0,   x2, x2; \
33462306a36Sopenharmony_ci	vpxor		x3,   x2, x2; \
33562306a36Sopenharmony_ci	vpand		tp,   x3, x3;
33662306a36Sopenharmony_ci#define SI6_2(x0, x1, x2, x3, x4)     \
33762306a36Sopenharmony_ci	vpxor		RNOT, tp, tp; \
33862306a36Sopenharmony_ci	vpxor		x1,   x3, x3; \
33962306a36Sopenharmony_ci	vpand		x2,   x1, x1; \
34062306a36Sopenharmony_ci	vpxor		tp,   x0, x4; \
34162306a36Sopenharmony_ci	vpxor		x4,   x3, x3; \
34262306a36Sopenharmony_ci	vpxor		x2,   x4, x4; \
34362306a36Sopenharmony_ci	vpxor		x1,   tp, x0; \
34462306a36Sopenharmony_ci	vpxor		x0,   x2, x2;
34562306a36Sopenharmony_ci
34662306a36Sopenharmony_ci#define SI7_1(x0, x1, x2, x3, x4)     \
34762306a36Sopenharmony_ci	vpand		x0,   x3, tp; \
34862306a36Sopenharmony_ci	vpxor		x2,   x0, x0; \
34962306a36Sopenharmony_ci	vpor		x3,   x2, x2; \
35062306a36Sopenharmony_ci	vpxor		x1,   x3, x4; \
35162306a36Sopenharmony_ci	vpxor		RNOT, x0, x0; \
35262306a36Sopenharmony_ci	vpor		tp,   x1, x1; \
35362306a36Sopenharmony_ci	vpxor		x0,   x4, x4; \
35462306a36Sopenharmony_ci	vpand		x2,   x0, x0; \
35562306a36Sopenharmony_ci	vpxor		x1,   x0, x0;
35662306a36Sopenharmony_ci#define SI7_2(x0, x1, x2, x3, x4)     \
35762306a36Sopenharmony_ci	vpand		x2,   x1, x1; \
35862306a36Sopenharmony_ci	vpxor		x2,   tp, x3; \
35962306a36Sopenharmony_ci	vpxor		x3,   x4, x4; \
36062306a36Sopenharmony_ci	vpand		x3,   x2, x2; \
36162306a36Sopenharmony_ci	vpor		x0,   x3, x3; \
36262306a36Sopenharmony_ci	vpxor		x4,   x1, x1; \
36362306a36Sopenharmony_ci	vpxor		x4,   x3, x3; \
36462306a36Sopenharmony_ci	vpand		x0,   x4, x4; \
36562306a36Sopenharmony_ci	vpxor		x2,   x4, x4;
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci#define get_key(i,j,t) \
36862306a36Sopenharmony_ci	vpbroadcastd (4*(i)+(j))*4(CTX), t;
36962306a36Sopenharmony_ci
37062306a36Sopenharmony_ci#define K2(x0, x1, x2, x3, x4, i) \
37162306a36Sopenharmony_ci	get_key(i, 0, RK0); \
37262306a36Sopenharmony_ci	get_key(i, 1, RK1); \
37362306a36Sopenharmony_ci	get_key(i, 2, RK2); \
37462306a36Sopenharmony_ci	get_key(i, 3, RK3); \
37562306a36Sopenharmony_ci	vpxor RK0,	x0 ## 1, x0 ## 1; \
37662306a36Sopenharmony_ci	vpxor RK1,	x1 ## 1, x1 ## 1; \
37762306a36Sopenharmony_ci	vpxor RK2,	x2 ## 1, x2 ## 1; \
37862306a36Sopenharmony_ci	vpxor RK3,	x3 ## 1, x3 ## 1; \
37962306a36Sopenharmony_ci		vpxor RK0,	x0 ## 2, x0 ## 2; \
38062306a36Sopenharmony_ci		vpxor RK1,	x1 ## 2, x1 ## 2; \
38162306a36Sopenharmony_ci		vpxor RK2,	x2 ## 2, x2 ## 2; \
38262306a36Sopenharmony_ci		vpxor RK3,	x3 ## 2, x3 ## 2;
38362306a36Sopenharmony_ci
38462306a36Sopenharmony_ci#define LK2(x0, x1, x2, x3, x4, i) \
38562306a36Sopenharmony_ci	vpslld $13,		x0 ## 1, x4 ## 1;          \
38662306a36Sopenharmony_ci	vpsrld $(32 - 13),	x0 ## 1, x0 ## 1;          \
38762306a36Sopenharmony_ci	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
38862306a36Sopenharmony_ci	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
38962306a36Sopenharmony_ci	vpslld $3,		x2 ## 1, x4 ## 1;          \
39062306a36Sopenharmony_ci	vpsrld $(32 - 3),	x2 ## 1, x2 ## 1;          \
39162306a36Sopenharmony_ci	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
39262306a36Sopenharmony_ci	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
39362306a36Sopenharmony_ci		vpslld $13,		x0 ## 2, x4 ## 2;          \
39462306a36Sopenharmony_ci		vpsrld $(32 - 13),	x0 ## 2, x0 ## 2;          \
39562306a36Sopenharmony_ci		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
39662306a36Sopenharmony_ci		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
39762306a36Sopenharmony_ci		vpslld $3,		x2 ## 2, x4 ## 2;          \
39862306a36Sopenharmony_ci		vpsrld $(32 - 3),	x2 ## 2, x2 ## 2;          \
39962306a36Sopenharmony_ci		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
40062306a36Sopenharmony_ci		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
40162306a36Sopenharmony_ci	vpslld $1,		x1 ## 1, x4 ## 1;          \
40262306a36Sopenharmony_ci	vpsrld $(32 - 1),	x1 ## 1, x1 ## 1;          \
40362306a36Sopenharmony_ci	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
40462306a36Sopenharmony_ci	vpslld $3,		x0 ## 1, x4 ## 1;          \
40562306a36Sopenharmony_ci	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
40662306a36Sopenharmony_ci	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
40762306a36Sopenharmony_ci	get_key(i, 1, RK1); \
40862306a36Sopenharmony_ci		vpslld $1,		x1 ## 2, x4 ## 2;          \
40962306a36Sopenharmony_ci		vpsrld $(32 - 1),	x1 ## 2, x1 ## 2;          \
41062306a36Sopenharmony_ci		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
41162306a36Sopenharmony_ci		vpslld $3,		x0 ## 2, x4 ## 2;          \
41262306a36Sopenharmony_ci		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
41362306a36Sopenharmony_ci		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
41462306a36Sopenharmony_ci		get_key(i, 3, RK3); \
41562306a36Sopenharmony_ci	vpslld $7,		x3 ## 1, x4 ## 1;          \
41662306a36Sopenharmony_ci	vpsrld $(32 - 7),	x3 ## 1, x3 ## 1;          \
41762306a36Sopenharmony_ci	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
41862306a36Sopenharmony_ci	vpslld $7,		x1 ## 1, x4 ## 1;          \
41962306a36Sopenharmony_ci	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
42062306a36Sopenharmony_ci	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
42162306a36Sopenharmony_ci	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
42262306a36Sopenharmony_ci	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
42362306a36Sopenharmony_ci	get_key(i, 0, RK0); \
42462306a36Sopenharmony_ci		vpslld $7,		x3 ## 2, x4 ## 2;          \
42562306a36Sopenharmony_ci		vpsrld $(32 - 7),	x3 ## 2, x3 ## 2;          \
42662306a36Sopenharmony_ci		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
42762306a36Sopenharmony_ci		vpslld $7,		x1 ## 2, x4 ## 2;          \
42862306a36Sopenharmony_ci		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
42962306a36Sopenharmony_ci		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
43062306a36Sopenharmony_ci		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
43162306a36Sopenharmony_ci		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
43262306a36Sopenharmony_ci		get_key(i, 2, RK2); \
43362306a36Sopenharmony_ci	vpxor			RK1, x1 ## 1, x1 ## 1;     \
43462306a36Sopenharmony_ci	vpxor			RK3, x3 ## 1, x3 ## 1;     \
43562306a36Sopenharmony_ci	vpslld $5,		x0 ## 1, x4 ## 1;          \
43662306a36Sopenharmony_ci	vpsrld $(32 - 5),	x0 ## 1, x0 ## 1;          \
43762306a36Sopenharmony_ci	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
43862306a36Sopenharmony_ci	vpslld $22,		x2 ## 1, x4 ## 1;          \
43962306a36Sopenharmony_ci	vpsrld $(32 - 22),	x2 ## 1, x2 ## 1;          \
44062306a36Sopenharmony_ci	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
44162306a36Sopenharmony_ci	vpxor			RK0, x0 ## 1, x0 ## 1;     \
44262306a36Sopenharmony_ci	vpxor			RK2, x2 ## 1, x2 ## 1;     \
44362306a36Sopenharmony_ci		vpxor			RK1, x1 ## 2, x1 ## 2;     \
44462306a36Sopenharmony_ci		vpxor			RK3, x3 ## 2, x3 ## 2;     \
44562306a36Sopenharmony_ci		vpslld $5,		x0 ## 2, x4 ## 2;          \
44662306a36Sopenharmony_ci		vpsrld $(32 - 5),	x0 ## 2, x0 ## 2;          \
44762306a36Sopenharmony_ci		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
44862306a36Sopenharmony_ci		vpslld $22,		x2 ## 2, x4 ## 2;          \
44962306a36Sopenharmony_ci		vpsrld $(32 - 22),	x2 ## 2, x2 ## 2;          \
45062306a36Sopenharmony_ci		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
45162306a36Sopenharmony_ci		vpxor			RK0, x0 ## 2, x0 ## 2;     \
45262306a36Sopenharmony_ci		vpxor			RK2, x2 ## 2, x2 ## 2;
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci#define KL2(x0, x1, x2, x3, x4, i) \
45562306a36Sopenharmony_ci	vpxor			RK0, x0 ## 1, x0 ## 1;     \
45662306a36Sopenharmony_ci	vpxor			RK2, x2 ## 1, x2 ## 1;     \
45762306a36Sopenharmony_ci	vpsrld $5,		x0 ## 1, x4 ## 1;          \
45862306a36Sopenharmony_ci	vpslld $(32 - 5),	x0 ## 1, x0 ## 1;          \
45962306a36Sopenharmony_ci	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
46062306a36Sopenharmony_ci	vpxor			RK3, x3 ## 1, x3 ## 1;     \
46162306a36Sopenharmony_ci	vpxor			RK1, x1 ## 1, x1 ## 1;     \
46262306a36Sopenharmony_ci	vpsrld $22,		x2 ## 1, x4 ## 1;          \
46362306a36Sopenharmony_ci	vpslld $(32 - 22),	x2 ## 1, x2 ## 1;          \
46462306a36Sopenharmony_ci	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
46562306a36Sopenharmony_ci	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
46662306a36Sopenharmony_ci		vpxor			RK0, x0 ## 2, x0 ## 2;     \
46762306a36Sopenharmony_ci		vpxor			RK2, x2 ## 2, x2 ## 2;     \
46862306a36Sopenharmony_ci		vpsrld $5,		x0 ## 2, x4 ## 2;          \
46962306a36Sopenharmony_ci		vpslld $(32 - 5),	x0 ## 2, x0 ## 2;          \
47062306a36Sopenharmony_ci		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
47162306a36Sopenharmony_ci		vpxor			RK3, x3 ## 2, x3 ## 2;     \
47262306a36Sopenharmony_ci		vpxor			RK1, x1 ## 2, x1 ## 2;     \
47362306a36Sopenharmony_ci		vpsrld $22,		x2 ## 2, x4 ## 2;          \
47462306a36Sopenharmony_ci		vpslld $(32 - 22),	x2 ## 2, x2 ## 2;          \
47562306a36Sopenharmony_ci		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
47662306a36Sopenharmony_ci		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
47762306a36Sopenharmony_ci	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
47862306a36Sopenharmony_ci	vpslld $7,		x1 ## 1, x4 ## 1;          \
47962306a36Sopenharmony_ci	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
48062306a36Sopenharmony_ci	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
48162306a36Sopenharmony_ci	vpsrld $1,		x1 ## 1, x4 ## 1;          \
48262306a36Sopenharmony_ci	vpslld $(32 - 1),	x1 ## 1, x1 ## 1;          \
48362306a36Sopenharmony_ci	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
48462306a36Sopenharmony_ci		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
48562306a36Sopenharmony_ci		vpslld $7,		x1 ## 2, x4 ## 2;          \
48662306a36Sopenharmony_ci		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
48762306a36Sopenharmony_ci		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
48862306a36Sopenharmony_ci		vpsrld $1,		x1 ## 2, x4 ## 2;          \
48962306a36Sopenharmony_ci		vpslld $(32 - 1),	x1 ## 2, x1 ## 2;          \
49062306a36Sopenharmony_ci		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
49162306a36Sopenharmony_ci	vpsrld $7,		x3 ## 1, x4 ## 1;          \
49262306a36Sopenharmony_ci	vpslld $(32 - 7),	x3 ## 1, x3 ## 1;          \
49362306a36Sopenharmony_ci	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
49462306a36Sopenharmony_ci	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
49562306a36Sopenharmony_ci	vpslld $3,		x0 ## 1, x4 ## 1;          \
49662306a36Sopenharmony_ci	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
49762306a36Sopenharmony_ci		vpsrld $7,		x3 ## 2, x4 ## 2;          \
49862306a36Sopenharmony_ci		vpslld $(32 - 7),	x3 ## 2, x3 ## 2;          \
49962306a36Sopenharmony_ci		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
50062306a36Sopenharmony_ci		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
50162306a36Sopenharmony_ci		vpslld $3,		x0 ## 2, x4 ## 2;          \
50262306a36Sopenharmony_ci		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
50362306a36Sopenharmony_ci	vpsrld $13,		x0 ## 1, x4 ## 1;          \
50462306a36Sopenharmony_ci	vpslld $(32 - 13),	x0 ## 1, x0 ## 1;          \
50562306a36Sopenharmony_ci	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
50662306a36Sopenharmony_ci	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
50762306a36Sopenharmony_ci	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
50862306a36Sopenharmony_ci	vpsrld $3,		x2 ## 1, x4 ## 1;          \
50962306a36Sopenharmony_ci	vpslld $(32 - 3),	x2 ## 1, x2 ## 1;          \
51062306a36Sopenharmony_ci	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
51162306a36Sopenharmony_ci		vpsrld $13,		x0 ## 2, x4 ## 2;          \
51262306a36Sopenharmony_ci		vpslld $(32 - 13),	x0 ## 2, x0 ## 2;          \
51362306a36Sopenharmony_ci		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
51462306a36Sopenharmony_ci		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
51562306a36Sopenharmony_ci		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
51662306a36Sopenharmony_ci		vpsrld $3,		x2 ## 2, x4 ## 2;          \
51762306a36Sopenharmony_ci		vpslld $(32 - 3),	x2 ## 2, x2 ## 2;          \
51862306a36Sopenharmony_ci		vpor			x4 ## 2, x2 ## 2, x2 ## 2;
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci#define S(SBOX, x0, x1, x2, x3, x4) \
52162306a36Sopenharmony_ci	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
52262306a36Sopenharmony_ci	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
52362306a36Sopenharmony_ci	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
52462306a36Sopenharmony_ci	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
52562306a36Sopenharmony_ci
52662306a36Sopenharmony_ci#define SP(SBOX, x0, x1, x2, x3, x4, i) \
52762306a36Sopenharmony_ci	get_key(i, 0, RK0); \
52862306a36Sopenharmony_ci	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
52962306a36Sopenharmony_ci	get_key(i, 2, RK2); \
53062306a36Sopenharmony_ci	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
53162306a36Sopenharmony_ci	get_key(i, 3, RK3); \
53262306a36Sopenharmony_ci	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
53362306a36Sopenharmony_ci	get_key(i, 1, RK1); \
53462306a36Sopenharmony_ci	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
53562306a36Sopenharmony_ci
53662306a36Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
53762306a36Sopenharmony_ci	vpunpckldq		x1, x0, t0; \
53862306a36Sopenharmony_ci	vpunpckhdq		x1, x0, t2; \
53962306a36Sopenharmony_ci	vpunpckldq		x3, x2, t1; \
54062306a36Sopenharmony_ci	vpunpckhdq		x3, x2, x3; \
54162306a36Sopenharmony_ci	\
54262306a36Sopenharmony_ci	vpunpcklqdq		t1, t0, x0; \
54362306a36Sopenharmony_ci	vpunpckhqdq		t1, t0, x1; \
54462306a36Sopenharmony_ci	vpunpcklqdq		x3, t2, x2; \
54562306a36Sopenharmony_ci	vpunpckhqdq		x3, t2, x3;
54662306a36Sopenharmony_ci
54762306a36Sopenharmony_ci#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
54862306a36Sopenharmony_ci	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
54962306a36Sopenharmony_ci
55062306a36Sopenharmony_ci#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
55162306a36Sopenharmony_ci	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__serpent_enc_blk16)
55462306a36Sopenharmony_ci	/* input:
55562306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
55662306a36Sopenharmony_ci	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
55762306a36Sopenharmony_ci	 * output:
55862306a36Sopenharmony_ci	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
55962306a36Sopenharmony_ci	 */
56062306a36Sopenharmony_ci
56162306a36Sopenharmony_ci	vpcmpeqd RNOT, RNOT, RNOT;
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_ci	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
56462306a36Sopenharmony_ci	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
56562306a36Sopenharmony_ci
56662306a36Sopenharmony_ci						 K2(RA, RB, RC, RD, RE, 0);
56762306a36Sopenharmony_ci	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
56862306a36Sopenharmony_ci	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2);
56962306a36Sopenharmony_ci	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3);
57062306a36Sopenharmony_ci	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4);
57162306a36Sopenharmony_ci	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5);
57262306a36Sopenharmony_ci	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6);
57362306a36Sopenharmony_ci	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7);
57462306a36Sopenharmony_ci	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8);
57562306a36Sopenharmony_ci	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9);
57662306a36Sopenharmony_ci	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10);
57762306a36Sopenharmony_ci	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11);
57862306a36Sopenharmony_ci	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12);
57962306a36Sopenharmony_ci	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13);
58062306a36Sopenharmony_ci	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14);
58162306a36Sopenharmony_ci	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15);
58262306a36Sopenharmony_ci	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16);
58362306a36Sopenharmony_ci	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17);
58462306a36Sopenharmony_ci	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18);
58562306a36Sopenharmony_ci	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19);
58662306a36Sopenharmony_ci	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20);
58762306a36Sopenharmony_ci	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21);
58862306a36Sopenharmony_ci	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22);
58962306a36Sopenharmony_ci	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23);
59062306a36Sopenharmony_ci	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24);
59162306a36Sopenharmony_ci	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25);
59262306a36Sopenharmony_ci	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26);
59362306a36Sopenharmony_ci	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27);
59462306a36Sopenharmony_ci	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28);
59562306a36Sopenharmony_ci	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29);
59662306a36Sopenharmony_ci	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30);
59762306a36Sopenharmony_ci	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
59862306a36Sopenharmony_ci	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
59962306a36Sopenharmony_ci
60062306a36Sopenharmony_ci	write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
60162306a36Sopenharmony_ci	write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
60262306a36Sopenharmony_ci
60362306a36Sopenharmony_ci	RET;
60462306a36Sopenharmony_ciSYM_FUNC_END(__serpent_enc_blk16)
60562306a36Sopenharmony_ci
60662306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__serpent_dec_blk16)
60762306a36Sopenharmony_ci	/* input:
60862306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
60962306a36Sopenharmony_ci	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
61062306a36Sopenharmony_ci	 * output:
61162306a36Sopenharmony_ci	 *	RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
61262306a36Sopenharmony_ci	 */
61362306a36Sopenharmony_ci
61462306a36Sopenharmony_ci	vpcmpeqd RNOT, RNOT, RNOT;
61562306a36Sopenharmony_ci
61662306a36Sopenharmony_ci	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
61762306a36Sopenharmony_ci	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
61862306a36Sopenharmony_ci
61962306a36Sopenharmony_ci						 K2(RA, RB, RC, RD, RE, 32);
62062306a36Sopenharmony_ci	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
62162306a36Sopenharmony_ci	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30);
62262306a36Sopenharmony_ci	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29);
62362306a36Sopenharmony_ci	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28);
62462306a36Sopenharmony_ci	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27);
62562306a36Sopenharmony_ci	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26);
62662306a36Sopenharmony_ci	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25);
62762306a36Sopenharmony_ci	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24);
62862306a36Sopenharmony_ci	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23);
62962306a36Sopenharmony_ci	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22);
63062306a36Sopenharmony_ci	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21);
63162306a36Sopenharmony_ci	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20);
63262306a36Sopenharmony_ci	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19);
63362306a36Sopenharmony_ci	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18);
63462306a36Sopenharmony_ci	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17);
63562306a36Sopenharmony_ci	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16);
63662306a36Sopenharmony_ci	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15);
63762306a36Sopenharmony_ci	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14);
63862306a36Sopenharmony_ci	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13);
63962306a36Sopenharmony_ci	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12);
64062306a36Sopenharmony_ci	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11);
64162306a36Sopenharmony_ci	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10);
64262306a36Sopenharmony_ci	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9);
64362306a36Sopenharmony_ci	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8);
64462306a36Sopenharmony_ci	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7);
64562306a36Sopenharmony_ci	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6);
64662306a36Sopenharmony_ci	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5);
64762306a36Sopenharmony_ci	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4);
64862306a36Sopenharmony_ci	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3);
64962306a36Sopenharmony_ci	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2);
65062306a36Sopenharmony_ci	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
65162306a36Sopenharmony_ci	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
65262306a36Sopenharmony_ci
65362306a36Sopenharmony_ci	write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
65462306a36Sopenharmony_ci	write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
65562306a36Sopenharmony_ci
65662306a36Sopenharmony_ci	RET;
65762306a36Sopenharmony_ciSYM_FUNC_END(__serpent_dec_blk16)
65862306a36Sopenharmony_ci
65962306a36Sopenharmony_ciSYM_FUNC_START(serpent_ecb_enc_16way)
66062306a36Sopenharmony_ci	/* input:
66162306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
66262306a36Sopenharmony_ci	 *	%rsi: dst
66362306a36Sopenharmony_ci	 *	%rdx: src
66462306a36Sopenharmony_ci	 */
66562306a36Sopenharmony_ci	FRAME_BEGIN
66662306a36Sopenharmony_ci
66762306a36Sopenharmony_ci	vzeroupper;
66862306a36Sopenharmony_ci
66962306a36Sopenharmony_ci	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_ci	call __serpent_enc_blk16;
67262306a36Sopenharmony_ci
67362306a36Sopenharmony_ci	store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
67462306a36Sopenharmony_ci
67562306a36Sopenharmony_ci	vzeroupper;
67662306a36Sopenharmony_ci
67762306a36Sopenharmony_ci	FRAME_END
67862306a36Sopenharmony_ci	RET;
67962306a36Sopenharmony_ciSYM_FUNC_END(serpent_ecb_enc_16way)
68062306a36Sopenharmony_ci
68162306a36Sopenharmony_ciSYM_FUNC_START(serpent_ecb_dec_16way)
68262306a36Sopenharmony_ci	/* input:
68362306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
68462306a36Sopenharmony_ci	 *	%rsi: dst
68562306a36Sopenharmony_ci	 *	%rdx: src
68662306a36Sopenharmony_ci	 */
68762306a36Sopenharmony_ci	FRAME_BEGIN
68862306a36Sopenharmony_ci
68962306a36Sopenharmony_ci	vzeroupper;
69062306a36Sopenharmony_ci
69162306a36Sopenharmony_ci	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
69262306a36Sopenharmony_ci
69362306a36Sopenharmony_ci	call __serpent_dec_blk16;
69462306a36Sopenharmony_ci
69562306a36Sopenharmony_ci	store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
69662306a36Sopenharmony_ci
69762306a36Sopenharmony_ci	vzeroupper;
69862306a36Sopenharmony_ci
69962306a36Sopenharmony_ci	FRAME_END
70062306a36Sopenharmony_ci	RET;
70162306a36Sopenharmony_ciSYM_FUNC_END(serpent_ecb_dec_16way)
70262306a36Sopenharmony_ci
70362306a36Sopenharmony_ciSYM_FUNC_START(serpent_cbc_dec_16way)
70462306a36Sopenharmony_ci	/* input:
70562306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
70662306a36Sopenharmony_ci	 *	%rsi: dst
70762306a36Sopenharmony_ci	 *	%rdx: src
70862306a36Sopenharmony_ci	 */
70962306a36Sopenharmony_ci	FRAME_BEGIN
71062306a36Sopenharmony_ci
71162306a36Sopenharmony_ci	vzeroupper;
71262306a36Sopenharmony_ci
71362306a36Sopenharmony_ci	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
71462306a36Sopenharmony_ci
71562306a36Sopenharmony_ci	call __serpent_dec_blk16;
71662306a36Sopenharmony_ci
71762306a36Sopenharmony_ci	store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
71862306a36Sopenharmony_ci			RK0);
71962306a36Sopenharmony_ci
72062306a36Sopenharmony_ci	vzeroupper;
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_ci	FRAME_END
72362306a36Sopenharmony_ci	RET;
72462306a36Sopenharmony_ciSYM_FUNC_END(serpent_cbc_dec_16way)
725