162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Based on crypto/serpent.c by
862306a36Sopenharmony_ci *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
962306a36Sopenharmony_ci *                2003 Herbert Valerio Riedel <hvr@gnu.org>
1062306a36Sopenharmony_ci */
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci#include <linux/linkage.h>
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci.file "serpent-sse2-x86_64-asm_64.S"
1562306a36Sopenharmony_ci.text
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ci#define CTX %rdi
1862306a36Sopenharmony_ci
1962306a36Sopenharmony_ci/**********************************************************************
2062306a36Sopenharmony_ci  8-way SSE2 serpent
2162306a36Sopenharmony_ci **********************************************************************/
2262306a36Sopenharmony_ci#define RA1 %xmm0
2362306a36Sopenharmony_ci#define RB1 %xmm1
2462306a36Sopenharmony_ci#define RC1 %xmm2
2562306a36Sopenharmony_ci#define RD1 %xmm3
2662306a36Sopenharmony_ci#define RE1 %xmm4
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci#define RA2 %xmm5
2962306a36Sopenharmony_ci#define RB2 %xmm6
3062306a36Sopenharmony_ci#define RC2 %xmm7
3162306a36Sopenharmony_ci#define RD2 %xmm8
3262306a36Sopenharmony_ci#define RE2 %xmm9
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci#define RNOT %xmm10
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_ci#define RK0 %xmm11
3762306a36Sopenharmony_ci#define RK1 %xmm12
3862306a36Sopenharmony_ci#define RK2 %xmm13
3962306a36Sopenharmony_ci#define RK3 %xmm14
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_ci#define S0_1(x0, x1, x2, x3, x4) \
4262306a36Sopenharmony_ci	movdqa x3,		x4; \
4362306a36Sopenharmony_ci	por x0,			x3; \
4462306a36Sopenharmony_ci	pxor x4,		x0; \
4562306a36Sopenharmony_ci	pxor x2,		x4; \
4662306a36Sopenharmony_ci	pxor RNOT,		x4; \
4762306a36Sopenharmony_ci	pxor x1,		x3; \
4862306a36Sopenharmony_ci	pand x0,		x1; \
4962306a36Sopenharmony_ci	pxor x4,		x1; \
5062306a36Sopenharmony_ci	pxor x0,		x2;
5162306a36Sopenharmony_ci#define S0_2(x0, x1, x2, x3, x4) \
5262306a36Sopenharmony_ci	pxor x3,		x0; \
5362306a36Sopenharmony_ci	por x0,			x4; \
5462306a36Sopenharmony_ci	pxor x2,		x0; \
5562306a36Sopenharmony_ci	pand x1,		x2; \
5662306a36Sopenharmony_ci	pxor x2,		x3; \
5762306a36Sopenharmony_ci	pxor RNOT,		x1; \
5862306a36Sopenharmony_ci	pxor x4,		x2; \
5962306a36Sopenharmony_ci	pxor x2,		x1;
6062306a36Sopenharmony_ci
6162306a36Sopenharmony_ci#define S1_1(x0, x1, x2, x3, x4) \
6262306a36Sopenharmony_ci	movdqa x1,		x4; \
6362306a36Sopenharmony_ci	pxor x0,		x1; \
6462306a36Sopenharmony_ci	pxor x3,		x0; \
6562306a36Sopenharmony_ci	pxor RNOT,		x3; \
6662306a36Sopenharmony_ci	pand x1,		x4; \
6762306a36Sopenharmony_ci	por x1,			x0; \
6862306a36Sopenharmony_ci	pxor x2,		x3; \
6962306a36Sopenharmony_ci	pxor x3,		x0; \
7062306a36Sopenharmony_ci	pxor x3,		x1;
7162306a36Sopenharmony_ci#define S1_2(x0, x1, x2, x3, x4) \
7262306a36Sopenharmony_ci	pxor x4,		x3; \
7362306a36Sopenharmony_ci	por x4,			x1; \
7462306a36Sopenharmony_ci	pxor x2,		x4; \
7562306a36Sopenharmony_ci	pand x0,		x2; \
7662306a36Sopenharmony_ci	pxor x1,		x2; \
7762306a36Sopenharmony_ci	por x0,			x1; \
7862306a36Sopenharmony_ci	pxor RNOT,		x0; \
7962306a36Sopenharmony_ci	pxor x2,		x0; \
8062306a36Sopenharmony_ci	pxor x1,		x4;
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci#define S2_1(x0, x1, x2, x3, x4) \
8362306a36Sopenharmony_ci	pxor RNOT,		x3; \
8462306a36Sopenharmony_ci	pxor x0,		x1; \
8562306a36Sopenharmony_ci	movdqa x0,		x4; \
8662306a36Sopenharmony_ci	pand x2,		x0; \
8762306a36Sopenharmony_ci	pxor x3,		x0; \
8862306a36Sopenharmony_ci	por x4,			x3; \
8962306a36Sopenharmony_ci	pxor x1,		x2; \
9062306a36Sopenharmony_ci	pxor x1,		x3; \
9162306a36Sopenharmony_ci	pand x0,		x1;
9262306a36Sopenharmony_ci#define S2_2(x0, x1, x2, x3, x4) \
9362306a36Sopenharmony_ci	pxor x2,		x0; \
9462306a36Sopenharmony_ci	pand x3,		x2; \
9562306a36Sopenharmony_ci	por x1,			x3; \
9662306a36Sopenharmony_ci	pxor RNOT,		x0; \
9762306a36Sopenharmony_ci	pxor x0,		x3; \
9862306a36Sopenharmony_ci	pxor x0,		x4; \
9962306a36Sopenharmony_ci	pxor x2,		x0; \
10062306a36Sopenharmony_ci	por x2,			x1;
10162306a36Sopenharmony_ci
10262306a36Sopenharmony_ci#define S3_1(x0, x1, x2, x3, x4) \
10362306a36Sopenharmony_ci	movdqa x1,		x4; \
10462306a36Sopenharmony_ci	pxor x3,		x1; \
10562306a36Sopenharmony_ci	por x0,			x3; \
10662306a36Sopenharmony_ci	pand x0,		x4; \
10762306a36Sopenharmony_ci	pxor x2,		x0; \
10862306a36Sopenharmony_ci	pxor x1,		x2; \
10962306a36Sopenharmony_ci	pand x3,		x1; \
11062306a36Sopenharmony_ci	pxor x3,		x2; \
11162306a36Sopenharmony_ci	por x4,			x0; \
11262306a36Sopenharmony_ci	pxor x3,		x4;
11362306a36Sopenharmony_ci#define S3_2(x0, x1, x2, x3, x4) \
11462306a36Sopenharmony_ci	pxor x0,		x1; \
11562306a36Sopenharmony_ci	pand x3,		x0; \
11662306a36Sopenharmony_ci	pand x4,		x3; \
11762306a36Sopenharmony_ci	pxor x2,		x3; \
11862306a36Sopenharmony_ci	por x1,			x4; \
11962306a36Sopenharmony_ci	pand x1,		x2; \
12062306a36Sopenharmony_ci	pxor x3,		x4; \
12162306a36Sopenharmony_ci	pxor x3,		x0; \
12262306a36Sopenharmony_ci	pxor x2,		x3;
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_ci#define S4_1(x0, x1, x2, x3, x4) \
12562306a36Sopenharmony_ci	movdqa x3,		x4; \
12662306a36Sopenharmony_ci	pand x0,		x3; \
12762306a36Sopenharmony_ci	pxor x4,		x0; \
12862306a36Sopenharmony_ci	pxor x2,		x3; \
12962306a36Sopenharmony_ci	por x4,			x2; \
13062306a36Sopenharmony_ci	pxor x1,		x0; \
13162306a36Sopenharmony_ci	pxor x3,		x4; \
13262306a36Sopenharmony_ci	por x0,			x2; \
13362306a36Sopenharmony_ci	pxor x1,		x2;
13462306a36Sopenharmony_ci#define S4_2(x0, x1, x2, x3, x4) \
13562306a36Sopenharmony_ci	pand x0,		x1; \
13662306a36Sopenharmony_ci	pxor x4,		x1; \
13762306a36Sopenharmony_ci	pand x2,		x4; \
13862306a36Sopenharmony_ci	pxor x3,		x2; \
13962306a36Sopenharmony_ci	pxor x0,		x4; \
14062306a36Sopenharmony_ci	por x1,			x3; \
14162306a36Sopenharmony_ci	pxor RNOT,		x1; \
14262306a36Sopenharmony_ci	pxor x0,		x3;
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci#define S5_1(x0, x1, x2, x3, x4) \
14562306a36Sopenharmony_ci	movdqa x1,		x4; \
14662306a36Sopenharmony_ci	por x0,			x1; \
14762306a36Sopenharmony_ci	pxor x1,		x2; \
14862306a36Sopenharmony_ci	pxor RNOT,		x3; \
14962306a36Sopenharmony_ci	pxor x0,		x4; \
15062306a36Sopenharmony_ci	pxor x2,		x0; \
15162306a36Sopenharmony_ci	pand x4,		x1; \
15262306a36Sopenharmony_ci	por x3,			x4; \
15362306a36Sopenharmony_ci	pxor x0,		x4;
15462306a36Sopenharmony_ci#define S5_2(x0, x1, x2, x3, x4) \
15562306a36Sopenharmony_ci	pand x3,		x0; \
15662306a36Sopenharmony_ci	pxor x3,		x1; \
15762306a36Sopenharmony_ci	pxor x2,		x3; \
15862306a36Sopenharmony_ci	pxor x1,		x0; \
15962306a36Sopenharmony_ci	pand x4,		x2; \
16062306a36Sopenharmony_ci	pxor x2,		x1; \
16162306a36Sopenharmony_ci	pand x0,		x2; \
16262306a36Sopenharmony_ci	pxor x2,		x3;
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci#define S6_1(x0, x1, x2, x3, x4) \
16562306a36Sopenharmony_ci	movdqa x1,		x4; \
16662306a36Sopenharmony_ci	pxor x0,		x3; \
16762306a36Sopenharmony_ci	pxor x2,		x1; \
16862306a36Sopenharmony_ci	pxor x0,		x2; \
16962306a36Sopenharmony_ci	pand x3,		x0; \
17062306a36Sopenharmony_ci	por x3,			x1; \
17162306a36Sopenharmony_ci	pxor RNOT,		x4; \
17262306a36Sopenharmony_ci	pxor x1,		x0; \
17362306a36Sopenharmony_ci	pxor x2,		x1;
17462306a36Sopenharmony_ci#define S6_2(x0, x1, x2, x3, x4) \
17562306a36Sopenharmony_ci	pxor x4,		x3; \
17662306a36Sopenharmony_ci	pxor x0,		x4; \
17762306a36Sopenharmony_ci	pand x0,		x2; \
17862306a36Sopenharmony_ci	pxor x1,		x4; \
17962306a36Sopenharmony_ci	pxor x3,		x2; \
18062306a36Sopenharmony_ci	pand x1,		x3; \
18162306a36Sopenharmony_ci	pxor x0,		x3; \
18262306a36Sopenharmony_ci	pxor x2,		x1;
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_ci#define S7_1(x0, x1, x2, x3, x4) \
18562306a36Sopenharmony_ci	pxor RNOT,		x1; \
18662306a36Sopenharmony_ci	movdqa x1,		x4; \
18762306a36Sopenharmony_ci	pxor RNOT,		x0; \
18862306a36Sopenharmony_ci	pand x2,		x1; \
18962306a36Sopenharmony_ci	pxor x3,		x1; \
19062306a36Sopenharmony_ci	por x4,			x3; \
19162306a36Sopenharmony_ci	pxor x2,		x4; \
19262306a36Sopenharmony_ci	pxor x3,		x2; \
19362306a36Sopenharmony_ci	pxor x0,		x3; \
19462306a36Sopenharmony_ci	por x1,			x0;
19562306a36Sopenharmony_ci#define S7_2(x0, x1, x2, x3, x4) \
19662306a36Sopenharmony_ci	pand x0,		x2; \
19762306a36Sopenharmony_ci	pxor x4,		x0; \
19862306a36Sopenharmony_ci	pxor x3,		x4; \
19962306a36Sopenharmony_ci	pand x0,		x3; \
20062306a36Sopenharmony_ci	pxor x1,		x4; \
20162306a36Sopenharmony_ci	pxor x4,		x2; \
20262306a36Sopenharmony_ci	pxor x1,		x3; \
20362306a36Sopenharmony_ci	por x0,			x4; \
20462306a36Sopenharmony_ci	pxor x1,		x4;
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_ci#define SI0_1(x0, x1, x2, x3, x4) \
20762306a36Sopenharmony_ci	movdqa x3,		x4; \
20862306a36Sopenharmony_ci	pxor x0,		x1; \
20962306a36Sopenharmony_ci	por x1,			x3; \
21062306a36Sopenharmony_ci	pxor x1,		x4; \
21162306a36Sopenharmony_ci	pxor RNOT,		x0; \
21262306a36Sopenharmony_ci	pxor x3,		x2; \
21362306a36Sopenharmony_ci	pxor x0,		x3; \
21462306a36Sopenharmony_ci	pand x1,		x0; \
21562306a36Sopenharmony_ci	pxor x2,		x0;
21662306a36Sopenharmony_ci#define SI0_2(x0, x1, x2, x3, x4) \
21762306a36Sopenharmony_ci	pand x3,		x2; \
21862306a36Sopenharmony_ci	pxor x4,		x3; \
21962306a36Sopenharmony_ci	pxor x3,		x2; \
22062306a36Sopenharmony_ci	pxor x3,		x1; \
22162306a36Sopenharmony_ci	pand x0,		x3; \
22262306a36Sopenharmony_ci	pxor x0,		x1; \
22362306a36Sopenharmony_ci	pxor x2,		x0; \
22462306a36Sopenharmony_ci	pxor x3,		x4;
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci#define SI1_1(x0, x1, x2, x3, x4) \
22762306a36Sopenharmony_ci	pxor x3,		x1; \
22862306a36Sopenharmony_ci	movdqa x0,		x4; \
22962306a36Sopenharmony_ci	pxor x2,		x0; \
23062306a36Sopenharmony_ci	pxor RNOT,		x2; \
23162306a36Sopenharmony_ci	por x1,			x4; \
23262306a36Sopenharmony_ci	pxor x3,		x4; \
23362306a36Sopenharmony_ci	pand x1,		x3; \
23462306a36Sopenharmony_ci	pxor x2,		x1; \
23562306a36Sopenharmony_ci	pand x4,		x2;
23662306a36Sopenharmony_ci#define SI1_2(x0, x1, x2, x3, x4) \
23762306a36Sopenharmony_ci	pxor x1,		x4; \
23862306a36Sopenharmony_ci	por x3,			x1; \
23962306a36Sopenharmony_ci	pxor x0,		x3; \
24062306a36Sopenharmony_ci	pxor x0,		x2; \
24162306a36Sopenharmony_ci	por x4,			x0; \
24262306a36Sopenharmony_ci	pxor x4,		x2; \
24362306a36Sopenharmony_ci	pxor x0,		x1; \
24462306a36Sopenharmony_ci	pxor x1,		x4;
24562306a36Sopenharmony_ci
24662306a36Sopenharmony_ci#define SI2_1(x0, x1, x2, x3, x4) \
24762306a36Sopenharmony_ci	pxor x1,		x2; \
24862306a36Sopenharmony_ci	movdqa x3,		x4; \
24962306a36Sopenharmony_ci	pxor RNOT,		x3; \
25062306a36Sopenharmony_ci	por x2,			x3; \
25162306a36Sopenharmony_ci	pxor x4,		x2; \
25262306a36Sopenharmony_ci	pxor x0,		x4; \
25362306a36Sopenharmony_ci	pxor x1,		x3; \
25462306a36Sopenharmony_ci	por x2,			x1; \
25562306a36Sopenharmony_ci	pxor x0,		x2;
25662306a36Sopenharmony_ci#define SI2_2(x0, x1, x2, x3, x4) \
25762306a36Sopenharmony_ci	pxor x4,		x1; \
25862306a36Sopenharmony_ci	por x3,			x4; \
25962306a36Sopenharmony_ci	pxor x3,		x2; \
26062306a36Sopenharmony_ci	pxor x2,		x4; \
26162306a36Sopenharmony_ci	pand x1,		x2; \
26262306a36Sopenharmony_ci	pxor x3,		x2; \
26362306a36Sopenharmony_ci	pxor x4,		x3; \
26462306a36Sopenharmony_ci	pxor x0,		x4;
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_ci#define SI3_1(x0, x1, x2, x3, x4) \
26762306a36Sopenharmony_ci	pxor x1,		x2; \
26862306a36Sopenharmony_ci	movdqa x1,		x4; \
26962306a36Sopenharmony_ci	pand x2,		x1; \
27062306a36Sopenharmony_ci	pxor x0,		x1; \
27162306a36Sopenharmony_ci	por x4,			x0; \
27262306a36Sopenharmony_ci	pxor x3,		x4; \
27362306a36Sopenharmony_ci	pxor x3,		x0; \
27462306a36Sopenharmony_ci	por x1,			x3; \
27562306a36Sopenharmony_ci	pxor x2,		x1;
27662306a36Sopenharmony_ci#define SI3_2(x0, x1, x2, x3, x4) \
27762306a36Sopenharmony_ci	pxor x3,		x1; \
27862306a36Sopenharmony_ci	pxor x2,		x0; \
27962306a36Sopenharmony_ci	pxor x3,		x2; \
28062306a36Sopenharmony_ci	pand x1,		x3; \
28162306a36Sopenharmony_ci	pxor x0,		x1; \
28262306a36Sopenharmony_ci	pand x2,		x0; \
28362306a36Sopenharmony_ci	pxor x3,		x4; \
28462306a36Sopenharmony_ci	pxor x0,		x3; \
28562306a36Sopenharmony_ci	pxor x1,		x0;
28662306a36Sopenharmony_ci
28762306a36Sopenharmony_ci#define SI4_1(x0, x1, x2, x3, x4) \
28862306a36Sopenharmony_ci	pxor x3,		x2; \
28962306a36Sopenharmony_ci	movdqa x0,		x4; \
29062306a36Sopenharmony_ci	pand x1,		x0; \
29162306a36Sopenharmony_ci	pxor x2,		x0; \
29262306a36Sopenharmony_ci	por x3,			x2; \
29362306a36Sopenharmony_ci	pxor RNOT,		x4; \
29462306a36Sopenharmony_ci	pxor x0,		x1; \
29562306a36Sopenharmony_ci	pxor x2,		x0; \
29662306a36Sopenharmony_ci	pand x4,		x2;
29762306a36Sopenharmony_ci#define SI4_2(x0, x1, x2, x3, x4) \
29862306a36Sopenharmony_ci	pxor x0,		x2; \
29962306a36Sopenharmony_ci	por x4,			x0; \
30062306a36Sopenharmony_ci	pxor x3,		x0; \
30162306a36Sopenharmony_ci	pand x2,		x3; \
30262306a36Sopenharmony_ci	pxor x3,		x4; \
30362306a36Sopenharmony_ci	pxor x1,		x3; \
30462306a36Sopenharmony_ci	pand x0,		x1; \
30562306a36Sopenharmony_ci	pxor x1,		x4; \
30662306a36Sopenharmony_ci	pxor x3,		x0;
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci#define SI5_1(x0, x1, x2, x3, x4) \
30962306a36Sopenharmony_ci	movdqa x1,		x4; \
31062306a36Sopenharmony_ci	por x2,			x1; \
31162306a36Sopenharmony_ci	pxor x4,		x2; \
31262306a36Sopenharmony_ci	pxor x3,		x1; \
31362306a36Sopenharmony_ci	pand x4,		x3; \
31462306a36Sopenharmony_ci	pxor x3,		x2; \
31562306a36Sopenharmony_ci	por x0,			x3; \
31662306a36Sopenharmony_ci	pxor RNOT,		x0; \
31762306a36Sopenharmony_ci	pxor x2,		x3; \
31862306a36Sopenharmony_ci	por x0,			x2;
31962306a36Sopenharmony_ci#define SI5_2(x0, x1, x2, x3, x4) \
32062306a36Sopenharmony_ci	pxor x1,		x4; \
32162306a36Sopenharmony_ci	pxor x4,		x2; \
32262306a36Sopenharmony_ci	pand x0,		x4; \
32362306a36Sopenharmony_ci	pxor x1,		x0; \
32462306a36Sopenharmony_ci	pxor x3,		x1; \
32562306a36Sopenharmony_ci	pand x2,		x0; \
32662306a36Sopenharmony_ci	pxor x3,		x2; \
32762306a36Sopenharmony_ci	pxor x2,		x0; \
32862306a36Sopenharmony_ci	pxor x4,		x2; \
32962306a36Sopenharmony_ci	pxor x3,		x4;
33062306a36Sopenharmony_ci
33162306a36Sopenharmony_ci#define SI6_1(x0, x1, x2, x3, x4) \
33262306a36Sopenharmony_ci	pxor x2,		x0; \
33362306a36Sopenharmony_ci	movdqa x0,		x4; \
33462306a36Sopenharmony_ci	pand x3,		x0; \
33562306a36Sopenharmony_ci	pxor x3,		x2; \
33662306a36Sopenharmony_ci	pxor x2,		x0; \
33762306a36Sopenharmony_ci	pxor x1,		x3; \
33862306a36Sopenharmony_ci	por x4,			x2; \
33962306a36Sopenharmony_ci	pxor x3,		x2; \
34062306a36Sopenharmony_ci	pand x0,		x3;
34162306a36Sopenharmony_ci#define SI6_2(x0, x1, x2, x3, x4) \
34262306a36Sopenharmony_ci	pxor RNOT,		x0; \
34362306a36Sopenharmony_ci	pxor x1,		x3; \
34462306a36Sopenharmony_ci	pand x2,		x1; \
34562306a36Sopenharmony_ci	pxor x0,		x4; \
34662306a36Sopenharmony_ci	pxor x4,		x3; \
34762306a36Sopenharmony_ci	pxor x2,		x4; \
34862306a36Sopenharmony_ci	pxor x1,		x0; \
34962306a36Sopenharmony_ci	pxor x0,		x2;
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci#define SI7_1(x0, x1, x2, x3, x4) \
35262306a36Sopenharmony_ci	movdqa x3,		x4; \
35362306a36Sopenharmony_ci	pand x0,		x3; \
35462306a36Sopenharmony_ci	pxor x2,		x0; \
35562306a36Sopenharmony_ci	por x4,			x2; \
35662306a36Sopenharmony_ci	pxor x1,		x4; \
35762306a36Sopenharmony_ci	pxor RNOT,		x0; \
35862306a36Sopenharmony_ci	por x3,			x1; \
35962306a36Sopenharmony_ci	pxor x0,		x4; \
36062306a36Sopenharmony_ci	pand x2,		x0; \
36162306a36Sopenharmony_ci	pxor x1,		x0;
36262306a36Sopenharmony_ci#define SI7_2(x0, x1, x2, x3, x4) \
36362306a36Sopenharmony_ci	pand x2,		x1; \
36462306a36Sopenharmony_ci	pxor x2,		x3; \
36562306a36Sopenharmony_ci	pxor x3,		x4; \
36662306a36Sopenharmony_ci	pand x3,		x2; \
36762306a36Sopenharmony_ci	por x0,			x3; \
36862306a36Sopenharmony_ci	pxor x4,		x1; \
36962306a36Sopenharmony_ci	pxor x4,		x3; \
37062306a36Sopenharmony_ci	pand x0,		x4; \
37162306a36Sopenharmony_ci	pxor x2,		x4;
37262306a36Sopenharmony_ci
37362306a36Sopenharmony_ci#define get_key(i, j, t) \
37462306a36Sopenharmony_ci	movd (4*(i)+(j))*4(CTX), t; \
37562306a36Sopenharmony_ci	pshufd $0, t, t;
37662306a36Sopenharmony_ci
37762306a36Sopenharmony_ci#define K2(x0, x1, x2, x3, x4, i) \
37862306a36Sopenharmony_ci	get_key(i, 0, RK0); \
37962306a36Sopenharmony_ci	get_key(i, 1, RK1); \
38062306a36Sopenharmony_ci	get_key(i, 2, RK2); \
38162306a36Sopenharmony_ci	get_key(i, 3, RK3); \
38262306a36Sopenharmony_ci	pxor RK0,		x0 ## 1; \
38362306a36Sopenharmony_ci	pxor RK1,		x1 ## 1; \
38462306a36Sopenharmony_ci	pxor RK2,		x2 ## 1; \
38562306a36Sopenharmony_ci	pxor RK3,		x3 ## 1; \
38662306a36Sopenharmony_ci		pxor RK0,		x0 ## 2; \
38762306a36Sopenharmony_ci		pxor RK1,		x1 ## 2; \
38862306a36Sopenharmony_ci		pxor RK2,		x2 ## 2; \
38962306a36Sopenharmony_ci		pxor RK3,		x3 ## 2;
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci#define LK2(x0, x1, x2, x3, x4, i) \
39262306a36Sopenharmony_ci	movdqa x0 ## 1,		x4 ## 1; \
39362306a36Sopenharmony_ci	pslld $13,		x0 ## 1; \
39462306a36Sopenharmony_ci	psrld $(32 - 13),	x4 ## 1; \
39562306a36Sopenharmony_ci	por x4 ## 1,		x0 ## 1; \
39662306a36Sopenharmony_ci	pxor x0 ## 1,		x1 ## 1; \
39762306a36Sopenharmony_ci	movdqa x2 ## 1,		x4 ## 1; \
39862306a36Sopenharmony_ci	pslld $3,		x2 ## 1; \
39962306a36Sopenharmony_ci	psrld $(32 - 3),	x4 ## 1; \
40062306a36Sopenharmony_ci	por x4 ## 1,		x2 ## 1; \
40162306a36Sopenharmony_ci	pxor x2 ## 1,		x1 ## 1; \
40262306a36Sopenharmony_ci		movdqa x0 ## 2,		x4 ## 2; \
40362306a36Sopenharmony_ci		pslld $13,		x0 ## 2; \
40462306a36Sopenharmony_ci		psrld $(32 - 13),	x4 ## 2; \
40562306a36Sopenharmony_ci		por x4 ## 2,		x0 ## 2; \
40662306a36Sopenharmony_ci		pxor x0 ## 2,		x1 ## 2; \
40762306a36Sopenharmony_ci		movdqa x2 ## 2,		x4 ## 2; \
40862306a36Sopenharmony_ci		pslld $3,		x2 ## 2; \
40962306a36Sopenharmony_ci		psrld $(32 - 3),	x4 ## 2; \
41062306a36Sopenharmony_ci		por x4 ## 2,		x2 ## 2; \
41162306a36Sopenharmony_ci		pxor x2 ## 2,		x1 ## 2; \
41262306a36Sopenharmony_ci	movdqa x1 ## 1,		x4 ## 1; \
41362306a36Sopenharmony_ci	pslld $1,		x1 ## 1; \
41462306a36Sopenharmony_ci	psrld $(32 - 1),	x4 ## 1; \
41562306a36Sopenharmony_ci	por x4 ## 1,		x1 ## 1; \
41662306a36Sopenharmony_ci	movdqa x0 ## 1,		x4 ## 1; \
41762306a36Sopenharmony_ci	pslld $3,		x4 ## 1; \
41862306a36Sopenharmony_ci	pxor x2 ## 1,		x3 ## 1; \
41962306a36Sopenharmony_ci	pxor x4 ## 1,		x3 ## 1; \
42062306a36Sopenharmony_ci	movdqa x3 ## 1,		x4 ## 1; \
42162306a36Sopenharmony_ci	get_key(i, 1, RK1); \
42262306a36Sopenharmony_ci		movdqa x1 ## 2,		x4 ## 2; \
42362306a36Sopenharmony_ci		pslld $1,		x1 ## 2; \
42462306a36Sopenharmony_ci		psrld $(32 - 1),	x4 ## 2; \
42562306a36Sopenharmony_ci		por x4 ## 2,		x1 ## 2; \
42662306a36Sopenharmony_ci		movdqa x0 ## 2,		x4 ## 2; \
42762306a36Sopenharmony_ci		pslld $3,		x4 ## 2; \
42862306a36Sopenharmony_ci		pxor x2 ## 2,		x3 ## 2; \
42962306a36Sopenharmony_ci		pxor x4 ## 2,		x3 ## 2; \
43062306a36Sopenharmony_ci		movdqa x3 ## 2,		x4 ## 2; \
43162306a36Sopenharmony_ci		get_key(i, 3, RK3); \
43262306a36Sopenharmony_ci	pslld $7,		x3 ## 1; \
43362306a36Sopenharmony_ci	psrld $(32 - 7),	x4 ## 1; \
43462306a36Sopenharmony_ci	por x4 ## 1,		x3 ## 1; \
43562306a36Sopenharmony_ci	movdqa x1 ## 1,		x4 ## 1; \
43662306a36Sopenharmony_ci	pslld $7,		x4 ## 1; \
43762306a36Sopenharmony_ci	pxor x1 ## 1,		x0 ## 1; \
43862306a36Sopenharmony_ci	pxor x3 ## 1,		x0 ## 1; \
43962306a36Sopenharmony_ci	pxor x3 ## 1,		x2 ## 1; \
44062306a36Sopenharmony_ci	pxor x4 ## 1,		x2 ## 1; \
44162306a36Sopenharmony_ci	get_key(i, 0, RK0); \
44262306a36Sopenharmony_ci		pslld $7,		x3 ## 2; \
44362306a36Sopenharmony_ci		psrld $(32 - 7),	x4 ## 2; \
44462306a36Sopenharmony_ci		por x4 ## 2,		x3 ## 2; \
44562306a36Sopenharmony_ci		movdqa x1 ## 2,		x4 ## 2; \
44662306a36Sopenharmony_ci		pslld $7,		x4 ## 2; \
44762306a36Sopenharmony_ci		pxor x1 ## 2,		x0 ## 2; \
44862306a36Sopenharmony_ci		pxor x3 ## 2,		x0 ## 2; \
44962306a36Sopenharmony_ci		pxor x3 ## 2,		x2 ## 2; \
45062306a36Sopenharmony_ci		pxor x4 ## 2,		x2 ## 2; \
45162306a36Sopenharmony_ci		get_key(i, 2, RK2); \
45262306a36Sopenharmony_ci	pxor RK1,		x1 ## 1; \
45362306a36Sopenharmony_ci	pxor RK3,		x3 ## 1; \
45462306a36Sopenharmony_ci	movdqa x0 ## 1,		x4 ## 1; \
45562306a36Sopenharmony_ci	pslld $5,		x0 ## 1; \
45662306a36Sopenharmony_ci	psrld $(32 - 5),	x4 ## 1; \
45762306a36Sopenharmony_ci	por x4 ## 1,		x0 ## 1; \
45862306a36Sopenharmony_ci	movdqa x2 ## 1,		x4 ## 1; \
45962306a36Sopenharmony_ci	pslld $22,		x2 ## 1; \
46062306a36Sopenharmony_ci	psrld $(32 - 22),	x4 ## 1; \
46162306a36Sopenharmony_ci	por x4 ## 1,		x2 ## 1; \
46262306a36Sopenharmony_ci	pxor RK0,		x0 ## 1; \
46362306a36Sopenharmony_ci	pxor RK2,		x2 ## 1; \
46462306a36Sopenharmony_ci		pxor RK1,		x1 ## 2; \
46562306a36Sopenharmony_ci		pxor RK3,		x3 ## 2; \
46662306a36Sopenharmony_ci		movdqa x0 ## 2,		x4 ## 2; \
46762306a36Sopenharmony_ci		pslld $5,		x0 ## 2; \
46862306a36Sopenharmony_ci		psrld $(32 - 5),	x4 ## 2; \
46962306a36Sopenharmony_ci		por x4 ## 2,		x0 ## 2; \
47062306a36Sopenharmony_ci		movdqa x2 ## 2,		x4 ## 2; \
47162306a36Sopenharmony_ci		pslld $22,		x2 ## 2; \
47262306a36Sopenharmony_ci		psrld $(32 - 22),	x4 ## 2; \
47362306a36Sopenharmony_ci		por x4 ## 2,		x2 ## 2; \
47462306a36Sopenharmony_ci		pxor RK0,		x0 ## 2; \
47562306a36Sopenharmony_ci		pxor RK2,		x2 ## 2;
47662306a36Sopenharmony_ci
47762306a36Sopenharmony_ci#define KL2(x0, x1, x2, x3, x4, i) \
47862306a36Sopenharmony_ci	pxor RK0,		x0 ## 1; \
47962306a36Sopenharmony_ci	pxor RK2,		x2 ## 1; \
48062306a36Sopenharmony_ci	movdqa x0 ## 1,		x4 ## 1; \
48162306a36Sopenharmony_ci	psrld $5,		x0 ## 1; \
48262306a36Sopenharmony_ci	pslld $(32 - 5),	x4 ## 1; \
48362306a36Sopenharmony_ci	por x4 ## 1,		x0 ## 1; \
48462306a36Sopenharmony_ci	pxor RK3,		x3 ## 1; \
48562306a36Sopenharmony_ci	pxor RK1,		x1 ## 1; \
48662306a36Sopenharmony_ci	movdqa x2 ## 1,		x4 ## 1; \
48762306a36Sopenharmony_ci	psrld $22,		x2 ## 1; \
48862306a36Sopenharmony_ci	pslld $(32 - 22),	x4 ## 1; \
48962306a36Sopenharmony_ci	por x4 ## 1,		x2 ## 1; \
49062306a36Sopenharmony_ci	pxor x3 ## 1,		x2 ## 1; \
49162306a36Sopenharmony_ci		pxor RK0,		x0 ## 2; \
49262306a36Sopenharmony_ci		pxor RK2,		x2 ## 2; \
49362306a36Sopenharmony_ci		movdqa x0 ## 2,		x4 ## 2; \
49462306a36Sopenharmony_ci		psrld $5,		x0 ## 2; \
49562306a36Sopenharmony_ci		pslld $(32 - 5),	x4 ## 2; \
49662306a36Sopenharmony_ci		por x4 ## 2,		x0 ## 2; \
49762306a36Sopenharmony_ci		pxor RK3,		x3 ## 2; \
49862306a36Sopenharmony_ci		pxor RK1,		x1 ## 2; \
49962306a36Sopenharmony_ci		movdqa x2 ## 2,		x4 ## 2; \
50062306a36Sopenharmony_ci		psrld $22,		x2 ## 2; \
50162306a36Sopenharmony_ci		pslld $(32 - 22),	x4 ## 2; \
50262306a36Sopenharmony_ci		por x4 ## 2,		x2 ## 2; \
50362306a36Sopenharmony_ci		pxor x3 ## 2,		x2 ## 2; \
50462306a36Sopenharmony_ci	pxor x3 ## 1,		x0 ## 1; \
50562306a36Sopenharmony_ci	movdqa x1 ## 1,		x4 ## 1; \
50662306a36Sopenharmony_ci	pslld $7,		x4 ## 1; \
50762306a36Sopenharmony_ci	pxor x1 ## 1,		x0 ## 1; \
50862306a36Sopenharmony_ci	pxor x4 ## 1,		x2 ## 1; \
50962306a36Sopenharmony_ci	movdqa x1 ## 1,		x4 ## 1; \
51062306a36Sopenharmony_ci	psrld $1,		x1 ## 1; \
51162306a36Sopenharmony_ci	pslld $(32 - 1),	x4 ## 1; \
51262306a36Sopenharmony_ci	por x4 ## 1,		x1 ## 1; \
51362306a36Sopenharmony_ci		pxor x3 ## 2,		x0 ## 2; \
51462306a36Sopenharmony_ci		movdqa x1 ## 2,		x4 ## 2; \
51562306a36Sopenharmony_ci		pslld $7,		x4 ## 2; \
51662306a36Sopenharmony_ci		pxor x1 ## 2,		x0 ## 2; \
51762306a36Sopenharmony_ci		pxor x4 ## 2,		x2 ## 2; \
51862306a36Sopenharmony_ci		movdqa x1 ## 2,		x4 ## 2; \
51962306a36Sopenharmony_ci		psrld $1,		x1 ## 2; \
52062306a36Sopenharmony_ci		pslld $(32 - 1),	x4 ## 2; \
52162306a36Sopenharmony_ci		por x4 ## 2,		x1 ## 2; \
52262306a36Sopenharmony_ci	movdqa x3 ## 1,		x4 ## 1; \
52362306a36Sopenharmony_ci	psrld $7,		x3 ## 1; \
52462306a36Sopenharmony_ci	pslld $(32 - 7),	x4 ## 1; \
52562306a36Sopenharmony_ci	por x4 ## 1,		x3 ## 1; \
52662306a36Sopenharmony_ci	pxor x0 ## 1,		x1 ## 1; \
52762306a36Sopenharmony_ci	movdqa x0 ## 1,		x4 ## 1; \
52862306a36Sopenharmony_ci	pslld $3,		x4 ## 1; \
52962306a36Sopenharmony_ci	pxor x4 ## 1,		x3 ## 1; \
53062306a36Sopenharmony_ci	movdqa x0 ## 1,		x4 ## 1; \
53162306a36Sopenharmony_ci		movdqa x3 ## 2,		x4 ## 2; \
53262306a36Sopenharmony_ci		psrld $7,		x3 ## 2; \
53362306a36Sopenharmony_ci		pslld $(32 - 7),	x4 ## 2; \
53462306a36Sopenharmony_ci		por x4 ## 2,		x3 ## 2; \
53562306a36Sopenharmony_ci		pxor x0 ## 2,		x1 ## 2; \
53662306a36Sopenharmony_ci		movdqa x0 ## 2,		x4 ## 2; \
53762306a36Sopenharmony_ci		pslld $3,		x4 ## 2; \
53862306a36Sopenharmony_ci		pxor x4 ## 2,		x3 ## 2; \
53962306a36Sopenharmony_ci		movdqa x0 ## 2,		x4 ## 2; \
54062306a36Sopenharmony_ci	psrld $13,		x0 ## 1; \
54162306a36Sopenharmony_ci	pslld $(32 - 13),	x4 ## 1; \
54262306a36Sopenharmony_ci	por x4 ## 1,		x0 ## 1; \
54362306a36Sopenharmony_ci	pxor x2 ## 1,		x1 ## 1; \
54462306a36Sopenharmony_ci	pxor x2 ## 1,		x3 ## 1; \
54562306a36Sopenharmony_ci	movdqa x2 ## 1,		x4 ## 1; \
54662306a36Sopenharmony_ci	psrld $3,		x2 ## 1; \
54762306a36Sopenharmony_ci	pslld $(32 - 3),	x4 ## 1; \
54862306a36Sopenharmony_ci	por x4 ## 1,		x2 ## 1; \
54962306a36Sopenharmony_ci		psrld $13,		x0 ## 2; \
55062306a36Sopenharmony_ci		pslld $(32 - 13),	x4 ## 2; \
55162306a36Sopenharmony_ci		por x4 ## 2,		x0 ## 2; \
55262306a36Sopenharmony_ci		pxor x2 ## 2,		x1 ## 2; \
55362306a36Sopenharmony_ci		pxor x2 ## 2,		x3 ## 2; \
55462306a36Sopenharmony_ci		movdqa x2 ## 2,		x4 ## 2; \
55562306a36Sopenharmony_ci		psrld $3,		x2 ## 2; \
55662306a36Sopenharmony_ci		pslld $(32 - 3),	x4 ## 2; \
55762306a36Sopenharmony_ci		por x4 ## 2,		x2 ## 2;
55862306a36Sopenharmony_ci
55962306a36Sopenharmony_ci#define S(SBOX, x0, x1, x2, x3, x4) \
56062306a36Sopenharmony_ci	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
56162306a36Sopenharmony_ci	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
56262306a36Sopenharmony_ci	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
56362306a36Sopenharmony_ci	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
56462306a36Sopenharmony_ci
56562306a36Sopenharmony_ci#define SP(SBOX, x0, x1, x2, x3, x4, i) \
56662306a36Sopenharmony_ci	get_key(i, 0, RK0); \
56762306a36Sopenharmony_ci	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
56862306a36Sopenharmony_ci	get_key(i, 2, RK2); \
56962306a36Sopenharmony_ci	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
57062306a36Sopenharmony_ci	get_key(i, 3, RK3); \
57162306a36Sopenharmony_ci	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
57262306a36Sopenharmony_ci	get_key(i, 1, RK1); \
57362306a36Sopenharmony_ci	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
57462306a36Sopenharmony_ci
57562306a36Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
57662306a36Sopenharmony_ci	movdqa x0,		t2; \
57762306a36Sopenharmony_ci	punpckldq x1,		x0; \
57862306a36Sopenharmony_ci	punpckhdq x1,		t2; \
57962306a36Sopenharmony_ci	movdqa x2,		t1; \
58062306a36Sopenharmony_ci	punpckhdq x3,		x2; \
58162306a36Sopenharmony_ci	punpckldq x3,		t1; \
58262306a36Sopenharmony_ci	movdqa x0,		x1; \
58362306a36Sopenharmony_ci	punpcklqdq t1,		x0; \
58462306a36Sopenharmony_ci	punpckhqdq t1,		x1; \
58562306a36Sopenharmony_ci	movdqa t2,		x3; \
58662306a36Sopenharmony_ci	punpcklqdq x2,		t2; \
58762306a36Sopenharmony_ci	punpckhqdq x2,		x3; \
58862306a36Sopenharmony_ci	movdqa t2,		x2;
58962306a36Sopenharmony_ci
59062306a36Sopenharmony_ci#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
59162306a36Sopenharmony_ci	movdqu (0*4*4)(in),	x0; \
59262306a36Sopenharmony_ci	movdqu (1*4*4)(in),	x1; \
59362306a36Sopenharmony_ci	movdqu (2*4*4)(in),	x2; \
59462306a36Sopenharmony_ci	movdqu (3*4*4)(in),	x3; \
59562306a36Sopenharmony_ci	\
59662306a36Sopenharmony_ci	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
59762306a36Sopenharmony_ci
59862306a36Sopenharmony_ci#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
59962306a36Sopenharmony_ci	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
60062306a36Sopenharmony_ci	\
60162306a36Sopenharmony_ci	movdqu x0,		(0*4*4)(out); \
60262306a36Sopenharmony_ci	movdqu x1,		(1*4*4)(out); \
60362306a36Sopenharmony_ci	movdqu x2,		(2*4*4)(out); \
60462306a36Sopenharmony_ci	movdqu x3,		(3*4*4)(out);
60562306a36Sopenharmony_ci
60662306a36Sopenharmony_ci#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
60762306a36Sopenharmony_ci	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
60862306a36Sopenharmony_ci	\
60962306a36Sopenharmony_ci	movdqu (0*4*4)(out),	t0; \
61062306a36Sopenharmony_ci	pxor t0,		x0; \
61162306a36Sopenharmony_ci	movdqu x0,		(0*4*4)(out); \
61262306a36Sopenharmony_ci	movdqu (1*4*4)(out),	t0; \
61362306a36Sopenharmony_ci	pxor t0,		x1; \
61462306a36Sopenharmony_ci	movdqu x1,		(1*4*4)(out); \
61562306a36Sopenharmony_ci	movdqu (2*4*4)(out),	t0; \
61662306a36Sopenharmony_ci	pxor t0,		x2; \
61762306a36Sopenharmony_ci	movdqu x2,		(2*4*4)(out); \
61862306a36Sopenharmony_ci	movdqu (3*4*4)(out),	t0; \
61962306a36Sopenharmony_ci	pxor t0,		x3; \
62062306a36Sopenharmony_ci	movdqu x3,		(3*4*4)(out);
62162306a36Sopenharmony_ci
62262306a36Sopenharmony_ciSYM_FUNC_START(__serpent_enc_blk_8way)
62362306a36Sopenharmony_ci	/* input:
62462306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
62562306a36Sopenharmony_ci	 *	%rsi: dst
62662306a36Sopenharmony_ci	 *	%rdx: src
62762306a36Sopenharmony_ci	 *	%rcx: bool, if true: xor output
62862306a36Sopenharmony_ci	 */
62962306a36Sopenharmony_ci
63062306a36Sopenharmony_ci	pcmpeqd RNOT, RNOT;
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_ci	leaq (4*4*4)(%rdx), %rax;
63362306a36Sopenharmony_ci	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
63462306a36Sopenharmony_ci	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
63562306a36Sopenharmony_ci
63662306a36Sopenharmony_ci						 K2(RA, RB, RC, RD, RE, 0);
63762306a36Sopenharmony_ci	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
63862306a36Sopenharmony_ci	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2);
63962306a36Sopenharmony_ci	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3);
64062306a36Sopenharmony_ci	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4);
64162306a36Sopenharmony_ci	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5);
64262306a36Sopenharmony_ci	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6);
64362306a36Sopenharmony_ci	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7);
64462306a36Sopenharmony_ci	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8);
64562306a36Sopenharmony_ci	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9);
64662306a36Sopenharmony_ci	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10);
64762306a36Sopenharmony_ci	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11);
64862306a36Sopenharmony_ci	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12);
64962306a36Sopenharmony_ci	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13);
65062306a36Sopenharmony_ci	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14);
65162306a36Sopenharmony_ci	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15);
65262306a36Sopenharmony_ci	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16);
65362306a36Sopenharmony_ci	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17);
65462306a36Sopenharmony_ci	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18);
65562306a36Sopenharmony_ci	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19);
65662306a36Sopenharmony_ci	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20);
65762306a36Sopenharmony_ci	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21);
65862306a36Sopenharmony_ci	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22);
65962306a36Sopenharmony_ci	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23);
66062306a36Sopenharmony_ci	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24);
66162306a36Sopenharmony_ci	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25);
66262306a36Sopenharmony_ci	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26);
66362306a36Sopenharmony_ci	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27);
66462306a36Sopenharmony_ci	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28);
66562306a36Sopenharmony_ci	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29);
66662306a36Sopenharmony_ci	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30);
66762306a36Sopenharmony_ci	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
66862306a36Sopenharmony_ci	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
66962306a36Sopenharmony_ci
67062306a36Sopenharmony_ci	leaq (4*4*4)(%rsi), %rax;
67162306a36Sopenharmony_ci
67262306a36Sopenharmony_ci	testb %cl, %cl;
67362306a36Sopenharmony_ci	jnz .L__enc_xor8;
67462306a36Sopenharmony_ci
67562306a36Sopenharmony_ci	write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
67662306a36Sopenharmony_ci	write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
67762306a36Sopenharmony_ci
67862306a36Sopenharmony_ci	RET;
67962306a36Sopenharmony_ci
68062306a36Sopenharmony_ci.L__enc_xor8:
68162306a36Sopenharmony_ci	xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
68262306a36Sopenharmony_ci	xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
68362306a36Sopenharmony_ci
68462306a36Sopenharmony_ci	RET;
68562306a36Sopenharmony_ciSYM_FUNC_END(__serpent_enc_blk_8way)
68662306a36Sopenharmony_ci
68762306a36Sopenharmony_ciSYM_FUNC_START(serpent_dec_blk_8way)
68862306a36Sopenharmony_ci	/* input:
68962306a36Sopenharmony_ci	 *	%rdi: ctx, CTX
69062306a36Sopenharmony_ci	 *	%rsi: dst
69162306a36Sopenharmony_ci	 *	%rdx: src
69262306a36Sopenharmony_ci	 */
69362306a36Sopenharmony_ci
69462306a36Sopenharmony_ci	pcmpeqd RNOT, RNOT;
69562306a36Sopenharmony_ci
69662306a36Sopenharmony_ci	leaq (4*4*4)(%rdx), %rax;
69762306a36Sopenharmony_ci	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
69862306a36Sopenharmony_ci	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
69962306a36Sopenharmony_ci
70062306a36Sopenharmony_ci						 K2(RA, RB, RC, RD, RE, 32);
70162306a36Sopenharmony_ci	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
70262306a36Sopenharmony_ci	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30);
70362306a36Sopenharmony_ci	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29);
70462306a36Sopenharmony_ci	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28);
70562306a36Sopenharmony_ci	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27);
70662306a36Sopenharmony_ci	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26);
70762306a36Sopenharmony_ci	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25);
70862306a36Sopenharmony_ci	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24);
70962306a36Sopenharmony_ci	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23);
71062306a36Sopenharmony_ci	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22);
71162306a36Sopenharmony_ci	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21);
71262306a36Sopenharmony_ci	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20);
71362306a36Sopenharmony_ci	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19);
71462306a36Sopenharmony_ci	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18);
71562306a36Sopenharmony_ci	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17);
71662306a36Sopenharmony_ci	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16);
71762306a36Sopenharmony_ci	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15);
71862306a36Sopenharmony_ci	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14);
71962306a36Sopenharmony_ci	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13);
72062306a36Sopenharmony_ci	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12);
72162306a36Sopenharmony_ci	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11);
72262306a36Sopenharmony_ci	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10);
72362306a36Sopenharmony_ci	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9);
72462306a36Sopenharmony_ci	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8);
72562306a36Sopenharmony_ci	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7);
72662306a36Sopenharmony_ci	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6);
72762306a36Sopenharmony_ci	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5);
72862306a36Sopenharmony_ci	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4);
72962306a36Sopenharmony_ci	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3);
73062306a36Sopenharmony_ci	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2);
73162306a36Sopenharmony_ci	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
73262306a36Sopenharmony_ci	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
73362306a36Sopenharmony_ci
73462306a36Sopenharmony_ci	leaq (4*4*4)(%rsi), %rax;
73562306a36Sopenharmony_ci	write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
73662306a36Sopenharmony_ci	write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
73762306a36Sopenharmony_ci
73862306a36Sopenharmony_ci	RET;
73962306a36Sopenharmony_ciSYM_FUNC_END(serpent_dec_blk_8way)
740