162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Based on crypto/serpent.c by
862306a36Sopenharmony_ci *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
962306a36Sopenharmony_ci *                2003 Herbert Valerio Riedel <hvr@gnu.org>
1062306a36Sopenharmony_ci */
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci#include <linux/linkage.h>
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci.file "serpent-sse2-i586-asm_32.S"
1562306a36Sopenharmony_ci.text
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ci#define arg_ctx 4
1862306a36Sopenharmony_ci#define arg_dst 8
1962306a36Sopenharmony_ci#define arg_src 12
2062306a36Sopenharmony_ci#define arg_xor 16
2162306a36Sopenharmony_ci
2262306a36Sopenharmony_ci/**********************************************************************
2362306a36Sopenharmony_ci  4-way SSE2 serpent
2462306a36Sopenharmony_ci **********************************************************************/
2562306a36Sopenharmony_ci#define CTX %edx
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci#define RA %xmm0
2862306a36Sopenharmony_ci#define RB %xmm1
2962306a36Sopenharmony_ci#define RC %xmm2
3062306a36Sopenharmony_ci#define RD %xmm3
3162306a36Sopenharmony_ci#define RE %xmm4
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci#define RT0 %xmm5
3462306a36Sopenharmony_ci#define RT1 %xmm6
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_ci#define RNOT %xmm7
3762306a36Sopenharmony_ci
3862306a36Sopenharmony_ci#define get_key(i, j, t) \
3962306a36Sopenharmony_ci	movd (4*(i)+(j))*4(CTX), t; \
4062306a36Sopenharmony_ci	pshufd $0, t, t;
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci#define K(x0, x1, x2, x3, x4, i) \
4362306a36Sopenharmony_ci	get_key(i, 0, x4); \
4462306a36Sopenharmony_ci	get_key(i, 1, RT0); \
4562306a36Sopenharmony_ci	get_key(i, 2, RT1); \
4662306a36Sopenharmony_ci	pxor x4,		x0; \
4762306a36Sopenharmony_ci	pxor RT0,		x1; \
4862306a36Sopenharmony_ci	pxor RT1,		x2; \
4962306a36Sopenharmony_ci	get_key(i, 3, x4); \
5062306a36Sopenharmony_ci	pxor x4,		x3;
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci#define LK(x0, x1, x2, x3, x4, i) \
5362306a36Sopenharmony_ci	movdqa x0,		x4; \
5462306a36Sopenharmony_ci	pslld $13,		x0; \
5562306a36Sopenharmony_ci	psrld $(32 - 13),	x4; \
5662306a36Sopenharmony_ci	por x4,			x0; \
5762306a36Sopenharmony_ci	pxor x0,		x1; \
5862306a36Sopenharmony_ci	movdqa x2,		x4; \
5962306a36Sopenharmony_ci	pslld $3,		x2; \
6062306a36Sopenharmony_ci	psrld $(32 - 3),	x4; \
6162306a36Sopenharmony_ci	por x4,			x2; \
6262306a36Sopenharmony_ci	pxor x2,		x1; \
6362306a36Sopenharmony_ci	movdqa x1,		x4; \
6462306a36Sopenharmony_ci	pslld $1,		x1; \
6562306a36Sopenharmony_ci	psrld $(32 - 1),	x4; \
6662306a36Sopenharmony_ci	por x4,			x1; \
6762306a36Sopenharmony_ci	movdqa x0,		x4; \
6862306a36Sopenharmony_ci	pslld $3,		x4; \
6962306a36Sopenharmony_ci	pxor x2,		x3; \
7062306a36Sopenharmony_ci	pxor x4,		x3; \
7162306a36Sopenharmony_ci	movdqa x3,		x4; \
7262306a36Sopenharmony_ci	pslld $7,		x3; \
7362306a36Sopenharmony_ci	psrld $(32 - 7),	x4; \
7462306a36Sopenharmony_ci	por x4,			x3; \
7562306a36Sopenharmony_ci	movdqa x1,		x4; \
7662306a36Sopenharmony_ci	pslld $7,		x4; \
7762306a36Sopenharmony_ci	pxor x1,		x0; \
7862306a36Sopenharmony_ci	pxor x3,		x0; \
7962306a36Sopenharmony_ci	pxor x3,		x2; \
8062306a36Sopenharmony_ci	pxor x4,		x2; \
8162306a36Sopenharmony_ci	movdqa x0,		x4; \
8262306a36Sopenharmony_ci	get_key(i, 1, RT0); \
8362306a36Sopenharmony_ci	pxor RT0,		x1; \
8462306a36Sopenharmony_ci	get_key(i, 3, RT0); \
8562306a36Sopenharmony_ci	pxor RT0,		x3; \
8662306a36Sopenharmony_ci	pslld $5,		x0; \
8762306a36Sopenharmony_ci	psrld $(32 - 5),	x4; \
8862306a36Sopenharmony_ci	por x4,			x0; \
8962306a36Sopenharmony_ci	movdqa x2,		x4; \
9062306a36Sopenharmony_ci	pslld $22,		x2; \
9162306a36Sopenharmony_ci	psrld $(32 - 22),	x4; \
9262306a36Sopenharmony_ci	por x4,			x2; \
9362306a36Sopenharmony_ci	get_key(i, 0, RT0); \
9462306a36Sopenharmony_ci	pxor RT0,		x0; \
9562306a36Sopenharmony_ci	get_key(i, 2, RT0); \
9662306a36Sopenharmony_ci	pxor RT0,		x2;
9762306a36Sopenharmony_ci
9862306a36Sopenharmony_ci#define KL(x0, x1, x2, x3, x4, i) \
9962306a36Sopenharmony_ci	K(x0, x1, x2, x3, x4, i); \
10062306a36Sopenharmony_ci	movdqa x0,		x4; \
10162306a36Sopenharmony_ci	psrld $5,		x0; \
10262306a36Sopenharmony_ci	pslld $(32 - 5),	x4; \
10362306a36Sopenharmony_ci	por x4,			x0; \
10462306a36Sopenharmony_ci	movdqa x2,		x4; \
10562306a36Sopenharmony_ci	psrld $22,		x2; \
10662306a36Sopenharmony_ci	pslld $(32 - 22),	x4; \
10762306a36Sopenharmony_ci	por x4,			x2; \
10862306a36Sopenharmony_ci	pxor x3,		x2; \
10962306a36Sopenharmony_ci	pxor x3,		x0; \
11062306a36Sopenharmony_ci	movdqa x1,		x4; \
11162306a36Sopenharmony_ci	pslld $7,		x4; \
11262306a36Sopenharmony_ci	pxor x1,		x0; \
11362306a36Sopenharmony_ci	pxor x4,		x2; \
11462306a36Sopenharmony_ci	movdqa x1,		x4; \
11562306a36Sopenharmony_ci	psrld $1,		x1; \
11662306a36Sopenharmony_ci	pslld $(32 - 1),	x4; \
11762306a36Sopenharmony_ci	por x4,			x1; \
11862306a36Sopenharmony_ci	movdqa x3,		x4; \
11962306a36Sopenharmony_ci	psrld $7,		x3; \
12062306a36Sopenharmony_ci	pslld $(32 - 7),	x4; \
12162306a36Sopenharmony_ci	por x4,			x3; \
12262306a36Sopenharmony_ci	pxor x0,		x1; \
12362306a36Sopenharmony_ci	movdqa x0,		x4; \
12462306a36Sopenharmony_ci	pslld $3,		x4; \
12562306a36Sopenharmony_ci	pxor x4,		x3; \
12662306a36Sopenharmony_ci	movdqa x0,		x4; \
12762306a36Sopenharmony_ci	psrld $13,		x0; \
12862306a36Sopenharmony_ci	pslld $(32 - 13),	x4; \
12962306a36Sopenharmony_ci	por x4,			x0; \
13062306a36Sopenharmony_ci	pxor x2,		x1; \
13162306a36Sopenharmony_ci	pxor x2,		x3; \
13262306a36Sopenharmony_ci	movdqa x2,		x4; \
13362306a36Sopenharmony_ci	psrld $3,		x2; \
13462306a36Sopenharmony_ci	pslld $(32 - 3),	x4; \
13562306a36Sopenharmony_ci	por x4,			x2;
13662306a36Sopenharmony_ci
13762306a36Sopenharmony_ci#define S0(x0, x1, x2, x3, x4) \
13862306a36Sopenharmony_ci	movdqa x3,		x4; \
13962306a36Sopenharmony_ci	por x0,			x3; \
14062306a36Sopenharmony_ci	pxor x4,		x0; \
14162306a36Sopenharmony_ci	pxor x2,		x4; \
14262306a36Sopenharmony_ci	pxor RNOT,		x4; \
14362306a36Sopenharmony_ci	pxor x1,		x3; \
14462306a36Sopenharmony_ci	pand x0,		x1; \
14562306a36Sopenharmony_ci	pxor x4,		x1; \
14662306a36Sopenharmony_ci	pxor x0,		x2; \
14762306a36Sopenharmony_ci	pxor x3,		x0; \
14862306a36Sopenharmony_ci	por x0,			x4; \
14962306a36Sopenharmony_ci	pxor x2,		x0; \
15062306a36Sopenharmony_ci	pand x1,		x2; \
15162306a36Sopenharmony_ci	pxor x2,		x3; \
15262306a36Sopenharmony_ci	pxor RNOT,		x1; \
15362306a36Sopenharmony_ci	pxor x4,		x2; \
15462306a36Sopenharmony_ci	pxor x2,		x1;
15562306a36Sopenharmony_ci
15662306a36Sopenharmony_ci#define S1(x0, x1, x2, x3, x4) \
15762306a36Sopenharmony_ci	movdqa x1,		x4; \
15862306a36Sopenharmony_ci	pxor x0,		x1; \
15962306a36Sopenharmony_ci	pxor x3,		x0; \
16062306a36Sopenharmony_ci	pxor RNOT,		x3; \
16162306a36Sopenharmony_ci	pand x1,		x4; \
16262306a36Sopenharmony_ci	por x1,			x0; \
16362306a36Sopenharmony_ci	pxor x2,		x3; \
16462306a36Sopenharmony_ci	pxor x3,		x0; \
16562306a36Sopenharmony_ci	pxor x3,		x1; \
16662306a36Sopenharmony_ci	pxor x4,		x3; \
16762306a36Sopenharmony_ci	por x4,			x1; \
16862306a36Sopenharmony_ci	pxor x2,		x4; \
16962306a36Sopenharmony_ci	pand x0,		x2; \
17062306a36Sopenharmony_ci	pxor x1,		x2; \
17162306a36Sopenharmony_ci	por x0,			x1; \
17262306a36Sopenharmony_ci	pxor RNOT,		x0; \
17362306a36Sopenharmony_ci	pxor x2,		x0; \
17462306a36Sopenharmony_ci	pxor x1,		x4;
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci#define S2(x0, x1, x2, x3, x4) \
17762306a36Sopenharmony_ci	pxor RNOT,		x3; \
17862306a36Sopenharmony_ci	pxor x0,		x1; \
17962306a36Sopenharmony_ci	movdqa x0,		x4; \
18062306a36Sopenharmony_ci	pand x2,		x0; \
18162306a36Sopenharmony_ci	pxor x3,		x0; \
18262306a36Sopenharmony_ci	por x4,			x3; \
18362306a36Sopenharmony_ci	pxor x1,		x2; \
18462306a36Sopenharmony_ci	pxor x1,		x3; \
18562306a36Sopenharmony_ci	pand x0,		x1; \
18662306a36Sopenharmony_ci	pxor x2,		x0; \
18762306a36Sopenharmony_ci	pand x3,		x2; \
18862306a36Sopenharmony_ci	por x1,			x3; \
18962306a36Sopenharmony_ci	pxor RNOT,		x0; \
19062306a36Sopenharmony_ci	pxor x0,		x3; \
19162306a36Sopenharmony_ci	pxor x0,		x4; \
19262306a36Sopenharmony_ci	pxor x2,		x0; \
19362306a36Sopenharmony_ci	por x2,			x1;
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_ci#define S3(x0, x1, x2, x3, x4) \
19662306a36Sopenharmony_ci	movdqa x1,		x4; \
19762306a36Sopenharmony_ci	pxor x3,		x1; \
19862306a36Sopenharmony_ci	por x0,			x3; \
19962306a36Sopenharmony_ci	pand x0,		x4; \
20062306a36Sopenharmony_ci	pxor x2,		x0; \
20162306a36Sopenharmony_ci	pxor x1,		x2; \
20262306a36Sopenharmony_ci	pand x3,		x1; \
20362306a36Sopenharmony_ci	pxor x3,		x2; \
20462306a36Sopenharmony_ci	por x4,			x0; \
20562306a36Sopenharmony_ci	pxor x3,		x4; \
20662306a36Sopenharmony_ci	pxor x0,		x1; \
20762306a36Sopenharmony_ci	pand x3,		x0; \
20862306a36Sopenharmony_ci	pand x4,		x3; \
20962306a36Sopenharmony_ci	pxor x2,		x3; \
21062306a36Sopenharmony_ci	por x1,			x4; \
21162306a36Sopenharmony_ci	pand x1,		x2; \
21262306a36Sopenharmony_ci	pxor x3,		x4; \
21362306a36Sopenharmony_ci	pxor x3,		x0; \
21462306a36Sopenharmony_ci	pxor x2,		x3;
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci#define S4(x0, x1, x2, x3, x4) \
21762306a36Sopenharmony_ci	movdqa x3,		x4; \
21862306a36Sopenharmony_ci	pand x0,		x3; \
21962306a36Sopenharmony_ci	pxor x4,		x0; \
22062306a36Sopenharmony_ci	pxor x2,		x3; \
22162306a36Sopenharmony_ci	por x4,			x2; \
22262306a36Sopenharmony_ci	pxor x1,		x0; \
22362306a36Sopenharmony_ci	pxor x3,		x4; \
22462306a36Sopenharmony_ci	por x0,			x2; \
22562306a36Sopenharmony_ci	pxor x1,		x2; \
22662306a36Sopenharmony_ci	pand x0,		x1; \
22762306a36Sopenharmony_ci	pxor x4,		x1; \
22862306a36Sopenharmony_ci	pand x2,		x4; \
22962306a36Sopenharmony_ci	pxor x3,		x2; \
23062306a36Sopenharmony_ci	pxor x0,		x4; \
23162306a36Sopenharmony_ci	por x1,			x3; \
23262306a36Sopenharmony_ci	pxor RNOT,		x1; \
23362306a36Sopenharmony_ci	pxor x0,		x3;
23462306a36Sopenharmony_ci
23562306a36Sopenharmony_ci#define S5(x0, x1, x2, x3, x4) \
23662306a36Sopenharmony_ci	movdqa x1,		x4; \
23762306a36Sopenharmony_ci	por x0,			x1; \
23862306a36Sopenharmony_ci	pxor x1,		x2; \
23962306a36Sopenharmony_ci	pxor RNOT,		x3; \
24062306a36Sopenharmony_ci	pxor x0,		x4; \
24162306a36Sopenharmony_ci	pxor x2,		x0; \
24262306a36Sopenharmony_ci	pand x4,		x1; \
24362306a36Sopenharmony_ci	por x3,			x4; \
24462306a36Sopenharmony_ci	pxor x0,		x4; \
24562306a36Sopenharmony_ci	pand x3,		x0; \
24662306a36Sopenharmony_ci	pxor x3,		x1; \
24762306a36Sopenharmony_ci	pxor x2,		x3; \
24862306a36Sopenharmony_ci	pxor x1,		x0; \
24962306a36Sopenharmony_ci	pand x4,		x2; \
25062306a36Sopenharmony_ci	pxor x2,		x1; \
25162306a36Sopenharmony_ci	pand x0,		x2; \
25262306a36Sopenharmony_ci	pxor x2,		x3;
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ci#define S6(x0, x1, x2, x3, x4) \
25562306a36Sopenharmony_ci	movdqa x1,		x4; \
25662306a36Sopenharmony_ci	pxor x0,		x3; \
25762306a36Sopenharmony_ci	pxor x2,		x1; \
25862306a36Sopenharmony_ci	pxor x0,		x2; \
25962306a36Sopenharmony_ci	pand x3,		x0; \
26062306a36Sopenharmony_ci	por x3,			x1; \
26162306a36Sopenharmony_ci	pxor RNOT,		x4; \
26262306a36Sopenharmony_ci	pxor x1,		x0; \
26362306a36Sopenharmony_ci	pxor x2,		x1; \
26462306a36Sopenharmony_ci	pxor x4,		x3; \
26562306a36Sopenharmony_ci	pxor x0,		x4; \
26662306a36Sopenharmony_ci	pand x0,		x2; \
26762306a36Sopenharmony_ci	pxor x1,		x4; \
26862306a36Sopenharmony_ci	pxor x3,		x2; \
26962306a36Sopenharmony_ci	pand x1,		x3; \
27062306a36Sopenharmony_ci	pxor x0,		x3; \
27162306a36Sopenharmony_ci	pxor x2,		x1;
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci#define S7(x0, x1, x2, x3, x4) \
27462306a36Sopenharmony_ci	pxor RNOT,		x1; \
27562306a36Sopenharmony_ci	movdqa x1,		x4; \
27662306a36Sopenharmony_ci	pxor RNOT,		x0; \
27762306a36Sopenharmony_ci	pand x2,		x1; \
27862306a36Sopenharmony_ci	pxor x3,		x1; \
27962306a36Sopenharmony_ci	por x4,			x3; \
28062306a36Sopenharmony_ci	pxor x2,		x4; \
28162306a36Sopenharmony_ci	pxor x3,		x2; \
28262306a36Sopenharmony_ci	pxor x0,		x3; \
28362306a36Sopenharmony_ci	por x1,			x0; \
28462306a36Sopenharmony_ci	pand x0,		x2; \
28562306a36Sopenharmony_ci	pxor x4,		x0; \
28662306a36Sopenharmony_ci	pxor x3,		x4; \
28762306a36Sopenharmony_ci	pand x0,		x3; \
28862306a36Sopenharmony_ci	pxor x1,		x4; \
28962306a36Sopenharmony_ci	pxor x4,		x2; \
29062306a36Sopenharmony_ci	pxor x1,		x3; \
29162306a36Sopenharmony_ci	por x0,			x4; \
29262306a36Sopenharmony_ci	pxor x1,		x4;
29362306a36Sopenharmony_ci
29462306a36Sopenharmony_ci#define SI0(x0, x1, x2, x3, x4) \
29562306a36Sopenharmony_ci	movdqa x3,		x4; \
29662306a36Sopenharmony_ci	pxor x0,		x1; \
29762306a36Sopenharmony_ci	por x1,			x3; \
29862306a36Sopenharmony_ci	pxor x1,		x4; \
29962306a36Sopenharmony_ci	pxor RNOT,		x0; \
30062306a36Sopenharmony_ci	pxor x3,		x2; \
30162306a36Sopenharmony_ci	pxor x0,		x3; \
30262306a36Sopenharmony_ci	pand x1,		x0; \
30362306a36Sopenharmony_ci	pxor x2,		x0; \
30462306a36Sopenharmony_ci	pand x3,		x2; \
30562306a36Sopenharmony_ci	pxor x4,		x3; \
30662306a36Sopenharmony_ci	pxor x3,		x2; \
30762306a36Sopenharmony_ci	pxor x3,		x1; \
30862306a36Sopenharmony_ci	pand x0,		x3; \
30962306a36Sopenharmony_ci	pxor x0,		x1; \
31062306a36Sopenharmony_ci	pxor x2,		x0; \
31162306a36Sopenharmony_ci	pxor x3,		x4;
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci#define SI1(x0, x1, x2, x3, x4) \
31462306a36Sopenharmony_ci	pxor x3,		x1; \
31562306a36Sopenharmony_ci	movdqa x0,		x4; \
31662306a36Sopenharmony_ci	pxor x2,		x0; \
31762306a36Sopenharmony_ci	pxor RNOT,		x2; \
31862306a36Sopenharmony_ci	por x1,			x4; \
31962306a36Sopenharmony_ci	pxor x3,		x4; \
32062306a36Sopenharmony_ci	pand x1,		x3; \
32162306a36Sopenharmony_ci	pxor x2,		x1; \
32262306a36Sopenharmony_ci	pand x4,		x2; \
32362306a36Sopenharmony_ci	pxor x1,		x4; \
32462306a36Sopenharmony_ci	por x3,			x1; \
32562306a36Sopenharmony_ci	pxor x0,		x3; \
32662306a36Sopenharmony_ci	pxor x0,		x2; \
32762306a36Sopenharmony_ci	por x4,			x0; \
32862306a36Sopenharmony_ci	pxor x4,		x2; \
32962306a36Sopenharmony_ci	pxor x0,		x1; \
33062306a36Sopenharmony_ci	pxor x1,		x4;
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci#define SI2(x0, x1, x2, x3, x4) \
33362306a36Sopenharmony_ci	pxor x1,		x2; \
33462306a36Sopenharmony_ci	movdqa x3,		x4; \
33562306a36Sopenharmony_ci	pxor RNOT,		x3; \
33662306a36Sopenharmony_ci	por x2,			x3; \
33762306a36Sopenharmony_ci	pxor x4,		x2; \
33862306a36Sopenharmony_ci	pxor x0,		x4; \
33962306a36Sopenharmony_ci	pxor x1,		x3; \
34062306a36Sopenharmony_ci	por x2,			x1; \
34162306a36Sopenharmony_ci	pxor x0,		x2; \
34262306a36Sopenharmony_ci	pxor x4,		x1; \
34362306a36Sopenharmony_ci	por x3,			x4; \
34462306a36Sopenharmony_ci	pxor x3,		x2; \
34562306a36Sopenharmony_ci	pxor x2,		x4; \
34662306a36Sopenharmony_ci	pand x1,		x2; \
34762306a36Sopenharmony_ci	pxor x3,		x2; \
34862306a36Sopenharmony_ci	pxor x4,		x3; \
34962306a36Sopenharmony_ci	pxor x0,		x4;
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci#define SI3(x0, x1, x2, x3, x4) \
35262306a36Sopenharmony_ci	pxor x1,		x2; \
35362306a36Sopenharmony_ci	movdqa x1,		x4; \
35462306a36Sopenharmony_ci	pand x2,		x1; \
35562306a36Sopenharmony_ci	pxor x0,		x1; \
35662306a36Sopenharmony_ci	por x4,			x0; \
35762306a36Sopenharmony_ci	pxor x3,		x4; \
35862306a36Sopenharmony_ci	pxor x3,		x0; \
35962306a36Sopenharmony_ci	por x1,			x3; \
36062306a36Sopenharmony_ci	pxor x2,		x1; \
36162306a36Sopenharmony_ci	pxor x3,		x1; \
36262306a36Sopenharmony_ci	pxor x2,		x0; \
36362306a36Sopenharmony_ci	pxor x3,		x2; \
36462306a36Sopenharmony_ci	pand x1,		x3; \
36562306a36Sopenharmony_ci	pxor x0,		x1; \
36662306a36Sopenharmony_ci	pand x2,		x0; \
36762306a36Sopenharmony_ci	pxor x3,		x4; \
36862306a36Sopenharmony_ci	pxor x0,		x3; \
36962306a36Sopenharmony_ci	pxor x1,		x0;
37062306a36Sopenharmony_ci
37162306a36Sopenharmony_ci#define SI4(x0, x1, x2, x3, x4) \
37262306a36Sopenharmony_ci	pxor x3,		x2; \
37362306a36Sopenharmony_ci	movdqa x0,		x4; \
37462306a36Sopenharmony_ci	pand x1,		x0; \
37562306a36Sopenharmony_ci	pxor x2,		x0; \
37662306a36Sopenharmony_ci	por x3,			x2; \
37762306a36Sopenharmony_ci	pxor RNOT,		x4; \
37862306a36Sopenharmony_ci	pxor x0,		x1; \
37962306a36Sopenharmony_ci	pxor x2,		x0; \
38062306a36Sopenharmony_ci	pand x4,		x2; \
38162306a36Sopenharmony_ci	pxor x0,		x2; \
38262306a36Sopenharmony_ci	por x4,			x0; \
38362306a36Sopenharmony_ci	pxor x3,		x0; \
38462306a36Sopenharmony_ci	pand x2,		x3; \
38562306a36Sopenharmony_ci	pxor x3,		x4; \
38662306a36Sopenharmony_ci	pxor x1,		x3; \
38762306a36Sopenharmony_ci	pand x0,		x1; \
38862306a36Sopenharmony_ci	pxor x1,		x4; \
38962306a36Sopenharmony_ci	pxor x3,		x0;
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci#define SI5(x0, x1, x2, x3, x4) \
39262306a36Sopenharmony_ci	movdqa x1,		x4; \
39362306a36Sopenharmony_ci	por x2,			x1; \
39462306a36Sopenharmony_ci	pxor x4,		x2; \
39562306a36Sopenharmony_ci	pxor x3,		x1; \
39662306a36Sopenharmony_ci	pand x4,		x3; \
39762306a36Sopenharmony_ci	pxor x3,		x2; \
39862306a36Sopenharmony_ci	por x0,			x3; \
39962306a36Sopenharmony_ci	pxor RNOT,		x0; \
40062306a36Sopenharmony_ci	pxor x2,		x3; \
40162306a36Sopenharmony_ci	por x0,			x2; \
40262306a36Sopenharmony_ci	pxor x1,		x4; \
40362306a36Sopenharmony_ci	pxor x4,		x2; \
40462306a36Sopenharmony_ci	pand x0,		x4; \
40562306a36Sopenharmony_ci	pxor x1,		x0; \
40662306a36Sopenharmony_ci	pxor x3,		x1; \
40762306a36Sopenharmony_ci	pand x2,		x0; \
40862306a36Sopenharmony_ci	pxor x3,		x2; \
40962306a36Sopenharmony_ci	pxor x2,		x0; \
41062306a36Sopenharmony_ci	pxor x4,		x2; \
41162306a36Sopenharmony_ci	pxor x3,		x4;
41262306a36Sopenharmony_ci
41362306a36Sopenharmony_ci#define SI6(x0, x1, x2, x3, x4) \
41462306a36Sopenharmony_ci	pxor x2,		x0; \
41562306a36Sopenharmony_ci	movdqa x0,		x4; \
41662306a36Sopenharmony_ci	pand x3,		x0; \
41762306a36Sopenharmony_ci	pxor x3,		x2; \
41862306a36Sopenharmony_ci	pxor x2,		x0; \
41962306a36Sopenharmony_ci	pxor x1,		x3; \
42062306a36Sopenharmony_ci	por x4,			x2; \
42162306a36Sopenharmony_ci	pxor x3,		x2; \
42262306a36Sopenharmony_ci	pand x0,		x3; \
42362306a36Sopenharmony_ci	pxor RNOT,		x0; \
42462306a36Sopenharmony_ci	pxor x1,		x3; \
42562306a36Sopenharmony_ci	pand x2,		x1; \
42662306a36Sopenharmony_ci	pxor x0,		x4; \
42762306a36Sopenharmony_ci	pxor x4,		x3; \
42862306a36Sopenharmony_ci	pxor x2,		x4; \
42962306a36Sopenharmony_ci	pxor x1,		x0; \
43062306a36Sopenharmony_ci	pxor x0,		x2;
43162306a36Sopenharmony_ci
43262306a36Sopenharmony_ci#define SI7(x0, x1, x2, x3, x4) \
43362306a36Sopenharmony_ci	movdqa x3,		x4; \
43462306a36Sopenharmony_ci	pand x0,		x3; \
43562306a36Sopenharmony_ci	pxor x2,		x0; \
43662306a36Sopenharmony_ci	por x4,			x2; \
43762306a36Sopenharmony_ci	pxor x1,		x4; \
43862306a36Sopenharmony_ci	pxor RNOT,		x0; \
43962306a36Sopenharmony_ci	por x3,			x1; \
44062306a36Sopenharmony_ci	pxor x0,		x4; \
44162306a36Sopenharmony_ci	pand x2,		x0; \
44262306a36Sopenharmony_ci	pxor x1,		x0; \
44362306a36Sopenharmony_ci	pand x2,		x1; \
44462306a36Sopenharmony_ci	pxor x2,		x3; \
44562306a36Sopenharmony_ci	pxor x3,		x4; \
44662306a36Sopenharmony_ci	pand x3,		x2; \
44762306a36Sopenharmony_ci	por x0,			x3; \
44862306a36Sopenharmony_ci	pxor x4,		x1; \
44962306a36Sopenharmony_ci	pxor x4,		x3; \
45062306a36Sopenharmony_ci	pand x0,		x4; \
45162306a36Sopenharmony_ci	pxor x2,		x4;
45262306a36Sopenharmony_ci
45362306a36Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
45462306a36Sopenharmony_ci	movdqa x0,		t2; \
45562306a36Sopenharmony_ci	punpckldq x1,		x0; \
45662306a36Sopenharmony_ci	punpckhdq x1,		t2; \
45762306a36Sopenharmony_ci	movdqa x2,		t1; \
45862306a36Sopenharmony_ci	punpckhdq x3,		x2; \
45962306a36Sopenharmony_ci	punpckldq x3,		t1; \
46062306a36Sopenharmony_ci	movdqa x0,		x1; \
46162306a36Sopenharmony_ci	punpcklqdq t1,		x0; \
46262306a36Sopenharmony_ci	punpckhqdq t1,		x1; \
46362306a36Sopenharmony_ci	movdqa t2,		x3; \
46462306a36Sopenharmony_ci	punpcklqdq x2,		t2; \
46562306a36Sopenharmony_ci	punpckhqdq x2,		x3; \
46662306a36Sopenharmony_ci	movdqa t2,		x2;
46762306a36Sopenharmony_ci
46862306a36Sopenharmony_ci#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
46962306a36Sopenharmony_ci	movdqu (0*4*4)(in),	x0; \
47062306a36Sopenharmony_ci	movdqu (1*4*4)(in),	x1; \
47162306a36Sopenharmony_ci	movdqu (2*4*4)(in),	x2; \
47262306a36Sopenharmony_ci	movdqu (3*4*4)(in),	x3; \
47362306a36Sopenharmony_ci	\
47462306a36Sopenharmony_ci	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
47562306a36Sopenharmony_ci
47662306a36Sopenharmony_ci#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
47762306a36Sopenharmony_ci	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
47862306a36Sopenharmony_ci	\
47962306a36Sopenharmony_ci	movdqu x0, (0*4*4)(out); \
48062306a36Sopenharmony_ci	movdqu x1, (1*4*4)(out); \
48162306a36Sopenharmony_ci	movdqu x2, (2*4*4)(out); \
48262306a36Sopenharmony_ci	movdqu x3, (3*4*4)(out);
48362306a36Sopenharmony_ci
48462306a36Sopenharmony_ci#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
48562306a36Sopenharmony_ci	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
48662306a36Sopenharmony_ci	\
48762306a36Sopenharmony_ci	movdqu (0*4*4)(out),	t0; \
48862306a36Sopenharmony_ci	pxor t0,		x0; \
48962306a36Sopenharmony_ci	movdqu x0,		(0*4*4)(out); \
49062306a36Sopenharmony_ci	movdqu (1*4*4)(out),	t0; \
49162306a36Sopenharmony_ci	pxor t0,		x1; \
49262306a36Sopenharmony_ci	movdqu x1,		(1*4*4)(out); \
49362306a36Sopenharmony_ci	movdqu (2*4*4)(out),	t0; \
49462306a36Sopenharmony_ci	pxor t0,		x2; \
49562306a36Sopenharmony_ci	movdqu x2,		(2*4*4)(out); \
49662306a36Sopenharmony_ci	movdqu (3*4*4)(out),	t0; \
49762306a36Sopenharmony_ci	pxor t0,		x3; \
49862306a36Sopenharmony_ci	movdqu x3,		(3*4*4)(out);
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ciSYM_FUNC_START(__serpent_enc_blk_4way)
50162306a36Sopenharmony_ci	/* input:
50262306a36Sopenharmony_ci	 *	arg_ctx(%esp): ctx, CTX
50362306a36Sopenharmony_ci	 *	arg_dst(%esp): dst
50462306a36Sopenharmony_ci	 *	arg_src(%esp): src
50562306a36Sopenharmony_ci	 *	arg_xor(%esp): bool, if true: xor output
50662306a36Sopenharmony_ci	 */
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ci	pcmpeqd RNOT, RNOT;
50962306a36Sopenharmony_ci
51062306a36Sopenharmony_ci	movl arg_ctx(%esp), CTX;
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_ci	movl arg_src(%esp), %eax;
51362306a36Sopenharmony_ci	read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
51462306a36Sopenharmony_ci
51562306a36Sopenharmony_ci					 K(RA, RB, RC, RD, RE, 0);
51662306a36Sopenharmony_ci	S0(RA, RB, RC, RD, RE);		LK(RC, RB, RD, RA, RE, 1);
51762306a36Sopenharmony_ci	S1(RC, RB, RD, RA, RE);		LK(RE, RD, RA, RC, RB, 2);
51862306a36Sopenharmony_ci	S2(RE, RD, RA, RC, RB);		LK(RB, RD, RE, RC, RA, 3);
51962306a36Sopenharmony_ci	S3(RB, RD, RE, RC, RA);		LK(RC, RA, RD, RB, RE, 4);
52062306a36Sopenharmony_ci	S4(RC, RA, RD, RB, RE);		LK(RA, RD, RB, RE, RC, 5);
52162306a36Sopenharmony_ci	S5(RA, RD, RB, RE, RC);		LK(RC, RA, RD, RE, RB, 6);
52262306a36Sopenharmony_ci	S6(RC, RA, RD, RE, RB);		LK(RD, RB, RA, RE, RC, 7);
52362306a36Sopenharmony_ci	S7(RD, RB, RA, RE, RC);		LK(RC, RA, RE, RD, RB, 8);
52462306a36Sopenharmony_ci	S0(RC, RA, RE, RD, RB);		LK(RE, RA, RD, RC, RB, 9);
52562306a36Sopenharmony_ci	S1(RE, RA, RD, RC, RB);		LK(RB, RD, RC, RE, RA, 10);
52662306a36Sopenharmony_ci	S2(RB, RD, RC, RE, RA);		LK(RA, RD, RB, RE, RC, 11);
52762306a36Sopenharmony_ci	S3(RA, RD, RB, RE, RC);		LK(RE, RC, RD, RA, RB, 12);
52862306a36Sopenharmony_ci	S4(RE, RC, RD, RA, RB);		LK(RC, RD, RA, RB, RE, 13);
52962306a36Sopenharmony_ci	S5(RC, RD, RA, RB, RE);		LK(RE, RC, RD, RB, RA, 14);
53062306a36Sopenharmony_ci	S6(RE, RC, RD, RB, RA);		LK(RD, RA, RC, RB, RE, 15);
53162306a36Sopenharmony_ci	S7(RD, RA, RC, RB, RE);		LK(RE, RC, RB, RD, RA, 16);
53262306a36Sopenharmony_ci	S0(RE, RC, RB, RD, RA);		LK(RB, RC, RD, RE, RA, 17);
53362306a36Sopenharmony_ci	S1(RB, RC, RD, RE, RA);		LK(RA, RD, RE, RB, RC, 18);
53462306a36Sopenharmony_ci	S2(RA, RD, RE, RB, RC);		LK(RC, RD, RA, RB, RE, 19);
53562306a36Sopenharmony_ci	S3(RC, RD, RA, RB, RE);		LK(RB, RE, RD, RC, RA, 20);
53662306a36Sopenharmony_ci	S4(RB, RE, RD, RC, RA);		LK(RE, RD, RC, RA, RB, 21);
53762306a36Sopenharmony_ci	S5(RE, RD, RC, RA, RB);		LK(RB, RE, RD, RA, RC, 22);
53862306a36Sopenharmony_ci	S6(RB, RE, RD, RA, RC);		LK(RD, RC, RE, RA, RB, 23);
53962306a36Sopenharmony_ci	S7(RD, RC, RE, RA, RB);		LK(RB, RE, RA, RD, RC, 24);
54062306a36Sopenharmony_ci	S0(RB, RE, RA, RD, RC);		LK(RA, RE, RD, RB, RC, 25);
54162306a36Sopenharmony_ci	S1(RA, RE, RD, RB, RC);		LK(RC, RD, RB, RA, RE, 26);
54262306a36Sopenharmony_ci	S2(RC, RD, RB, RA, RE);		LK(RE, RD, RC, RA, RB, 27);
54362306a36Sopenharmony_ci	S3(RE, RD, RC, RA, RB);		LK(RA, RB, RD, RE, RC, 28);
54462306a36Sopenharmony_ci	S4(RA, RB, RD, RE, RC);		LK(RB, RD, RE, RC, RA, 29);
54562306a36Sopenharmony_ci	S5(RB, RD, RE, RC, RA);		LK(RA, RB, RD, RC, RE, 30);
54662306a36Sopenharmony_ci	S6(RA, RB, RD, RC, RE);		LK(RD, RE, RB, RC, RA, 31);
54762306a36Sopenharmony_ci	S7(RD, RE, RB, RC, RA);		 K(RA, RB, RC, RD, RE, 32);
54862306a36Sopenharmony_ci
54962306a36Sopenharmony_ci	movl arg_dst(%esp), %eax;
55062306a36Sopenharmony_ci
55162306a36Sopenharmony_ci	cmpb $0, arg_xor(%esp);
55262306a36Sopenharmony_ci	jnz .L__enc_xor4;
55362306a36Sopenharmony_ci
55462306a36Sopenharmony_ci	write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
55562306a36Sopenharmony_ci
55662306a36Sopenharmony_ci	RET;
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ci.L__enc_xor4:
55962306a36Sopenharmony_ci	xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
56062306a36Sopenharmony_ci
56162306a36Sopenharmony_ci	RET;
56262306a36Sopenharmony_ciSYM_FUNC_END(__serpent_enc_blk_4way)
56362306a36Sopenharmony_ci
56462306a36Sopenharmony_ciSYM_FUNC_START(serpent_dec_blk_4way)
56562306a36Sopenharmony_ci	/* input:
56662306a36Sopenharmony_ci	 *	arg_ctx(%esp): ctx, CTX
56762306a36Sopenharmony_ci	 *	arg_dst(%esp): dst
56862306a36Sopenharmony_ci	 *	arg_src(%esp): src
56962306a36Sopenharmony_ci	 */
57062306a36Sopenharmony_ci
57162306a36Sopenharmony_ci	pcmpeqd RNOT, RNOT;
57262306a36Sopenharmony_ci
57362306a36Sopenharmony_ci	movl arg_ctx(%esp), CTX;
57462306a36Sopenharmony_ci
57562306a36Sopenharmony_ci	movl arg_src(%esp), %eax;
57662306a36Sopenharmony_ci	read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci					 K(RA, RB, RC, RD, RE, 32);
57962306a36Sopenharmony_ci	SI7(RA, RB, RC, RD, RE);	KL(RB, RD, RA, RE, RC, 31);
58062306a36Sopenharmony_ci	SI6(RB, RD, RA, RE, RC);	KL(RA, RC, RE, RB, RD, 30);
58162306a36Sopenharmony_ci	SI5(RA, RC, RE, RB, RD);	KL(RC, RD, RA, RE, RB, 29);
58262306a36Sopenharmony_ci	SI4(RC, RD, RA, RE, RB);	KL(RC, RA, RB, RE, RD, 28);
58362306a36Sopenharmony_ci	SI3(RC, RA, RB, RE, RD);	KL(RB, RC, RD, RE, RA, 27);
58462306a36Sopenharmony_ci	SI2(RB, RC, RD, RE, RA);	KL(RC, RA, RE, RD, RB, 26);
58562306a36Sopenharmony_ci	SI1(RC, RA, RE, RD, RB);	KL(RB, RA, RE, RD, RC, 25);
58662306a36Sopenharmony_ci	SI0(RB, RA, RE, RD, RC);	KL(RE, RC, RA, RB, RD, 24);
58762306a36Sopenharmony_ci	SI7(RE, RC, RA, RB, RD);	KL(RC, RB, RE, RD, RA, 23);
58862306a36Sopenharmony_ci	SI6(RC, RB, RE, RD, RA);	KL(RE, RA, RD, RC, RB, 22);
58962306a36Sopenharmony_ci	SI5(RE, RA, RD, RC, RB);	KL(RA, RB, RE, RD, RC, 21);
59062306a36Sopenharmony_ci	SI4(RA, RB, RE, RD, RC);	KL(RA, RE, RC, RD, RB, 20);
59162306a36Sopenharmony_ci	SI3(RA, RE, RC, RD, RB);	KL(RC, RA, RB, RD, RE, 19);
59262306a36Sopenharmony_ci	SI2(RC, RA, RB, RD, RE);	KL(RA, RE, RD, RB, RC, 18);
59362306a36Sopenharmony_ci	SI1(RA, RE, RD, RB, RC);	KL(RC, RE, RD, RB, RA, 17);
59462306a36Sopenharmony_ci	SI0(RC, RE, RD, RB, RA);	KL(RD, RA, RE, RC, RB, 16);
59562306a36Sopenharmony_ci	SI7(RD, RA, RE, RC, RB);	KL(RA, RC, RD, RB, RE, 15);
59662306a36Sopenharmony_ci	SI6(RA, RC, RD, RB, RE);	KL(RD, RE, RB, RA, RC, 14);
59762306a36Sopenharmony_ci	SI5(RD, RE, RB, RA, RC);	KL(RE, RC, RD, RB, RA, 13);
59862306a36Sopenharmony_ci	SI4(RE, RC, RD, RB, RA);	KL(RE, RD, RA, RB, RC, 12);
59962306a36Sopenharmony_ci	SI3(RE, RD, RA, RB, RC);	KL(RA, RE, RC, RB, RD, 11);
60062306a36Sopenharmony_ci	SI2(RA, RE, RC, RB, RD);	KL(RE, RD, RB, RC, RA, 10);
60162306a36Sopenharmony_ci	SI1(RE, RD, RB, RC, RA);	KL(RA, RD, RB, RC, RE, 9);
60262306a36Sopenharmony_ci	SI0(RA, RD, RB, RC, RE);	KL(RB, RE, RD, RA, RC, 8);
60362306a36Sopenharmony_ci	SI7(RB, RE, RD, RA, RC);	KL(RE, RA, RB, RC, RD, 7);
60462306a36Sopenharmony_ci	SI6(RE, RA, RB, RC, RD);	KL(RB, RD, RC, RE, RA, 6);
60562306a36Sopenharmony_ci	SI5(RB, RD, RC, RE, RA);	KL(RD, RA, RB, RC, RE, 5);
60662306a36Sopenharmony_ci	SI4(RD, RA, RB, RC, RE);	KL(RD, RB, RE, RC, RA, 4);
60762306a36Sopenharmony_ci	SI3(RD, RB, RE, RC, RA);	KL(RE, RD, RA, RC, RB, 3);
60862306a36Sopenharmony_ci	SI2(RE, RD, RA, RC, RB);	KL(RD, RB, RC, RA, RE, 2);
60962306a36Sopenharmony_ci	SI1(RD, RB, RC, RA, RE);	KL(RE, RB, RC, RA, RD, 1);
61062306a36Sopenharmony_ci	SI0(RE, RB, RC, RA, RD);	 K(RC, RD, RB, RE, RA, 0);
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci	movl arg_dst(%esp), %eax;
61362306a36Sopenharmony_ci	write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_ci	RET;
61662306a36Sopenharmony_ciSYM_FUNC_END(serpent_dec_blk_4way)
617