162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Blowfish Cipher Algorithm (x86_64)
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci
862306a36Sopenharmony_ci#include <linux/linkage.h>
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci.file "blowfish-x86_64-asm.S"
1162306a36Sopenharmony_ci.text
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci/* structure of crypto context */
1462306a36Sopenharmony_ci#define p	0
1562306a36Sopenharmony_ci#define s0	((16 + 2) * 4)
1662306a36Sopenharmony_ci#define s1	((16 + 2 + (1 * 256)) * 4)
1762306a36Sopenharmony_ci#define s2	((16 + 2 + (2 * 256)) * 4)
1862306a36Sopenharmony_ci#define s3	((16 + 2 + (3 * 256)) * 4)
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci/* register macros */
2162306a36Sopenharmony_ci#define CTX %r12
2262306a36Sopenharmony_ci#define RIO %rsi
2362306a36Sopenharmony_ci
2462306a36Sopenharmony_ci#define RX0 %rax
2562306a36Sopenharmony_ci#define RX1 %rbx
2662306a36Sopenharmony_ci#define RX2 %rcx
2762306a36Sopenharmony_ci#define RX3 %rdx
2862306a36Sopenharmony_ci
2962306a36Sopenharmony_ci#define RX0d %eax
3062306a36Sopenharmony_ci#define RX1d %ebx
3162306a36Sopenharmony_ci#define RX2d %ecx
3262306a36Sopenharmony_ci#define RX3d %edx
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci#define RX0bl %al
3562306a36Sopenharmony_ci#define RX1bl %bl
3662306a36Sopenharmony_ci#define RX2bl %cl
3762306a36Sopenharmony_ci#define RX3bl %dl
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci#define RX0bh %ah
4062306a36Sopenharmony_ci#define RX1bh %bh
4162306a36Sopenharmony_ci#define RX2bh %ch
4262306a36Sopenharmony_ci#define RX3bh %dh
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci#define RT0 %rdi
4562306a36Sopenharmony_ci#define RT1 %rsi
4662306a36Sopenharmony_ci#define RT2 %r8
4762306a36Sopenharmony_ci#define RT3 %r9
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci#define RT0d %edi
5062306a36Sopenharmony_ci#define RT1d %esi
5162306a36Sopenharmony_ci#define RT2d %r8d
5262306a36Sopenharmony_ci#define RT3d %r9d
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci#define RKEY %r10
5562306a36Sopenharmony_ci
5662306a36Sopenharmony_ci/***********************************************************************
5762306a36Sopenharmony_ci * 1-way blowfish
5862306a36Sopenharmony_ci ***********************************************************************/
5962306a36Sopenharmony_ci#define F() \
6062306a36Sopenharmony_ci	rorq $16,		RX0; \
6162306a36Sopenharmony_ci	movzbl RX0bh,		RT0d; \
6262306a36Sopenharmony_ci	movzbl RX0bl,		RT1d; \
6362306a36Sopenharmony_ci	rolq $16,		RX0; \
6462306a36Sopenharmony_ci	movl s0(CTX,RT0,4),	RT0d; \
6562306a36Sopenharmony_ci	addl s1(CTX,RT1,4),	RT0d; \
6662306a36Sopenharmony_ci	movzbl RX0bh,		RT1d; \
6762306a36Sopenharmony_ci	movzbl RX0bl,		RT2d; \
6862306a36Sopenharmony_ci	rolq $32,		RX0; \
6962306a36Sopenharmony_ci	xorl s2(CTX,RT1,4),	RT0d; \
7062306a36Sopenharmony_ci	addl s3(CTX,RT2,4),	RT0d; \
7162306a36Sopenharmony_ci	xorq RT0,		RX0;
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci#define add_roundkey_enc(n) \
7462306a36Sopenharmony_ci	xorq p+4*(n)(CTX), 	RX0;
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci#define round_enc(n) \
7762306a36Sopenharmony_ci	add_roundkey_enc(n); \
7862306a36Sopenharmony_ci	\
7962306a36Sopenharmony_ci	F(); \
8062306a36Sopenharmony_ci	F();
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci#define add_roundkey_dec(n) \
8362306a36Sopenharmony_ci	movq p+4*(n-1)(CTX),	RT0; \
8462306a36Sopenharmony_ci	rorq $32,		RT0; \
8562306a36Sopenharmony_ci	xorq RT0,		RX0;
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci#define round_dec(n) \
8862306a36Sopenharmony_ci	add_roundkey_dec(n); \
8962306a36Sopenharmony_ci	\
9062306a36Sopenharmony_ci	F(); \
9162306a36Sopenharmony_ci	F(); \
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_ci#define read_block() \
9462306a36Sopenharmony_ci	movq (RIO), 		RX0; \
9562306a36Sopenharmony_ci	rorq $32, 		RX0; \
9662306a36Sopenharmony_ci	bswapq 			RX0;
9762306a36Sopenharmony_ci
9862306a36Sopenharmony_ci#define write_block() \
9962306a36Sopenharmony_ci	bswapq 			RX0; \
10062306a36Sopenharmony_ci	movq RX0, 		(RIO);
10162306a36Sopenharmony_ci
10262306a36Sopenharmony_ciSYM_FUNC_START(blowfish_enc_blk)
10362306a36Sopenharmony_ci	/* input:
10462306a36Sopenharmony_ci	 *	%rdi: ctx
10562306a36Sopenharmony_ci	 *	%rsi: dst
10662306a36Sopenharmony_ci	 *	%rdx: src
10762306a36Sopenharmony_ci	 */
10862306a36Sopenharmony_ci	movq %r12, %r11;
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci	movq %rdi, CTX;
11162306a36Sopenharmony_ci	movq %rsi, %r10;
11262306a36Sopenharmony_ci	movq %rdx, RIO;
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci	read_block();
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci	round_enc(0);
11762306a36Sopenharmony_ci	round_enc(2);
11862306a36Sopenharmony_ci	round_enc(4);
11962306a36Sopenharmony_ci	round_enc(6);
12062306a36Sopenharmony_ci	round_enc(8);
12162306a36Sopenharmony_ci	round_enc(10);
12262306a36Sopenharmony_ci	round_enc(12);
12362306a36Sopenharmony_ci	round_enc(14);
12462306a36Sopenharmony_ci	add_roundkey_enc(16);
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci	movq %r11, %r12;
12762306a36Sopenharmony_ci	movq %r10, RIO;
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci	write_block();
13062306a36Sopenharmony_ci	RET;
13162306a36Sopenharmony_ciSYM_FUNC_END(blowfish_enc_blk)
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ciSYM_FUNC_START(blowfish_dec_blk)
13462306a36Sopenharmony_ci	/* input:
13562306a36Sopenharmony_ci	 *	%rdi: ctx
13662306a36Sopenharmony_ci	 *	%rsi: dst
13762306a36Sopenharmony_ci	 *	%rdx: src
13862306a36Sopenharmony_ci	 */
13962306a36Sopenharmony_ci	movq %r12, %r11;
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci	movq %rdi, CTX;
14262306a36Sopenharmony_ci	movq %rsi, %r10;
14362306a36Sopenharmony_ci	movq %rdx, RIO;
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_ci	read_block();
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci	round_dec(17);
14862306a36Sopenharmony_ci	round_dec(15);
14962306a36Sopenharmony_ci	round_dec(13);
15062306a36Sopenharmony_ci	round_dec(11);
15162306a36Sopenharmony_ci	round_dec(9);
15262306a36Sopenharmony_ci	round_dec(7);
15362306a36Sopenharmony_ci	round_dec(5);
15462306a36Sopenharmony_ci	round_dec(3);
15562306a36Sopenharmony_ci	add_roundkey_dec(1);
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci	movq %r10, RIO;
15862306a36Sopenharmony_ci	write_block();
15962306a36Sopenharmony_ci
16062306a36Sopenharmony_ci	movq %r11, %r12;
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_ci	RET;
16362306a36Sopenharmony_ciSYM_FUNC_END(blowfish_dec_blk)
16462306a36Sopenharmony_ci
16562306a36Sopenharmony_ci/**********************************************************************
16662306a36Sopenharmony_ci  4-way blowfish, four blocks parallel
16762306a36Sopenharmony_ci **********************************************************************/
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci/* F() for 4-way. Slower when used alone/1-way, but faster when used
17062306a36Sopenharmony_ci * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
17162306a36Sopenharmony_ci */
17262306a36Sopenharmony_ci#define F4(x) \
17362306a36Sopenharmony_ci	movzbl x ## bh,		RT1d; \
17462306a36Sopenharmony_ci	movzbl x ## bl,		RT3d; \
17562306a36Sopenharmony_ci	rorq $16,		x; \
17662306a36Sopenharmony_ci	movzbl x ## bh,		RT0d; \
17762306a36Sopenharmony_ci	movzbl x ## bl,		RT2d; \
17862306a36Sopenharmony_ci	rorq $16,		x; \
17962306a36Sopenharmony_ci	movl s0(CTX,RT0,4),	RT0d; \
18062306a36Sopenharmony_ci	addl s1(CTX,RT2,4),	RT0d; \
18162306a36Sopenharmony_ci	xorl s2(CTX,RT1,4),	RT0d; \
18262306a36Sopenharmony_ci	addl s3(CTX,RT3,4),	RT0d; \
18362306a36Sopenharmony_ci	xorq RT0,		x;
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_ci#define add_preloaded_roundkey4() \
18662306a36Sopenharmony_ci	xorq RKEY,		RX0; \
18762306a36Sopenharmony_ci	xorq RKEY,		RX1; \
18862306a36Sopenharmony_ci	xorq RKEY,		RX2; \
18962306a36Sopenharmony_ci	xorq RKEY,		RX3;
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci#define preload_roundkey_enc(n) \
19262306a36Sopenharmony_ci	movq p+4*(n)(CTX),	RKEY;
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ci#define add_roundkey_enc4(n) \
19562306a36Sopenharmony_ci	add_preloaded_roundkey4(); \
19662306a36Sopenharmony_ci	preload_roundkey_enc(n + 2);
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci#define round_enc4(n) \
19962306a36Sopenharmony_ci	add_roundkey_enc4(n); \
20062306a36Sopenharmony_ci	\
20162306a36Sopenharmony_ci	F4(RX0); \
20262306a36Sopenharmony_ci	F4(RX1); \
20362306a36Sopenharmony_ci	F4(RX2); \
20462306a36Sopenharmony_ci	F4(RX3); \
20562306a36Sopenharmony_ci	\
20662306a36Sopenharmony_ci	F4(RX0); \
20762306a36Sopenharmony_ci	F4(RX1); \
20862306a36Sopenharmony_ci	F4(RX2); \
20962306a36Sopenharmony_ci	F4(RX3);
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_ci#define preload_roundkey_dec(n) \
21262306a36Sopenharmony_ci	movq p+4*((n)-1)(CTX),	RKEY; \
21362306a36Sopenharmony_ci	rorq $32,		RKEY;
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ci#define add_roundkey_dec4(n) \
21662306a36Sopenharmony_ci	add_preloaded_roundkey4(); \
21762306a36Sopenharmony_ci	preload_roundkey_dec(n - 2);
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci#define round_dec4(n) \
22062306a36Sopenharmony_ci	add_roundkey_dec4(n); \
22162306a36Sopenharmony_ci	\
22262306a36Sopenharmony_ci	F4(RX0); \
22362306a36Sopenharmony_ci	F4(RX1); \
22462306a36Sopenharmony_ci	F4(RX2); \
22562306a36Sopenharmony_ci	F4(RX3); \
22662306a36Sopenharmony_ci	\
22762306a36Sopenharmony_ci	F4(RX0); \
22862306a36Sopenharmony_ci	F4(RX1); \
22962306a36Sopenharmony_ci	F4(RX2); \
23062306a36Sopenharmony_ci	F4(RX3);
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_ci#define read_block4() \
23362306a36Sopenharmony_ci	movq (RIO),		RX0; \
23462306a36Sopenharmony_ci	rorq $32,		RX0; \
23562306a36Sopenharmony_ci	bswapq 			RX0; \
23662306a36Sopenharmony_ci	\
23762306a36Sopenharmony_ci	movq 8(RIO),		RX1; \
23862306a36Sopenharmony_ci	rorq $32,		RX1; \
23962306a36Sopenharmony_ci	bswapq 			RX1; \
24062306a36Sopenharmony_ci	\
24162306a36Sopenharmony_ci	movq 16(RIO),		RX2; \
24262306a36Sopenharmony_ci	rorq $32,		RX2; \
24362306a36Sopenharmony_ci	bswapq 			RX2; \
24462306a36Sopenharmony_ci	\
24562306a36Sopenharmony_ci	movq 24(RIO),		RX3; \
24662306a36Sopenharmony_ci	rorq $32,		RX3; \
24762306a36Sopenharmony_ci	bswapq 			RX3;
24862306a36Sopenharmony_ci
24962306a36Sopenharmony_ci#define write_block4() \
25062306a36Sopenharmony_ci	bswapq 			RX0; \
25162306a36Sopenharmony_ci	movq RX0,		(RIO); \
25262306a36Sopenharmony_ci	\
25362306a36Sopenharmony_ci	bswapq 			RX1; \
25462306a36Sopenharmony_ci	movq RX1,		8(RIO); \
25562306a36Sopenharmony_ci	\
25662306a36Sopenharmony_ci	bswapq 			RX2; \
25762306a36Sopenharmony_ci	movq RX2,		16(RIO); \
25862306a36Sopenharmony_ci	\
25962306a36Sopenharmony_ci	bswapq 			RX3; \
26062306a36Sopenharmony_ci	movq RX3,		24(RIO);
26162306a36Sopenharmony_ci
26262306a36Sopenharmony_ci#define xor_block4() \
26362306a36Sopenharmony_ci	movq (RIO),		RT0; \
26462306a36Sopenharmony_ci	bswapq			RT0; \
26562306a36Sopenharmony_ci	xorq RT0,		RX1; \
26662306a36Sopenharmony_ci	\
26762306a36Sopenharmony_ci	movq 8(RIO),		RT2; \
26862306a36Sopenharmony_ci	bswapq			RT2; \
26962306a36Sopenharmony_ci	xorq RT2,		RX2; \
27062306a36Sopenharmony_ci	\
27162306a36Sopenharmony_ci	movq 16(RIO),		RT3; \
27262306a36Sopenharmony_ci	bswapq			RT3; \
27362306a36Sopenharmony_ci	xorq RT3,		RX3;
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ciSYM_FUNC_START(blowfish_enc_blk_4way)
27662306a36Sopenharmony_ci	/* input:
27762306a36Sopenharmony_ci	 *	%rdi: ctx
27862306a36Sopenharmony_ci	 *	%rsi: dst
27962306a36Sopenharmony_ci	 *	%rdx: src
28062306a36Sopenharmony_ci	 */
28162306a36Sopenharmony_ci	pushq %r12;
28262306a36Sopenharmony_ci	pushq %rbx;
28362306a36Sopenharmony_ci
28462306a36Sopenharmony_ci	movq %rdi, CTX
28562306a36Sopenharmony_ci	movq %rsi, %r11;
28662306a36Sopenharmony_ci	movq %rdx, RIO;
28762306a36Sopenharmony_ci
28862306a36Sopenharmony_ci	preload_roundkey_enc(0);
28962306a36Sopenharmony_ci
29062306a36Sopenharmony_ci	read_block4();
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_ci	round_enc4(0);
29362306a36Sopenharmony_ci	round_enc4(2);
29462306a36Sopenharmony_ci	round_enc4(4);
29562306a36Sopenharmony_ci	round_enc4(6);
29662306a36Sopenharmony_ci	round_enc4(8);
29762306a36Sopenharmony_ci	round_enc4(10);
29862306a36Sopenharmony_ci	round_enc4(12);
29962306a36Sopenharmony_ci	round_enc4(14);
30062306a36Sopenharmony_ci	add_preloaded_roundkey4();
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	movq %r11, RIO;
30362306a36Sopenharmony_ci	write_block4();
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci	popq %rbx;
30662306a36Sopenharmony_ci	popq %r12;
30762306a36Sopenharmony_ci	RET;
30862306a36Sopenharmony_ciSYM_FUNC_END(blowfish_enc_blk_4way)
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ciSYM_FUNC_START(__blowfish_dec_blk_4way)
31162306a36Sopenharmony_ci	/* input:
31262306a36Sopenharmony_ci	 *	%rdi: ctx
31362306a36Sopenharmony_ci	 *	%rsi: dst
31462306a36Sopenharmony_ci	 *	%rdx: src
31562306a36Sopenharmony_ci	 *	%rcx: cbc (bool)
31662306a36Sopenharmony_ci	 */
31762306a36Sopenharmony_ci	pushq %r12;
31862306a36Sopenharmony_ci	pushq %rbx;
31962306a36Sopenharmony_ci	pushq %rcx;
32062306a36Sopenharmony_ci	pushq %rdx;
32162306a36Sopenharmony_ci
32262306a36Sopenharmony_ci	movq %rdi, CTX;
32362306a36Sopenharmony_ci	movq %rsi, %r11;
32462306a36Sopenharmony_ci	movq %rdx, RIO;
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci	preload_roundkey_dec(17);
32762306a36Sopenharmony_ci	read_block4();
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_ci	round_dec4(17);
33062306a36Sopenharmony_ci	round_dec4(15);
33162306a36Sopenharmony_ci	round_dec4(13);
33262306a36Sopenharmony_ci	round_dec4(11);
33362306a36Sopenharmony_ci	round_dec4(9);
33462306a36Sopenharmony_ci	round_dec4(7);
33562306a36Sopenharmony_ci	round_dec4(5);
33662306a36Sopenharmony_ci	round_dec4(3);
33762306a36Sopenharmony_ci	add_preloaded_roundkey4();
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci	popq RIO;
34062306a36Sopenharmony_ci	popq %r12;
34162306a36Sopenharmony_ci	testq %r12, %r12;
34262306a36Sopenharmony_ci	jz .L_no_cbc_xor;
34362306a36Sopenharmony_ci
34462306a36Sopenharmony_ci	xor_block4();
34562306a36Sopenharmony_ci
34662306a36Sopenharmony_ci.L_no_cbc_xor:
34762306a36Sopenharmony_ci	movq %r11, RIO;
34862306a36Sopenharmony_ci	write_block4();
34962306a36Sopenharmony_ci
35062306a36Sopenharmony_ci	popq %rbx;
35162306a36Sopenharmony_ci	popq %r12;
35262306a36Sopenharmony_ci
35362306a36Sopenharmony_ci	RET;
35462306a36Sopenharmony_ciSYM_FUNC_END(__blowfish_dec_blk_4way)
355