162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/***************************************************************************
362306a36Sopenharmony_ci*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
462306a36Sopenharmony_ci*                                                                         *
562306a36Sopenharmony_ci***************************************************************************/
662306a36Sopenharmony_ci
762306a36Sopenharmony_ci.file "twofish-x86_64-asm.S"
862306a36Sopenharmony_ci.text
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#include <linux/linkage.h>
1162306a36Sopenharmony_ci#include <asm/asm-offsets.h>
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci#define a_offset	0
1462306a36Sopenharmony_ci#define b_offset	4
1562306a36Sopenharmony_ci#define c_offset	8
1662306a36Sopenharmony_ci#define d_offset	12
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci/* Structure of the crypto context struct*/
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci#define s0	0	/* S0 Array 256 Words each */
2162306a36Sopenharmony_ci#define s1	1024	/* S1 Array */
2262306a36Sopenharmony_ci#define s2	2048	/* S2 Array */
2362306a36Sopenharmony_ci#define s3	3072	/* S3 Array */
2462306a36Sopenharmony_ci#define w	4096	/* 8 whitening keys (word) */
2562306a36Sopenharmony_ci#define k	4128	/* key 1-32 ( word ) */
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci/* define a few register aliases to allow macro substitution */
2862306a36Sopenharmony_ci
2962306a36Sopenharmony_ci#define R0     %rax
3062306a36Sopenharmony_ci#define R0D    %eax
3162306a36Sopenharmony_ci#define R0B    %al
3262306a36Sopenharmony_ci#define R0H    %ah
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci#define R1     %rbx
3562306a36Sopenharmony_ci#define R1D    %ebx
3662306a36Sopenharmony_ci#define R1B    %bl
3762306a36Sopenharmony_ci#define R1H    %bh
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci#define R2     %rcx
4062306a36Sopenharmony_ci#define R2D    %ecx
4162306a36Sopenharmony_ci#define R2B    %cl
4262306a36Sopenharmony_ci#define R2H    %ch
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci#define R3     %rdx
4562306a36Sopenharmony_ci#define R3D    %edx
4662306a36Sopenharmony_ci#define R3B    %dl
4762306a36Sopenharmony_ci#define R3H    %dh
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci
5062306a36Sopenharmony_ci/* performs input whitening */
5162306a36Sopenharmony_ci#define input_whitening(src,context,offset)\
5262306a36Sopenharmony_ci	xor	w+offset(context),	src;
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci/* performs input whitening */
5562306a36Sopenharmony_ci#define output_whitening(src,context,offset)\
5662306a36Sopenharmony_ci	xor	w+16+offset(context),	src;
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci/*
6062306a36Sopenharmony_ci * a input register containing a (rotated 16)
6162306a36Sopenharmony_ci * b input register containing b
6262306a36Sopenharmony_ci * c input register containing c
6362306a36Sopenharmony_ci * d input register containing d (already rol $1)
6462306a36Sopenharmony_ci * operations on a and b are interleaved to increase performance
6562306a36Sopenharmony_ci */
6662306a36Sopenharmony_ci#define encrypt_round(a,b,c,d,round)\
6762306a36Sopenharmony_ci	movzx	b ## B,		%edi;\
6862306a36Sopenharmony_ci	mov	s1(%r11,%rdi,4),%r8d;\
6962306a36Sopenharmony_ci	movzx	a ## B,		%edi;\
7062306a36Sopenharmony_ci	mov	s2(%r11,%rdi,4),%r9d;\
7162306a36Sopenharmony_ci	movzx	b ## H,		%edi;\
7262306a36Sopenharmony_ci	ror	$16,		b ## D;\
7362306a36Sopenharmony_ci	xor	s2(%r11,%rdi,4),%r8d;\
7462306a36Sopenharmony_ci	movzx	a ## H,		%edi;\
7562306a36Sopenharmony_ci	ror	$16,		a ## D;\
7662306a36Sopenharmony_ci	xor	s3(%r11,%rdi,4),%r9d;\
7762306a36Sopenharmony_ci	movzx	b ## B,		%edi;\
7862306a36Sopenharmony_ci	xor	s3(%r11,%rdi,4),%r8d;\
7962306a36Sopenharmony_ci	movzx	a ## B,		%edi;\
8062306a36Sopenharmony_ci	xor	(%r11,%rdi,4),	%r9d;\
8162306a36Sopenharmony_ci	movzx	b ## H,		%edi;\
8262306a36Sopenharmony_ci	ror	$15,		b ## D;\
8362306a36Sopenharmony_ci	xor	(%r11,%rdi,4),	%r8d;\
8462306a36Sopenharmony_ci	movzx	a ## H,		%edi;\
8562306a36Sopenharmony_ci	xor	s1(%r11,%rdi,4),%r9d;\
8662306a36Sopenharmony_ci	add	%r8d,		%r9d;\
8762306a36Sopenharmony_ci	add	%r9d,		%r8d;\
8862306a36Sopenharmony_ci	add	k+round(%r11),	%r9d;\
8962306a36Sopenharmony_ci	xor	%r9d,		c ## D;\
9062306a36Sopenharmony_ci	rol	$15,		c ## D;\
9162306a36Sopenharmony_ci	add	k+4+round(%r11),%r8d;\
9262306a36Sopenharmony_ci	xor	%r8d,		d ## D;
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_ci/*
9562306a36Sopenharmony_ci * a input register containing a(rotated 16)
9662306a36Sopenharmony_ci * b input register containing b
9762306a36Sopenharmony_ci * c input register containing c
9862306a36Sopenharmony_ci * d input register containing d (already rol $1)
9962306a36Sopenharmony_ci * operations on a and b are interleaved to increase performance
10062306a36Sopenharmony_ci * during the round a and b are prepared for the output whitening
10162306a36Sopenharmony_ci */
10262306a36Sopenharmony_ci#define encrypt_last_round(a,b,c,d,round)\
10362306a36Sopenharmony_ci	mov	b ## D,		%r10d;\
10462306a36Sopenharmony_ci	shl	$32,		%r10;\
10562306a36Sopenharmony_ci	movzx	b ## B,		%edi;\
10662306a36Sopenharmony_ci	mov	s1(%r11,%rdi,4),%r8d;\
10762306a36Sopenharmony_ci	movzx	a ## B,		%edi;\
10862306a36Sopenharmony_ci	mov	s2(%r11,%rdi,4),%r9d;\
10962306a36Sopenharmony_ci	movzx	b ## H,		%edi;\
11062306a36Sopenharmony_ci	ror	$16,		b ## D;\
11162306a36Sopenharmony_ci	xor	s2(%r11,%rdi,4),%r8d;\
11262306a36Sopenharmony_ci	movzx	a ## H,		%edi;\
11362306a36Sopenharmony_ci	ror	$16,		a ## D;\
11462306a36Sopenharmony_ci	xor	s3(%r11,%rdi,4),%r9d;\
11562306a36Sopenharmony_ci	movzx	b ## B,		%edi;\
11662306a36Sopenharmony_ci	xor	s3(%r11,%rdi,4),%r8d;\
11762306a36Sopenharmony_ci	movzx	a ## B,		%edi;\
11862306a36Sopenharmony_ci	xor	(%r11,%rdi,4),	%r9d;\
11962306a36Sopenharmony_ci	xor	a,		%r10;\
12062306a36Sopenharmony_ci	movzx	b ## H,		%edi;\
12162306a36Sopenharmony_ci	xor	(%r11,%rdi,4),	%r8d;\
12262306a36Sopenharmony_ci	movzx	a ## H,		%edi;\
12362306a36Sopenharmony_ci	xor	s1(%r11,%rdi,4),%r9d;\
12462306a36Sopenharmony_ci	add	%r8d,		%r9d;\
12562306a36Sopenharmony_ci	add	%r9d,		%r8d;\
12662306a36Sopenharmony_ci	add	k+round(%r11),	%r9d;\
12762306a36Sopenharmony_ci	xor	%r9d,		c ## D;\
12862306a36Sopenharmony_ci	ror	$1,		c ## D;\
12962306a36Sopenharmony_ci	add	k+4+round(%r11),%r8d;\
13062306a36Sopenharmony_ci	xor	%r8d,		d ## D
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci/*
13362306a36Sopenharmony_ci * a input register containing a
13462306a36Sopenharmony_ci * b input register containing b (rotated 16)
13562306a36Sopenharmony_ci * c input register containing c (already rol $1)
13662306a36Sopenharmony_ci * d input register containing d
13762306a36Sopenharmony_ci * operations on a and b are interleaved to increase performance
13862306a36Sopenharmony_ci */
13962306a36Sopenharmony_ci#define decrypt_round(a,b,c,d,round)\
14062306a36Sopenharmony_ci	movzx	a ## B,		%edi;\
14162306a36Sopenharmony_ci	mov	(%r11,%rdi,4),	%r9d;\
14262306a36Sopenharmony_ci	movzx	b ## B,		%edi;\
14362306a36Sopenharmony_ci	mov	s3(%r11,%rdi,4),%r8d;\
14462306a36Sopenharmony_ci	movzx	a ## H,		%edi;\
14562306a36Sopenharmony_ci	ror	$16,		a ## D;\
14662306a36Sopenharmony_ci	xor	s1(%r11,%rdi,4),%r9d;\
14762306a36Sopenharmony_ci	movzx	b ## H,		%edi;\
14862306a36Sopenharmony_ci	ror	$16,		b ## D;\
14962306a36Sopenharmony_ci	xor	(%r11,%rdi,4),	%r8d;\
15062306a36Sopenharmony_ci	movzx	a ## B,		%edi;\
15162306a36Sopenharmony_ci	xor	s2(%r11,%rdi,4),%r9d;\
15262306a36Sopenharmony_ci	movzx	b ## B,		%edi;\
15362306a36Sopenharmony_ci	xor	s1(%r11,%rdi,4),%r8d;\
15462306a36Sopenharmony_ci	movzx	a ## H,		%edi;\
15562306a36Sopenharmony_ci	ror	$15,		a ## D;\
15662306a36Sopenharmony_ci	xor	s3(%r11,%rdi,4),%r9d;\
15762306a36Sopenharmony_ci	movzx	b ## H,		%edi;\
15862306a36Sopenharmony_ci	xor	s2(%r11,%rdi,4),%r8d;\
15962306a36Sopenharmony_ci	add	%r8d,		%r9d;\
16062306a36Sopenharmony_ci	add	%r9d,		%r8d;\
16162306a36Sopenharmony_ci	add	k+round(%r11),	%r9d;\
16262306a36Sopenharmony_ci	xor	%r9d,		c ## D;\
16362306a36Sopenharmony_ci	add	k+4+round(%r11),%r8d;\
16462306a36Sopenharmony_ci	xor	%r8d,		d ## D;\
16562306a36Sopenharmony_ci	rol	$15,		d ## D;
16662306a36Sopenharmony_ci
16762306a36Sopenharmony_ci/*
16862306a36Sopenharmony_ci * a input register containing a
16962306a36Sopenharmony_ci * b input register containing b
17062306a36Sopenharmony_ci * c input register containing c (already rol $1)
17162306a36Sopenharmony_ci * d input register containing d
17262306a36Sopenharmony_ci * operations on a and b are interleaved to increase performance
17362306a36Sopenharmony_ci * during the round a and b are prepared for the output whitening
17462306a36Sopenharmony_ci */
17562306a36Sopenharmony_ci#define decrypt_last_round(a,b,c,d,round)\
17662306a36Sopenharmony_ci	movzx	a ## B,		%edi;\
17762306a36Sopenharmony_ci	mov	(%r11,%rdi,4),	%r9d;\
17862306a36Sopenharmony_ci	movzx	b ## B,		%edi;\
17962306a36Sopenharmony_ci	mov	s3(%r11,%rdi,4),%r8d;\
18062306a36Sopenharmony_ci	movzx	b ## H,		%edi;\
18162306a36Sopenharmony_ci	ror	$16,		b ## D;\
18262306a36Sopenharmony_ci	xor	(%r11,%rdi,4),	%r8d;\
18362306a36Sopenharmony_ci	movzx	a ## H,		%edi;\
18462306a36Sopenharmony_ci	mov	b ## D,		%r10d;\
18562306a36Sopenharmony_ci	shl	$32,		%r10;\
18662306a36Sopenharmony_ci	xor	a,		%r10;\
18762306a36Sopenharmony_ci	ror	$16,		a ## D;\
18862306a36Sopenharmony_ci	xor	s1(%r11,%rdi,4),%r9d;\
18962306a36Sopenharmony_ci	movzx	b ## B,		%edi;\
19062306a36Sopenharmony_ci	xor	s1(%r11,%rdi,4),%r8d;\
19162306a36Sopenharmony_ci	movzx	a ## B,		%edi;\
19262306a36Sopenharmony_ci	xor	s2(%r11,%rdi,4),%r9d;\
19362306a36Sopenharmony_ci	movzx	b ## H,		%edi;\
19462306a36Sopenharmony_ci	xor	s2(%r11,%rdi,4),%r8d;\
19562306a36Sopenharmony_ci	movzx	a ## H,		%edi;\
19662306a36Sopenharmony_ci	xor	s3(%r11,%rdi,4),%r9d;\
19762306a36Sopenharmony_ci	add	%r8d,		%r9d;\
19862306a36Sopenharmony_ci	add	%r9d,		%r8d;\
19962306a36Sopenharmony_ci	add	k+round(%r11),	%r9d;\
20062306a36Sopenharmony_ci	xor	%r9d,		c ## D;\
20162306a36Sopenharmony_ci	add	k+4+round(%r11),%r8d;\
20262306a36Sopenharmony_ci	xor	%r8d,		d ## D;\
20362306a36Sopenharmony_ci	ror	$1,		d ## D;
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ciSYM_FUNC_START(twofish_enc_blk)
20662306a36Sopenharmony_ci	pushq    R1
20762306a36Sopenharmony_ci
20862306a36Sopenharmony_ci	/* %rdi contains the ctx address */
20962306a36Sopenharmony_ci	/* %rsi contains the output address */
21062306a36Sopenharmony_ci	/* %rdx contains the input address */
21162306a36Sopenharmony_ci	/* ctx address is moved to free one non-rex register
21262306a36Sopenharmony_ci	as target for the 8bit high operations */
21362306a36Sopenharmony_ci	mov	%rdi,		%r11
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ci	movq	(R3),	R1
21662306a36Sopenharmony_ci	movq	8(R3),	R3
21762306a36Sopenharmony_ci	input_whitening(R1,%r11,a_offset)
21862306a36Sopenharmony_ci	input_whitening(R3,%r11,c_offset)
21962306a36Sopenharmony_ci	mov	R1D,	R0D
22062306a36Sopenharmony_ci	rol	$16,	R0D
22162306a36Sopenharmony_ci	shr	$32,	R1
22262306a36Sopenharmony_ci	mov	R3D,	R2D
22362306a36Sopenharmony_ci	shr	$32,	R3
22462306a36Sopenharmony_ci	rol	$1,	R3D
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,0);
22762306a36Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,8);
22862306a36Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,2*8);
22962306a36Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,3*8);
23062306a36Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,4*8);
23162306a36Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,5*8);
23262306a36Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,6*8);
23362306a36Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,7*8);
23462306a36Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,8*8);
23562306a36Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,9*8);
23662306a36Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,10*8);
23762306a36Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,11*8);
23862306a36Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,12*8);
23962306a36Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,13*8);
24062306a36Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,14*8);
24162306a36Sopenharmony_ci	encrypt_last_round(R2,R3,R0,R1,15*8);
24262306a36Sopenharmony_ci
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci	output_whitening(%r10,%r11,a_offset)
24562306a36Sopenharmony_ci	movq	%r10,	(%rsi)
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci	shl	$32,	R1
24862306a36Sopenharmony_ci	xor	R0,	R1
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci	output_whitening(R1,%r11,c_offset)
25162306a36Sopenharmony_ci	movq	R1,	8(%rsi)
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci	popq	R1
25462306a36Sopenharmony_ci	movl	$1,%eax
25562306a36Sopenharmony_ci	RET
25662306a36Sopenharmony_ciSYM_FUNC_END(twofish_enc_blk)
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_ciSYM_FUNC_START(twofish_dec_blk)
25962306a36Sopenharmony_ci	pushq    R1
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_ci	/* %rdi contains the ctx address */
26262306a36Sopenharmony_ci	/* %rsi contains the output address */
26362306a36Sopenharmony_ci	/* %rdx contains the input address */
26462306a36Sopenharmony_ci	/* ctx address is moved to free one non-rex register
26562306a36Sopenharmony_ci	as target for the 8bit high operations */
26662306a36Sopenharmony_ci	mov	%rdi,		%r11
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	movq	(R3),	R1
26962306a36Sopenharmony_ci	movq	8(R3),	R3
27062306a36Sopenharmony_ci	output_whitening(R1,%r11,a_offset)
27162306a36Sopenharmony_ci	output_whitening(R3,%r11,c_offset)
27262306a36Sopenharmony_ci	mov	R1D,	R0D
27362306a36Sopenharmony_ci	shr	$32,	R1
27462306a36Sopenharmony_ci	rol	$16,	R1D
27562306a36Sopenharmony_ci	mov	R3D,	R2D
27662306a36Sopenharmony_ci	shr	$32,	R3
27762306a36Sopenharmony_ci	rol	$1,	R2D
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,15*8);
28062306a36Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,14*8);
28162306a36Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,13*8);
28262306a36Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,12*8);
28362306a36Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,11*8);
28462306a36Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,10*8);
28562306a36Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,9*8);
28662306a36Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,8*8);
28762306a36Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,7*8);
28862306a36Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,6*8);
28962306a36Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,5*8);
29062306a36Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,4*8);
29162306a36Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,3*8);
29262306a36Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,2*8);
29362306a36Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,1*8);
29462306a36Sopenharmony_ci	decrypt_last_round(R2,R3,R0,R1,0);
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci	input_whitening(%r10,%r11,a_offset)
29762306a36Sopenharmony_ci	movq	%r10,	(%rsi)
29862306a36Sopenharmony_ci
29962306a36Sopenharmony_ci	shl	$32,	R1
30062306a36Sopenharmony_ci	xor	R0,	R1
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	input_whitening(R1,%r11,c_offset)
30362306a36Sopenharmony_ci	movq	R1,	8(%rsi)
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci	popq	R1
30662306a36Sopenharmony_ci	movl	$1,%eax
30762306a36Sopenharmony_ci	RET
30862306a36Sopenharmony_ciSYM_FUNC_END(twofish_dec_blk)
309