18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
28c2ecf20Sopenharmony_ci/***************************************************************************
38c2ecf20Sopenharmony_ci*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
48c2ecf20Sopenharmony_ci*                                                                         *
58c2ecf20Sopenharmony_ci***************************************************************************/
68c2ecf20Sopenharmony_ci
78c2ecf20Sopenharmony_ci.file "twofish-x86_64-asm.S"
88c2ecf20Sopenharmony_ci.text
98c2ecf20Sopenharmony_ci
108c2ecf20Sopenharmony_ci#include <linux/linkage.h>
118c2ecf20Sopenharmony_ci#include <asm/asm-offsets.h>
128c2ecf20Sopenharmony_ci
138c2ecf20Sopenharmony_ci#define a_offset	0
148c2ecf20Sopenharmony_ci#define b_offset	4
158c2ecf20Sopenharmony_ci#define c_offset	8
168c2ecf20Sopenharmony_ci#define d_offset	12
178c2ecf20Sopenharmony_ci
188c2ecf20Sopenharmony_ci/* Structure of the crypto context struct*/
198c2ecf20Sopenharmony_ci
208c2ecf20Sopenharmony_ci#define s0	0	/* S0 Array 256 Words each */
218c2ecf20Sopenharmony_ci#define s1	1024	/* S1 Array */
228c2ecf20Sopenharmony_ci#define s2	2048	/* S2 Array */
238c2ecf20Sopenharmony_ci#define s3	3072	/* S3 Array */
248c2ecf20Sopenharmony_ci#define w	4096	/* 8 whitening keys (word) */
258c2ecf20Sopenharmony_ci#define k	4128	/* key 1-32 ( word ) */
268c2ecf20Sopenharmony_ci
278c2ecf20Sopenharmony_ci/* define a few register aliases to allow macro substitution */
288c2ecf20Sopenharmony_ci
298c2ecf20Sopenharmony_ci#define R0     %rax
308c2ecf20Sopenharmony_ci#define R0D    %eax
318c2ecf20Sopenharmony_ci#define R0B    %al
328c2ecf20Sopenharmony_ci#define R0H    %ah
338c2ecf20Sopenharmony_ci
348c2ecf20Sopenharmony_ci#define R1     %rbx
358c2ecf20Sopenharmony_ci#define R1D    %ebx
368c2ecf20Sopenharmony_ci#define R1B    %bl
378c2ecf20Sopenharmony_ci#define R1H    %bh
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_ci#define R2     %rcx
408c2ecf20Sopenharmony_ci#define R2D    %ecx
418c2ecf20Sopenharmony_ci#define R2B    %cl
428c2ecf20Sopenharmony_ci#define R2H    %ch
438c2ecf20Sopenharmony_ci
448c2ecf20Sopenharmony_ci#define R3     %rdx
458c2ecf20Sopenharmony_ci#define R3D    %edx
468c2ecf20Sopenharmony_ci#define R3B    %dl
478c2ecf20Sopenharmony_ci#define R3H    %dh
488c2ecf20Sopenharmony_ci
498c2ecf20Sopenharmony_ci
508c2ecf20Sopenharmony_ci/* performs input whitening */
518c2ecf20Sopenharmony_ci#define input_whitening(src,context,offset)\
528c2ecf20Sopenharmony_ci	xor	w+offset(context),	src;
538c2ecf20Sopenharmony_ci
548c2ecf20Sopenharmony_ci/* performs input whitening */
558c2ecf20Sopenharmony_ci#define output_whitening(src,context,offset)\
568c2ecf20Sopenharmony_ci	xor	w+16+offset(context),	src;
578c2ecf20Sopenharmony_ci
588c2ecf20Sopenharmony_ci
598c2ecf20Sopenharmony_ci/*
608c2ecf20Sopenharmony_ci * a input register containing a (rotated 16)
618c2ecf20Sopenharmony_ci * b input register containing b
628c2ecf20Sopenharmony_ci * c input register containing c
638c2ecf20Sopenharmony_ci * d input register containing d (already rol $1)
648c2ecf20Sopenharmony_ci * operations on a and b are interleaved to increase performance
658c2ecf20Sopenharmony_ci */
668c2ecf20Sopenharmony_ci#define encrypt_round(a,b,c,d,round)\
678c2ecf20Sopenharmony_ci	movzx	b ## B,		%edi;\
688c2ecf20Sopenharmony_ci	mov	s1(%r11,%rdi,4),%r8d;\
698c2ecf20Sopenharmony_ci	movzx	a ## B,		%edi;\
708c2ecf20Sopenharmony_ci	mov	s2(%r11,%rdi,4),%r9d;\
718c2ecf20Sopenharmony_ci	movzx	b ## H,		%edi;\
728c2ecf20Sopenharmony_ci	ror	$16,		b ## D;\
738c2ecf20Sopenharmony_ci	xor	s2(%r11,%rdi,4),%r8d;\
748c2ecf20Sopenharmony_ci	movzx	a ## H,		%edi;\
758c2ecf20Sopenharmony_ci	ror	$16,		a ## D;\
768c2ecf20Sopenharmony_ci	xor	s3(%r11,%rdi,4),%r9d;\
778c2ecf20Sopenharmony_ci	movzx	b ## B,		%edi;\
788c2ecf20Sopenharmony_ci	xor	s3(%r11,%rdi,4),%r8d;\
798c2ecf20Sopenharmony_ci	movzx	a ## B,		%edi;\
808c2ecf20Sopenharmony_ci	xor	(%r11,%rdi,4),	%r9d;\
818c2ecf20Sopenharmony_ci	movzx	b ## H,		%edi;\
828c2ecf20Sopenharmony_ci	ror	$15,		b ## D;\
838c2ecf20Sopenharmony_ci	xor	(%r11,%rdi,4),	%r8d;\
848c2ecf20Sopenharmony_ci	movzx	a ## H,		%edi;\
858c2ecf20Sopenharmony_ci	xor	s1(%r11,%rdi,4),%r9d;\
868c2ecf20Sopenharmony_ci	add	%r8d,		%r9d;\
878c2ecf20Sopenharmony_ci	add	%r9d,		%r8d;\
888c2ecf20Sopenharmony_ci	add	k+round(%r11),	%r9d;\
898c2ecf20Sopenharmony_ci	xor	%r9d,		c ## D;\
908c2ecf20Sopenharmony_ci	rol	$15,		c ## D;\
918c2ecf20Sopenharmony_ci	add	k+4+round(%r11),%r8d;\
928c2ecf20Sopenharmony_ci	xor	%r8d,		d ## D;
938c2ecf20Sopenharmony_ci
948c2ecf20Sopenharmony_ci/*
958c2ecf20Sopenharmony_ci * a input register containing a(rotated 16)
968c2ecf20Sopenharmony_ci * b input register containing b
978c2ecf20Sopenharmony_ci * c input register containing c
988c2ecf20Sopenharmony_ci * d input register containing d (already rol $1)
998c2ecf20Sopenharmony_ci * operations on a and b are interleaved to increase performance
1008c2ecf20Sopenharmony_ci * during the round a and b are prepared for the output whitening
1018c2ecf20Sopenharmony_ci */
1028c2ecf20Sopenharmony_ci#define encrypt_last_round(a,b,c,d,round)\
1038c2ecf20Sopenharmony_ci	mov	b ## D,		%r10d;\
1048c2ecf20Sopenharmony_ci	shl	$32,		%r10;\
1058c2ecf20Sopenharmony_ci	movzx	b ## B,		%edi;\
1068c2ecf20Sopenharmony_ci	mov	s1(%r11,%rdi,4),%r8d;\
1078c2ecf20Sopenharmony_ci	movzx	a ## B,		%edi;\
1088c2ecf20Sopenharmony_ci	mov	s2(%r11,%rdi,4),%r9d;\
1098c2ecf20Sopenharmony_ci	movzx	b ## H,		%edi;\
1108c2ecf20Sopenharmony_ci	ror	$16,		b ## D;\
1118c2ecf20Sopenharmony_ci	xor	s2(%r11,%rdi,4),%r8d;\
1128c2ecf20Sopenharmony_ci	movzx	a ## H,		%edi;\
1138c2ecf20Sopenharmony_ci	ror	$16,		a ## D;\
1148c2ecf20Sopenharmony_ci	xor	s3(%r11,%rdi,4),%r9d;\
1158c2ecf20Sopenharmony_ci	movzx	b ## B,		%edi;\
1168c2ecf20Sopenharmony_ci	xor	s3(%r11,%rdi,4),%r8d;\
1178c2ecf20Sopenharmony_ci	movzx	a ## B,		%edi;\
1188c2ecf20Sopenharmony_ci	xor	(%r11,%rdi,4),	%r9d;\
1198c2ecf20Sopenharmony_ci	xor	a,		%r10;\
1208c2ecf20Sopenharmony_ci	movzx	b ## H,		%edi;\
1218c2ecf20Sopenharmony_ci	xor	(%r11,%rdi,4),	%r8d;\
1228c2ecf20Sopenharmony_ci	movzx	a ## H,		%edi;\
1238c2ecf20Sopenharmony_ci	xor	s1(%r11,%rdi,4),%r9d;\
1248c2ecf20Sopenharmony_ci	add	%r8d,		%r9d;\
1258c2ecf20Sopenharmony_ci	add	%r9d,		%r8d;\
1268c2ecf20Sopenharmony_ci	add	k+round(%r11),	%r9d;\
1278c2ecf20Sopenharmony_ci	xor	%r9d,		c ## D;\
1288c2ecf20Sopenharmony_ci	ror	$1,		c ## D;\
1298c2ecf20Sopenharmony_ci	add	k+4+round(%r11),%r8d;\
1308c2ecf20Sopenharmony_ci	xor	%r8d,		d ## D
1318c2ecf20Sopenharmony_ci
1328c2ecf20Sopenharmony_ci/*
1338c2ecf20Sopenharmony_ci * a input register containing a
1348c2ecf20Sopenharmony_ci * b input register containing b (rotated 16)
1358c2ecf20Sopenharmony_ci * c input register containing c (already rol $1)
1368c2ecf20Sopenharmony_ci * d input register containing d
1378c2ecf20Sopenharmony_ci * operations on a and b are interleaved to increase performance
1388c2ecf20Sopenharmony_ci */
1398c2ecf20Sopenharmony_ci#define decrypt_round(a,b,c,d,round)\
1408c2ecf20Sopenharmony_ci	movzx	a ## B,		%edi;\
1418c2ecf20Sopenharmony_ci	mov	(%r11,%rdi,4),	%r9d;\
1428c2ecf20Sopenharmony_ci	movzx	b ## B,		%edi;\
1438c2ecf20Sopenharmony_ci	mov	s3(%r11,%rdi,4),%r8d;\
1448c2ecf20Sopenharmony_ci	movzx	a ## H,		%edi;\
1458c2ecf20Sopenharmony_ci	ror	$16,		a ## D;\
1468c2ecf20Sopenharmony_ci	xor	s1(%r11,%rdi,4),%r9d;\
1478c2ecf20Sopenharmony_ci	movzx	b ## H,		%edi;\
1488c2ecf20Sopenharmony_ci	ror	$16,		b ## D;\
1498c2ecf20Sopenharmony_ci	xor	(%r11,%rdi,4),	%r8d;\
1508c2ecf20Sopenharmony_ci	movzx	a ## B,		%edi;\
1518c2ecf20Sopenharmony_ci	xor	s2(%r11,%rdi,4),%r9d;\
1528c2ecf20Sopenharmony_ci	movzx	b ## B,		%edi;\
1538c2ecf20Sopenharmony_ci	xor	s1(%r11,%rdi,4),%r8d;\
1548c2ecf20Sopenharmony_ci	movzx	a ## H,		%edi;\
1558c2ecf20Sopenharmony_ci	ror	$15,		a ## D;\
1568c2ecf20Sopenharmony_ci	xor	s3(%r11,%rdi,4),%r9d;\
1578c2ecf20Sopenharmony_ci	movzx	b ## H,		%edi;\
1588c2ecf20Sopenharmony_ci	xor	s2(%r11,%rdi,4),%r8d;\
1598c2ecf20Sopenharmony_ci	add	%r8d,		%r9d;\
1608c2ecf20Sopenharmony_ci	add	%r9d,		%r8d;\
1618c2ecf20Sopenharmony_ci	add	k+round(%r11),	%r9d;\
1628c2ecf20Sopenharmony_ci	xor	%r9d,		c ## D;\
1638c2ecf20Sopenharmony_ci	add	k+4+round(%r11),%r8d;\
1648c2ecf20Sopenharmony_ci	xor	%r8d,		d ## D;\
1658c2ecf20Sopenharmony_ci	rol	$15,		d ## D;
1668c2ecf20Sopenharmony_ci
1678c2ecf20Sopenharmony_ci/*
1688c2ecf20Sopenharmony_ci * a input register containing a
1698c2ecf20Sopenharmony_ci * b input register containing b
1708c2ecf20Sopenharmony_ci * c input register containing c (already rol $1)
1718c2ecf20Sopenharmony_ci * d input register containing d
1728c2ecf20Sopenharmony_ci * operations on a and b are interleaved to increase performance
1738c2ecf20Sopenharmony_ci * during the round a and b are prepared for the output whitening
1748c2ecf20Sopenharmony_ci */
1758c2ecf20Sopenharmony_ci#define decrypt_last_round(a,b,c,d,round)\
1768c2ecf20Sopenharmony_ci	movzx	a ## B,		%edi;\
1778c2ecf20Sopenharmony_ci	mov	(%r11,%rdi,4),	%r9d;\
1788c2ecf20Sopenharmony_ci	movzx	b ## B,		%edi;\
1798c2ecf20Sopenharmony_ci	mov	s3(%r11,%rdi,4),%r8d;\
1808c2ecf20Sopenharmony_ci	movzx	b ## H,		%edi;\
1818c2ecf20Sopenharmony_ci	ror	$16,		b ## D;\
1828c2ecf20Sopenharmony_ci	xor	(%r11,%rdi,4),	%r8d;\
1838c2ecf20Sopenharmony_ci	movzx	a ## H,		%edi;\
1848c2ecf20Sopenharmony_ci	mov	b ## D,		%r10d;\
1858c2ecf20Sopenharmony_ci	shl	$32,		%r10;\
1868c2ecf20Sopenharmony_ci	xor	a,		%r10;\
1878c2ecf20Sopenharmony_ci	ror	$16,		a ## D;\
1888c2ecf20Sopenharmony_ci	xor	s1(%r11,%rdi,4),%r9d;\
1898c2ecf20Sopenharmony_ci	movzx	b ## B,		%edi;\
1908c2ecf20Sopenharmony_ci	xor	s1(%r11,%rdi,4),%r8d;\
1918c2ecf20Sopenharmony_ci	movzx	a ## B,		%edi;\
1928c2ecf20Sopenharmony_ci	xor	s2(%r11,%rdi,4),%r9d;\
1938c2ecf20Sopenharmony_ci	movzx	b ## H,		%edi;\
1948c2ecf20Sopenharmony_ci	xor	s2(%r11,%rdi,4),%r8d;\
1958c2ecf20Sopenharmony_ci	movzx	a ## H,		%edi;\
1968c2ecf20Sopenharmony_ci	xor	s3(%r11,%rdi,4),%r9d;\
1978c2ecf20Sopenharmony_ci	add	%r8d,		%r9d;\
1988c2ecf20Sopenharmony_ci	add	%r9d,		%r8d;\
1998c2ecf20Sopenharmony_ci	add	k+round(%r11),	%r9d;\
2008c2ecf20Sopenharmony_ci	xor	%r9d,		c ## D;\
2018c2ecf20Sopenharmony_ci	add	k+4+round(%r11),%r8d;\
2028c2ecf20Sopenharmony_ci	xor	%r8d,		d ## D;\
2038c2ecf20Sopenharmony_ci	ror	$1,		d ## D;
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_enc_blk)
2068c2ecf20Sopenharmony_ci	pushq    R1
2078c2ecf20Sopenharmony_ci
2088c2ecf20Sopenharmony_ci	/* %rdi contains the ctx address */
2098c2ecf20Sopenharmony_ci	/* %rsi contains the output address */
2108c2ecf20Sopenharmony_ci	/* %rdx contains the input address */
2118c2ecf20Sopenharmony_ci	/* ctx address is moved to free one non-rex register
2128c2ecf20Sopenharmony_ci	as target for the 8bit high operations */
2138c2ecf20Sopenharmony_ci	mov	%rdi,		%r11
2148c2ecf20Sopenharmony_ci
2158c2ecf20Sopenharmony_ci	movq	(R3),	R1
2168c2ecf20Sopenharmony_ci	movq	8(R3),	R3
2178c2ecf20Sopenharmony_ci	input_whitening(R1,%r11,a_offset)
2188c2ecf20Sopenharmony_ci	input_whitening(R3,%r11,c_offset)
2198c2ecf20Sopenharmony_ci	mov	R1D,	R0D
2208c2ecf20Sopenharmony_ci	rol	$16,	R0D
2218c2ecf20Sopenharmony_ci	shr	$32,	R1
2228c2ecf20Sopenharmony_ci	mov	R3D,	R2D
2238c2ecf20Sopenharmony_ci	shr	$32,	R3
2248c2ecf20Sopenharmony_ci	rol	$1,	R3D
2258c2ecf20Sopenharmony_ci
2268c2ecf20Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,0);
2278c2ecf20Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,8);
2288c2ecf20Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,2*8);
2298c2ecf20Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,3*8);
2308c2ecf20Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,4*8);
2318c2ecf20Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,5*8);
2328c2ecf20Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,6*8);
2338c2ecf20Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,7*8);
2348c2ecf20Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,8*8);
2358c2ecf20Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,9*8);
2368c2ecf20Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,10*8);
2378c2ecf20Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,11*8);
2388c2ecf20Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,12*8);
2398c2ecf20Sopenharmony_ci	encrypt_round(R2,R3,R0,R1,13*8);
2408c2ecf20Sopenharmony_ci	encrypt_round(R0,R1,R2,R3,14*8);
2418c2ecf20Sopenharmony_ci	encrypt_last_round(R2,R3,R0,R1,15*8);
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ci
2448c2ecf20Sopenharmony_ci	output_whitening(%r10,%r11,a_offset)
2458c2ecf20Sopenharmony_ci	movq	%r10,	(%rsi)
2468c2ecf20Sopenharmony_ci
2478c2ecf20Sopenharmony_ci	shl	$32,	R1
2488c2ecf20Sopenharmony_ci	xor	R0,	R1
2498c2ecf20Sopenharmony_ci
2508c2ecf20Sopenharmony_ci	output_whitening(R1,%r11,c_offset)
2518c2ecf20Sopenharmony_ci	movq	R1,	8(%rsi)
2528c2ecf20Sopenharmony_ci
2538c2ecf20Sopenharmony_ci	popq	R1
2548c2ecf20Sopenharmony_ci	movl	$1,%eax
2558c2ecf20Sopenharmony_ci	RET
2568c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_enc_blk)
2578c2ecf20Sopenharmony_ci
2588c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_dec_blk)
2598c2ecf20Sopenharmony_ci	pushq    R1
2608c2ecf20Sopenharmony_ci
2618c2ecf20Sopenharmony_ci	/* %rdi contains the ctx address */
2628c2ecf20Sopenharmony_ci	/* %rsi contains the output address */
2638c2ecf20Sopenharmony_ci	/* %rdx contains the input address */
2648c2ecf20Sopenharmony_ci	/* ctx address is moved to free one non-rex register
2658c2ecf20Sopenharmony_ci	as target for the 8bit high operations */
2668c2ecf20Sopenharmony_ci	mov	%rdi,		%r11
2678c2ecf20Sopenharmony_ci
2688c2ecf20Sopenharmony_ci	movq	(R3),	R1
2698c2ecf20Sopenharmony_ci	movq	8(R3),	R3
2708c2ecf20Sopenharmony_ci	output_whitening(R1,%r11,a_offset)
2718c2ecf20Sopenharmony_ci	output_whitening(R3,%r11,c_offset)
2728c2ecf20Sopenharmony_ci	mov	R1D,	R0D
2738c2ecf20Sopenharmony_ci	shr	$32,	R1
2748c2ecf20Sopenharmony_ci	rol	$16,	R1D
2758c2ecf20Sopenharmony_ci	mov	R3D,	R2D
2768c2ecf20Sopenharmony_ci	shr	$32,	R3
2778c2ecf20Sopenharmony_ci	rol	$1,	R2D
2788c2ecf20Sopenharmony_ci
2798c2ecf20Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,15*8);
2808c2ecf20Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,14*8);
2818c2ecf20Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,13*8);
2828c2ecf20Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,12*8);
2838c2ecf20Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,11*8);
2848c2ecf20Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,10*8);
2858c2ecf20Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,9*8);
2868c2ecf20Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,8*8);
2878c2ecf20Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,7*8);
2888c2ecf20Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,6*8);
2898c2ecf20Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,5*8);
2908c2ecf20Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,4*8);
2918c2ecf20Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,3*8);
2928c2ecf20Sopenharmony_ci	decrypt_round(R2,R3,R0,R1,2*8);
2938c2ecf20Sopenharmony_ci	decrypt_round(R0,R1,R2,R3,1*8);
2948c2ecf20Sopenharmony_ci	decrypt_last_round(R2,R3,R0,R1,0);
2958c2ecf20Sopenharmony_ci
2968c2ecf20Sopenharmony_ci	input_whitening(%r10,%r11,a_offset)
2978c2ecf20Sopenharmony_ci	movq	%r10,	(%rsi)
2988c2ecf20Sopenharmony_ci
2998c2ecf20Sopenharmony_ci	shl	$32,	R1
3008c2ecf20Sopenharmony_ci	xor	R0,	R1
3018c2ecf20Sopenharmony_ci
3028c2ecf20Sopenharmony_ci	input_whitening(R1,%r11,c_offset)
3038c2ecf20Sopenharmony_ci	movq	R1,	8(%rsi)
3048c2ecf20Sopenharmony_ci
3058c2ecf20Sopenharmony_ci	popq	R1
3068c2ecf20Sopenharmony_ci	movl	$1,%eax
3078c2ecf20Sopenharmony_ci	RET
3088c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_dec_blk)
309