1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /***************************************************************************
3 *   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
4 *                                                                         *
5 ***************************************************************************/
6 
7 .file "twofish-x86_64-asm.S"
8 .text
9 
10 #include <linux/linkage.h>
11 #include <asm/asm-offsets.h>
12 
13 #define a_offset	0
14 #define b_offset	4
15 #define c_offset	8
16 #define d_offset	12
17 
18 /* Structure of the crypto context struct*/
19 
20 #define s0	0	/* S0 Array 256 Words each */
21 #define s1	1024	/* S1 Array */
22 #define s2	2048	/* S2 Array */
23 #define s3	3072	/* S3 Array */
24 #define w	4096	/* 8 whitening keys (word) */
25 #define k	4128	/* key 1-32 ( word ) */
26 
27 /* define a few register aliases to allow macro substitution */
28 
29 #define R0     %rax
30 #define R0D    %eax
31 #define R0B    %al
32 #define R0H    %ah
33 
34 #define R1     %rbx
35 #define R1D    %ebx
36 #define R1B    %bl
37 #define R1H    %bh
38 
39 #define R2     %rcx
40 #define R2D    %ecx
41 #define R2B    %cl
42 #define R2H    %ch
43 
44 #define R3     %rdx
45 #define R3D    %edx
46 #define R3B    %dl
47 #define R3H    %dh
48 
49 
50 /* performs input whitening */
51 #define input_whitening(src,context,offset)\
52 	xor	w+offset(context),	src;
53 
54 /* performs input whitening */
55 #define output_whitening(src,context,offset)\
56 	xor	w+16+offset(context),	src;
57 
58 
59 /*
60  * a input register containing a (rotated 16)
61  * b input register containing b
62  * c input register containing c
63  * d input register containing d (already rol $1)
64  * operations on a and b are interleaved to increase performance
65  */
66 #define encrypt_round(a,b,c,d,round)\
67 	movzx	b ## B,		%edi;\
68 	mov	s1(%r11,%rdi,4),%r8d;\
69 	movzx	a ## B,		%edi;\
70 	mov	s2(%r11,%rdi,4),%r9d;\
71 	movzx	b ## H,		%edi;\
72 	ror	$16,		b ## D;\
73 	xor	s2(%r11,%rdi,4),%r8d;\
74 	movzx	a ## H,		%edi;\
75 	ror	$16,		a ## D;\
76 	xor	s3(%r11,%rdi,4),%r9d;\
77 	movzx	b ## B,		%edi;\
78 	xor	s3(%r11,%rdi,4),%r8d;\
79 	movzx	a ## B,		%edi;\
80 	xor	(%r11,%rdi,4),	%r9d;\
81 	movzx	b ## H,		%edi;\
82 	ror	$15,		b ## D;\
83 	xor	(%r11,%rdi,4),	%r8d;\
84 	movzx	a ## H,		%edi;\
85 	xor	s1(%r11,%rdi,4),%r9d;\
86 	add	%r8d,		%r9d;\
87 	add	%r9d,		%r8d;\
88 	add	k+round(%r11),	%r9d;\
89 	xor	%r9d,		c ## D;\
90 	rol	$15,		c ## D;\
91 	add	k+4+round(%r11),%r8d;\
92 	xor	%r8d,		d ## D;
93 
94 /*
95  * a input register containing a(rotated 16)
96  * b input register containing b
97  * c input register containing c
98  * d input register containing d (already rol $1)
99  * operations on a and b are interleaved to increase performance
100  * during the round a and b are prepared for the output whitening
101  */
102 #define encrypt_last_round(a,b,c,d,round)\
103 	mov	b ## D,		%r10d;\
104 	shl	$32,		%r10;\
105 	movzx	b ## B,		%edi;\
106 	mov	s1(%r11,%rdi,4),%r8d;\
107 	movzx	a ## B,		%edi;\
108 	mov	s2(%r11,%rdi,4),%r9d;\
109 	movzx	b ## H,		%edi;\
110 	ror	$16,		b ## D;\
111 	xor	s2(%r11,%rdi,4),%r8d;\
112 	movzx	a ## H,		%edi;\
113 	ror	$16,		a ## D;\
114 	xor	s3(%r11,%rdi,4),%r9d;\
115 	movzx	b ## B,		%edi;\
116 	xor	s3(%r11,%rdi,4),%r8d;\
117 	movzx	a ## B,		%edi;\
118 	xor	(%r11,%rdi,4),	%r9d;\
119 	xor	a,		%r10;\
120 	movzx	b ## H,		%edi;\
121 	xor	(%r11,%rdi,4),	%r8d;\
122 	movzx	a ## H,		%edi;\
123 	xor	s1(%r11,%rdi,4),%r9d;\
124 	add	%r8d,		%r9d;\
125 	add	%r9d,		%r8d;\
126 	add	k+round(%r11),	%r9d;\
127 	xor	%r9d,		c ## D;\
128 	ror	$1,		c ## D;\
129 	add	k+4+round(%r11),%r8d;\
130 	xor	%r8d,		d ## D
131 
132 /*
133  * a input register containing a
134  * b input register containing b (rotated 16)
135  * c input register containing c (already rol $1)
136  * d input register containing d
137  * operations on a and b are interleaved to increase performance
138  */
139 #define decrypt_round(a,b,c,d,round)\
140 	movzx	a ## B,		%edi;\
141 	mov	(%r11,%rdi,4),	%r9d;\
142 	movzx	b ## B,		%edi;\
143 	mov	s3(%r11,%rdi,4),%r8d;\
144 	movzx	a ## H,		%edi;\
145 	ror	$16,		a ## D;\
146 	xor	s1(%r11,%rdi,4),%r9d;\
147 	movzx	b ## H,		%edi;\
148 	ror	$16,		b ## D;\
149 	xor	(%r11,%rdi,4),	%r8d;\
150 	movzx	a ## B,		%edi;\
151 	xor	s2(%r11,%rdi,4),%r9d;\
152 	movzx	b ## B,		%edi;\
153 	xor	s1(%r11,%rdi,4),%r8d;\
154 	movzx	a ## H,		%edi;\
155 	ror	$15,		a ## D;\
156 	xor	s3(%r11,%rdi,4),%r9d;\
157 	movzx	b ## H,		%edi;\
158 	xor	s2(%r11,%rdi,4),%r8d;\
159 	add	%r8d,		%r9d;\
160 	add	%r9d,		%r8d;\
161 	add	k+round(%r11),	%r9d;\
162 	xor	%r9d,		c ## D;\
163 	add	k+4+round(%r11),%r8d;\
164 	xor	%r8d,		d ## D;\
165 	rol	$15,		d ## D;
166 
167 /*
168  * a input register containing a
169  * b input register containing b
170  * c input register containing c (already rol $1)
171  * d input register containing d
172  * operations on a and b are interleaved to increase performance
173  * during the round a and b are prepared for the output whitening
174  */
175 #define decrypt_last_round(a,b,c,d,round)\
176 	movzx	a ## B,		%edi;\
177 	mov	(%r11,%rdi,4),	%r9d;\
178 	movzx	b ## B,		%edi;\
179 	mov	s3(%r11,%rdi,4),%r8d;\
180 	movzx	b ## H,		%edi;\
181 	ror	$16,		b ## D;\
182 	xor	(%r11,%rdi,4),	%r8d;\
183 	movzx	a ## H,		%edi;\
184 	mov	b ## D,		%r10d;\
185 	shl	$32,		%r10;\
186 	xor	a,		%r10;\
187 	ror	$16,		a ## D;\
188 	xor	s1(%r11,%rdi,4),%r9d;\
189 	movzx	b ## B,		%edi;\
190 	xor	s1(%r11,%rdi,4),%r8d;\
191 	movzx	a ## B,		%edi;\
192 	xor	s2(%r11,%rdi,4),%r9d;\
193 	movzx	b ## H,		%edi;\
194 	xor	s2(%r11,%rdi,4),%r8d;\
195 	movzx	a ## H,		%edi;\
196 	xor	s3(%r11,%rdi,4),%r9d;\
197 	add	%r8d,		%r9d;\
198 	add	%r9d,		%r8d;\
199 	add	k+round(%r11),	%r9d;\
200 	xor	%r9d,		c ## D;\
201 	add	k+4+round(%r11),%r8d;\
202 	xor	%r8d,		d ## D;\
203 	ror	$1,		d ## D;
204 
205 SYM_FUNC_START(twofish_enc_blk)
206 	pushq    R1
207 
208 	/* %rdi contains the ctx address */
209 	/* %rsi contains the output address */
210 	/* %rdx contains the input address */
211 	/* ctx address is moved to free one non-rex register
212 	as target for the 8bit high operations */
213 	mov	%rdi,		%r11
214 
215 	movq	(R3),	R1
216 	movq	8(R3),	R3
217 	input_whitening(R1,%r11,a_offset)
218 	input_whitening(R3,%r11,c_offset)
219 	mov	R1D,	R0D
220 	rol	$16,	R0D
221 	shr	$32,	R1
222 	mov	R3D,	R2D
223 	shr	$32,	R3
224 	rol	$1,	R3D
225 
226 	encrypt_round(R0,R1,R2,R3,0);
227 	encrypt_round(R2,R3,R0,R1,8);
228 	encrypt_round(R0,R1,R2,R3,2*8);
229 	encrypt_round(R2,R3,R0,R1,3*8);
230 	encrypt_round(R0,R1,R2,R3,4*8);
231 	encrypt_round(R2,R3,R0,R1,5*8);
232 	encrypt_round(R0,R1,R2,R3,6*8);
233 	encrypt_round(R2,R3,R0,R1,7*8);
234 	encrypt_round(R0,R1,R2,R3,8*8);
235 	encrypt_round(R2,R3,R0,R1,9*8);
236 	encrypt_round(R0,R1,R2,R3,10*8);
237 	encrypt_round(R2,R3,R0,R1,11*8);
238 	encrypt_round(R0,R1,R2,R3,12*8);
239 	encrypt_round(R2,R3,R0,R1,13*8);
240 	encrypt_round(R0,R1,R2,R3,14*8);
241 	encrypt_last_round(R2,R3,R0,R1,15*8);
242 
243 
244 	output_whitening(%r10,%r11,a_offset)
245 	movq	%r10,	(%rsi)
246 
247 	shl	$32,	R1
248 	xor	R0,	R1
249 
250 	output_whitening(R1,%r11,c_offset)
251 	movq	R1,	8(%rsi)
252 
253 	popq	R1
254 	movl	$1,%eax
255 	RET
256 SYM_FUNC_END(twofish_enc_blk)
257 
258 SYM_FUNC_START(twofish_dec_blk)
259 	pushq    R1
260 
261 	/* %rdi contains the ctx address */
262 	/* %rsi contains the output address */
263 	/* %rdx contains the input address */
264 	/* ctx address is moved to free one non-rex register
265 	as target for the 8bit high operations */
266 	mov	%rdi,		%r11
267 
268 	movq	(R3),	R1
269 	movq	8(R3),	R3
270 	output_whitening(R1,%r11,a_offset)
271 	output_whitening(R3,%r11,c_offset)
272 	mov	R1D,	R0D
273 	shr	$32,	R1
274 	rol	$16,	R1D
275 	mov	R3D,	R2D
276 	shr	$32,	R3
277 	rol	$1,	R2D
278 
279 	decrypt_round(R0,R1,R2,R3,15*8);
280 	decrypt_round(R2,R3,R0,R1,14*8);
281 	decrypt_round(R0,R1,R2,R3,13*8);
282 	decrypt_round(R2,R3,R0,R1,12*8);
283 	decrypt_round(R0,R1,R2,R3,11*8);
284 	decrypt_round(R2,R3,R0,R1,10*8);
285 	decrypt_round(R0,R1,R2,R3,9*8);
286 	decrypt_round(R2,R3,R0,R1,8*8);
287 	decrypt_round(R0,R1,R2,R3,7*8);
288 	decrypt_round(R2,R3,R0,R1,6*8);
289 	decrypt_round(R0,R1,R2,R3,5*8);
290 	decrypt_round(R2,R3,R0,R1,4*8);
291 	decrypt_round(R0,R1,R2,R3,3*8);
292 	decrypt_round(R2,R3,R0,R1,2*8);
293 	decrypt_round(R0,R1,R2,R3,1*8);
294 	decrypt_last_round(R2,R3,R0,R1,0);
295 
296 	input_whitening(%r10,%r11,a_offset)
297 	movq	%r10,	(%rsi)
298 
299 	shl	$32,	R1
300 	xor	R0,	R1
301 
302 	input_whitening(R1,%r11,c_offset)
303 	movq	R1,	8(%rsi)
304 
305 	popq	R1
306 	movl	$1,%eax
307 	RET
308 SYM_FUNC_END(twofish_dec_blk)
309