1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /***************************************************************************
3 *   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
4 *                                                                         *
5 ***************************************************************************/
6 
7 .file "twofish-i586-asm.S"
8 .text
9 
10 #include <linux/linkage.h>
11 #include <asm/asm-offsets.h>
12 
13 /* return address at 0 */
14 
15 #define in_blk    12  /* input byte array address parameter*/
16 #define out_blk   8  /* output byte array address parameter*/
17 #define ctx       4  /* Twofish context structure */
18 
19 #define a_offset	0
20 #define b_offset	4
21 #define c_offset	8
22 #define d_offset	12
23 
24 /* Structure of the crypto context struct*/
25 
26 #define s0	0	/* S0 Array 256 Words each */
27 #define s1	1024	/* S1 Array */
28 #define s2	2048	/* S2 Array */
29 #define s3	3072	/* S3 Array */
30 #define w	4096	/* 8 whitening keys (word) */
31 #define k	4128	/* key 1-32 ( word ) */
32 
33 /* define a few register aliases to allow macro substitution */
34 
35 #define R0D    %eax
36 #define R0B    %al
37 #define R0H    %ah
38 
39 #define R1D    %ebx
40 #define R1B    %bl
41 #define R1H    %bh
42 
43 #define R2D    %ecx
44 #define R2B    %cl
45 #define R2H    %ch
46 
47 #define R3D    %edx
48 #define R3B    %dl
49 #define R3H    %dh
50 
51 
52 /* performs input whitening */
53 #define input_whitening(src,context,offset)\
54 	xor	w+offset(context),	src;
55 
56 /* performs input whitening */
57 #define output_whitening(src,context,offset)\
58 	xor	w+16+offset(context),	src;
59 
60 /*
61  * a input register containing a (rotated 16)
62  * b input register containing b
63  * c input register containing c
64  * d input register containing d (already rol $1)
65  * operations on a and b are interleaved to increase performance
66  */
67 #define encrypt_round(a,b,c,d,round)\
68 	push	d ## D;\
69 	movzx	b ## B,		%edi;\
70 	mov	s1(%ebp,%edi,4),d ## D;\
71 	movzx	a ## B,		%edi;\
72 	mov	s2(%ebp,%edi,4),%esi;\
73 	movzx	b ## H,		%edi;\
74 	ror	$16,		b ## D;\
75 	xor	s2(%ebp,%edi,4),d ## D;\
76 	movzx	a ## H,		%edi;\
77 	ror	$16,		a ## D;\
78 	xor	s3(%ebp,%edi,4),%esi;\
79 	movzx	b ## B,		%edi;\
80 	xor	s3(%ebp,%edi,4),d ## D;\
81 	movzx	a ## B,		%edi;\
82 	xor	(%ebp,%edi,4),	%esi;\
83 	movzx	b ## H,		%edi;\
84 	ror	$15,		b ## D;\
85 	xor	(%ebp,%edi,4),	d ## D;\
86 	movzx	a ## H,		%edi;\
87 	xor	s1(%ebp,%edi,4),%esi;\
88 	pop	%edi;\
89 	add	d ## D,		%esi;\
90 	add	%esi,		d ## D;\
91 	add	k+round(%ebp),	%esi;\
92 	xor	%esi,		c ## D;\
93 	rol	$15,		c ## D;\
94 	add	k+4+round(%ebp),d ## D;\
95 	xor	%edi,		d ## D;
96 
97 /*
98  * a input register containing a (rotated 16)
99  * b input register containing b
100  * c input register containing c
101  * d input register containing d (already rol $1)
102  * operations on a and b are interleaved to increase performance
103  * last round has different rotations for the output preparation
104  */
105 #define encrypt_last_round(a,b,c,d,round)\
106 	push	d ## D;\
107 	movzx	b ## B,		%edi;\
108 	mov	s1(%ebp,%edi,4),d ## D;\
109 	movzx	a ## B,		%edi;\
110 	mov	s2(%ebp,%edi,4),%esi;\
111 	movzx	b ## H,		%edi;\
112 	ror	$16,		b ## D;\
113 	xor	s2(%ebp,%edi,4),d ## D;\
114 	movzx	a ## H,		%edi;\
115 	ror	$16,		a ## D;\
116 	xor	s3(%ebp,%edi,4),%esi;\
117 	movzx	b ## B,		%edi;\
118 	xor	s3(%ebp,%edi,4),d ## D;\
119 	movzx	a ## B,		%edi;\
120 	xor	(%ebp,%edi,4),	%esi;\
121 	movzx	b ## H,		%edi;\
122 	ror	$16,		b ## D;\
123 	xor	(%ebp,%edi,4),	d ## D;\
124 	movzx	a ## H,		%edi;\
125 	xor	s1(%ebp,%edi,4),%esi;\
126 	pop	%edi;\
127 	add	d ## D,		%esi;\
128 	add	%esi,		d ## D;\
129 	add	k+round(%ebp),	%esi;\
130 	xor	%esi,		c ## D;\
131 	ror	$1,		c ## D;\
132 	add	k+4+round(%ebp),d ## D;\
133 	xor	%edi,		d ## D;
134 
135 /*
136  * a input register containing a
137  * b input register containing b (rotated 16)
138  * c input register containing c
139  * d input register containing d (already rol $1)
140  * operations on a and b are interleaved to increase performance
141  */
142 #define decrypt_round(a,b,c,d,round)\
143 	push	c ## D;\
144 	movzx	a ## B,		%edi;\
145 	mov	(%ebp,%edi,4),	c ## D;\
146 	movzx	b ## B,		%edi;\
147 	mov	s3(%ebp,%edi,4),%esi;\
148 	movzx	a ## H,		%edi;\
149 	ror	$16,		a ## D;\
150 	xor	s1(%ebp,%edi,4),c ## D;\
151 	movzx	b ## H,		%edi;\
152 	ror	$16,		b ## D;\
153 	xor	(%ebp,%edi,4),	%esi;\
154 	movzx	a ## B,		%edi;\
155 	xor	s2(%ebp,%edi,4),c ## D;\
156 	movzx	b ## B,		%edi;\
157 	xor	s1(%ebp,%edi,4),%esi;\
158 	movzx	a ## H,		%edi;\
159 	ror	$15,		a ## D;\
160 	xor	s3(%ebp,%edi,4),c ## D;\
161 	movzx	b ## H,		%edi;\
162 	xor	s2(%ebp,%edi,4),%esi;\
163 	pop	%edi;\
164 	add	%esi,		c ## D;\
165 	add	c ## D,		%esi;\
166 	add	k+round(%ebp),	c ## D;\
167 	xor	%edi,		c ## D;\
168 	add	k+4+round(%ebp),%esi;\
169 	xor	%esi,		d ## D;\
170 	rol	$15,		d ## D;
171 
172 /*
173  * a input register containing a
174  * b input register containing b (rotated 16)
175  * c input register containing c
176  * d input register containing d (already rol $1)
177  * operations on a and b are interleaved to increase performance
178  * last round has different rotations for the output preparation
179  */
180 #define decrypt_last_round(a,b,c,d,round)\
181 	push	c ## D;\
182 	movzx	a ## B,		%edi;\
183 	mov	(%ebp,%edi,4),	c ## D;\
184 	movzx	b ## B,		%edi;\
185 	mov	s3(%ebp,%edi,4),%esi;\
186 	movzx	a ## H,		%edi;\
187 	ror	$16,		a ## D;\
188 	xor	s1(%ebp,%edi,4),c ## D;\
189 	movzx	b ## H,		%edi;\
190 	ror	$16,		b ## D;\
191 	xor	(%ebp,%edi,4),	%esi;\
192 	movzx	a ## B,		%edi;\
193 	xor	s2(%ebp,%edi,4),c ## D;\
194 	movzx	b ## B,		%edi;\
195 	xor	s1(%ebp,%edi,4),%esi;\
196 	movzx	a ## H,		%edi;\
197 	ror	$16,		a ## D;\
198 	xor	s3(%ebp,%edi,4),c ## D;\
199 	movzx	b ## H,		%edi;\
200 	xor	s2(%ebp,%edi,4),%esi;\
201 	pop	%edi;\
202 	add	%esi,		c ## D;\
203 	add	c ## D,		%esi;\
204 	add	k+round(%ebp),	c ## D;\
205 	xor	%edi,		c ## D;\
206 	add	k+4+round(%ebp),%esi;\
207 	xor	%esi,		d ## D;\
208 	ror	$1,		d ## D;
209 
210 SYM_FUNC_START(twofish_enc_blk)
211 	push	%ebp			/* save registers according to calling convention*/
212 	push    %ebx
213 	push    %esi
214 	push    %edi
215 
216 	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base
217 					 * pointer to the ctx address */
218 	mov     in_blk+16(%esp),%edi	/* input address in edi */
219 
220 	mov	(%edi),		%eax
221 	mov	b_offset(%edi),	%ebx
222 	mov	c_offset(%edi),	%ecx
223 	mov	d_offset(%edi),	%edx
224 	input_whitening(%eax,%ebp,a_offset)
225 	ror	$16,	%eax
226 	input_whitening(%ebx,%ebp,b_offset)
227 	input_whitening(%ecx,%ebp,c_offset)
228 	input_whitening(%edx,%ebp,d_offset)
229 	rol	$1,	%edx
230 
231 	encrypt_round(R0,R1,R2,R3,0);
232 	encrypt_round(R2,R3,R0,R1,8);
233 	encrypt_round(R0,R1,R2,R3,2*8);
234 	encrypt_round(R2,R3,R0,R1,3*8);
235 	encrypt_round(R0,R1,R2,R3,4*8);
236 	encrypt_round(R2,R3,R0,R1,5*8);
237 	encrypt_round(R0,R1,R2,R3,6*8);
238 	encrypt_round(R2,R3,R0,R1,7*8);
239 	encrypt_round(R0,R1,R2,R3,8*8);
240 	encrypt_round(R2,R3,R0,R1,9*8);
241 	encrypt_round(R0,R1,R2,R3,10*8);
242 	encrypt_round(R2,R3,R0,R1,11*8);
243 	encrypt_round(R0,R1,R2,R3,12*8);
244 	encrypt_round(R2,R3,R0,R1,13*8);
245 	encrypt_round(R0,R1,R2,R3,14*8);
246 	encrypt_last_round(R2,R3,R0,R1,15*8);
247 
248 	output_whitening(%eax,%ebp,c_offset)
249 	output_whitening(%ebx,%ebp,d_offset)
250 	output_whitening(%ecx,%ebp,a_offset)
251 	output_whitening(%edx,%ebp,b_offset)
252 	mov	out_blk+16(%esp),%edi;
253 	mov	%eax,		c_offset(%edi)
254 	mov	%ebx,		d_offset(%edi)
255 	mov	%ecx,		(%edi)
256 	mov	%edx,		b_offset(%edi)
257 
258 	pop	%edi
259 	pop	%esi
260 	pop	%ebx
261 	pop	%ebp
262 	mov	$1,	%eax
263 	RET
264 SYM_FUNC_END(twofish_enc_blk)
265 
266 SYM_FUNC_START(twofish_dec_blk)
267 	push	%ebp			/* save registers according to calling convention*/
268 	push    %ebx
269 	push    %esi
270 	push    %edi
271 
272 
273 	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base
274 					 * pointer to the ctx address */
275 	mov     in_blk+16(%esp),%edi	/* input address in edi */
276 
277 	mov	(%edi),		%eax
278 	mov	b_offset(%edi),	%ebx
279 	mov	c_offset(%edi),	%ecx
280 	mov	d_offset(%edi),	%edx
281 	output_whitening(%eax,%ebp,a_offset)
282 	output_whitening(%ebx,%ebp,b_offset)
283 	ror	$16,	%ebx
284 	output_whitening(%ecx,%ebp,c_offset)
285 	output_whitening(%edx,%ebp,d_offset)
286 	rol	$1,	%ecx
287 
288 	decrypt_round(R0,R1,R2,R3,15*8);
289 	decrypt_round(R2,R3,R0,R1,14*8);
290 	decrypt_round(R0,R1,R2,R3,13*8);
291 	decrypt_round(R2,R3,R0,R1,12*8);
292 	decrypt_round(R0,R1,R2,R3,11*8);
293 	decrypt_round(R2,R3,R0,R1,10*8);
294 	decrypt_round(R0,R1,R2,R3,9*8);
295 	decrypt_round(R2,R3,R0,R1,8*8);
296 	decrypt_round(R0,R1,R2,R3,7*8);
297 	decrypt_round(R2,R3,R0,R1,6*8);
298 	decrypt_round(R0,R1,R2,R3,5*8);
299 	decrypt_round(R2,R3,R0,R1,4*8);
300 	decrypt_round(R0,R1,R2,R3,3*8);
301 	decrypt_round(R2,R3,R0,R1,2*8);
302 	decrypt_round(R0,R1,R2,R3,1*8);
303 	decrypt_last_round(R2,R3,R0,R1,0);
304 
305 	input_whitening(%eax,%ebp,c_offset)
306 	input_whitening(%ebx,%ebp,d_offset)
307 	input_whitening(%ecx,%ebp,a_offset)
308 	input_whitening(%edx,%ebp,b_offset)
309 	mov	out_blk+16(%esp),%edi;
310 	mov	%eax,		c_offset(%edi)
311 	mov	%ebx,		d_offset(%edi)
312 	mov	%ecx,		(%edi)
313 	mov	%edx,		b_offset(%edi)
314 
315 	pop	%edi
316 	pop	%esi
317 	pop	%ebx
318 	pop	%ebp
319 	mov	$1,	%eax
320 	RET
321 SYM_FUNC_END(twofish_dec_blk)
322