1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Twofish Cipher 3-way parallel algorithm (x86_64)
4  *
5  * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6  */
7 
8 #include <linux/linkage.h>
9 
10 .file "twofish-x86_64-asm-3way.S"
11 .text
12 
13 /* structure of crypto context */
14 #define s0	0
15 #define s1	1024
16 #define s2	2048
17 #define s3	3072
18 #define w	4096
19 #define k	4128
20 
21 /**********************************************************************
22   3-way twofish
23  **********************************************************************/
24 #define CTX %rdi
25 #define RIO %rdx
26 
27 #define RAB0 %rax
28 #define RAB1 %rbx
29 #define RAB2 %rcx
30 
31 #define RAB0d %eax
32 #define RAB1d %ebx
33 #define RAB2d %ecx
34 
35 #define RAB0bh %ah
36 #define RAB1bh %bh
37 #define RAB2bh %ch
38 
39 #define RAB0bl %al
40 #define RAB1bl %bl
41 #define RAB2bl %cl
42 
43 #define CD0 0x0(%rsp)
44 #define CD1 0x8(%rsp)
45 #define CD2 0x10(%rsp)
46 
47 # used only before/after all rounds
48 #define RCD0 %r8
49 #define RCD1 %r9
50 #define RCD2 %r10
51 
52 # used only during rounds
53 #define RX0 %r8
54 #define RX1 %r9
55 #define RX2 %r10
56 
57 #define RX0d %r8d
58 #define RX1d %r9d
59 #define RX2d %r10d
60 
61 #define RY0 %r11
62 #define RY1 %r12
63 #define RY2 %r13
64 
65 #define RY0d %r11d
66 #define RY1d %r12d
67 #define RY2d %r13d
68 
69 #define RT0 %rdx
70 #define RT1 %rsi
71 
72 #define RT0d %edx
73 #define RT1d %esi
74 
75 #define RT1bl %sil
76 
77 #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
78 	movzbl ab ## bl,		tmp2 ## d; \
79 	movzbl ab ## bh,		tmp1 ## d; \
80 	rorq $(rot),			ab; \
81 	op1##l T0(CTX, tmp2, 4),	dst ## d; \
82 	op2##l T1(CTX, tmp1, 4),	dst ## d;
83 
84 #define swap_ab_with_cd(ab, cd, tmp)	\
85 	movq cd, tmp;			\
86 	movq ab, cd;			\
87 	movq tmp, ab;
88 
89 /*
90  * Combined G1 & G2 function. Reordered with help of rotates to have moves
91  * at begining.
92  */
93 #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
94 	/* G1,1 && G2,1 */ \
95 	do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
96 	do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
97 	\
98 	do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
99 	do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
100 	\
101 	do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
102 	do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
103 	\
104 	/* G1,2 && G2,2 */ \
105 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
106 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
107 	swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \
108 	\
109 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
110 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
111 	swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \
112 	\
113 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
114 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
115 	swap_ab_with_cd(ab ## 2, cd ## 2, RT0);
116 
117 #define enc_round_end(ab, x, y, n) \
118 	addl y ## d,			x ## d; \
119 	addl x ## d,			y ## d; \
120 	addl k+4*(2*(n))(CTX),		x ## d; \
121 	xorl ab ## d,			x ## d; \
122 	addl k+4*(2*(n)+1)(CTX),	y ## d; \
123 	shrq $32,			ab; \
124 	roll $1,			ab ## d; \
125 	xorl y ## d,			ab ## d; \
126 	shlq $32,			ab; \
127 	rorl $1,			x ## d; \
128 	orq x,				ab;
129 
130 #define dec_round_end(ba, x, y, n) \
131 	addl y ## d,			x ## d; \
132 	addl x ## d,			y ## d; \
133 	addl k+4*(2*(n))(CTX),		x ## d; \
134 	addl k+4*(2*(n)+1)(CTX),	y ## d; \
135 	xorl ba ## d,			y ## d; \
136 	shrq $32,			ba; \
137 	roll $1,			ba ## d; \
138 	xorl x ## d,			ba ## d; \
139 	shlq $32,			ba; \
140 	rorl $1,			y ## d; \
141 	orq y,				ba;
142 
143 #define encrypt_round3(ab, cd, n) \
144 	g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
145 	\
146 	enc_round_end(ab ## 0, RX0, RY0, n); \
147 	enc_round_end(ab ## 1, RX1, RY1, n); \
148 	enc_round_end(ab ## 2, RX2, RY2, n);
149 
150 #define decrypt_round3(ba, dc, n) \
151 	g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
152 	\
153 	dec_round_end(ba ## 0, RX0, RY0, n); \
154 	dec_round_end(ba ## 1, RX1, RY1, n); \
155 	dec_round_end(ba ## 2, RX2, RY2, n);
156 
157 #define encrypt_cycle3(ab, cd, n) \
158 	encrypt_round3(ab, cd, n*2); \
159 	encrypt_round3(ab, cd, (n*2)+1);
160 
161 #define decrypt_cycle3(ba, dc, n) \
162 	decrypt_round3(ba, dc, (n*2)+1); \
163 	decrypt_round3(ba, dc, (n*2));
164 
165 #define push_cd()	\
166 	pushq RCD2;	\
167 	pushq RCD1;	\
168 	pushq RCD0;
169 
170 #define pop_cd()	\
171 	popq RCD0;	\
172 	popq RCD1;	\
173 	popq RCD2;
174 
175 #define inpack3(in, n, xy, m) \
176 	movq 4*(n)(in),			xy ## 0; \
177 	xorq w+4*m(CTX),		xy ## 0; \
178 	\
179 	movq 4*(4+(n))(in),		xy ## 1; \
180 	xorq w+4*m(CTX),		xy ## 1; \
181 	\
182 	movq 4*(8+(n))(in),		xy ## 2; \
183 	xorq w+4*m(CTX),		xy ## 2;
184 
185 #define outunpack3(op, out, n, xy, m) \
186 	xorq w+4*m(CTX),		xy ## 0; \
187 	op ## q xy ## 0,		4*(n)(out); \
188 	\
189 	xorq w+4*m(CTX),		xy ## 1; \
190 	op ## q xy ## 1,		4*(4+(n))(out); \
191 	\
192 	xorq w+4*m(CTX),		xy ## 2; \
193 	op ## q xy ## 2,		4*(8+(n))(out);
194 
195 #define inpack_enc3() \
196 	inpack3(RIO, 0, RAB, 0); \
197 	inpack3(RIO, 2, RCD, 2);
198 
199 #define outunpack_enc3(op) \
200 	outunpack3(op, RIO, 2, RAB, 6); \
201 	outunpack3(op, RIO, 0, RCD, 4);
202 
203 #define inpack_dec3() \
204 	inpack3(RIO, 0, RAB, 4); \
205 	rorq $32,			RAB0; \
206 	rorq $32,			RAB1; \
207 	rorq $32,			RAB2; \
208 	inpack3(RIO, 2, RCD, 6); \
209 	rorq $32,			RCD0; \
210 	rorq $32,			RCD1; \
211 	rorq $32,			RCD2;
212 
213 #define outunpack_dec3() \
214 	rorq $32,			RCD0; \
215 	rorq $32,			RCD1; \
216 	rorq $32,			RCD2; \
217 	outunpack3(mov, RIO, 0, RCD, 0); \
218 	rorq $32,			RAB0; \
219 	rorq $32,			RAB1; \
220 	rorq $32,			RAB2; \
221 	outunpack3(mov, RIO, 2, RAB, 2);
222 
223 SYM_FUNC_START(__twofish_enc_blk_3way)
224 	/* input:
225 	 *	%rdi: ctx, CTX
226 	 *	%rsi: dst
227 	 *	%rdx: src, RIO
228 	 *	%rcx: bool, if true: xor output
229 	 */
230 	pushq %r13;
231 	pushq %r12;
232 	pushq %rbx;
233 
234 	pushq %rcx; /* bool xor */
235 	pushq %rsi; /* dst */
236 
237 	inpack_enc3();
238 
239 	push_cd();
240 	encrypt_cycle3(RAB, CD, 0);
241 	encrypt_cycle3(RAB, CD, 1);
242 	encrypt_cycle3(RAB, CD, 2);
243 	encrypt_cycle3(RAB, CD, 3);
244 	encrypt_cycle3(RAB, CD, 4);
245 	encrypt_cycle3(RAB, CD, 5);
246 	encrypt_cycle3(RAB, CD, 6);
247 	encrypt_cycle3(RAB, CD, 7);
248 	pop_cd();
249 
250 	popq RIO; /* dst */
251 	popq RT1; /* bool xor */
252 
253 	testb RT1bl, RT1bl;
254 	jnz .L__enc_xor3;
255 
256 	outunpack_enc3(mov);
257 
258 	popq %rbx;
259 	popq %r12;
260 	popq %r13;
261 	RET;
262 
263 .L__enc_xor3:
264 	outunpack_enc3(xor);
265 
266 	popq %rbx;
267 	popq %r12;
268 	popq %r13;
269 	RET;
270 SYM_FUNC_END(__twofish_enc_blk_3way)
271 
272 SYM_FUNC_START(twofish_dec_blk_3way)
273 	/* input:
274 	 *	%rdi: ctx, CTX
275 	 *	%rsi: dst
276 	 *	%rdx: src, RIO
277 	 */
278 	pushq %r13;
279 	pushq %r12;
280 	pushq %rbx;
281 
282 	pushq %rsi; /* dst */
283 
284 	inpack_dec3();
285 
286 	push_cd();
287 	decrypt_cycle3(RAB, CD, 7);
288 	decrypt_cycle3(RAB, CD, 6);
289 	decrypt_cycle3(RAB, CD, 5);
290 	decrypt_cycle3(RAB, CD, 4);
291 	decrypt_cycle3(RAB, CD, 3);
292 	decrypt_cycle3(RAB, CD, 2);
293 	decrypt_cycle3(RAB, CD, 1);
294 	decrypt_cycle3(RAB, CD, 0);
295 	pop_cd();
296 
297 	popq RIO; /* dst */
298 
299 	outunpack_dec3();
300 
301 	popq %rbx;
302 	popq %r12;
303 	popq %r13;
304 	RET;
305 SYM_FUNC_END(twofish_dec_blk_3way)
306