1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
4  *
5  * Copyright (C) 2012 Johannes Goetzfried
6  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7  *
8  * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
9  */
10 
11 #include <linux/linkage.h>
12 #include <asm/frame.h>
13 #include "glue_helper-asm-avx.S"
14 
15 .file "cast6-avx-x86_64-asm_64.S"
16 
17 .extern cast_s1
18 .extern cast_s2
19 .extern cast_s3
20 .extern cast_s4
21 
22 /* structure of crypto context */
23 #define km	0
24 #define kr	(12*4*4)
25 
26 /* s-boxes */
27 #define s1	cast_s1
28 #define s2	cast_s2
29 #define s3	cast_s3
30 #define s4	cast_s4
31 
32 /**********************************************************************
33   8-way AVX cast6
34  **********************************************************************/
35 #define CTX %r15
36 
37 #define RA1 %xmm0
38 #define RB1 %xmm1
39 #define RC1 %xmm2
40 #define RD1 %xmm3
41 
42 #define RA2 %xmm4
43 #define RB2 %xmm5
44 #define RC2 %xmm6
45 #define RD2 %xmm7
46 
47 #define RX  %xmm8
48 
49 #define RKM  %xmm9
50 #define RKR  %xmm10
51 #define RKRF %xmm11
52 #define RKRR %xmm12
53 #define R32  %xmm13
54 #define R1ST %xmm14
55 
56 #define RTMP %xmm15
57 
58 #define RID1  %rdi
59 #define RID1d %edi
60 #define RID2  %rsi
61 #define RID2d %esi
62 
63 #define RGI1   %rdx
64 #define RGI1bl %dl
65 #define RGI1bh %dh
66 #define RGI2   %rcx
67 #define RGI2bl %cl
68 #define RGI2bh %ch
69 
70 #define RGI3   %rax
71 #define RGI3bl %al
72 #define RGI3bh %ah
73 #define RGI4   %rbx
74 #define RGI4bl %bl
75 #define RGI4bh %bh
76 
77 #define RFS1  %r8
78 #define RFS1d %r8d
79 #define RFS2  %r9
80 #define RFS2d %r9d
81 #define RFS3  %r10
82 #define RFS3d %r10d
83 
84 
85 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
86 	movzbl		src ## bh,     RID1d;    \
87 	leaq		s1(%rip),      RID2;     \
88 	movl		(RID2,RID1,4), dst ## d; \
89 	movzbl		src ## bl,     RID2d;    \
90 	leaq		s2(%rip),      RID1;     \
91 	op1		(RID1,RID2,4), dst ## d; \
92 	shrq $16,	src;                     \
93 	movzbl		src ## bh,     RID1d;    \
94 	leaq		s3(%rip),      RID2;     \
95 	op2		(RID2,RID1,4), dst ## d; \
96 	movzbl		src ## bl,     RID2d;    \
97 	interleave_op(il_reg);			 \
98 	leaq		s4(%rip),      RID1;     \
99 	op3		(RID1,RID2,4), dst ## d;
100 
101 #define dummy(d) /* do nothing */
102 
103 #define shr_next(reg) \
104 	shrq $16,	reg;
105 
106 #define F_head(a, x, gi1, gi2, op0) \
107 	op0	a,	RKM,  x;                 \
108 	vpslld	RKRF,	x,    RTMP;              \
109 	vpsrld	RKRR,	x,    x;                 \
110 	vpor	RTMP,	x,    x;                 \
111 	\
112 	vmovq		x,    gi1;               \
113 	vpextrq $1,	x,    gi2;
114 
115 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
116 	lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
117 	lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
118 	\
119 	lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none);     \
120 	shlq $32,	RFS2;                                      \
121 	orq		RFS1, RFS2;                                \
122 	lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none);     \
123 	shlq $32,	RFS1;                                      \
124 	orq		RFS1, RFS3;                                \
125 	\
126 	vmovq		RFS2, x;                                   \
127 	vpinsrq $1,	RFS3, x, x;
128 
129 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
130 	F_head(b1, RX, RGI1, RGI2, op0);              \
131 	F_head(b2, RX, RGI3, RGI4, op0);              \
132 	\
133 	F_tail(b1, RX, RGI1, RGI2, op1, op2, op3);    \
134 	F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3);  \
135 	\
136 	vpxor		a1, RX,   a1;                 \
137 	vpxor		a2, RTMP, a2;
138 
139 #define F1_2(a1, b1, a2, b2) \
140 	F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
141 #define F2_2(a1, b1, a2, b2) \
142 	F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
143 #define F3_2(a1, b1, a2, b2) \
144 	F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
145 
146 #define qop(in, out, f) \
147 	F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
148 
149 #define get_round_keys(nn) \
150 	vbroadcastss	(km+(4*(nn)))(CTX), RKM;        \
151 	vpand		R1ST,               RKR,  RKRF; \
152 	vpsubq		RKRF,               R32,  RKRR; \
153 	vpsrldq $1,	RKR,                RKR;
154 
155 #define Q(n) \
156 	get_round_keys(4*n+0); \
157 	qop(RD, RC, 1);        \
158 	\
159 	get_round_keys(4*n+1); \
160 	qop(RC, RB, 2);        \
161 	\
162 	get_round_keys(4*n+2); \
163 	qop(RB, RA, 3);        \
164 	\
165 	get_round_keys(4*n+3); \
166 	qop(RA, RD, 1);
167 
168 #define QBAR(n) \
169 	get_round_keys(4*n+3); \
170 	qop(RA, RD, 1);        \
171 	\
172 	get_round_keys(4*n+2); \
173 	qop(RB, RA, 3);        \
174 	\
175 	get_round_keys(4*n+1); \
176 	qop(RC, RB, 2);        \
177 	\
178 	get_round_keys(4*n+0); \
179 	qop(RD, RC, 1);
180 
181 #define shuffle(mask) \
182 	vpshufb		mask(%rip),            RKR, RKR;
183 
184 #define preload_rkr(n, do_mask, mask) \
185 	vbroadcastss	.L16_mask(%rip),          RKR;      \
186 	/* add 16-bit rotation to key rotations (mod 32) */ \
187 	vpxor		(kr+n*16)(CTX),           RKR, RKR; \
188 	do_mask(mask);
189 
190 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
191 	vpunpckldq		x1, x0, t0; \
192 	vpunpckhdq		x1, x0, t2; \
193 	vpunpckldq		x3, x2, t1; \
194 	vpunpckhdq		x3, x2, x3; \
195 	\
196 	vpunpcklqdq		t1, t0, x0; \
197 	vpunpckhqdq		t1, t0, x1; \
198 	vpunpcklqdq		x3, t2, x2; \
199 	vpunpckhqdq		x3, t2, x3;
200 
201 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
202 	vpshufb rmask, x0,	x0; \
203 	vpshufb rmask, x1,	x1; \
204 	vpshufb rmask, x2,	x2; \
205 	vpshufb rmask, x3,	x3; \
206 	\
207 	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
208 
209 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
210 	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
211 	\
212 	vpshufb rmask,		x0, x0;       \
213 	vpshufb rmask,		x1, x1;       \
214 	vpshufb rmask,		x2, x2;       \
215 	vpshufb rmask,		x3, x3;
216 
217 .section	.rodata.cst16, "aM", @progbits, 16
218 .align 16
219 .Lbswap_mask:
220 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
221 .Lbswap128_mask:
222 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
223 .Lrkr_enc_Q_Q_QBAR_QBAR:
224 	.byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
225 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
226 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
227 .Lrkr_dec_Q_Q_Q_Q:
228 	.byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
229 .Lrkr_dec_Q_Q_QBAR_QBAR:
230 	.byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
231 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
232 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
233 
234 .section	.rodata.cst4.L16_mask, "aM", @progbits, 4
235 .align 4
236 .L16_mask:
237 	.byte 16, 16, 16, 16
238 
239 .section	.rodata.cst4.L32_mask, "aM", @progbits, 4
240 .align 4
241 .L32_mask:
242 	.byte 32, 0, 0, 0
243 
244 .section	.rodata.cst4.first_mask, "aM", @progbits, 4
245 .align 4
246 .Lfirst_mask:
247 	.byte 0x1f, 0, 0, 0
248 
249 .text
250 
251 .align 8
252 SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
253 	/* input:
254 	 *	%rdi: ctx
255 	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
256 	 * output:
257 	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
258 	 */
259 
260 	pushq %r15;
261 	pushq %rbx;
262 
263 	movq %rdi, CTX;
264 
265 	vmovdqa .Lbswap_mask(%rip), RKM;
266 	vmovd .Lfirst_mask(%rip), R1ST;
267 	vmovd .L32_mask(%rip), R32;
268 
269 	inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
270 	inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
271 
272 	preload_rkr(0, dummy, none);
273 	Q(0);
274 	Q(1);
275 	Q(2);
276 	Q(3);
277 	preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
278 	Q(4);
279 	Q(5);
280 	QBAR(6);
281 	QBAR(7);
282 	preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
283 	QBAR(8);
284 	QBAR(9);
285 	QBAR(10);
286 	QBAR(11);
287 
288 	popq %rbx;
289 	popq %r15;
290 
291 	vmovdqa .Lbswap_mask(%rip), RKM;
292 
293 	outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
294 	outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
295 
296 	RET;
297 SYM_FUNC_END(__cast6_enc_blk8)
298 
299 .align 8
300 SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
301 	/* input:
302 	 *	%rdi: ctx
303 	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
304 	 * output:
305 	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
306 	 */
307 
308 	pushq %r15;
309 	pushq %rbx;
310 
311 	movq %rdi, CTX;
312 
313 	vmovdqa .Lbswap_mask(%rip), RKM;
314 	vmovd .Lfirst_mask(%rip), R1ST;
315 	vmovd .L32_mask(%rip), R32;
316 
317 	inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
318 	inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
319 
320 	preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
321 	Q(11);
322 	Q(10);
323 	Q(9);
324 	Q(8);
325 	preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
326 	Q(7);
327 	Q(6);
328 	QBAR(5);
329 	QBAR(4);
330 	preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
331 	QBAR(3);
332 	QBAR(2);
333 	QBAR(1);
334 	QBAR(0);
335 
336 	popq %rbx;
337 	popq %r15;
338 
339 	vmovdqa .Lbswap_mask(%rip), RKM;
340 	outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
341 	outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
342 
343 	RET;
344 SYM_FUNC_END(__cast6_dec_blk8)
345 
346 SYM_FUNC_START(cast6_ecb_enc_8way)
347 	/* input:
348 	 *	%rdi: ctx
349 	 *	%rsi: dst
350 	 *	%rdx: src
351 	 */
352 	FRAME_BEGIN
353 	pushq %r15;
354 
355 	movq %rdi, CTX;
356 	movq %rsi, %r11;
357 
358 	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
359 
360 	call __cast6_enc_blk8;
361 
362 	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
363 
364 	popq %r15;
365 	FRAME_END
366 	RET;
367 SYM_FUNC_END(cast6_ecb_enc_8way)
368 
369 SYM_FUNC_START(cast6_ecb_dec_8way)
370 	/* input:
371 	 *	%rdi: ctx
372 	 *	%rsi: dst
373 	 *	%rdx: src
374 	 */
375 	FRAME_BEGIN
376 	pushq %r15;
377 
378 	movq %rdi, CTX;
379 	movq %rsi, %r11;
380 
381 	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
382 
383 	call __cast6_dec_blk8;
384 
385 	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
386 
387 	popq %r15;
388 	FRAME_END
389 	RET;
390 SYM_FUNC_END(cast6_ecb_dec_8way)
391 
392 SYM_FUNC_START(cast6_cbc_dec_8way)
393 	/* input:
394 	 *	%rdi: ctx
395 	 *	%rsi: dst
396 	 *	%rdx: src
397 	 */
398 	FRAME_BEGIN
399 	pushq %r12;
400 	pushq %r15;
401 
402 	movq %rdi, CTX;
403 	movq %rsi, %r11;
404 	movq %rdx, %r12;
405 
406 	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
407 
408 	call __cast6_dec_blk8;
409 
410 	store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
411 
412 	popq %r15;
413 	popq %r12;
414 	FRAME_END
415 	RET;
416 SYM_FUNC_END(cast6_cbc_dec_8way)
417