1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* 3 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64) 4 * 5 * Copyright (C) 2012 Johannes Goetzfried 6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 7 * 8 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 9 */ 10 11 #include <linux/linkage.h> 12 #include <asm/frame.h> 13 #include "glue_helper-asm-avx.S" 14 15 .file "cast6-avx-x86_64-asm_64.S" 16 17 .extern cast_s1 18 .extern cast_s2 19 .extern cast_s3 20 .extern cast_s4 21 22 /* structure of crypto context */ 23 #define km 0 24 #define kr (12*4*4) 25 26 /* s-boxes */ 27 #define s1 cast_s1 28 #define s2 cast_s2 29 #define s3 cast_s3 30 #define s4 cast_s4 31 32 /********************************************************************** 33 8-way AVX cast6 34 **********************************************************************/ 35 #define CTX %r15 36 37 #define RA1 %xmm0 38 #define RB1 %xmm1 39 #define RC1 %xmm2 40 #define RD1 %xmm3 41 42 #define RA2 %xmm4 43 #define RB2 %xmm5 44 #define RC2 %xmm6 45 #define RD2 %xmm7 46 47 #define RX %xmm8 48 49 #define RKM %xmm9 50 #define RKR %xmm10 51 #define RKRF %xmm11 52 #define RKRR %xmm12 53 #define R32 %xmm13 54 #define R1ST %xmm14 55 56 #define RTMP %xmm15 57 58 #define RID1 %rdi 59 #define RID1d %edi 60 #define RID2 %rsi 61 #define RID2d %esi 62 63 #define RGI1 %rdx 64 #define RGI1bl %dl 65 #define RGI1bh %dh 66 #define RGI2 %rcx 67 #define RGI2bl %cl 68 #define RGI2bh %ch 69 70 #define RGI3 %rax 71 #define RGI3bl %al 72 #define RGI3bh %ah 73 #define RGI4 %rbx 74 #define RGI4bl %bl 75 #define RGI4bh %bh 76 77 #define RFS1 %r8 78 #define RFS1d %r8d 79 #define RFS2 %r9 80 #define RFS2d %r9d 81 #define RFS3 %r10 82 #define RFS3d %r10d 83 84 85 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ 86 movzbl src ## bh, RID1d; \ 87 movzbl src ## bl, RID2d; \ 88 shrq $16, src; \ 89 movl s1(, RID1, 4), dst ## d; \ 90 op1 s2(, RID2, 4), dst ## d; \ 91 movzbl src ## bh, RID1d; \ 92 movzbl src ## bl, RID2d; \ 93 interleave_op(il_reg); \ 94 op2 s3(, RID1, 4), dst ## d; \ 95 op3 s4(, RID2, 4), dst ## d; 96 97 #define dummy(d) /* do nothing */ 98 99 #define shr_next(reg) \ 100 shrq $16, reg; 101 102 #define F_head(a, x, gi1, gi2, op0) \ 103 op0 a, RKM, x; \ 104 vpslld RKRF, x, RTMP; \ 105 vpsrld RKRR, x, x; \ 106 vpor RTMP, x, x; \ 107 \ 108 vmovq x, gi1; \ 109 vpextrq $1, x, gi2; 110 111 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \ 112 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ 113 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ 114 \ 115 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ 116 shlq $32, RFS2; \ 117 orq RFS1, RFS2; \ 118 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ 119 shlq $32, RFS1; \ 120 orq RFS1, RFS3; \ 121 \ 122 vmovq RFS2, x; \ 123 vpinsrq $1, RFS3, x, x; 124 125 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ 126 F_head(b1, RX, RGI1, RGI2, op0); \ 127 F_head(b2, RX, RGI3, RGI4, op0); \ 128 \ 129 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ 130 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ 131 \ 132 vpxor a1, RX, a1; \ 133 vpxor a2, RTMP, a2; 134 135 #define F1_2(a1, b1, a2, b2) \ 136 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) 137 #define F2_2(a1, b1, a2, b2) \ 138 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) 139 #define F3_2(a1, b1, a2, b2) \ 140 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) 141 142 #define qop(in, out, f) \ 143 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2); 144 145 #define get_round_keys(nn) \ 146 vbroadcastss (km+(4*(nn)))(CTX), RKM; \ 147 vpand R1ST, RKR, RKRF; \ 148 vpsubq RKRF, R32, RKRR; \ 149 vpsrldq $1, RKR, RKR; 150 151 #define Q(n) \ 152 get_round_keys(4*n+0); \ 153 qop(RD, RC, 1); \ 154 \ 155 get_round_keys(4*n+1); \ 156 qop(RC, RB, 2); \ 157 \ 158 get_round_keys(4*n+2); \ 159 qop(RB, RA, 3); \ 160 \ 161 get_round_keys(4*n+3); \ 162 qop(RA, RD, 1); 163 164 #define QBAR(n) \ 165 get_round_keys(4*n+3); \ 166 qop(RA, RD, 1); \ 167 \ 168 get_round_keys(4*n+2); \ 169 qop(RB, RA, 3); \ 170 \ 171 get_round_keys(4*n+1); \ 172 qop(RC, RB, 2); \ 173 \ 174 get_round_keys(4*n+0); \ 175 qop(RD, RC, 1); 176 177 #define shuffle(mask) \ 178 vpshufb mask, RKR, RKR; 179 180 #define preload_rkr(n, do_mask, mask) \ 181 vbroadcastss .L16_mask, RKR; \ 182 /* add 16-bit rotation to key rotations (mod 32) */ \ 183 vpxor (kr+n*16)(CTX), RKR, RKR; \ 184 do_mask(mask); 185 186 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 187 vpunpckldq x1, x0, t0; \ 188 vpunpckhdq x1, x0, t2; \ 189 vpunpckldq x3, x2, t1; \ 190 vpunpckhdq x3, x2, x3; \ 191 \ 192 vpunpcklqdq t1, t0, x0; \ 193 vpunpckhqdq t1, t0, x1; \ 194 vpunpcklqdq x3, t2, x2; \ 195 vpunpckhqdq x3, t2, x3; 196 197 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ 198 vpshufb rmask, x0, x0; \ 199 vpshufb rmask, x1, x1; \ 200 vpshufb rmask, x2, x2; \ 201 vpshufb rmask, x3, x3; \ 202 \ 203 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 204 205 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ 206 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 207 \ 208 vpshufb rmask, x0, x0; \ 209 vpshufb rmask, x1, x1; \ 210 vpshufb rmask, x2, x2; \ 211 vpshufb rmask, x3, x3; 212 213 .section .rodata.cst16, "aM", @progbits, 16 214 .align 16 215 .Lxts_gf128mul_and_shl1_mask: 216 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 217 .Lbswap_mask: 218 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 219 .Lbswap128_mask: 220 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 221 .Lrkr_enc_Q_Q_QBAR_QBAR: 222 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 223 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR: 224 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 225 .Lrkr_dec_Q_Q_Q_Q: 226 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 227 .Lrkr_dec_Q_Q_QBAR_QBAR: 228 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0 229 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR: 230 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 231 232 .section .rodata.cst4.L16_mask, "aM", @progbits, 4 233 .align 4 234 .L16_mask: 235 .byte 16, 16, 16, 16 236 237 .section .rodata.cst4.L32_mask, "aM", @progbits, 4 238 .align 4 239 .L32_mask: 240 .byte 32, 0, 0, 0 241 242 .section .rodata.cst4.first_mask, "aM", @progbits, 4 243 .align 4 244 .Lfirst_mask: 245 .byte 0x1f, 0, 0, 0 246 247 .text 248 249 .align 8 250 SYM_FUNC_START_LOCAL(__cast6_enc_blk8) 251 /* input: 252 * %rdi: ctx 253 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 254 * output: 255 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 256 */ 257 258 pushq %r15; 259 pushq %rbx; 260 261 movq %rdi, CTX; 262 263 vmovdqa .Lbswap_mask, RKM; 264 vmovd .Lfirst_mask, R1ST; 265 vmovd .L32_mask, R32; 266 267 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 268 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 269 270 preload_rkr(0, dummy, none); 271 Q(0); 272 Q(1); 273 Q(2); 274 Q(3); 275 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR); 276 Q(4); 277 Q(5); 278 QBAR(6); 279 QBAR(7); 280 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR); 281 QBAR(8); 282 QBAR(9); 283 QBAR(10); 284 QBAR(11); 285 286 popq %rbx; 287 popq %r15; 288 289 vmovdqa .Lbswap_mask, RKM; 290 291 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 292 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 293 294 RET; 295 SYM_FUNC_END(__cast6_enc_blk8) 296 297 .align 8 298 SYM_FUNC_START_LOCAL(__cast6_dec_blk8) 299 /* input: 300 * %rdi: ctx 301 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 302 * output: 303 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks 304 */ 305 306 pushq %r15; 307 pushq %rbx; 308 309 movq %rdi, CTX; 310 311 vmovdqa .Lbswap_mask, RKM; 312 vmovd .Lfirst_mask, R1ST; 313 vmovd .L32_mask, R32; 314 315 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 316 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 317 318 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); 319 Q(11); 320 Q(10); 321 Q(9); 322 Q(8); 323 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR); 324 Q(7); 325 Q(6); 326 QBAR(5); 327 QBAR(4); 328 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR); 329 QBAR(3); 330 QBAR(2); 331 QBAR(1); 332 QBAR(0); 333 334 popq %rbx; 335 popq %r15; 336 337 vmovdqa .Lbswap_mask, RKM; 338 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 339 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 340 341 RET; 342 SYM_FUNC_END(__cast6_dec_blk8) 343 344 SYM_FUNC_START(cast6_ecb_enc_8way) 345 /* input: 346 * %rdi: ctx 347 * %rsi: dst 348 * %rdx: src 349 */ 350 FRAME_BEGIN 351 pushq %r15; 352 353 movq %rdi, CTX; 354 movq %rsi, %r11; 355 356 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 357 358 call __cast6_enc_blk8; 359 360 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 361 362 popq %r15; 363 FRAME_END 364 RET; 365 SYM_FUNC_END(cast6_ecb_enc_8way) 366 367 SYM_FUNC_START(cast6_ecb_dec_8way) 368 /* input: 369 * %rdi: ctx 370 * %rsi: dst 371 * %rdx: src 372 */ 373 FRAME_BEGIN 374 pushq %r15; 375 376 movq %rdi, CTX; 377 movq %rsi, %r11; 378 379 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 380 381 call __cast6_dec_blk8; 382 383 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 384 385 popq %r15; 386 FRAME_END 387 RET; 388 SYM_FUNC_END(cast6_ecb_dec_8way) 389 390 SYM_FUNC_START(cast6_cbc_dec_8way) 391 /* input: 392 * %rdi: ctx 393 * %rsi: dst 394 * %rdx: src 395 */ 396 FRAME_BEGIN 397 pushq %r12; 398 pushq %r15; 399 400 movq %rdi, CTX; 401 movq %rsi, %r11; 402 movq %rdx, %r12; 403 404 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 405 406 call __cast6_dec_blk8; 407 408 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 409 410 popq %r15; 411 popq %r12; 412 FRAME_END 413 RET; 414 SYM_FUNC_END(cast6_cbc_dec_8way) 415 416 SYM_FUNC_START(cast6_ctr_8way) 417 /* input: 418 * %rdi: ctx, CTX 419 * %rsi: dst 420 * %rdx: src 421 * %rcx: iv (little endian, 128bit) 422 */ 423 FRAME_BEGIN 424 pushq %r12; 425 pushq %r15 426 427 movq %rdi, CTX; 428 movq %rsi, %r11; 429 movq %rdx, %r12; 430 431 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 432 RD2, RX, RKR, RKM); 433 434 call __cast6_enc_blk8; 435 436 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 437 438 popq %r15; 439 popq %r12; 440 FRAME_END 441 RET; 442 SYM_FUNC_END(cast6_ctr_8way) 443 444 SYM_FUNC_START(cast6_xts_enc_8way) 445 /* input: 446 * %rdi: ctx, CTX 447 * %rsi: dst 448 * %rdx: src 449 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 450 */ 451 FRAME_BEGIN 452 pushq %r15; 453 454 movq %rdi, CTX 455 movq %rsi, %r11; 456 457 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 458 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 459 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); 460 461 call __cast6_enc_blk8; 462 463 /* dst <= regs xor IVs(in dst) */ 464 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 465 466 popq %r15; 467 FRAME_END 468 RET; 469 SYM_FUNC_END(cast6_xts_enc_8way) 470 471 SYM_FUNC_START(cast6_xts_dec_8way) 472 /* input: 473 * %rdi: ctx, CTX 474 * %rsi: dst 475 * %rdx: src 476 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 477 */ 478 FRAME_BEGIN 479 pushq %r15; 480 481 movq %rdi, CTX 482 movq %rsi, %r11; 483 484 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 485 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 486 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); 487 488 call __cast6_dec_blk8; 489 490 /* dst <= regs xor IVs(in dst) */ 491 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 492 493 popq %r15; 494 FRAME_END 495 RET; 496 SYM_FUNC_END(cast6_xts_dec_8way) 497