162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64) 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2012 Johannes Goetzfried 662306a36Sopenharmony_ci * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 762306a36Sopenharmony_ci * 862306a36Sopenharmony_ci * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 962306a36Sopenharmony_ci */ 1062306a36Sopenharmony_ci 1162306a36Sopenharmony_ci#include <linux/linkage.h> 1262306a36Sopenharmony_ci#include <asm/frame.h> 1362306a36Sopenharmony_ci 1462306a36Sopenharmony_ci.file "cast5-avx-x86_64-asm_64.S" 1562306a36Sopenharmony_ci 1662306a36Sopenharmony_ci.extern cast_s1 1762306a36Sopenharmony_ci.extern cast_s2 1862306a36Sopenharmony_ci.extern cast_s3 1962306a36Sopenharmony_ci.extern cast_s4 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_ci/* structure of crypto context */ 2262306a36Sopenharmony_ci#define km 0 2362306a36Sopenharmony_ci#define kr (16*4) 2462306a36Sopenharmony_ci#define rr ((16*4)+16) 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci/* s-boxes */ 2762306a36Sopenharmony_ci#define s1 cast_s1 2862306a36Sopenharmony_ci#define s2 cast_s2 2962306a36Sopenharmony_ci#define s3 cast_s3 3062306a36Sopenharmony_ci#define s4 cast_s4 3162306a36Sopenharmony_ci 3262306a36Sopenharmony_ci/********************************************************************** 3362306a36Sopenharmony_ci 16-way AVX cast5 3462306a36Sopenharmony_ci **********************************************************************/ 3562306a36Sopenharmony_ci#define CTX %r15 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_ci#define RL1 %xmm0 3862306a36Sopenharmony_ci#define RR1 %xmm1 3962306a36Sopenharmony_ci#define RL2 %xmm2 4062306a36Sopenharmony_ci#define RR2 %xmm3 4162306a36Sopenharmony_ci#define RL3 %xmm4 4262306a36Sopenharmony_ci#define RR3 %xmm5 4362306a36Sopenharmony_ci#define RL4 %xmm6 4462306a36Sopenharmony_ci#define RR4 %xmm7 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ci#define RX %xmm8 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_ci#define RKM %xmm9 4962306a36Sopenharmony_ci#define RKR %xmm10 5062306a36Sopenharmony_ci#define RKRF %xmm11 5162306a36Sopenharmony_ci#define RKRR %xmm12 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci#define R32 %xmm13 5462306a36Sopenharmony_ci#define R1ST %xmm14 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci#define RTMP %xmm15 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ci#define RID1 %rdi 5962306a36Sopenharmony_ci#define RID1d %edi 6062306a36Sopenharmony_ci#define RID2 %rsi 6162306a36Sopenharmony_ci#define RID2d %esi 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_ci#define RGI1 %rdx 6462306a36Sopenharmony_ci#define RGI1bl %dl 6562306a36Sopenharmony_ci#define RGI1bh %dh 6662306a36Sopenharmony_ci#define RGI2 %rcx 6762306a36Sopenharmony_ci#define RGI2bl %cl 6862306a36Sopenharmony_ci#define RGI2bh %ch 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_ci#define RGI3 %rax 7162306a36Sopenharmony_ci#define RGI3bl %al 7262306a36Sopenharmony_ci#define RGI3bh %ah 7362306a36Sopenharmony_ci#define RGI4 %rbx 7462306a36Sopenharmony_ci#define RGI4bl %bl 7562306a36Sopenharmony_ci#define RGI4bh %bh 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci#define RFS1 %r8 7862306a36Sopenharmony_ci#define RFS1d %r8d 7962306a36Sopenharmony_ci#define RFS2 %r9 8062306a36Sopenharmony_ci#define RFS2d %r9d 8162306a36Sopenharmony_ci#define RFS3 %r10 8262306a36Sopenharmony_ci#define RFS3d %r10d 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ 8662306a36Sopenharmony_ci movzbl src ## bh, RID1d; \ 8762306a36Sopenharmony_ci leaq s1(%rip), RID2; \ 8862306a36Sopenharmony_ci movl (RID2,RID1,4), dst ## d; \ 8962306a36Sopenharmony_ci movzbl src ## bl, RID2d; \ 9062306a36Sopenharmony_ci leaq s2(%rip), RID1; \ 9162306a36Sopenharmony_ci op1 (RID1,RID2,4), dst ## d; \ 9262306a36Sopenharmony_ci shrq $16, src; \ 9362306a36Sopenharmony_ci movzbl src ## bh, RID1d; \ 9462306a36Sopenharmony_ci leaq s3(%rip), RID2; \ 9562306a36Sopenharmony_ci op2 (RID2,RID1,4), dst ## d; \ 9662306a36Sopenharmony_ci movzbl src ## bl, RID2d; \ 9762306a36Sopenharmony_ci interleave_op(il_reg); \ 9862306a36Sopenharmony_ci leaq s4(%rip), RID1; \ 9962306a36Sopenharmony_ci op3 (RID1,RID2,4), dst ## d; 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci#define dummy(d) /* do nothing */ 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci#define shr_next(reg) \ 10462306a36Sopenharmony_ci shrq $16, reg; 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci#define F_head(a, x, gi1, gi2, op0) \ 10762306a36Sopenharmony_ci op0 a, RKM, x; \ 10862306a36Sopenharmony_ci vpslld RKRF, x, RTMP; \ 10962306a36Sopenharmony_ci vpsrld RKRR, x, x; \ 11062306a36Sopenharmony_ci vpor RTMP, x, x; \ 11162306a36Sopenharmony_ci \ 11262306a36Sopenharmony_ci vmovq x, gi1; \ 11362306a36Sopenharmony_ci vpextrq $1, x, gi2; 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci#define F_tail(a, x, gi1, gi2, op1, op2, op3) \ 11662306a36Sopenharmony_ci lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ 11762306a36Sopenharmony_ci lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ 11862306a36Sopenharmony_ci \ 11962306a36Sopenharmony_ci lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ 12062306a36Sopenharmony_ci shlq $32, RFS2; \ 12162306a36Sopenharmony_ci orq RFS1, RFS2; \ 12262306a36Sopenharmony_ci lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ 12362306a36Sopenharmony_ci shlq $32, RFS1; \ 12462306a36Sopenharmony_ci orq RFS1, RFS3; \ 12562306a36Sopenharmony_ci \ 12662306a36Sopenharmony_ci vmovq RFS2, x; \ 12762306a36Sopenharmony_ci vpinsrq $1, RFS3, x, x; 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_ci#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ 13062306a36Sopenharmony_ci F_head(b1, RX, RGI1, RGI2, op0); \ 13162306a36Sopenharmony_ci F_head(b2, RX, RGI3, RGI4, op0); \ 13262306a36Sopenharmony_ci \ 13362306a36Sopenharmony_ci F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ 13462306a36Sopenharmony_ci F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ 13562306a36Sopenharmony_ci \ 13662306a36Sopenharmony_ci vpxor a1, RX, a1; \ 13762306a36Sopenharmony_ci vpxor a2, RTMP, a2; 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci#define F1_2(a1, b1, a2, b2) \ 14062306a36Sopenharmony_ci F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) 14162306a36Sopenharmony_ci#define F2_2(a1, b1, a2, b2) \ 14262306a36Sopenharmony_ci F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) 14362306a36Sopenharmony_ci#define F3_2(a1, b1, a2, b2) \ 14462306a36Sopenharmony_ci F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci#define subround(a1, b1, a2, b2, f) \ 14762306a36Sopenharmony_ci F ## f ## _2(a1, b1, a2, b2); 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci#define round(l, r, n, f) \ 15062306a36Sopenharmony_ci vbroadcastss (km+(4*n))(CTX), RKM; \ 15162306a36Sopenharmony_ci vpand R1ST, RKR, RKRF; \ 15262306a36Sopenharmony_ci vpsubq RKRF, R32, RKRR; \ 15362306a36Sopenharmony_ci vpsrldq $1, RKR, RKR; \ 15462306a36Sopenharmony_ci subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \ 15562306a36Sopenharmony_ci subround(l ## 3, r ## 3, l ## 4, r ## 4, f); 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci#define enc_preload_rkr() \ 15862306a36Sopenharmony_ci vbroadcastss .L16_mask(%rip), RKR; \ 15962306a36Sopenharmony_ci /* add 16-bit rotation to key rotations (mod 32) */ \ 16062306a36Sopenharmony_ci vpxor kr(CTX), RKR, RKR; 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci#define dec_preload_rkr() \ 16362306a36Sopenharmony_ci vbroadcastss .L16_mask(%rip), RKR; \ 16462306a36Sopenharmony_ci /* add 16-bit rotation to key rotations (mod 32) */ \ 16562306a36Sopenharmony_ci vpxor kr(CTX), RKR, RKR; \ 16662306a36Sopenharmony_ci vpshufb .Lbswap128_mask(%rip), RKR, RKR; 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci#define transpose_2x4(x0, x1, t0, t1) \ 16962306a36Sopenharmony_ci vpunpckldq x1, x0, t0; \ 17062306a36Sopenharmony_ci vpunpckhdq x1, x0, t1; \ 17162306a36Sopenharmony_ci \ 17262306a36Sopenharmony_ci vpunpcklqdq t1, t0, x0; \ 17362306a36Sopenharmony_ci vpunpckhqdq t1, t0, x1; 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci#define inpack_blocks(x0, x1, t0, t1, rmask) \ 17662306a36Sopenharmony_ci vpshufb rmask, x0, x0; \ 17762306a36Sopenharmony_ci vpshufb rmask, x1, x1; \ 17862306a36Sopenharmony_ci \ 17962306a36Sopenharmony_ci transpose_2x4(x0, x1, t0, t1) 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci#define outunpack_blocks(x0, x1, t0, t1, rmask) \ 18262306a36Sopenharmony_ci transpose_2x4(x0, x1, t0, t1) \ 18362306a36Sopenharmony_ci \ 18462306a36Sopenharmony_ci vpshufb rmask, x0, x0; \ 18562306a36Sopenharmony_ci vpshufb rmask, x1, x1; 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 18862306a36Sopenharmony_ci.align 16 18962306a36Sopenharmony_ci.Lbswap_mask: 19062306a36Sopenharmony_ci .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 19162306a36Sopenharmony_ci.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 19262306a36Sopenharmony_ci.align 16 19362306a36Sopenharmony_ci.Lbswap128_mask: 19462306a36Sopenharmony_ci .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 19562306a36Sopenharmony_ci.section .rodata.cst16.bswap_iv_mask, "aM", @progbits, 16 19662306a36Sopenharmony_ci.align 16 19762306a36Sopenharmony_ci.Lbswap_iv_mask: 19862306a36Sopenharmony_ci .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci.section .rodata.cst4.16_mask, "aM", @progbits, 4 20162306a36Sopenharmony_ci.align 4 20262306a36Sopenharmony_ci.L16_mask: 20362306a36Sopenharmony_ci .byte 16, 16, 16, 16 20462306a36Sopenharmony_ci.section .rodata.cst4.32_mask, "aM", @progbits, 4 20562306a36Sopenharmony_ci.align 4 20662306a36Sopenharmony_ci.L32_mask: 20762306a36Sopenharmony_ci .byte 32, 0, 0, 0 20862306a36Sopenharmony_ci.section .rodata.cst4.first_mask, "aM", @progbits, 4 20962306a36Sopenharmony_ci.align 4 21062306a36Sopenharmony_ci.Lfirst_mask: 21162306a36Sopenharmony_ci .byte 0x1f, 0, 0, 0 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci.text 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__cast5_enc_blk16) 21662306a36Sopenharmony_ci /* input: 21762306a36Sopenharmony_ci * %rdi: ctx 21862306a36Sopenharmony_ci * RL1: blocks 1 and 2 21962306a36Sopenharmony_ci * RR1: blocks 3 and 4 22062306a36Sopenharmony_ci * RL2: blocks 5 and 6 22162306a36Sopenharmony_ci * RR2: blocks 7 and 8 22262306a36Sopenharmony_ci * RL3: blocks 9 and 10 22362306a36Sopenharmony_ci * RR3: blocks 11 and 12 22462306a36Sopenharmony_ci * RL4: blocks 13 and 14 22562306a36Sopenharmony_ci * RR4: blocks 15 and 16 22662306a36Sopenharmony_ci * output: 22762306a36Sopenharmony_ci * RL1: encrypted blocks 1 and 2 22862306a36Sopenharmony_ci * RR1: encrypted blocks 3 and 4 22962306a36Sopenharmony_ci * RL2: encrypted blocks 5 and 6 23062306a36Sopenharmony_ci * RR2: encrypted blocks 7 and 8 23162306a36Sopenharmony_ci * RL3: encrypted blocks 9 and 10 23262306a36Sopenharmony_ci * RR3: encrypted blocks 11 and 12 23362306a36Sopenharmony_ci * RL4: encrypted blocks 13 and 14 23462306a36Sopenharmony_ci * RR4: encrypted blocks 15 and 16 23562306a36Sopenharmony_ci */ 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci pushq %r15; 23862306a36Sopenharmony_ci pushq %rbx; 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci movq %rdi, CTX; 24162306a36Sopenharmony_ci 24262306a36Sopenharmony_ci vmovdqa .Lbswap_mask(%rip), RKM; 24362306a36Sopenharmony_ci vmovd .Lfirst_mask(%rip), R1ST; 24462306a36Sopenharmony_ci vmovd .L32_mask(%rip), R32; 24562306a36Sopenharmony_ci enc_preload_rkr(); 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci inpack_blocks(RL1, RR1, RTMP, RX, RKM); 24862306a36Sopenharmony_ci inpack_blocks(RL2, RR2, RTMP, RX, RKM); 24962306a36Sopenharmony_ci inpack_blocks(RL3, RR3, RTMP, RX, RKM); 25062306a36Sopenharmony_ci inpack_blocks(RL4, RR4, RTMP, RX, RKM); 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci round(RL, RR, 0, 1); 25362306a36Sopenharmony_ci round(RR, RL, 1, 2); 25462306a36Sopenharmony_ci round(RL, RR, 2, 3); 25562306a36Sopenharmony_ci round(RR, RL, 3, 1); 25662306a36Sopenharmony_ci round(RL, RR, 4, 2); 25762306a36Sopenharmony_ci round(RR, RL, 5, 3); 25862306a36Sopenharmony_ci round(RL, RR, 6, 1); 25962306a36Sopenharmony_ci round(RR, RL, 7, 2); 26062306a36Sopenharmony_ci round(RL, RR, 8, 3); 26162306a36Sopenharmony_ci round(RR, RL, 9, 1); 26262306a36Sopenharmony_ci round(RL, RR, 10, 2); 26362306a36Sopenharmony_ci round(RR, RL, 11, 3); 26462306a36Sopenharmony_ci 26562306a36Sopenharmony_ci movzbl rr(CTX), %eax; 26662306a36Sopenharmony_ci testl %eax, %eax; 26762306a36Sopenharmony_ci jnz .L__skip_enc; 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci round(RL, RR, 12, 1); 27062306a36Sopenharmony_ci round(RR, RL, 13, 2); 27162306a36Sopenharmony_ci round(RL, RR, 14, 3); 27262306a36Sopenharmony_ci round(RR, RL, 15, 1); 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci.L__skip_enc: 27562306a36Sopenharmony_ci popq %rbx; 27662306a36Sopenharmony_ci popq %r15; 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci vmovdqa .Lbswap_mask(%rip), RKM; 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci outunpack_blocks(RR1, RL1, RTMP, RX, RKM); 28162306a36Sopenharmony_ci outunpack_blocks(RR2, RL2, RTMP, RX, RKM); 28262306a36Sopenharmony_ci outunpack_blocks(RR3, RL3, RTMP, RX, RKM); 28362306a36Sopenharmony_ci outunpack_blocks(RR4, RL4, RTMP, RX, RKM); 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci RET; 28662306a36Sopenharmony_ciSYM_FUNC_END(__cast5_enc_blk16) 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__cast5_dec_blk16) 28962306a36Sopenharmony_ci /* input: 29062306a36Sopenharmony_ci * %rdi: ctx 29162306a36Sopenharmony_ci * RL1: encrypted blocks 1 and 2 29262306a36Sopenharmony_ci * RR1: encrypted blocks 3 and 4 29362306a36Sopenharmony_ci * RL2: encrypted blocks 5 and 6 29462306a36Sopenharmony_ci * RR2: encrypted blocks 7 and 8 29562306a36Sopenharmony_ci * RL3: encrypted blocks 9 and 10 29662306a36Sopenharmony_ci * RR3: encrypted blocks 11 and 12 29762306a36Sopenharmony_ci * RL4: encrypted blocks 13 and 14 29862306a36Sopenharmony_ci * RR4: encrypted blocks 15 and 16 29962306a36Sopenharmony_ci * output: 30062306a36Sopenharmony_ci * RL1: decrypted blocks 1 and 2 30162306a36Sopenharmony_ci * RR1: decrypted blocks 3 and 4 30262306a36Sopenharmony_ci * RL2: decrypted blocks 5 and 6 30362306a36Sopenharmony_ci * RR2: decrypted blocks 7 and 8 30462306a36Sopenharmony_ci * RL3: decrypted blocks 9 and 10 30562306a36Sopenharmony_ci * RR3: decrypted blocks 11 and 12 30662306a36Sopenharmony_ci * RL4: decrypted blocks 13 and 14 30762306a36Sopenharmony_ci * RR4: decrypted blocks 15 and 16 30862306a36Sopenharmony_ci */ 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci pushq %r15; 31162306a36Sopenharmony_ci pushq %rbx; 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci movq %rdi, CTX; 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci vmovdqa .Lbswap_mask(%rip), RKM; 31662306a36Sopenharmony_ci vmovd .Lfirst_mask(%rip), R1ST; 31762306a36Sopenharmony_ci vmovd .L32_mask(%rip), R32; 31862306a36Sopenharmony_ci dec_preload_rkr(); 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci inpack_blocks(RL1, RR1, RTMP, RX, RKM); 32162306a36Sopenharmony_ci inpack_blocks(RL2, RR2, RTMP, RX, RKM); 32262306a36Sopenharmony_ci inpack_blocks(RL3, RR3, RTMP, RX, RKM); 32362306a36Sopenharmony_ci inpack_blocks(RL4, RR4, RTMP, RX, RKM); 32462306a36Sopenharmony_ci 32562306a36Sopenharmony_ci movzbl rr(CTX), %eax; 32662306a36Sopenharmony_ci testl %eax, %eax; 32762306a36Sopenharmony_ci jnz .L__skip_dec; 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_ci round(RL, RR, 15, 1); 33062306a36Sopenharmony_ci round(RR, RL, 14, 3); 33162306a36Sopenharmony_ci round(RL, RR, 13, 2); 33262306a36Sopenharmony_ci round(RR, RL, 12, 1); 33362306a36Sopenharmony_ci 33462306a36Sopenharmony_ci.L__dec_tail: 33562306a36Sopenharmony_ci round(RL, RR, 11, 3); 33662306a36Sopenharmony_ci round(RR, RL, 10, 2); 33762306a36Sopenharmony_ci round(RL, RR, 9, 1); 33862306a36Sopenharmony_ci round(RR, RL, 8, 3); 33962306a36Sopenharmony_ci round(RL, RR, 7, 2); 34062306a36Sopenharmony_ci round(RR, RL, 6, 1); 34162306a36Sopenharmony_ci round(RL, RR, 5, 3); 34262306a36Sopenharmony_ci round(RR, RL, 4, 2); 34362306a36Sopenharmony_ci round(RL, RR, 3, 1); 34462306a36Sopenharmony_ci round(RR, RL, 2, 3); 34562306a36Sopenharmony_ci round(RL, RR, 1, 2); 34662306a36Sopenharmony_ci round(RR, RL, 0, 1); 34762306a36Sopenharmony_ci 34862306a36Sopenharmony_ci vmovdqa .Lbswap_mask(%rip), RKM; 34962306a36Sopenharmony_ci popq %rbx; 35062306a36Sopenharmony_ci popq %r15; 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_ci outunpack_blocks(RR1, RL1, RTMP, RX, RKM); 35362306a36Sopenharmony_ci outunpack_blocks(RR2, RL2, RTMP, RX, RKM); 35462306a36Sopenharmony_ci outunpack_blocks(RR3, RL3, RTMP, RX, RKM); 35562306a36Sopenharmony_ci outunpack_blocks(RR4, RL4, RTMP, RX, RKM); 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_ci RET; 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci.L__skip_dec: 36062306a36Sopenharmony_ci vpsrldq $4, RKR, RKR; 36162306a36Sopenharmony_ci jmp .L__dec_tail; 36262306a36Sopenharmony_ciSYM_FUNC_END(__cast5_dec_blk16) 36362306a36Sopenharmony_ci 36462306a36Sopenharmony_ciSYM_FUNC_START(cast5_ecb_enc_16way) 36562306a36Sopenharmony_ci /* input: 36662306a36Sopenharmony_ci * %rdi: ctx 36762306a36Sopenharmony_ci * %rsi: dst 36862306a36Sopenharmony_ci * %rdx: src 36962306a36Sopenharmony_ci */ 37062306a36Sopenharmony_ci FRAME_BEGIN 37162306a36Sopenharmony_ci pushq %r15; 37262306a36Sopenharmony_ci 37362306a36Sopenharmony_ci movq %rdi, CTX; 37462306a36Sopenharmony_ci movq %rsi, %r11; 37562306a36Sopenharmony_ci 37662306a36Sopenharmony_ci vmovdqu (0*4*4)(%rdx), RL1; 37762306a36Sopenharmony_ci vmovdqu (1*4*4)(%rdx), RR1; 37862306a36Sopenharmony_ci vmovdqu (2*4*4)(%rdx), RL2; 37962306a36Sopenharmony_ci vmovdqu (3*4*4)(%rdx), RR2; 38062306a36Sopenharmony_ci vmovdqu (4*4*4)(%rdx), RL3; 38162306a36Sopenharmony_ci vmovdqu (5*4*4)(%rdx), RR3; 38262306a36Sopenharmony_ci vmovdqu (6*4*4)(%rdx), RL4; 38362306a36Sopenharmony_ci vmovdqu (7*4*4)(%rdx), RR4; 38462306a36Sopenharmony_ci 38562306a36Sopenharmony_ci call __cast5_enc_blk16; 38662306a36Sopenharmony_ci 38762306a36Sopenharmony_ci vmovdqu RR1, (0*4*4)(%r11); 38862306a36Sopenharmony_ci vmovdqu RL1, (1*4*4)(%r11); 38962306a36Sopenharmony_ci vmovdqu RR2, (2*4*4)(%r11); 39062306a36Sopenharmony_ci vmovdqu RL2, (3*4*4)(%r11); 39162306a36Sopenharmony_ci vmovdqu RR3, (4*4*4)(%r11); 39262306a36Sopenharmony_ci vmovdqu RL3, (5*4*4)(%r11); 39362306a36Sopenharmony_ci vmovdqu RR4, (6*4*4)(%r11); 39462306a36Sopenharmony_ci vmovdqu RL4, (7*4*4)(%r11); 39562306a36Sopenharmony_ci 39662306a36Sopenharmony_ci popq %r15; 39762306a36Sopenharmony_ci FRAME_END 39862306a36Sopenharmony_ci RET; 39962306a36Sopenharmony_ciSYM_FUNC_END(cast5_ecb_enc_16way) 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ciSYM_FUNC_START(cast5_ecb_dec_16way) 40262306a36Sopenharmony_ci /* input: 40362306a36Sopenharmony_ci * %rdi: ctx 40462306a36Sopenharmony_ci * %rsi: dst 40562306a36Sopenharmony_ci * %rdx: src 40662306a36Sopenharmony_ci */ 40762306a36Sopenharmony_ci 40862306a36Sopenharmony_ci FRAME_BEGIN 40962306a36Sopenharmony_ci pushq %r15; 41062306a36Sopenharmony_ci 41162306a36Sopenharmony_ci movq %rdi, CTX; 41262306a36Sopenharmony_ci movq %rsi, %r11; 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci vmovdqu (0*4*4)(%rdx), RL1; 41562306a36Sopenharmony_ci vmovdqu (1*4*4)(%rdx), RR1; 41662306a36Sopenharmony_ci vmovdqu (2*4*4)(%rdx), RL2; 41762306a36Sopenharmony_ci vmovdqu (3*4*4)(%rdx), RR2; 41862306a36Sopenharmony_ci vmovdqu (4*4*4)(%rdx), RL3; 41962306a36Sopenharmony_ci vmovdqu (5*4*4)(%rdx), RR3; 42062306a36Sopenharmony_ci vmovdqu (6*4*4)(%rdx), RL4; 42162306a36Sopenharmony_ci vmovdqu (7*4*4)(%rdx), RR4; 42262306a36Sopenharmony_ci 42362306a36Sopenharmony_ci call __cast5_dec_blk16; 42462306a36Sopenharmony_ci 42562306a36Sopenharmony_ci vmovdqu RR1, (0*4*4)(%r11); 42662306a36Sopenharmony_ci vmovdqu RL1, (1*4*4)(%r11); 42762306a36Sopenharmony_ci vmovdqu RR2, (2*4*4)(%r11); 42862306a36Sopenharmony_ci vmovdqu RL2, (3*4*4)(%r11); 42962306a36Sopenharmony_ci vmovdqu RR3, (4*4*4)(%r11); 43062306a36Sopenharmony_ci vmovdqu RL3, (5*4*4)(%r11); 43162306a36Sopenharmony_ci vmovdqu RR4, (6*4*4)(%r11); 43262306a36Sopenharmony_ci vmovdqu RL4, (7*4*4)(%r11); 43362306a36Sopenharmony_ci 43462306a36Sopenharmony_ci popq %r15; 43562306a36Sopenharmony_ci FRAME_END 43662306a36Sopenharmony_ci RET; 43762306a36Sopenharmony_ciSYM_FUNC_END(cast5_ecb_dec_16way) 43862306a36Sopenharmony_ci 43962306a36Sopenharmony_ciSYM_FUNC_START(cast5_cbc_dec_16way) 44062306a36Sopenharmony_ci /* input: 44162306a36Sopenharmony_ci * %rdi: ctx 44262306a36Sopenharmony_ci * %rsi: dst 44362306a36Sopenharmony_ci * %rdx: src 44462306a36Sopenharmony_ci */ 44562306a36Sopenharmony_ci FRAME_BEGIN 44662306a36Sopenharmony_ci pushq %r12; 44762306a36Sopenharmony_ci pushq %r15; 44862306a36Sopenharmony_ci 44962306a36Sopenharmony_ci movq %rdi, CTX; 45062306a36Sopenharmony_ci movq %rsi, %r11; 45162306a36Sopenharmony_ci movq %rdx, %r12; 45262306a36Sopenharmony_ci 45362306a36Sopenharmony_ci vmovdqu (0*16)(%rdx), RL1; 45462306a36Sopenharmony_ci vmovdqu (1*16)(%rdx), RR1; 45562306a36Sopenharmony_ci vmovdqu (2*16)(%rdx), RL2; 45662306a36Sopenharmony_ci vmovdqu (3*16)(%rdx), RR2; 45762306a36Sopenharmony_ci vmovdqu (4*16)(%rdx), RL3; 45862306a36Sopenharmony_ci vmovdqu (5*16)(%rdx), RR3; 45962306a36Sopenharmony_ci vmovdqu (6*16)(%rdx), RL4; 46062306a36Sopenharmony_ci vmovdqu (7*16)(%rdx), RR4; 46162306a36Sopenharmony_ci 46262306a36Sopenharmony_ci call __cast5_dec_blk16; 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_ci /* xor with src */ 46562306a36Sopenharmony_ci vmovq (%r12), RX; 46662306a36Sopenharmony_ci vpshufd $0x4f, RX, RX; 46762306a36Sopenharmony_ci vpxor RX, RR1, RR1; 46862306a36Sopenharmony_ci vpxor 0*16+8(%r12), RL1, RL1; 46962306a36Sopenharmony_ci vpxor 1*16+8(%r12), RR2, RR2; 47062306a36Sopenharmony_ci vpxor 2*16+8(%r12), RL2, RL2; 47162306a36Sopenharmony_ci vpxor 3*16+8(%r12), RR3, RR3; 47262306a36Sopenharmony_ci vpxor 4*16+8(%r12), RL3, RL3; 47362306a36Sopenharmony_ci vpxor 5*16+8(%r12), RR4, RR4; 47462306a36Sopenharmony_ci vpxor 6*16+8(%r12), RL4, RL4; 47562306a36Sopenharmony_ci 47662306a36Sopenharmony_ci vmovdqu RR1, (0*16)(%r11); 47762306a36Sopenharmony_ci vmovdqu RL1, (1*16)(%r11); 47862306a36Sopenharmony_ci vmovdqu RR2, (2*16)(%r11); 47962306a36Sopenharmony_ci vmovdqu RL2, (3*16)(%r11); 48062306a36Sopenharmony_ci vmovdqu RR3, (4*16)(%r11); 48162306a36Sopenharmony_ci vmovdqu RL3, (5*16)(%r11); 48262306a36Sopenharmony_ci vmovdqu RR4, (6*16)(%r11); 48362306a36Sopenharmony_ci vmovdqu RL4, (7*16)(%r11); 48462306a36Sopenharmony_ci 48562306a36Sopenharmony_ci popq %r15; 48662306a36Sopenharmony_ci popq %r12; 48762306a36Sopenharmony_ci FRAME_END 48862306a36Sopenharmony_ci RET; 48962306a36Sopenharmony_ciSYM_FUNC_END(cast5_cbc_dec_16way) 49062306a36Sopenharmony_ci 49162306a36Sopenharmony_ciSYM_FUNC_START(cast5_ctr_16way) 49262306a36Sopenharmony_ci /* input: 49362306a36Sopenharmony_ci * %rdi: ctx 49462306a36Sopenharmony_ci * %rsi: dst 49562306a36Sopenharmony_ci * %rdx: src 49662306a36Sopenharmony_ci * %rcx: iv (big endian, 64bit) 49762306a36Sopenharmony_ci */ 49862306a36Sopenharmony_ci FRAME_BEGIN 49962306a36Sopenharmony_ci pushq %r12; 50062306a36Sopenharmony_ci pushq %r15; 50162306a36Sopenharmony_ci 50262306a36Sopenharmony_ci movq %rdi, CTX; 50362306a36Sopenharmony_ci movq %rsi, %r11; 50462306a36Sopenharmony_ci movq %rdx, %r12; 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci vpcmpeqd RTMP, RTMP, RTMP; 50762306a36Sopenharmony_ci vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */ 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_ci vpcmpeqd RKR, RKR, RKR; 51062306a36Sopenharmony_ci vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */ 51162306a36Sopenharmony_ci vmovdqa .Lbswap_iv_mask(%rip), R1ST; 51262306a36Sopenharmony_ci vmovdqa .Lbswap128_mask(%rip), RKM; 51362306a36Sopenharmony_ci 51462306a36Sopenharmony_ci /* load IV and byteswap */ 51562306a36Sopenharmony_ci vmovq (%rcx), RX; 51662306a36Sopenharmony_ci vpshufb R1ST, RX, RX; 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci /* construct IVs */ 51962306a36Sopenharmony_ci vpsubq RTMP, RX, RX; /* le: IV1, IV0 */ 52062306a36Sopenharmony_ci vpshufb RKM, RX, RL1; /* be: IV0, IV1 */ 52162306a36Sopenharmony_ci vpsubq RKR, RX, RX; 52262306a36Sopenharmony_ci vpshufb RKM, RX, RR1; /* be: IV2, IV3 */ 52362306a36Sopenharmony_ci vpsubq RKR, RX, RX; 52462306a36Sopenharmony_ci vpshufb RKM, RX, RL2; /* be: IV4, IV5 */ 52562306a36Sopenharmony_ci vpsubq RKR, RX, RX; 52662306a36Sopenharmony_ci vpshufb RKM, RX, RR2; /* be: IV6, IV7 */ 52762306a36Sopenharmony_ci vpsubq RKR, RX, RX; 52862306a36Sopenharmony_ci vpshufb RKM, RX, RL3; /* be: IV8, IV9 */ 52962306a36Sopenharmony_ci vpsubq RKR, RX, RX; 53062306a36Sopenharmony_ci vpshufb RKM, RX, RR3; /* be: IV10, IV11 */ 53162306a36Sopenharmony_ci vpsubq RKR, RX, RX; 53262306a36Sopenharmony_ci vpshufb RKM, RX, RL4; /* be: IV12, IV13 */ 53362306a36Sopenharmony_ci vpsubq RKR, RX, RX; 53462306a36Sopenharmony_ci vpshufb RKM, RX, RR4; /* be: IV14, IV15 */ 53562306a36Sopenharmony_ci 53662306a36Sopenharmony_ci /* store last IV */ 53762306a36Sopenharmony_ci vpsubq RTMP, RX, RX; /* le: IV16, IV14 */ 53862306a36Sopenharmony_ci vpshufb R1ST, RX, RX; /* be: IV16, IV16 */ 53962306a36Sopenharmony_ci vmovq RX, (%rcx); 54062306a36Sopenharmony_ci 54162306a36Sopenharmony_ci call __cast5_enc_blk16; 54262306a36Sopenharmony_ci 54362306a36Sopenharmony_ci /* dst = src ^ iv */ 54462306a36Sopenharmony_ci vpxor (0*16)(%r12), RR1, RR1; 54562306a36Sopenharmony_ci vpxor (1*16)(%r12), RL1, RL1; 54662306a36Sopenharmony_ci vpxor (2*16)(%r12), RR2, RR2; 54762306a36Sopenharmony_ci vpxor (3*16)(%r12), RL2, RL2; 54862306a36Sopenharmony_ci vpxor (4*16)(%r12), RR3, RR3; 54962306a36Sopenharmony_ci vpxor (5*16)(%r12), RL3, RL3; 55062306a36Sopenharmony_ci vpxor (6*16)(%r12), RR4, RR4; 55162306a36Sopenharmony_ci vpxor (7*16)(%r12), RL4, RL4; 55262306a36Sopenharmony_ci vmovdqu RR1, (0*16)(%r11); 55362306a36Sopenharmony_ci vmovdqu RL1, (1*16)(%r11); 55462306a36Sopenharmony_ci vmovdqu RR2, (2*16)(%r11); 55562306a36Sopenharmony_ci vmovdqu RL2, (3*16)(%r11); 55662306a36Sopenharmony_ci vmovdqu RR3, (4*16)(%r11); 55762306a36Sopenharmony_ci vmovdqu RL3, (5*16)(%r11); 55862306a36Sopenharmony_ci vmovdqu RR4, (6*16)(%r11); 55962306a36Sopenharmony_ci vmovdqu RL4, (7*16)(%r11); 56062306a36Sopenharmony_ci 56162306a36Sopenharmony_ci popq %r15; 56262306a36Sopenharmony_ci popq %r12; 56362306a36Sopenharmony_ci FRAME_END 56462306a36Sopenharmony_ci RET; 56562306a36Sopenharmony_ciSYM_FUNC_END(cast5_ctr_16way) 566