18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64) 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2012 Johannes Goetzfried 68c2ecf20Sopenharmony_ci * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 98c2ecf20Sopenharmony_ci */ 108c2ecf20Sopenharmony_ci 118c2ecf20Sopenharmony_ci#include <linux/linkage.h> 128c2ecf20Sopenharmony_ci#include <asm/frame.h> 138c2ecf20Sopenharmony_ci#include "glue_helper-asm-avx.S" 148c2ecf20Sopenharmony_ci 158c2ecf20Sopenharmony_ci.file "cast6-avx-x86_64-asm_64.S" 168c2ecf20Sopenharmony_ci 178c2ecf20Sopenharmony_ci.extern cast_s1 188c2ecf20Sopenharmony_ci.extern cast_s2 198c2ecf20Sopenharmony_ci.extern cast_s3 208c2ecf20Sopenharmony_ci.extern cast_s4 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_ci/* structure of crypto context */ 238c2ecf20Sopenharmony_ci#define km 0 248c2ecf20Sopenharmony_ci#define kr (12*4*4) 258c2ecf20Sopenharmony_ci 268c2ecf20Sopenharmony_ci/* s-boxes */ 278c2ecf20Sopenharmony_ci#define s1 cast_s1 288c2ecf20Sopenharmony_ci#define s2 cast_s2 298c2ecf20Sopenharmony_ci#define s3 cast_s3 308c2ecf20Sopenharmony_ci#define s4 cast_s4 318c2ecf20Sopenharmony_ci 328c2ecf20Sopenharmony_ci/********************************************************************** 338c2ecf20Sopenharmony_ci 8-way AVX cast6 348c2ecf20Sopenharmony_ci **********************************************************************/ 358c2ecf20Sopenharmony_ci#define CTX %r15 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_ci#define RA1 %xmm0 388c2ecf20Sopenharmony_ci#define RB1 %xmm1 398c2ecf20Sopenharmony_ci#define RC1 %xmm2 408c2ecf20Sopenharmony_ci#define RD1 %xmm3 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci#define RA2 %xmm4 438c2ecf20Sopenharmony_ci#define RB2 %xmm5 448c2ecf20Sopenharmony_ci#define RC2 %xmm6 458c2ecf20Sopenharmony_ci#define RD2 %xmm7 468c2ecf20Sopenharmony_ci 478c2ecf20Sopenharmony_ci#define RX %xmm8 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_ci#define RKM %xmm9 508c2ecf20Sopenharmony_ci#define RKR %xmm10 518c2ecf20Sopenharmony_ci#define RKRF %xmm11 528c2ecf20Sopenharmony_ci#define RKRR %xmm12 538c2ecf20Sopenharmony_ci#define R32 %xmm13 548c2ecf20Sopenharmony_ci#define R1ST %xmm14 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci#define RTMP %xmm15 578c2ecf20Sopenharmony_ci 588c2ecf20Sopenharmony_ci#define RID1 %rdi 598c2ecf20Sopenharmony_ci#define RID1d %edi 608c2ecf20Sopenharmony_ci#define RID2 %rsi 618c2ecf20Sopenharmony_ci#define RID2d %esi 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci#define RGI1 %rdx 648c2ecf20Sopenharmony_ci#define RGI1bl %dl 658c2ecf20Sopenharmony_ci#define RGI1bh %dh 668c2ecf20Sopenharmony_ci#define RGI2 %rcx 678c2ecf20Sopenharmony_ci#define RGI2bl %cl 688c2ecf20Sopenharmony_ci#define RGI2bh %ch 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_ci#define RGI3 %rax 718c2ecf20Sopenharmony_ci#define RGI3bl %al 728c2ecf20Sopenharmony_ci#define RGI3bh %ah 738c2ecf20Sopenharmony_ci#define RGI4 %rbx 748c2ecf20Sopenharmony_ci#define RGI4bl %bl 758c2ecf20Sopenharmony_ci#define RGI4bh %bh 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci#define RFS1 %r8 788c2ecf20Sopenharmony_ci#define RFS1d %r8d 798c2ecf20Sopenharmony_ci#define RFS2 %r9 808c2ecf20Sopenharmony_ci#define RFS2d %r9d 818c2ecf20Sopenharmony_ci#define RFS3 %r10 828c2ecf20Sopenharmony_ci#define RFS3d %r10d 838c2ecf20Sopenharmony_ci 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ci#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ 868c2ecf20Sopenharmony_ci movzbl src ## bh, RID1d; \ 878c2ecf20Sopenharmony_ci movzbl src ## bl, RID2d; \ 888c2ecf20Sopenharmony_ci shrq $16, src; \ 898c2ecf20Sopenharmony_ci movl s1(, RID1, 4), dst ## d; \ 908c2ecf20Sopenharmony_ci op1 s2(, RID2, 4), dst ## d; \ 918c2ecf20Sopenharmony_ci movzbl src ## bh, RID1d; \ 928c2ecf20Sopenharmony_ci movzbl src ## bl, RID2d; \ 938c2ecf20Sopenharmony_ci interleave_op(il_reg); \ 948c2ecf20Sopenharmony_ci op2 s3(, RID1, 4), dst ## d; \ 958c2ecf20Sopenharmony_ci op3 s4(, RID2, 4), dst ## d; 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci#define dummy(d) /* do nothing */ 988c2ecf20Sopenharmony_ci 998c2ecf20Sopenharmony_ci#define shr_next(reg) \ 1008c2ecf20Sopenharmony_ci shrq $16, reg; 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_ci#define F_head(a, x, gi1, gi2, op0) \ 1038c2ecf20Sopenharmony_ci op0 a, RKM, x; \ 1048c2ecf20Sopenharmony_ci vpslld RKRF, x, RTMP; \ 1058c2ecf20Sopenharmony_ci vpsrld RKRR, x, x; \ 1068c2ecf20Sopenharmony_ci vpor RTMP, x, x; \ 1078c2ecf20Sopenharmony_ci \ 1088c2ecf20Sopenharmony_ci vmovq x, gi1; \ 1098c2ecf20Sopenharmony_ci vpextrq $1, x, gi2; 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_ci#define F_tail(a, x, gi1, gi2, op1, op2, op3) \ 1128c2ecf20Sopenharmony_ci lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ 1138c2ecf20Sopenharmony_ci lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ 1148c2ecf20Sopenharmony_ci \ 1158c2ecf20Sopenharmony_ci lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ 1168c2ecf20Sopenharmony_ci shlq $32, RFS2; \ 1178c2ecf20Sopenharmony_ci orq RFS1, RFS2; \ 1188c2ecf20Sopenharmony_ci lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ 1198c2ecf20Sopenharmony_ci shlq $32, RFS1; \ 1208c2ecf20Sopenharmony_ci orq RFS1, RFS3; \ 1218c2ecf20Sopenharmony_ci \ 1228c2ecf20Sopenharmony_ci vmovq RFS2, x; \ 1238c2ecf20Sopenharmony_ci vpinsrq $1, RFS3, x, x; 1248c2ecf20Sopenharmony_ci 1258c2ecf20Sopenharmony_ci#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ 1268c2ecf20Sopenharmony_ci F_head(b1, RX, RGI1, RGI2, op0); \ 1278c2ecf20Sopenharmony_ci F_head(b2, RX, RGI3, RGI4, op0); \ 1288c2ecf20Sopenharmony_ci \ 1298c2ecf20Sopenharmony_ci F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ 1308c2ecf20Sopenharmony_ci F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ 1318c2ecf20Sopenharmony_ci \ 1328c2ecf20Sopenharmony_ci vpxor a1, RX, a1; \ 1338c2ecf20Sopenharmony_ci vpxor a2, RTMP, a2; 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ci#define F1_2(a1, b1, a2, b2) \ 1368c2ecf20Sopenharmony_ci F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) 1378c2ecf20Sopenharmony_ci#define F2_2(a1, b1, a2, b2) \ 1388c2ecf20Sopenharmony_ci F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) 1398c2ecf20Sopenharmony_ci#define F3_2(a1, b1, a2, b2) \ 1408c2ecf20Sopenharmony_ci F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) 1418c2ecf20Sopenharmony_ci 1428c2ecf20Sopenharmony_ci#define qop(in, out, f) \ 1438c2ecf20Sopenharmony_ci F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2); 1448c2ecf20Sopenharmony_ci 1458c2ecf20Sopenharmony_ci#define get_round_keys(nn) \ 1468c2ecf20Sopenharmony_ci vbroadcastss (km+(4*(nn)))(CTX), RKM; \ 1478c2ecf20Sopenharmony_ci vpand R1ST, RKR, RKRF; \ 1488c2ecf20Sopenharmony_ci vpsubq RKRF, R32, RKRR; \ 1498c2ecf20Sopenharmony_ci vpsrldq $1, RKR, RKR; 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci#define Q(n) \ 1528c2ecf20Sopenharmony_ci get_round_keys(4*n+0); \ 1538c2ecf20Sopenharmony_ci qop(RD, RC, 1); \ 1548c2ecf20Sopenharmony_ci \ 1558c2ecf20Sopenharmony_ci get_round_keys(4*n+1); \ 1568c2ecf20Sopenharmony_ci qop(RC, RB, 2); \ 1578c2ecf20Sopenharmony_ci \ 1588c2ecf20Sopenharmony_ci get_round_keys(4*n+2); \ 1598c2ecf20Sopenharmony_ci qop(RB, RA, 3); \ 1608c2ecf20Sopenharmony_ci \ 1618c2ecf20Sopenharmony_ci get_round_keys(4*n+3); \ 1628c2ecf20Sopenharmony_ci qop(RA, RD, 1); 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_ci#define QBAR(n) \ 1658c2ecf20Sopenharmony_ci get_round_keys(4*n+3); \ 1668c2ecf20Sopenharmony_ci qop(RA, RD, 1); \ 1678c2ecf20Sopenharmony_ci \ 1688c2ecf20Sopenharmony_ci get_round_keys(4*n+2); \ 1698c2ecf20Sopenharmony_ci qop(RB, RA, 3); \ 1708c2ecf20Sopenharmony_ci \ 1718c2ecf20Sopenharmony_ci get_round_keys(4*n+1); \ 1728c2ecf20Sopenharmony_ci qop(RC, RB, 2); \ 1738c2ecf20Sopenharmony_ci \ 1748c2ecf20Sopenharmony_ci get_round_keys(4*n+0); \ 1758c2ecf20Sopenharmony_ci qop(RD, RC, 1); 1768c2ecf20Sopenharmony_ci 1778c2ecf20Sopenharmony_ci#define shuffle(mask) \ 1788c2ecf20Sopenharmony_ci vpshufb mask, RKR, RKR; 1798c2ecf20Sopenharmony_ci 1808c2ecf20Sopenharmony_ci#define preload_rkr(n, do_mask, mask) \ 1818c2ecf20Sopenharmony_ci vbroadcastss .L16_mask, RKR; \ 1828c2ecf20Sopenharmony_ci /* add 16-bit rotation to key rotations (mod 32) */ \ 1838c2ecf20Sopenharmony_ci vpxor (kr+n*16)(CTX), RKR, RKR; \ 1848c2ecf20Sopenharmony_ci do_mask(mask); 1858c2ecf20Sopenharmony_ci 1868c2ecf20Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 1878c2ecf20Sopenharmony_ci vpunpckldq x1, x0, t0; \ 1888c2ecf20Sopenharmony_ci vpunpckhdq x1, x0, t2; \ 1898c2ecf20Sopenharmony_ci vpunpckldq x3, x2, t1; \ 1908c2ecf20Sopenharmony_ci vpunpckhdq x3, x2, x3; \ 1918c2ecf20Sopenharmony_ci \ 1928c2ecf20Sopenharmony_ci vpunpcklqdq t1, t0, x0; \ 1938c2ecf20Sopenharmony_ci vpunpckhqdq t1, t0, x1; \ 1948c2ecf20Sopenharmony_ci vpunpcklqdq x3, t2, x2; \ 1958c2ecf20Sopenharmony_ci vpunpckhqdq x3, t2, x3; 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_ci#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ 1988c2ecf20Sopenharmony_ci vpshufb rmask, x0, x0; \ 1998c2ecf20Sopenharmony_ci vpshufb rmask, x1, x1; \ 2008c2ecf20Sopenharmony_ci vpshufb rmask, x2, x2; \ 2018c2ecf20Sopenharmony_ci vpshufb rmask, x3, x3; \ 2028c2ecf20Sopenharmony_ci \ 2038c2ecf20Sopenharmony_ci transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ 2068c2ecf20Sopenharmony_ci transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 2078c2ecf20Sopenharmony_ci \ 2088c2ecf20Sopenharmony_ci vpshufb rmask, x0, x0; \ 2098c2ecf20Sopenharmony_ci vpshufb rmask, x1, x1; \ 2108c2ecf20Sopenharmony_ci vpshufb rmask, x2, x2; \ 2118c2ecf20Sopenharmony_ci vpshufb rmask, x3, x3; 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci.section .rodata.cst16, "aM", @progbits, 16 2148c2ecf20Sopenharmony_ci.align 16 2158c2ecf20Sopenharmony_ci.Lxts_gf128mul_and_shl1_mask: 2168c2ecf20Sopenharmony_ci .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 2178c2ecf20Sopenharmony_ci.Lbswap_mask: 2188c2ecf20Sopenharmony_ci .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 2198c2ecf20Sopenharmony_ci.Lbswap128_mask: 2208c2ecf20Sopenharmony_ci .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2218c2ecf20Sopenharmony_ci.Lrkr_enc_Q_Q_QBAR_QBAR: 2228c2ecf20Sopenharmony_ci .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 2238c2ecf20Sopenharmony_ci.Lrkr_enc_QBAR_QBAR_QBAR_QBAR: 2248c2ecf20Sopenharmony_ci .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 2258c2ecf20Sopenharmony_ci.Lrkr_dec_Q_Q_Q_Q: 2268c2ecf20Sopenharmony_ci .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 2278c2ecf20Sopenharmony_ci.Lrkr_dec_Q_Q_QBAR_QBAR: 2288c2ecf20Sopenharmony_ci .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0 2298c2ecf20Sopenharmony_ci.Lrkr_dec_QBAR_QBAR_QBAR_QBAR: 2308c2ecf20Sopenharmony_ci .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2318c2ecf20Sopenharmony_ci 2328c2ecf20Sopenharmony_ci.section .rodata.cst4.L16_mask, "aM", @progbits, 4 2338c2ecf20Sopenharmony_ci.align 4 2348c2ecf20Sopenharmony_ci.L16_mask: 2358c2ecf20Sopenharmony_ci .byte 16, 16, 16, 16 2368c2ecf20Sopenharmony_ci 2378c2ecf20Sopenharmony_ci.section .rodata.cst4.L32_mask, "aM", @progbits, 4 2388c2ecf20Sopenharmony_ci.align 4 2398c2ecf20Sopenharmony_ci.L32_mask: 2408c2ecf20Sopenharmony_ci .byte 32, 0, 0, 0 2418c2ecf20Sopenharmony_ci 2428c2ecf20Sopenharmony_ci.section .rodata.cst4.first_mask, "aM", @progbits, 4 2438c2ecf20Sopenharmony_ci.align 4 2448c2ecf20Sopenharmony_ci.Lfirst_mask: 2458c2ecf20Sopenharmony_ci .byte 0x1f, 0, 0, 0 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_ci.text 2488c2ecf20Sopenharmony_ci 2498c2ecf20Sopenharmony_ci.align 8 2508c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__cast6_enc_blk8) 2518c2ecf20Sopenharmony_ci /* input: 2528c2ecf20Sopenharmony_ci * %rdi: ctx 2538c2ecf20Sopenharmony_ci * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 2548c2ecf20Sopenharmony_ci * output: 2558c2ecf20Sopenharmony_ci * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 2568c2ecf20Sopenharmony_ci */ 2578c2ecf20Sopenharmony_ci 2588c2ecf20Sopenharmony_ci pushq %r15; 2598c2ecf20Sopenharmony_ci pushq %rbx; 2608c2ecf20Sopenharmony_ci 2618c2ecf20Sopenharmony_ci movq %rdi, CTX; 2628c2ecf20Sopenharmony_ci 2638c2ecf20Sopenharmony_ci vmovdqa .Lbswap_mask, RKM; 2648c2ecf20Sopenharmony_ci vmovd .Lfirst_mask, R1ST; 2658c2ecf20Sopenharmony_ci vmovd .L32_mask, R32; 2668c2ecf20Sopenharmony_ci 2678c2ecf20Sopenharmony_ci inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 2688c2ecf20Sopenharmony_ci inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 2698c2ecf20Sopenharmony_ci 2708c2ecf20Sopenharmony_ci preload_rkr(0, dummy, none); 2718c2ecf20Sopenharmony_ci Q(0); 2728c2ecf20Sopenharmony_ci Q(1); 2738c2ecf20Sopenharmony_ci Q(2); 2748c2ecf20Sopenharmony_ci Q(3); 2758c2ecf20Sopenharmony_ci preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR); 2768c2ecf20Sopenharmony_ci Q(4); 2778c2ecf20Sopenharmony_ci Q(5); 2788c2ecf20Sopenharmony_ci QBAR(6); 2798c2ecf20Sopenharmony_ci QBAR(7); 2808c2ecf20Sopenharmony_ci preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR); 2818c2ecf20Sopenharmony_ci QBAR(8); 2828c2ecf20Sopenharmony_ci QBAR(9); 2838c2ecf20Sopenharmony_ci QBAR(10); 2848c2ecf20Sopenharmony_ci QBAR(11); 2858c2ecf20Sopenharmony_ci 2868c2ecf20Sopenharmony_ci popq %rbx; 2878c2ecf20Sopenharmony_ci popq %r15; 2888c2ecf20Sopenharmony_ci 2898c2ecf20Sopenharmony_ci vmovdqa .Lbswap_mask, RKM; 2908c2ecf20Sopenharmony_ci 2918c2ecf20Sopenharmony_ci outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 2928c2ecf20Sopenharmony_ci outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 2938c2ecf20Sopenharmony_ci 2948c2ecf20Sopenharmony_ci RET; 2958c2ecf20Sopenharmony_ciSYM_FUNC_END(__cast6_enc_blk8) 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_ci.align 8 2988c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__cast6_dec_blk8) 2998c2ecf20Sopenharmony_ci /* input: 3008c2ecf20Sopenharmony_ci * %rdi: ctx 3018c2ecf20Sopenharmony_ci * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 3028c2ecf20Sopenharmony_ci * output: 3038c2ecf20Sopenharmony_ci * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks 3048c2ecf20Sopenharmony_ci */ 3058c2ecf20Sopenharmony_ci 3068c2ecf20Sopenharmony_ci pushq %r15; 3078c2ecf20Sopenharmony_ci pushq %rbx; 3088c2ecf20Sopenharmony_ci 3098c2ecf20Sopenharmony_ci movq %rdi, CTX; 3108c2ecf20Sopenharmony_ci 3118c2ecf20Sopenharmony_ci vmovdqa .Lbswap_mask, RKM; 3128c2ecf20Sopenharmony_ci vmovd .Lfirst_mask, R1ST; 3138c2ecf20Sopenharmony_ci vmovd .L32_mask, R32; 3148c2ecf20Sopenharmony_ci 3158c2ecf20Sopenharmony_ci inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 3168c2ecf20Sopenharmony_ci inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 3178c2ecf20Sopenharmony_ci 3188c2ecf20Sopenharmony_ci preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); 3198c2ecf20Sopenharmony_ci Q(11); 3208c2ecf20Sopenharmony_ci Q(10); 3218c2ecf20Sopenharmony_ci Q(9); 3228c2ecf20Sopenharmony_ci Q(8); 3238c2ecf20Sopenharmony_ci preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR); 3248c2ecf20Sopenharmony_ci Q(7); 3258c2ecf20Sopenharmony_ci Q(6); 3268c2ecf20Sopenharmony_ci QBAR(5); 3278c2ecf20Sopenharmony_ci QBAR(4); 3288c2ecf20Sopenharmony_ci preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR); 3298c2ecf20Sopenharmony_ci QBAR(3); 3308c2ecf20Sopenharmony_ci QBAR(2); 3318c2ecf20Sopenharmony_ci QBAR(1); 3328c2ecf20Sopenharmony_ci QBAR(0); 3338c2ecf20Sopenharmony_ci 3348c2ecf20Sopenharmony_ci popq %rbx; 3358c2ecf20Sopenharmony_ci popq %r15; 3368c2ecf20Sopenharmony_ci 3378c2ecf20Sopenharmony_ci vmovdqa .Lbswap_mask, RKM; 3388c2ecf20Sopenharmony_ci outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 3398c2ecf20Sopenharmony_ci outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 3408c2ecf20Sopenharmony_ci 3418c2ecf20Sopenharmony_ci RET; 3428c2ecf20Sopenharmony_ciSYM_FUNC_END(__cast6_dec_blk8) 3438c2ecf20Sopenharmony_ci 3448c2ecf20Sopenharmony_ciSYM_FUNC_START(cast6_ecb_enc_8way) 3458c2ecf20Sopenharmony_ci /* input: 3468c2ecf20Sopenharmony_ci * %rdi: ctx 3478c2ecf20Sopenharmony_ci * %rsi: dst 3488c2ecf20Sopenharmony_ci * %rdx: src 3498c2ecf20Sopenharmony_ci */ 3508c2ecf20Sopenharmony_ci FRAME_BEGIN 3518c2ecf20Sopenharmony_ci pushq %r15; 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci movq %rdi, CTX; 3548c2ecf20Sopenharmony_ci movq %rsi, %r11; 3558c2ecf20Sopenharmony_ci 3568c2ecf20Sopenharmony_ci load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 3578c2ecf20Sopenharmony_ci 3588c2ecf20Sopenharmony_ci call __cast6_enc_blk8; 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 3618c2ecf20Sopenharmony_ci 3628c2ecf20Sopenharmony_ci popq %r15; 3638c2ecf20Sopenharmony_ci FRAME_END 3648c2ecf20Sopenharmony_ci RET; 3658c2ecf20Sopenharmony_ciSYM_FUNC_END(cast6_ecb_enc_8way) 3668c2ecf20Sopenharmony_ci 3678c2ecf20Sopenharmony_ciSYM_FUNC_START(cast6_ecb_dec_8way) 3688c2ecf20Sopenharmony_ci /* input: 3698c2ecf20Sopenharmony_ci * %rdi: ctx 3708c2ecf20Sopenharmony_ci * %rsi: dst 3718c2ecf20Sopenharmony_ci * %rdx: src 3728c2ecf20Sopenharmony_ci */ 3738c2ecf20Sopenharmony_ci FRAME_BEGIN 3748c2ecf20Sopenharmony_ci pushq %r15; 3758c2ecf20Sopenharmony_ci 3768c2ecf20Sopenharmony_ci movq %rdi, CTX; 3778c2ecf20Sopenharmony_ci movq %rsi, %r11; 3788c2ecf20Sopenharmony_ci 3798c2ecf20Sopenharmony_ci load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 3808c2ecf20Sopenharmony_ci 3818c2ecf20Sopenharmony_ci call __cast6_dec_blk8; 3828c2ecf20Sopenharmony_ci 3838c2ecf20Sopenharmony_ci store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 3848c2ecf20Sopenharmony_ci 3858c2ecf20Sopenharmony_ci popq %r15; 3868c2ecf20Sopenharmony_ci FRAME_END 3878c2ecf20Sopenharmony_ci RET; 3888c2ecf20Sopenharmony_ciSYM_FUNC_END(cast6_ecb_dec_8way) 3898c2ecf20Sopenharmony_ci 3908c2ecf20Sopenharmony_ciSYM_FUNC_START(cast6_cbc_dec_8way) 3918c2ecf20Sopenharmony_ci /* input: 3928c2ecf20Sopenharmony_ci * %rdi: ctx 3938c2ecf20Sopenharmony_ci * %rsi: dst 3948c2ecf20Sopenharmony_ci * %rdx: src 3958c2ecf20Sopenharmony_ci */ 3968c2ecf20Sopenharmony_ci FRAME_BEGIN 3978c2ecf20Sopenharmony_ci pushq %r12; 3988c2ecf20Sopenharmony_ci pushq %r15; 3998c2ecf20Sopenharmony_ci 4008c2ecf20Sopenharmony_ci movq %rdi, CTX; 4018c2ecf20Sopenharmony_ci movq %rsi, %r11; 4028c2ecf20Sopenharmony_ci movq %rdx, %r12; 4038c2ecf20Sopenharmony_ci 4048c2ecf20Sopenharmony_ci load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 4058c2ecf20Sopenharmony_ci 4068c2ecf20Sopenharmony_ci call __cast6_dec_blk8; 4078c2ecf20Sopenharmony_ci 4088c2ecf20Sopenharmony_ci store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 4098c2ecf20Sopenharmony_ci 4108c2ecf20Sopenharmony_ci popq %r15; 4118c2ecf20Sopenharmony_ci popq %r12; 4128c2ecf20Sopenharmony_ci FRAME_END 4138c2ecf20Sopenharmony_ci RET; 4148c2ecf20Sopenharmony_ciSYM_FUNC_END(cast6_cbc_dec_8way) 4158c2ecf20Sopenharmony_ci 4168c2ecf20Sopenharmony_ciSYM_FUNC_START(cast6_ctr_8way) 4178c2ecf20Sopenharmony_ci /* input: 4188c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 4198c2ecf20Sopenharmony_ci * %rsi: dst 4208c2ecf20Sopenharmony_ci * %rdx: src 4218c2ecf20Sopenharmony_ci * %rcx: iv (little endian, 128bit) 4228c2ecf20Sopenharmony_ci */ 4238c2ecf20Sopenharmony_ci FRAME_BEGIN 4248c2ecf20Sopenharmony_ci pushq %r12; 4258c2ecf20Sopenharmony_ci pushq %r15 4268c2ecf20Sopenharmony_ci 4278c2ecf20Sopenharmony_ci movq %rdi, CTX; 4288c2ecf20Sopenharmony_ci movq %rsi, %r11; 4298c2ecf20Sopenharmony_ci movq %rdx, %r12; 4308c2ecf20Sopenharmony_ci 4318c2ecf20Sopenharmony_ci load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 4328c2ecf20Sopenharmony_ci RD2, RX, RKR, RKM); 4338c2ecf20Sopenharmony_ci 4348c2ecf20Sopenharmony_ci call __cast6_enc_blk8; 4358c2ecf20Sopenharmony_ci 4368c2ecf20Sopenharmony_ci store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 4378c2ecf20Sopenharmony_ci 4388c2ecf20Sopenharmony_ci popq %r15; 4398c2ecf20Sopenharmony_ci popq %r12; 4408c2ecf20Sopenharmony_ci FRAME_END 4418c2ecf20Sopenharmony_ci RET; 4428c2ecf20Sopenharmony_ciSYM_FUNC_END(cast6_ctr_8way) 4438c2ecf20Sopenharmony_ci 4448c2ecf20Sopenharmony_ciSYM_FUNC_START(cast6_xts_enc_8way) 4458c2ecf20Sopenharmony_ci /* input: 4468c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 4478c2ecf20Sopenharmony_ci * %rsi: dst 4488c2ecf20Sopenharmony_ci * %rdx: src 4498c2ecf20Sopenharmony_ci * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 4508c2ecf20Sopenharmony_ci */ 4518c2ecf20Sopenharmony_ci FRAME_BEGIN 4528c2ecf20Sopenharmony_ci pushq %r15; 4538c2ecf20Sopenharmony_ci 4548c2ecf20Sopenharmony_ci movq %rdi, CTX 4558c2ecf20Sopenharmony_ci movq %rsi, %r11; 4568c2ecf20Sopenharmony_ci 4578c2ecf20Sopenharmony_ci /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 4588c2ecf20Sopenharmony_ci load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 4598c2ecf20Sopenharmony_ci RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); 4608c2ecf20Sopenharmony_ci 4618c2ecf20Sopenharmony_ci call __cast6_enc_blk8; 4628c2ecf20Sopenharmony_ci 4638c2ecf20Sopenharmony_ci /* dst <= regs xor IVs(in dst) */ 4648c2ecf20Sopenharmony_ci store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 4658c2ecf20Sopenharmony_ci 4668c2ecf20Sopenharmony_ci popq %r15; 4678c2ecf20Sopenharmony_ci FRAME_END 4688c2ecf20Sopenharmony_ci RET; 4698c2ecf20Sopenharmony_ciSYM_FUNC_END(cast6_xts_enc_8way) 4708c2ecf20Sopenharmony_ci 4718c2ecf20Sopenharmony_ciSYM_FUNC_START(cast6_xts_dec_8way) 4728c2ecf20Sopenharmony_ci /* input: 4738c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 4748c2ecf20Sopenharmony_ci * %rsi: dst 4758c2ecf20Sopenharmony_ci * %rdx: src 4768c2ecf20Sopenharmony_ci * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 4778c2ecf20Sopenharmony_ci */ 4788c2ecf20Sopenharmony_ci FRAME_BEGIN 4798c2ecf20Sopenharmony_ci pushq %r15; 4808c2ecf20Sopenharmony_ci 4818c2ecf20Sopenharmony_ci movq %rdi, CTX 4828c2ecf20Sopenharmony_ci movq %rsi, %r11; 4838c2ecf20Sopenharmony_ci 4848c2ecf20Sopenharmony_ci /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 4858c2ecf20Sopenharmony_ci load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 4868c2ecf20Sopenharmony_ci RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); 4878c2ecf20Sopenharmony_ci 4888c2ecf20Sopenharmony_ci call __cast6_dec_blk8; 4898c2ecf20Sopenharmony_ci 4908c2ecf20Sopenharmony_ci /* dst <= regs xor IVs(in dst) */ 4918c2ecf20Sopenharmony_ci store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 4928c2ecf20Sopenharmony_ci 4938c2ecf20Sopenharmony_ci popq %r15; 4948c2ecf20Sopenharmony_ci FRAME_END 4958c2ecf20Sopenharmony_ci RET; 4968c2ecf20Sopenharmony_ciSYM_FUNC_END(cast6_xts_dec_8way) 497