18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Twofish Cipher 8-way parallel algorithm (AVX/x86_64) 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2012 Johannes Goetzfried 68c2ecf20Sopenharmony_ci * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 98c2ecf20Sopenharmony_ci */ 108c2ecf20Sopenharmony_ci 118c2ecf20Sopenharmony_ci#include <linux/linkage.h> 128c2ecf20Sopenharmony_ci#include <asm/frame.h> 138c2ecf20Sopenharmony_ci#include "glue_helper-asm-avx.S" 148c2ecf20Sopenharmony_ci 158c2ecf20Sopenharmony_ci.file "twofish-avx-x86_64-asm_64.S" 168c2ecf20Sopenharmony_ci 178c2ecf20Sopenharmony_ci.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 188c2ecf20Sopenharmony_ci.align 16 198c2ecf20Sopenharmony_ci.Lbswap128_mask: 208c2ecf20Sopenharmony_ci .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_ci.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 238c2ecf20Sopenharmony_ci.align 16 248c2ecf20Sopenharmony_ci.Lxts_gf128mul_and_shl1_mask: 258c2ecf20Sopenharmony_ci .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci.text 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_ci/* structure of crypto context */ 308c2ecf20Sopenharmony_ci#define s0 0 318c2ecf20Sopenharmony_ci#define s1 1024 328c2ecf20Sopenharmony_ci#define s2 2048 338c2ecf20Sopenharmony_ci#define s3 3072 348c2ecf20Sopenharmony_ci#define w 4096 358c2ecf20Sopenharmony_ci#define k 4128 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_ci/********************************************************************** 388c2ecf20Sopenharmony_ci 8-way AVX twofish 398c2ecf20Sopenharmony_ci **********************************************************************/ 408c2ecf20Sopenharmony_ci#define CTX %rdi 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci#define RA1 %xmm0 438c2ecf20Sopenharmony_ci#define RB1 %xmm1 448c2ecf20Sopenharmony_ci#define RC1 %xmm2 458c2ecf20Sopenharmony_ci#define RD1 %xmm3 468c2ecf20Sopenharmony_ci 478c2ecf20Sopenharmony_ci#define RA2 %xmm4 488c2ecf20Sopenharmony_ci#define RB2 %xmm5 498c2ecf20Sopenharmony_ci#define RC2 %xmm6 508c2ecf20Sopenharmony_ci#define RD2 %xmm7 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_ci#define RX0 %xmm8 538c2ecf20Sopenharmony_ci#define RY0 %xmm9 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_ci#define RX1 %xmm10 568c2ecf20Sopenharmony_ci#define RY1 %xmm11 578c2ecf20Sopenharmony_ci 588c2ecf20Sopenharmony_ci#define RK1 %xmm12 598c2ecf20Sopenharmony_ci#define RK2 %xmm13 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci#define RT %xmm14 628c2ecf20Sopenharmony_ci#define RR %xmm15 638c2ecf20Sopenharmony_ci 648c2ecf20Sopenharmony_ci#define RID1 %r13 658c2ecf20Sopenharmony_ci#define RID1d %r13d 668c2ecf20Sopenharmony_ci#define RID2 %rsi 678c2ecf20Sopenharmony_ci#define RID2d %esi 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci#define RGI1 %rdx 708c2ecf20Sopenharmony_ci#define RGI1bl %dl 718c2ecf20Sopenharmony_ci#define RGI1bh %dh 728c2ecf20Sopenharmony_ci#define RGI2 %rcx 738c2ecf20Sopenharmony_ci#define RGI2bl %cl 748c2ecf20Sopenharmony_ci#define RGI2bh %ch 758c2ecf20Sopenharmony_ci 768c2ecf20Sopenharmony_ci#define RGI3 %rax 778c2ecf20Sopenharmony_ci#define RGI3bl %al 788c2ecf20Sopenharmony_ci#define RGI3bh %ah 798c2ecf20Sopenharmony_ci#define RGI4 %rbx 808c2ecf20Sopenharmony_ci#define RGI4bl %bl 818c2ecf20Sopenharmony_ci#define RGI4bh %bh 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_ci#define RGS1 %r8 848c2ecf20Sopenharmony_ci#define RGS1d %r8d 858c2ecf20Sopenharmony_ci#define RGS2 %r9 868c2ecf20Sopenharmony_ci#define RGS2d %r9d 878c2ecf20Sopenharmony_ci#define RGS3 %r10 888c2ecf20Sopenharmony_ci#define RGS3d %r10d 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_ci#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \ 928c2ecf20Sopenharmony_ci movzbl src ## bl, RID1d; \ 938c2ecf20Sopenharmony_ci movzbl src ## bh, RID2d; \ 948c2ecf20Sopenharmony_ci shrq $16, src; \ 958c2ecf20Sopenharmony_ci movl t0(CTX, RID1, 4), dst ## d; \ 968c2ecf20Sopenharmony_ci movl t1(CTX, RID2, 4), RID2d; \ 978c2ecf20Sopenharmony_ci movzbl src ## bl, RID1d; \ 988c2ecf20Sopenharmony_ci xorl RID2d, dst ## d; \ 998c2ecf20Sopenharmony_ci movzbl src ## bh, RID2d; \ 1008c2ecf20Sopenharmony_ci interleave_op(il_reg); \ 1018c2ecf20Sopenharmony_ci xorl t2(CTX, RID1, 4), dst ## d; \ 1028c2ecf20Sopenharmony_ci xorl t3(CTX, RID2, 4), dst ## d; 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci#define dummy(d) /* do nothing */ 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci#define shr_next(reg) \ 1078c2ecf20Sopenharmony_ci shrq $16, reg; 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ci#define G(gi1, gi2, x, t0, t1, t2, t3) \ 1108c2ecf20Sopenharmony_ci lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1); \ 1118c2ecf20Sopenharmony_ci lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2); \ 1128c2ecf20Sopenharmony_ci \ 1138c2ecf20Sopenharmony_ci lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none); \ 1148c2ecf20Sopenharmony_ci shlq $32, RGS2; \ 1158c2ecf20Sopenharmony_ci orq RGS1, RGS2; \ 1168c2ecf20Sopenharmony_ci lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none); \ 1178c2ecf20Sopenharmony_ci shlq $32, RGS1; \ 1188c2ecf20Sopenharmony_ci orq RGS1, RGS3; 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci#define round_head_2(a, b, x1, y1, x2, y2) \ 1218c2ecf20Sopenharmony_ci vmovq b ## 1, RGI3; \ 1228c2ecf20Sopenharmony_ci vpextrq $1, b ## 1, RGI4; \ 1238c2ecf20Sopenharmony_ci \ 1248c2ecf20Sopenharmony_ci G(RGI1, RGI2, x1, s0, s1, s2, s3); \ 1258c2ecf20Sopenharmony_ci vmovq a ## 2, RGI1; \ 1268c2ecf20Sopenharmony_ci vpextrq $1, a ## 2, RGI2; \ 1278c2ecf20Sopenharmony_ci vmovq RGS2, x1; \ 1288c2ecf20Sopenharmony_ci vpinsrq $1, RGS3, x1, x1; \ 1298c2ecf20Sopenharmony_ci \ 1308c2ecf20Sopenharmony_ci G(RGI3, RGI4, y1, s1, s2, s3, s0); \ 1318c2ecf20Sopenharmony_ci vmovq b ## 2, RGI3; \ 1328c2ecf20Sopenharmony_ci vpextrq $1, b ## 2, RGI4; \ 1338c2ecf20Sopenharmony_ci vmovq RGS2, y1; \ 1348c2ecf20Sopenharmony_ci vpinsrq $1, RGS3, y1, y1; \ 1358c2ecf20Sopenharmony_ci \ 1368c2ecf20Sopenharmony_ci G(RGI1, RGI2, x2, s0, s1, s2, s3); \ 1378c2ecf20Sopenharmony_ci vmovq RGS2, x2; \ 1388c2ecf20Sopenharmony_ci vpinsrq $1, RGS3, x2, x2; \ 1398c2ecf20Sopenharmony_ci \ 1408c2ecf20Sopenharmony_ci G(RGI3, RGI4, y2, s1, s2, s3, s0); \ 1418c2ecf20Sopenharmony_ci vmovq RGS2, y2; \ 1428c2ecf20Sopenharmony_ci vpinsrq $1, RGS3, y2, y2; 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci#define encround_tail(a, b, c, d, x, y, prerotate) \ 1458c2ecf20Sopenharmony_ci vpaddd x, y, x; \ 1468c2ecf20Sopenharmony_ci vpaddd x, RK1, RT;\ 1478c2ecf20Sopenharmony_ci prerotate(b); \ 1488c2ecf20Sopenharmony_ci vpxor RT, c, c; \ 1498c2ecf20Sopenharmony_ci vpaddd y, x, y; \ 1508c2ecf20Sopenharmony_ci vpaddd y, RK2, y; \ 1518c2ecf20Sopenharmony_ci vpsrld $1, c, RT; \ 1528c2ecf20Sopenharmony_ci vpslld $(32 - 1), c, c; \ 1538c2ecf20Sopenharmony_ci vpor c, RT, c; \ 1548c2ecf20Sopenharmony_ci vpxor d, y, d; \ 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci#define decround_tail(a, b, c, d, x, y, prerotate) \ 1578c2ecf20Sopenharmony_ci vpaddd x, y, x; \ 1588c2ecf20Sopenharmony_ci vpaddd x, RK1, RT;\ 1598c2ecf20Sopenharmony_ci prerotate(a); \ 1608c2ecf20Sopenharmony_ci vpxor RT, c, c; \ 1618c2ecf20Sopenharmony_ci vpaddd y, x, y; \ 1628c2ecf20Sopenharmony_ci vpaddd y, RK2, y; \ 1638c2ecf20Sopenharmony_ci vpxor d, y, d; \ 1648c2ecf20Sopenharmony_ci vpsrld $1, d, y; \ 1658c2ecf20Sopenharmony_ci vpslld $(32 - 1), d, d; \ 1668c2ecf20Sopenharmony_ci vpor d, y, d; \ 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci#define rotate_1l(x) \ 1698c2ecf20Sopenharmony_ci vpslld $1, x, RR; \ 1708c2ecf20Sopenharmony_ci vpsrld $(32 - 1), x, x; \ 1718c2ecf20Sopenharmony_ci vpor x, RR, x; 1728c2ecf20Sopenharmony_ci 1738c2ecf20Sopenharmony_ci#define preload_rgi(c) \ 1748c2ecf20Sopenharmony_ci vmovq c, RGI1; \ 1758c2ecf20Sopenharmony_ci vpextrq $1, c, RGI2; 1768c2ecf20Sopenharmony_ci 1778c2ecf20Sopenharmony_ci#define encrypt_round(n, a, b, c, d, preload, prerotate) \ 1788c2ecf20Sopenharmony_ci vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ 1798c2ecf20Sopenharmony_ci vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ 1808c2ecf20Sopenharmony_ci round_head_2(a, b, RX0, RY0, RX1, RY1); \ 1818c2ecf20Sopenharmony_ci encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \ 1828c2ecf20Sopenharmony_ci preload(c ## 1); \ 1838c2ecf20Sopenharmony_ci encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate); 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci#define decrypt_round(n, a, b, c, d, preload, prerotate) \ 1868c2ecf20Sopenharmony_ci vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ 1878c2ecf20Sopenharmony_ci vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ 1888c2ecf20Sopenharmony_ci round_head_2(a, b, RX0, RY0, RX1, RY1); \ 1898c2ecf20Sopenharmony_ci decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \ 1908c2ecf20Sopenharmony_ci preload(c ## 1); \ 1918c2ecf20Sopenharmony_ci decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate); 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_ci#define encrypt_cycle(n) \ 1948c2ecf20Sopenharmony_ci encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \ 1958c2ecf20Sopenharmony_ci encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_ci#define encrypt_cycle_last(n) \ 1988c2ecf20Sopenharmony_ci encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \ 1998c2ecf20Sopenharmony_ci encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy); 2008c2ecf20Sopenharmony_ci 2018c2ecf20Sopenharmony_ci#define decrypt_cycle(n) \ 2028c2ecf20Sopenharmony_ci decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \ 2038c2ecf20Sopenharmony_ci decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci#define decrypt_cycle_last(n) \ 2068c2ecf20Sopenharmony_ci decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \ 2078c2ecf20Sopenharmony_ci decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy); 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 2108c2ecf20Sopenharmony_ci vpunpckldq x1, x0, t0; \ 2118c2ecf20Sopenharmony_ci vpunpckhdq x1, x0, t2; \ 2128c2ecf20Sopenharmony_ci vpunpckldq x3, x2, t1; \ 2138c2ecf20Sopenharmony_ci vpunpckhdq x3, x2, x3; \ 2148c2ecf20Sopenharmony_ci \ 2158c2ecf20Sopenharmony_ci vpunpcklqdq t1, t0, x0; \ 2168c2ecf20Sopenharmony_ci vpunpckhqdq t1, t0, x1; \ 2178c2ecf20Sopenharmony_ci vpunpcklqdq x3, t2, x2; \ 2188c2ecf20Sopenharmony_ci vpunpckhqdq x3, t2, x3; 2198c2ecf20Sopenharmony_ci 2208c2ecf20Sopenharmony_ci#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \ 2218c2ecf20Sopenharmony_ci vpxor x0, wkey, x0; \ 2228c2ecf20Sopenharmony_ci vpxor x1, wkey, x1; \ 2238c2ecf20Sopenharmony_ci vpxor x2, wkey, x2; \ 2248c2ecf20Sopenharmony_ci vpxor x3, wkey, x3; \ 2258c2ecf20Sopenharmony_ci \ 2268c2ecf20Sopenharmony_ci transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_ci#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \ 2298c2ecf20Sopenharmony_ci transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 2308c2ecf20Sopenharmony_ci \ 2318c2ecf20Sopenharmony_ci vpxor x0, wkey, x0; \ 2328c2ecf20Sopenharmony_ci vpxor x1, wkey, x1; \ 2338c2ecf20Sopenharmony_ci vpxor x2, wkey, x2; \ 2348c2ecf20Sopenharmony_ci vpxor x3, wkey, x3; 2358c2ecf20Sopenharmony_ci 2368c2ecf20Sopenharmony_ci.align 8 2378c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__twofish_enc_blk8) 2388c2ecf20Sopenharmony_ci /* input: 2398c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 2408c2ecf20Sopenharmony_ci * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 2418c2ecf20Sopenharmony_ci * output: 2428c2ecf20Sopenharmony_ci * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks 2438c2ecf20Sopenharmony_ci */ 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ci vmovdqu w(CTX), RK1; 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_ci pushq %r13; 2488c2ecf20Sopenharmony_ci pushq %rbx; 2498c2ecf20Sopenharmony_ci pushq %rcx; 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); 2528c2ecf20Sopenharmony_ci preload_rgi(RA1); 2538c2ecf20Sopenharmony_ci rotate_1l(RD1); 2548c2ecf20Sopenharmony_ci inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); 2558c2ecf20Sopenharmony_ci rotate_1l(RD2); 2568c2ecf20Sopenharmony_ci 2578c2ecf20Sopenharmony_ci encrypt_cycle(0); 2588c2ecf20Sopenharmony_ci encrypt_cycle(1); 2598c2ecf20Sopenharmony_ci encrypt_cycle(2); 2608c2ecf20Sopenharmony_ci encrypt_cycle(3); 2618c2ecf20Sopenharmony_ci encrypt_cycle(4); 2628c2ecf20Sopenharmony_ci encrypt_cycle(5); 2638c2ecf20Sopenharmony_ci encrypt_cycle(6); 2648c2ecf20Sopenharmony_ci encrypt_cycle_last(7); 2658c2ecf20Sopenharmony_ci 2668c2ecf20Sopenharmony_ci vmovdqu (w+4*4)(CTX), RK1; 2678c2ecf20Sopenharmony_ci 2688c2ecf20Sopenharmony_ci popq %rcx; 2698c2ecf20Sopenharmony_ci popq %rbx; 2708c2ecf20Sopenharmony_ci popq %r13; 2718c2ecf20Sopenharmony_ci 2728c2ecf20Sopenharmony_ci outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); 2738c2ecf20Sopenharmony_ci outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); 2748c2ecf20Sopenharmony_ci 2758c2ecf20Sopenharmony_ci RET; 2768c2ecf20Sopenharmony_ciSYM_FUNC_END(__twofish_enc_blk8) 2778c2ecf20Sopenharmony_ci 2788c2ecf20Sopenharmony_ci.align 8 2798c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__twofish_dec_blk8) 2808c2ecf20Sopenharmony_ci /* input: 2818c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 2828c2ecf20Sopenharmony_ci * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks 2838c2ecf20Sopenharmony_ci * output: 2848c2ecf20Sopenharmony_ci * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks 2858c2ecf20Sopenharmony_ci */ 2868c2ecf20Sopenharmony_ci 2878c2ecf20Sopenharmony_ci vmovdqu (w+4*4)(CTX), RK1; 2888c2ecf20Sopenharmony_ci 2898c2ecf20Sopenharmony_ci pushq %r13; 2908c2ecf20Sopenharmony_ci pushq %rbx; 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_ci inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); 2938c2ecf20Sopenharmony_ci preload_rgi(RC1); 2948c2ecf20Sopenharmony_ci rotate_1l(RA1); 2958c2ecf20Sopenharmony_ci inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); 2968c2ecf20Sopenharmony_ci rotate_1l(RA2); 2978c2ecf20Sopenharmony_ci 2988c2ecf20Sopenharmony_ci decrypt_cycle(7); 2998c2ecf20Sopenharmony_ci decrypt_cycle(6); 3008c2ecf20Sopenharmony_ci decrypt_cycle(5); 3018c2ecf20Sopenharmony_ci decrypt_cycle(4); 3028c2ecf20Sopenharmony_ci decrypt_cycle(3); 3038c2ecf20Sopenharmony_ci decrypt_cycle(2); 3048c2ecf20Sopenharmony_ci decrypt_cycle(1); 3058c2ecf20Sopenharmony_ci decrypt_cycle_last(0); 3068c2ecf20Sopenharmony_ci 3078c2ecf20Sopenharmony_ci vmovdqu (w)(CTX), RK1; 3088c2ecf20Sopenharmony_ci 3098c2ecf20Sopenharmony_ci popq %rbx; 3108c2ecf20Sopenharmony_ci popq %r13; 3118c2ecf20Sopenharmony_ci 3128c2ecf20Sopenharmony_ci outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); 3138c2ecf20Sopenharmony_ci outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); 3148c2ecf20Sopenharmony_ci 3158c2ecf20Sopenharmony_ci RET; 3168c2ecf20Sopenharmony_ciSYM_FUNC_END(__twofish_dec_blk8) 3178c2ecf20Sopenharmony_ci 3188c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_ecb_enc_8way) 3198c2ecf20Sopenharmony_ci /* input: 3208c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 3218c2ecf20Sopenharmony_ci * %rsi: dst 3228c2ecf20Sopenharmony_ci * %rdx: src 3238c2ecf20Sopenharmony_ci */ 3248c2ecf20Sopenharmony_ci FRAME_BEGIN 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_ci movq %rsi, %r11; 3278c2ecf20Sopenharmony_ci 3288c2ecf20Sopenharmony_ci load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 3298c2ecf20Sopenharmony_ci 3308c2ecf20Sopenharmony_ci call __twofish_enc_blk8; 3318c2ecf20Sopenharmony_ci 3328c2ecf20Sopenharmony_ci store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); 3338c2ecf20Sopenharmony_ci 3348c2ecf20Sopenharmony_ci FRAME_END 3358c2ecf20Sopenharmony_ci RET; 3368c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_ecb_enc_8way) 3378c2ecf20Sopenharmony_ci 3388c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_ecb_dec_8way) 3398c2ecf20Sopenharmony_ci /* input: 3408c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 3418c2ecf20Sopenharmony_ci * %rsi: dst 3428c2ecf20Sopenharmony_ci * %rdx: src 3438c2ecf20Sopenharmony_ci */ 3448c2ecf20Sopenharmony_ci FRAME_BEGIN 3458c2ecf20Sopenharmony_ci 3468c2ecf20Sopenharmony_ci movq %rsi, %r11; 3478c2ecf20Sopenharmony_ci 3488c2ecf20Sopenharmony_ci load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); 3498c2ecf20Sopenharmony_ci 3508c2ecf20Sopenharmony_ci call __twofish_dec_blk8; 3518c2ecf20Sopenharmony_ci 3528c2ecf20Sopenharmony_ci store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 3538c2ecf20Sopenharmony_ci 3548c2ecf20Sopenharmony_ci FRAME_END 3558c2ecf20Sopenharmony_ci RET; 3568c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_ecb_dec_8way) 3578c2ecf20Sopenharmony_ci 3588c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_cbc_dec_8way) 3598c2ecf20Sopenharmony_ci /* input: 3608c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 3618c2ecf20Sopenharmony_ci * %rsi: dst 3628c2ecf20Sopenharmony_ci * %rdx: src 3638c2ecf20Sopenharmony_ci */ 3648c2ecf20Sopenharmony_ci FRAME_BEGIN 3658c2ecf20Sopenharmony_ci 3668c2ecf20Sopenharmony_ci pushq %r12; 3678c2ecf20Sopenharmony_ci 3688c2ecf20Sopenharmony_ci movq %rsi, %r11; 3698c2ecf20Sopenharmony_ci movq %rdx, %r12; 3708c2ecf20Sopenharmony_ci 3718c2ecf20Sopenharmony_ci load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); 3728c2ecf20Sopenharmony_ci 3738c2ecf20Sopenharmony_ci call __twofish_dec_blk8; 3748c2ecf20Sopenharmony_ci 3758c2ecf20Sopenharmony_ci store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 3768c2ecf20Sopenharmony_ci 3778c2ecf20Sopenharmony_ci popq %r12; 3788c2ecf20Sopenharmony_ci 3798c2ecf20Sopenharmony_ci FRAME_END 3808c2ecf20Sopenharmony_ci RET; 3818c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_cbc_dec_8way) 3828c2ecf20Sopenharmony_ci 3838c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_ctr_8way) 3848c2ecf20Sopenharmony_ci /* input: 3858c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 3868c2ecf20Sopenharmony_ci * %rsi: dst 3878c2ecf20Sopenharmony_ci * %rdx: src 3888c2ecf20Sopenharmony_ci * %rcx: iv (little endian, 128bit) 3898c2ecf20Sopenharmony_ci */ 3908c2ecf20Sopenharmony_ci FRAME_BEGIN 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci pushq %r12; 3938c2ecf20Sopenharmony_ci 3948c2ecf20Sopenharmony_ci movq %rsi, %r11; 3958c2ecf20Sopenharmony_ci movq %rdx, %r12; 3968c2ecf20Sopenharmony_ci 3978c2ecf20Sopenharmony_ci load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 3988c2ecf20Sopenharmony_ci RD2, RX0, RX1, RY0); 3998c2ecf20Sopenharmony_ci 4008c2ecf20Sopenharmony_ci call __twofish_enc_blk8; 4018c2ecf20Sopenharmony_ci 4028c2ecf20Sopenharmony_ci store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); 4038c2ecf20Sopenharmony_ci 4048c2ecf20Sopenharmony_ci popq %r12; 4058c2ecf20Sopenharmony_ci 4068c2ecf20Sopenharmony_ci FRAME_END 4078c2ecf20Sopenharmony_ci RET; 4088c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_ctr_8way) 4098c2ecf20Sopenharmony_ci 4108c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_xts_enc_8way) 4118c2ecf20Sopenharmony_ci /* input: 4128c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 4138c2ecf20Sopenharmony_ci * %rsi: dst 4148c2ecf20Sopenharmony_ci * %rdx: src 4158c2ecf20Sopenharmony_ci * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 4168c2ecf20Sopenharmony_ci */ 4178c2ecf20Sopenharmony_ci FRAME_BEGIN 4188c2ecf20Sopenharmony_ci 4198c2ecf20Sopenharmony_ci movq %rsi, %r11; 4208c2ecf20Sopenharmony_ci 4218c2ecf20Sopenharmony_ci /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 4228c2ecf20Sopenharmony_ci load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 4238c2ecf20Sopenharmony_ci RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); 4248c2ecf20Sopenharmony_ci 4258c2ecf20Sopenharmony_ci call __twofish_enc_blk8; 4268c2ecf20Sopenharmony_ci 4278c2ecf20Sopenharmony_ci /* dst <= regs xor IVs(in dst) */ 4288c2ecf20Sopenharmony_ci store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); 4298c2ecf20Sopenharmony_ci 4308c2ecf20Sopenharmony_ci FRAME_END 4318c2ecf20Sopenharmony_ci RET; 4328c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_xts_enc_8way) 4338c2ecf20Sopenharmony_ci 4348c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_xts_dec_8way) 4358c2ecf20Sopenharmony_ci /* input: 4368c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 4378c2ecf20Sopenharmony_ci * %rsi: dst 4388c2ecf20Sopenharmony_ci * %rdx: src 4398c2ecf20Sopenharmony_ci * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 4408c2ecf20Sopenharmony_ci */ 4418c2ecf20Sopenharmony_ci FRAME_BEGIN 4428c2ecf20Sopenharmony_ci 4438c2ecf20Sopenharmony_ci movq %rsi, %r11; 4448c2ecf20Sopenharmony_ci 4458c2ecf20Sopenharmony_ci /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 4468c2ecf20Sopenharmony_ci load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2, 4478c2ecf20Sopenharmony_ci RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); 4488c2ecf20Sopenharmony_ci 4498c2ecf20Sopenharmony_ci call __twofish_dec_blk8; 4508c2ecf20Sopenharmony_ci 4518c2ecf20Sopenharmony_ci /* dst <= regs xor IVs(in dst) */ 4528c2ecf20Sopenharmony_ci store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 4538c2ecf20Sopenharmony_ci 4548c2ecf20Sopenharmony_ci FRAME_END 4558c2ecf20Sopenharmony_ci RET; 4568c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_xts_dec_8way) 457