18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Twofish Cipher 3-way parallel algorithm (x86_64) 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci#include <linux/linkage.h> 98c2ecf20Sopenharmony_ci 108c2ecf20Sopenharmony_ci.file "twofish-x86_64-asm-3way.S" 118c2ecf20Sopenharmony_ci.text 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_ci/* structure of crypto context */ 148c2ecf20Sopenharmony_ci#define s0 0 158c2ecf20Sopenharmony_ci#define s1 1024 168c2ecf20Sopenharmony_ci#define s2 2048 178c2ecf20Sopenharmony_ci#define s3 3072 188c2ecf20Sopenharmony_ci#define w 4096 198c2ecf20Sopenharmony_ci#define k 4128 208c2ecf20Sopenharmony_ci 218c2ecf20Sopenharmony_ci/********************************************************************** 228c2ecf20Sopenharmony_ci 3-way twofish 238c2ecf20Sopenharmony_ci **********************************************************************/ 248c2ecf20Sopenharmony_ci#define CTX %rdi 258c2ecf20Sopenharmony_ci#define RIO %rdx 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci#define RAB0 %rax 288c2ecf20Sopenharmony_ci#define RAB1 %rbx 298c2ecf20Sopenharmony_ci#define RAB2 %rcx 308c2ecf20Sopenharmony_ci 318c2ecf20Sopenharmony_ci#define RAB0d %eax 328c2ecf20Sopenharmony_ci#define RAB1d %ebx 338c2ecf20Sopenharmony_ci#define RAB2d %ecx 348c2ecf20Sopenharmony_ci 358c2ecf20Sopenharmony_ci#define RAB0bh %ah 368c2ecf20Sopenharmony_ci#define RAB1bh %bh 378c2ecf20Sopenharmony_ci#define RAB2bh %ch 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ci#define RAB0bl %al 408c2ecf20Sopenharmony_ci#define RAB1bl %bl 418c2ecf20Sopenharmony_ci#define RAB2bl %cl 428c2ecf20Sopenharmony_ci 438c2ecf20Sopenharmony_ci#define CD0 0x0(%rsp) 448c2ecf20Sopenharmony_ci#define CD1 0x8(%rsp) 458c2ecf20Sopenharmony_ci#define CD2 0x10(%rsp) 468c2ecf20Sopenharmony_ci 478c2ecf20Sopenharmony_ci# used only before/after all rounds 488c2ecf20Sopenharmony_ci#define RCD0 %r8 498c2ecf20Sopenharmony_ci#define RCD1 %r9 508c2ecf20Sopenharmony_ci#define RCD2 %r10 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_ci# used only during rounds 538c2ecf20Sopenharmony_ci#define RX0 %r8 548c2ecf20Sopenharmony_ci#define RX1 %r9 558c2ecf20Sopenharmony_ci#define RX2 %r10 568c2ecf20Sopenharmony_ci 578c2ecf20Sopenharmony_ci#define RX0d %r8d 588c2ecf20Sopenharmony_ci#define RX1d %r9d 598c2ecf20Sopenharmony_ci#define RX2d %r10d 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci#define RY0 %r11 628c2ecf20Sopenharmony_ci#define RY1 %r12 638c2ecf20Sopenharmony_ci#define RY2 %r13 648c2ecf20Sopenharmony_ci 658c2ecf20Sopenharmony_ci#define RY0d %r11d 668c2ecf20Sopenharmony_ci#define RY1d %r12d 678c2ecf20Sopenharmony_ci#define RY2d %r13d 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci#define RT0 %rdx 708c2ecf20Sopenharmony_ci#define RT1 %rsi 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_ci#define RT0d %edx 738c2ecf20Sopenharmony_ci#define RT1d %esi 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_ci#define RT1bl %sil 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ 788c2ecf20Sopenharmony_ci movzbl ab ## bl, tmp2 ## d; \ 798c2ecf20Sopenharmony_ci movzbl ab ## bh, tmp1 ## d; \ 808c2ecf20Sopenharmony_ci rorq $(rot), ab; \ 818c2ecf20Sopenharmony_ci op1##l T0(CTX, tmp2, 4), dst ## d; \ 828c2ecf20Sopenharmony_ci op2##l T1(CTX, tmp1, 4), dst ## d; 838c2ecf20Sopenharmony_ci 848c2ecf20Sopenharmony_ci#define swap_ab_with_cd(ab, cd, tmp) \ 858c2ecf20Sopenharmony_ci movq cd, tmp; \ 868c2ecf20Sopenharmony_ci movq ab, cd; \ 878c2ecf20Sopenharmony_ci movq tmp, ab; 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci/* 908c2ecf20Sopenharmony_ci * Combined G1 & G2 function. Reordered with help of rotates to have moves 918c2ecf20Sopenharmony_ci * at begining. 928c2ecf20Sopenharmony_ci */ 938c2ecf20Sopenharmony_ci#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ 948c2ecf20Sopenharmony_ci /* G1,1 && G2,1 */ \ 958c2ecf20Sopenharmony_ci do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ 968c2ecf20Sopenharmony_ci do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ 978c2ecf20Sopenharmony_ci \ 988c2ecf20Sopenharmony_ci do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ 998c2ecf20Sopenharmony_ci do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ 1008c2ecf20Sopenharmony_ci \ 1018c2ecf20Sopenharmony_ci do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ 1028c2ecf20Sopenharmony_ci do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ 1038c2ecf20Sopenharmony_ci \ 1048c2ecf20Sopenharmony_ci /* G1,2 && G2,2 */ \ 1058c2ecf20Sopenharmony_ci do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ 1068c2ecf20Sopenharmony_ci do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ 1078c2ecf20Sopenharmony_ci swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \ 1088c2ecf20Sopenharmony_ci \ 1098c2ecf20Sopenharmony_ci do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ 1108c2ecf20Sopenharmony_ci do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ 1118c2ecf20Sopenharmony_ci swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \ 1128c2ecf20Sopenharmony_ci \ 1138c2ecf20Sopenharmony_ci do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ 1148c2ecf20Sopenharmony_ci do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ 1158c2ecf20Sopenharmony_ci swap_ab_with_cd(ab ## 2, cd ## 2, RT0); 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci#define enc_round_end(ab, x, y, n) \ 1188c2ecf20Sopenharmony_ci addl y ## d, x ## d; \ 1198c2ecf20Sopenharmony_ci addl x ## d, y ## d; \ 1208c2ecf20Sopenharmony_ci addl k+4*(2*(n))(CTX), x ## d; \ 1218c2ecf20Sopenharmony_ci xorl ab ## d, x ## d; \ 1228c2ecf20Sopenharmony_ci addl k+4*(2*(n)+1)(CTX), y ## d; \ 1238c2ecf20Sopenharmony_ci shrq $32, ab; \ 1248c2ecf20Sopenharmony_ci roll $1, ab ## d; \ 1258c2ecf20Sopenharmony_ci xorl y ## d, ab ## d; \ 1268c2ecf20Sopenharmony_ci shlq $32, ab; \ 1278c2ecf20Sopenharmony_ci rorl $1, x ## d; \ 1288c2ecf20Sopenharmony_ci orq x, ab; 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci#define dec_round_end(ba, x, y, n) \ 1318c2ecf20Sopenharmony_ci addl y ## d, x ## d; \ 1328c2ecf20Sopenharmony_ci addl x ## d, y ## d; \ 1338c2ecf20Sopenharmony_ci addl k+4*(2*(n))(CTX), x ## d; \ 1348c2ecf20Sopenharmony_ci addl k+4*(2*(n)+1)(CTX), y ## d; \ 1358c2ecf20Sopenharmony_ci xorl ba ## d, y ## d; \ 1368c2ecf20Sopenharmony_ci shrq $32, ba; \ 1378c2ecf20Sopenharmony_ci roll $1, ba ## d; \ 1388c2ecf20Sopenharmony_ci xorl x ## d, ba ## d; \ 1398c2ecf20Sopenharmony_ci shlq $32, ba; \ 1408c2ecf20Sopenharmony_ci rorl $1, y ## d; \ 1418c2ecf20Sopenharmony_ci orq y, ba; 1428c2ecf20Sopenharmony_ci 1438c2ecf20Sopenharmony_ci#define encrypt_round3(ab, cd, n) \ 1448c2ecf20Sopenharmony_ci g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ 1458c2ecf20Sopenharmony_ci \ 1468c2ecf20Sopenharmony_ci enc_round_end(ab ## 0, RX0, RY0, n); \ 1478c2ecf20Sopenharmony_ci enc_round_end(ab ## 1, RX1, RY1, n); \ 1488c2ecf20Sopenharmony_ci enc_round_end(ab ## 2, RX2, RY2, n); 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci#define decrypt_round3(ba, dc, n) \ 1518c2ecf20Sopenharmony_ci g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ 1528c2ecf20Sopenharmony_ci \ 1538c2ecf20Sopenharmony_ci dec_round_end(ba ## 0, RX0, RY0, n); \ 1548c2ecf20Sopenharmony_ci dec_round_end(ba ## 1, RX1, RY1, n); \ 1558c2ecf20Sopenharmony_ci dec_round_end(ba ## 2, RX2, RY2, n); 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci#define encrypt_cycle3(ab, cd, n) \ 1588c2ecf20Sopenharmony_ci encrypt_round3(ab, cd, n*2); \ 1598c2ecf20Sopenharmony_ci encrypt_round3(ab, cd, (n*2)+1); 1608c2ecf20Sopenharmony_ci 1618c2ecf20Sopenharmony_ci#define decrypt_cycle3(ba, dc, n) \ 1628c2ecf20Sopenharmony_ci decrypt_round3(ba, dc, (n*2)+1); \ 1638c2ecf20Sopenharmony_ci decrypt_round3(ba, dc, (n*2)); 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ci#define push_cd() \ 1668c2ecf20Sopenharmony_ci pushq RCD2; \ 1678c2ecf20Sopenharmony_ci pushq RCD1; \ 1688c2ecf20Sopenharmony_ci pushq RCD0; 1698c2ecf20Sopenharmony_ci 1708c2ecf20Sopenharmony_ci#define pop_cd() \ 1718c2ecf20Sopenharmony_ci popq RCD0; \ 1728c2ecf20Sopenharmony_ci popq RCD1; \ 1738c2ecf20Sopenharmony_ci popq RCD2; 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ci#define inpack3(in, n, xy, m) \ 1768c2ecf20Sopenharmony_ci movq 4*(n)(in), xy ## 0; \ 1778c2ecf20Sopenharmony_ci xorq w+4*m(CTX), xy ## 0; \ 1788c2ecf20Sopenharmony_ci \ 1798c2ecf20Sopenharmony_ci movq 4*(4+(n))(in), xy ## 1; \ 1808c2ecf20Sopenharmony_ci xorq w+4*m(CTX), xy ## 1; \ 1818c2ecf20Sopenharmony_ci \ 1828c2ecf20Sopenharmony_ci movq 4*(8+(n))(in), xy ## 2; \ 1838c2ecf20Sopenharmony_ci xorq w+4*m(CTX), xy ## 2; 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci#define outunpack3(op, out, n, xy, m) \ 1868c2ecf20Sopenharmony_ci xorq w+4*m(CTX), xy ## 0; \ 1878c2ecf20Sopenharmony_ci op ## q xy ## 0, 4*(n)(out); \ 1888c2ecf20Sopenharmony_ci \ 1898c2ecf20Sopenharmony_ci xorq w+4*m(CTX), xy ## 1; \ 1908c2ecf20Sopenharmony_ci op ## q xy ## 1, 4*(4+(n))(out); \ 1918c2ecf20Sopenharmony_ci \ 1928c2ecf20Sopenharmony_ci xorq w+4*m(CTX), xy ## 2; \ 1938c2ecf20Sopenharmony_ci op ## q xy ## 2, 4*(8+(n))(out); 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_ci#define inpack_enc3() \ 1968c2ecf20Sopenharmony_ci inpack3(RIO, 0, RAB, 0); \ 1978c2ecf20Sopenharmony_ci inpack3(RIO, 2, RCD, 2); 1988c2ecf20Sopenharmony_ci 1998c2ecf20Sopenharmony_ci#define outunpack_enc3(op) \ 2008c2ecf20Sopenharmony_ci outunpack3(op, RIO, 2, RAB, 6); \ 2018c2ecf20Sopenharmony_ci outunpack3(op, RIO, 0, RCD, 4); 2028c2ecf20Sopenharmony_ci 2038c2ecf20Sopenharmony_ci#define inpack_dec3() \ 2048c2ecf20Sopenharmony_ci inpack3(RIO, 0, RAB, 4); \ 2058c2ecf20Sopenharmony_ci rorq $32, RAB0; \ 2068c2ecf20Sopenharmony_ci rorq $32, RAB1; \ 2078c2ecf20Sopenharmony_ci rorq $32, RAB2; \ 2088c2ecf20Sopenharmony_ci inpack3(RIO, 2, RCD, 6); \ 2098c2ecf20Sopenharmony_ci rorq $32, RCD0; \ 2108c2ecf20Sopenharmony_ci rorq $32, RCD1; \ 2118c2ecf20Sopenharmony_ci rorq $32, RCD2; 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci#define outunpack_dec3() \ 2148c2ecf20Sopenharmony_ci rorq $32, RCD0; \ 2158c2ecf20Sopenharmony_ci rorq $32, RCD1; \ 2168c2ecf20Sopenharmony_ci rorq $32, RCD2; \ 2178c2ecf20Sopenharmony_ci outunpack3(mov, RIO, 0, RCD, 0); \ 2188c2ecf20Sopenharmony_ci rorq $32, RAB0; \ 2198c2ecf20Sopenharmony_ci rorq $32, RAB1; \ 2208c2ecf20Sopenharmony_ci rorq $32, RAB2; \ 2218c2ecf20Sopenharmony_ci outunpack3(mov, RIO, 2, RAB, 2); 2228c2ecf20Sopenharmony_ci 2238c2ecf20Sopenharmony_ciSYM_FUNC_START(__twofish_enc_blk_3way) 2248c2ecf20Sopenharmony_ci /* input: 2258c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 2268c2ecf20Sopenharmony_ci * %rsi: dst 2278c2ecf20Sopenharmony_ci * %rdx: src, RIO 2288c2ecf20Sopenharmony_ci * %rcx: bool, if true: xor output 2298c2ecf20Sopenharmony_ci */ 2308c2ecf20Sopenharmony_ci pushq %r13; 2318c2ecf20Sopenharmony_ci pushq %r12; 2328c2ecf20Sopenharmony_ci pushq %rbx; 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_ci pushq %rcx; /* bool xor */ 2358c2ecf20Sopenharmony_ci pushq %rsi; /* dst */ 2368c2ecf20Sopenharmony_ci 2378c2ecf20Sopenharmony_ci inpack_enc3(); 2388c2ecf20Sopenharmony_ci 2398c2ecf20Sopenharmony_ci push_cd(); 2408c2ecf20Sopenharmony_ci encrypt_cycle3(RAB, CD, 0); 2418c2ecf20Sopenharmony_ci encrypt_cycle3(RAB, CD, 1); 2428c2ecf20Sopenharmony_ci encrypt_cycle3(RAB, CD, 2); 2438c2ecf20Sopenharmony_ci encrypt_cycle3(RAB, CD, 3); 2448c2ecf20Sopenharmony_ci encrypt_cycle3(RAB, CD, 4); 2458c2ecf20Sopenharmony_ci encrypt_cycle3(RAB, CD, 5); 2468c2ecf20Sopenharmony_ci encrypt_cycle3(RAB, CD, 6); 2478c2ecf20Sopenharmony_ci encrypt_cycle3(RAB, CD, 7); 2488c2ecf20Sopenharmony_ci pop_cd(); 2498c2ecf20Sopenharmony_ci 2508c2ecf20Sopenharmony_ci popq RIO; /* dst */ 2518c2ecf20Sopenharmony_ci popq RT1; /* bool xor */ 2528c2ecf20Sopenharmony_ci 2538c2ecf20Sopenharmony_ci testb RT1bl, RT1bl; 2548c2ecf20Sopenharmony_ci jnz .L__enc_xor3; 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_ci outunpack_enc3(mov); 2578c2ecf20Sopenharmony_ci 2588c2ecf20Sopenharmony_ci popq %rbx; 2598c2ecf20Sopenharmony_ci popq %r12; 2608c2ecf20Sopenharmony_ci popq %r13; 2618c2ecf20Sopenharmony_ci RET; 2628c2ecf20Sopenharmony_ci 2638c2ecf20Sopenharmony_ci.L__enc_xor3: 2648c2ecf20Sopenharmony_ci outunpack_enc3(xor); 2658c2ecf20Sopenharmony_ci 2668c2ecf20Sopenharmony_ci popq %rbx; 2678c2ecf20Sopenharmony_ci popq %r12; 2688c2ecf20Sopenharmony_ci popq %r13; 2698c2ecf20Sopenharmony_ci RET; 2708c2ecf20Sopenharmony_ciSYM_FUNC_END(__twofish_enc_blk_3way) 2718c2ecf20Sopenharmony_ci 2728c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_dec_blk_3way) 2738c2ecf20Sopenharmony_ci /* input: 2748c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 2758c2ecf20Sopenharmony_ci * %rsi: dst 2768c2ecf20Sopenharmony_ci * %rdx: src, RIO 2778c2ecf20Sopenharmony_ci */ 2788c2ecf20Sopenharmony_ci pushq %r13; 2798c2ecf20Sopenharmony_ci pushq %r12; 2808c2ecf20Sopenharmony_ci pushq %rbx; 2818c2ecf20Sopenharmony_ci 2828c2ecf20Sopenharmony_ci pushq %rsi; /* dst */ 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_ci inpack_dec3(); 2858c2ecf20Sopenharmony_ci 2868c2ecf20Sopenharmony_ci push_cd(); 2878c2ecf20Sopenharmony_ci decrypt_cycle3(RAB, CD, 7); 2888c2ecf20Sopenharmony_ci decrypt_cycle3(RAB, CD, 6); 2898c2ecf20Sopenharmony_ci decrypt_cycle3(RAB, CD, 5); 2908c2ecf20Sopenharmony_ci decrypt_cycle3(RAB, CD, 4); 2918c2ecf20Sopenharmony_ci decrypt_cycle3(RAB, CD, 3); 2928c2ecf20Sopenharmony_ci decrypt_cycle3(RAB, CD, 2); 2938c2ecf20Sopenharmony_ci decrypt_cycle3(RAB, CD, 1); 2948c2ecf20Sopenharmony_ci decrypt_cycle3(RAB, CD, 0); 2958c2ecf20Sopenharmony_ci pop_cd(); 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_ci popq RIO; /* dst */ 2988c2ecf20Sopenharmony_ci 2998c2ecf20Sopenharmony_ci outunpack_dec3(); 3008c2ecf20Sopenharmony_ci 3018c2ecf20Sopenharmony_ci popq %rbx; 3028c2ecf20Sopenharmony_ci popq %r12; 3038c2ecf20Sopenharmony_ci popq %r13; 3048c2ecf20Sopenharmony_ci RET; 3058c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_dec_blk_3way) 306