18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Blowfish Cipher Algorithm (x86_64) 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci#include <linux/linkage.h> 98c2ecf20Sopenharmony_ci 108c2ecf20Sopenharmony_ci.file "blowfish-x86_64-asm.S" 118c2ecf20Sopenharmony_ci.text 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_ci/* structure of crypto context */ 148c2ecf20Sopenharmony_ci#define p 0 158c2ecf20Sopenharmony_ci#define s0 ((16 + 2) * 4) 168c2ecf20Sopenharmony_ci#define s1 ((16 + 2 + (1 * 256)) * 4) 178c2ecf20Sopenharmony_ci#define s2 ((16 + 2 + (2 * 256)) * 4) 188c2ecf20Sopenharmony_ci#define s3 ((16 + 2 + (3 * 256)) * 4) 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ci/* register macros */ 218c2ecf20Sopenharmony_ci#define CTX %r12 228c2ecf20Sopenharmony_ci#define RIO %rsi 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_ci#define RX0 %rax 258c2ecf20Sopenharmony_ci#define RX1 %rbx 268c2ecf20Sopenharmony_ci#define RX2 %rcx 278c2ecf20Sopenharmony_ci#define RX3 %rdx 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_ci#define RX0d %eax 308c2ecf20Sopenharmony_ci#define RX1d %ebx 318c2ecf20Sopenharmony_ci#define RX2d %ecx 328c2ecf20Sopenharmony_ci#define RX3d %edx 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_ci#define RX0bl %al 358c2ecf20Sopenharmony_ci#define RX1bl %bl 368c2ecf20Sopenharmony_ci#define RX2bl %cl 378c2ecf20Sopenharmony_ci#define RX3bl %dl 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ci#define RX0bh %ah 408c2ecf20Sopenharmony_ci#define RX1bh %bh 418c2ecf20Sopenharmony_ci#define RX2bh %ch 428c2ecf20Sopenharmony_ci#define RX3bh %dh 438c2ecf20Sopenharmony_ci 448c2ecf20Sopenharmony_ci#define RT0 %rdi 458c2ecf20Sopenharmony_ci#define RT1 %rsi 468c2ecf20Sopenharmony_ci#define RT2 %r8 478c2ecf20Sopenharmony_ci#define RT3 %r9 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_ci#define RT0d %edi 508c2ecf20Sopenharmony_ci#define RT1d %esi 518c2ecf20Sopenharmony_ci#define RT2d %r8d 528c2ecf20Sopenharmony_ci#define RT3d %r9d 538c2ecf20Sopenharmony_ci 548c2ecf20Sopenharmony_ci#define RKEY %r10 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci/*********************************************************************** 578c2ecf20Sopenharmony_ci * 1-way blowfish 588c2ecf20Sopenharmony_ci ***********************************************************************/ 598c2ecf20Sopenharmony_ci#define F() \ 608c2ecf20Sopenharmony_ci rorq $16, RX0; \ 618c2ecf20Sopenharmony_ci movzbl RX0bh, RT0d; \ 628c2ecf20Sopenharmony_ci movzbl RX0bl, RT1d; \ 638c2ecf20Sopenharmony_ci rolq $16, RX0; \ 648c2ecf20Sopenharmony_ci movl s0(CTX,RT0,4), RT0d; \ 658c2ecf20Sopenharmony_ci addl s1(CTX,RT1,4), RT0d; \ 668c2ecf20Sopenharmony_ci movzbl RX0bh, RT1d; \ 678c2ecf20Sopenharmony_ci movzbl RX0bl, RT2d; \ 688c2ecf20Sopenharmony_ci rolq $32, RX0; \ 698c2ecf20Sopenharmony_ci xorl s2(CTX,RT1,4), RT0d; \ 708c2ecf20Sopenharmony_ci addl s3(CTX,RT2,4), RT0d; \ 718c2ecf20Sopenharmony_ci xorq RT0, RX0; 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_ci#define add_roundkey_enc(n) \ 748c2ecf20Sopenharmony_ci xorq p+4*(n)(CTX), RX0; 758c2ecf20Sopenharmony_ci 768c2ecf20Sopenharmony_ci#define round_enc(n) \ 778c2ecf20Sopenharmony_ci add_roundkey_enc(n); \ 788c2ecf20Sopenharmony_ci \ 798c2ecf20Sopenharmony_ci F(); \ 808c2ecf20Sopenharmony_ci F(); 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_ci#define add_roundkey_dec(n) \ 838c2ecf20Sopenharmony_ci movq p+4*(n-1)(CTX), RT0; \ 848c2ecf20Sopenharmony_ci rorq $32, RT0; \ 858c2ecf20Sopenharmony_ci xorq RT0, RX0; 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_ci#define round_dec(n) \ 888c2ecf20Sopenharmony_ci add_roundkey_dec(n); \ 898c2ecf20Sopenharmony_ci \ 908c2ecf20Sopenharmony_ci F(); \ 918c2ecf20Sopenharmony_ci F(); \ 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci#define read_block() \ 948c2ecf20Sopenharmony_ci movq (RIO), RX0; \ 958c2ecf20Sopenharmony_ci rorq $32, RX0; \ 968c2ecf20Sopenharmony_ci bswapq RX0; 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci#define write_block() \ 998c2ecf20Sopenharmony_ci bswapq RX0; \ 1008c2ecf20Sopenharmony_ci movq RX0, (RIO); 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_ci#define xor_block() \ 1038c2ecf20Sopenharmony_ci bswapq RX0; \ 1048c2ecf20Sopenharmony_ci xorq RX0, (RIO); 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ciSYM_FUNC_START(__blowfish_enc_blk) 1078c2ecf20Sopenharmony_ci /* input: 1088c2ecf20Sopenharmony_ci * %rdi: ctx 1098c2ecf20Sopenharmony_ci * %rsi: dst 1108c2ecf20Sopenharmony_ci * %rdx: src 1118c2ecf20Sopenharmony_ci * %rcx: bool, if true: xor output 1128c2ecf20Sopenharmony_ci */ 1138c2ecf20Sopenharmony_ci movq %r12, %r11; 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ci movq %rdi, CTX; 1168c2ecf20Sopenharmony_ci movq %rsi, %r10; 1178c2ecf20Sopenharmony_ci movq %rdx, RIO; 1188c2ecf20Sopenharmony_ci 1198c2ecf20Sopenharmony_ci read_block(); 1208c2ecf20Sopenharmony_ci 1218c2ecf20Sopenharmony_ci round_enc(0); 1228c2ecf20Sopenharmony_ci round_enc(2); 1238c2ecf20Sopenharmony_ci round_enc(4); 1248c2ecf20Sopenharmony_ci round_enc(6); 1258c2ecf20Sopenharmony_ci round_enc(8); 1268c2ecf20Sopenharmony_ci round_enc(10); 1278c2ecf20Sopenharmony_ci round_enc(12); 1288c2ecf20Sopenharmony_ci round_enc(14); 1298c2ecf20Sopenharmony_ci add_roundkey_enc(16); 1308c2ecf20Sopenharmony_ci 1318c2ecf20Sopenharmony_ci movq %r11, %r12; 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_ci movq %r10, RIO; 1348c2ecf20Sopenharmony_ci test %cl, %cl; 1358c2ecf20Sopenharmony_ci jnz .L__enc_xor; 1368c2ecf20Sopenharmony_ci 1378c2ecf20Sopenharmony_ci write_block(); 1388c2ecf20Sopenharmony_ci RET; 1398c2ecf20Sopenharmony_ci.L__enc_xor: 1408c2ecf20Sopenharmony_ci xor_block(); 1418c2ecf20Sopenharmony_ci RET; 1428c2ecf20Sopenharmony_ciSYM_FUNC_END(__blowfish_enc_blk) 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ciSYM_FUNC_START(blowfish_dec_blk) 1458c2ecf20Sopenharmony_ci /* input: 1468c2ecf20Sopenharmony_ci * %rdi: ctx 1478c2ecf20Sopenharmony_ci * %rsi: dst 1488c2ecf20Sopenharmony_ci * %rdx: src 1498c2ecf20Sopenharmony_ci */ 1508c2ecf20Sopenharmony_ci movq %r12, %r11; 1518c2ecf20Sopenharmony_ci 1528c2ecf20Sopenharmony_ci movq %rdi, CTX; 1538c2ecf20Sopenharmony_ci movq %rsi, %r10; 1548c2ecf20Sopenharmony_ci movq %rdx, RIO; 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci read_block(); 1578c2ecf20Sopenharmony_ci 1588c2ecf20Sopenharmony_ci round_dec(17); 1598c2ecf20Sopenharmony_ci round_dec(15); 1608c2ecf20Sopenharmony_ci round_dec(13); 1618c2ecf20Sopenharmony_ci round_dec(11); 1628c2ecf20Sopenharmony_ci round_dec(9); 1638c2ecf20Sopenharmony_ci round_dec(7); 1648c2ecf20Sopenharmony_ci round_dec(5); 1658c2ecf20Sopenharmony_ci round_dec(3); 1668c2ecf20Sopenharmony_ci add_roundkey_dec(1); 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci movq %r10, RIO; 1698c2ecf20Sopenharmony_ci write_block(); 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci movq %r11, %r12; 1728c2ecf20Sopenharmony_ci 1738c2ecf20Sopenharmony_ci RET; 1748c2ecf20Sopenharmony_ciSYM_FUNC_END(blowfish_dec_blk) 1758c2ecf20Sopenharmony_ci 1768c2ecf20Sopenharmony_ci/********************************************************************** 1778c2ecf20Sopenharmony_ci 4-way blowfish, four blocks parallel 1788c2ecf20Sopenharmony_ci **********************************************************************/ 1798c2ecf20Sopenharmony_ci 1808c2ecf20Sopenharmony_ci/* F() for 4-way. Slower when used alone/1-way, but faster when used 1818c2ecf20Sopenharmony_ci * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). 1828c2ecf20Sopenharmony_ci */ 1838c2ecf20Sopenharmony_ci#define F4(x) \ 1848c2ecf20Sopenharmony_ci movzbl x ## bh, RT1d; \ 1858c2ecf20Sopenharmony_ci movzbl x ## bl, RT3d; \ 1868c2ecf20Sopenharmony_ci rorq $16, x; \ 1878c2ecf20Sopenharmony_ci movzbl x ## bh, RT0d; \ 1888c2ecf20Sopenharmony_ci movzbl x ## bl, RT2d; \ 1898c2ecf20Sopenharmony_ci rorq $16, x; \ 1908c2ecf20Sopenharmony_ci movl s0(CTX,RT0,4), RT0d; \ 1918c2ecf20Sopenharmony_ci addl s1(CTX,RT2,4), RT0d; \ 1928c2ecf20Sopenharmony_ci xorl s2(CTX,RT1,4), RT0d; \ 1938c2ecf20Sopenharmony_ci addl s3(CTX,RT3,4), RT0d; \ 1948c2ecf20Sopenharmony_ci xorq RT0, x; 1958c2ecf20Sopenharmony_ci 1968c2ecf20Sopenharmony_ci#define add_preloaded_roundkey4() \ 1978c2ecf20Sopenharmony_ci xorq RKEY, RX0; \ 1988c2ecf20Sopenharmony_ci xorq RKEY, RX1; \ 1998c2ecf20Sopenharmony_ci xorq RKEY, RX2; \ 2008c2ecf20Sopenharmony_ci xorq RKEY, RX3; 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci#define preload_roundkey_enc(n) \ 2038c2ecf20Sopenharmony_ci movq p+4*(n)(CTX), RKEY; 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci#define add_roundkey_enc4(n) \ 2068c2ecf20Sopenharmony_ci add_preloaded_roundkey4(); \ 2078c2ecf20Sopenharmony_ci preload_roundkey_enc(n + 2); 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_ci#define round_enc4(n) \ 2108c2ecf20Sopenharmony_ci add_roundkey_enc4(n); \ 2118c2ecf20Sopenharmony_ci \ 2128c2ecf20Sopenharmony_ci F4(RX0); \ 2138c2ecf20Sopenharmony_ci F4(RX1); \ 2148c2ecf20Sopenharmony_ci F4(RX2); \ 2158c2ecf20Sopenharmony_ci F4(RX3); \ 2168c2ecf20Sopenharmony_ci \ 2178c2ecf20Sopenharmony_ci F4(RX0); \ 2188c2ecf20Sopenharmony_ci F4(RX1); \ 2198c2ecf20Sopenharmony_ci F4(RX2); \ 2208c2ecf20Sopenharmony_ci F4(RX3); 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci#define preload_roundkey_dec(n) \ 2238c2ecf20Sopenharmony_ci movq p+4*((n)-1)(CTX), RKEY; \ 2248c2ecf20Sopenharmony_ci rorq $32, RKEY; 2258c2ecf20Sopenharmony_ci 2268c2ecf20Sopenharmony_ci#define add_roundkey_dec4(n) \ 2278c2ecf20Sopenharmony_ci add_preloaded_roundkey4(); \ 2288c2ecf20Sopenharmony_ci preload_roundkey_dec(n - 2); 2298c2ecf20Sopenharmony_ci 2308c2ecf20Sopenharmony_ci#define round_dec4(n) \ 2318c2ecf20Sopenharmony_ci add_roundkey_dec4(n); \ 2328c2ecf20Sopenharmony_ci \ 2338c2ecf20Sopenharmony_ci F4(RX0); \ 2348c2ecf20Sopenharmony_ci F4(RX1); \ 2358c2ecf20Sopenharmony_ci F4(RX2); \ 2368c2ecf20Sopenharmony_ci F4(RX3); \ 2378c2ecf20Sopenharmony_ci \ 2388c2ecf20Sopenharmony_ci F4(RX0); \ 2398c2ecf20Sopenharmony_ci F4(RX1); \ 2408c2ecf20Sopenharmony_ci F4(RX2); \ 2418c2ecf20Sopenharmony_ci F4(RX3); 2428c2ecf20Sopenharmony_ci 2438c2ecf20Sopenharmony_ci#define read_block4() \ 2448c2ecf20Sopenharmony_ci movq (RIO), RX0; \ 2458c2ecf20Sopenharmony_ci rorq $32, RX0; \ 2468c2ecf20Sopenharmony_ci bswapq RX0; \ 2478c2ecf20Sopenharmony_ci \ 2488c2ecf20Sopenharmony_ci movq 8(RIO), RX1; \ 2498c2ecf20Sopenharmony_ci rorq $32, RX1; \ 2508c2ecf20Sopenharmony_ci bswapq RX1; \ 2518c2ecf20Sopenharmony_ci \ 2528c2ecf20Sopenharmony_ci movq 16(RIO), RX2; \ 2538c2ecf20Sopenharmony_ci rorq $32, RX2; \ 2548c2ecf20Sopenharmony_ci bswapq RX2; \ 2558c2ecf20Sopenharmony_ci \ 2568c2ecf20Sopenharmony_ci movq 24(RIO), RX3; \ 2578c2ecf20Sopenharmony_ci rorq $32, RX3; \ 2588c2ecf20Sopenharmony_ci bswapq RX3; 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_ci#define write_block4() \ 2618c2ecf20Sopenharmony_ci bswapq RX0; \ 2628c2ecf20Sopenharmony_ci movq RX0, (RIO); \ 2638c2ecf20Sopenharmony_ci \ 2648c2ecf20Sopenharmony_ci bswapq RX1; \ 2658c2ecf20Sopenharmony_ci movq RX1, 8(RIO); \ 2668c2ecf20Sopenharmony_ci \ 2678c2ecf20Sopenharmony_ci bswapq RX2; \ 2688c2ecf20Sopenharmony_ci movq RX2, 16(RIO); \ 2698c2ecf20Sopenharmony_ci \ 2708c2ecf20Sopenharmony_ci bswapq RX3; \ 2718c2ecf20Sopenharmony_ci movq RX3, 24(RIO); 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_ci#define xor_block4() \ 2748c2ecf20Sopenharmony_ci bswapq RX0; \ 2758c2ecf20Sopenharmony_ci xorq RX0, (RIO); \ 2768c2ecf20Sopenharmony_ci \ 2778c2ecf20Sopenharmony_ci bswapq RX1; \ 2788c2ecf20Sopenharmony_ci xorq RX1, 8(RIO); \ 2798c2ecf20Sopenharmony_ci \ 2808c2ecf20Sopenharmony_ci bswapq RX2; \ 2818c2ecf20Sopenharmony_ci xorq RX2, 16(RIO); \ 2828c2ecf20Sopenharmony_ci \ 2838c2ecf20Sopenharmony_ci bswapq RX3; \ 2848c2ecf20Sopenharmony_ci xorq RX3, 24(RIO); 2858c2ecf20Sopenharmony_ci 2868c2ecf20Sopenharmony_ciSYM_FUNC_START(__blowfish_enc_blk_4way) 2878c2ecf20Sopenharmony_ci /* input: 2888c2ecf20Sopenharmony_ci * %rdi: ctx 2898c2ecf20Sopenharmony_ci * %rsi: dst 2908c2ecf20Sopenharmony_ci * %rdx: src 2918c2ecf20Sopenharmony_ci * %rcx: bool, if true: xor output 2928c2ecf20Sopenharmony_ci */ 2938c2ecf20Sopenharmony_ci pushq %r12; 2948c2ecf20Sopenharmony_ci pushq %rbx; 2958c2ecf20Sopenharmony_ci pushq %rcx; 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_ci movq %rdi, CTX 2988c2ecf20Sopenharmony_ci movq %rsi, %r11; 2998c2ecf20Sopenharmony_ci movq %rdx, RIO; 3008c2ecf20Sopenharmony_ci 3018c2ecf20Sopenharmony_ci preload_roundkey_enc(0); 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci read_block4(); 3048c2ecf20Sopenharmony_ci 3058c2ecf20Sopenharmony_ci round_enc4(0); 3068c2ecf20Sopenharmony_ci round_enc4(2); 3078c2ecf20Sopenharmony_ci round_enc4(4); 3088c2ecf20Sopenharmony_ci round_enc4(6); 3098c2ecf20Sopenharmony_ci round_enc4(8); 3108c2ecf20Sopenharmony_ci round_enc4(10); 3118c2ecf20Sopenharmony_ci round_enc4(12); 3128c2ecf20Sopenharmony_ci round_enc4(14); 3138c2ecf20Sopenharmony_ci add_preloaded_roundkey4(); 3148c2ecf20Sopenharmony_ci 3158c2ecf20Sopenharmony_ci popq %r12; 3168c2ecf20Sopenharmony_ci movq %r11, RIO; 3178c2ecf20Sopenharmony_ci 3188c2ecf20Sopenharmony_ci test %r12b, %r12b; 3198c2ecf20Sopenharmony_ci jnz .L__enc_xor4; 3208c2ecf20Sopenharmony_ci 3218c2ecf20Sopenharmony_ci write_block4(); 3228c2ecf20Sopenharmony_ci 3238c2ecf20Sopenharmony_ci popq %rbx; 3248c2ecf20Sopenharmony_ci popq %r12; 3258c2ecf20Sopenharmony_ci RET; 3268c2ecf20Sopenharmony_ci 3278c2ecf20Sopenharmony_ci.L__enc_xor4: 3288c2ecf20Sopenharmony_ci xor_block4(); 3298c2ecf20Sopenharmony_ci 3308c2ecf20Sopenharmony_ci popq %rbx; 3318c2ecf20Sopenharmony_ci popq %r12; 3328c2ecf20Sopenharmony_ci RET; 3338c2ecf20Sopenharmony_ciSYM_FUNC_END(__blowfish_enc_blk_4way) 3348c2ecf20Sopenharmony_ci 3358c2ecf20Sopenharmony_ciSYM_FUNC_START(blowfish_dec_blk_4way) 3368c2ecf20Sopenharmony_ci /* input: 3378c2ecf20Sopenharmony_ci * %rdi: ctx 3388c2ecf20Sopenharmony_ci * %rsi: dst 3398c2ecf20Sopenharmony_ci * %rdx: src 3408c2ecf20Sopenharmony_ci */ 3418c2ecf20Sopenharmony_ci pushq %r12; 3428c2ecf20Sopenharmony_ci pushq %rbx; 3438c2ecf20Sopenharmony_ci 3448c2ecf20Sopenharmony_ci movq %rdi, CTX; 3458c2ecf20Sopenharmony_ci movq %rsi, %r11 3468c2ecf20Sopenharmony_ci movq %rdx, RIO; 3478c2ecf20Sopenharmony_ci 3488c2ecf20Sopenharmony_ci preload_roundkey_dec(17); 3498c2ecf20Sopenharmony_ci read_block4(); 3508c2ecf20Sopenharmony_ci 3518c2ecf20Sopenharmony_ci round_dec4(17); 3528c2ecf20Sopenharmony_ci round_dec4(15); 3538c2ecf20Sopenharmony_ci round_dec4(13); 3548c2ecf20Sopenharmony_ci round_dec4(11); 3558c2ecf20Sopenharmony_ci round_dec4(9); 3568c2ecf20Sopenharmony_ci round_dec4(7); 3578c2ecf20Sopenharmony_ci round_dec4(5); 3588c2ecf20Sopenharmony_ci round_dec4(3); 3598c2ecf20Sopenharmony_ci add_preloaded_roundkey4(); 3608c2ecf20Sopenharmony_ci 3618c2ecf20Sopenharmony_ci movq %r11, RIO; 3628c2ecf20Sopenharmony_ci write_block4(); 3638c2ecf20Sopenharmony_ci 3648c2ecf20Sopenharmony_ci popq %rbx; 3658c2ecf20Sopenharmony_ci popq %r12; 3668c2ecf20Sopenharmony_ci 3678c2ecf20Sopenharmony_ci RET; 3688c2ecf20Sopenharmony_ciSYM_FUNC_END(blowfish_dec_blk_4way) 369