162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Blowfish Cipher Algorithm (x86_64) 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 662306a36Sopenharmony_ci */ 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci#include <linux/linkage.h> 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci.file "blowfish-x86_64-asm.S" 1162306a36Sopenharmony_ci.text 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci/* structure of crypto context */ 1462306a36Sopenharmony_ci#define p 0 1562306a36Sopenharmony_ci#define s0 ((16 + 2) * 4) 1662306a36Sopenharmony_ci#define s1 ((16 + 2 + (1 * 256)) * 4) 1762306a36Sopenharmony_ci#define s2 ((16 + 2 + (2 * 256)) * 4) 1862306a36Sopenharmony_ci#define s3 ((16 + 2 + (3 * 256)) * 4) 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ci/* register macros */ 2162306a36Sopenharmony_ci#define CTX %r12 2262306a36Sopenharmony_ci#define RIO %rsi 2362306a36Sopenharmony_ci 2462306a36Sopenharmony_ci#define RX0 %rax 2562306a36Sopenharmony_ci#define RX1 %rbx 2662306a36Sopenharmony_ci#define RX2 %rcx 2762306a36Sopenharmony_ci#define RX3 %rdx 2862306a36Sopenharmony_ci 2962306a36Sopenharmony_ci#define RX0d %eax 3062306a36Sopenharmony_ci#define RX1d %ebx 3162306a36Sopenharmony_ci#define RX2d %ecx 3262306a36Sopenharmony_ci#define RX3d %edx 3362306a36Sopenharmony_ci 3462306a36Sopenharmony_ci#define RX0bl %al 3562306a36Sopenharmony_ci#define RX1bl %bl 3662306a36Sopenharmony_ci#define RX2bl %cl 3762306a36Sopenharmony_ci#define RX3bl %dl 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci#define RX0bh %ah 4062306a36Sopenharmony_ci#define RX1bh %bh 4162306a36Sopenharmony_ci#define RX2bh %ch 4262306a36Sopenharmony_ci#define RX3bh %dh 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci#define RT0 %rdi 4562306a36Sopenharmony_ci#define RT1 %rsi 4662306a36Sopenharmony_ci#define RT2 %r8 4762306a36Sopenharmony_ci#define RT3 %r9 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci#define RT0d %edi 5062306a36Sopenharmony_ci#define RT1d %esi 5162306a36Sopenharmony_ci#define RT2d %r8d 5262306a36Sopenharmony_ci#define RT3d %r9d 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_ci#define RKEY %r10 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci/*********************************************************************** 5762306a36Sopenharmony_ci * 1-way blowfish 5862306a36Sopenharmony_ci ***********************************************************************/ 5962306a36Sopenharmony_ci#define F() \ 6062306a36Sopenharmony_ci rorq $16, RX0; \ 6162306a36Sopenharmony_ci movzbl RX0bh, RT0d; \ 6262306a36Sopenharmony_ci movzbl RX0bl, RT1d; \ 6362306a36Sopenharmony_ci rolq $16, RX0; \ 6462306a36Sopenharmony_ci movl s0(CTX,RT0,4), RT0d; \ 6562306a36Sopenharmony_ci addl s1(CTX,RT1,4), RT0d; \ 6662306a36Sopenharmony_ci movzbl RX0bh, RT1d; \ 6762306a36Sopenharmony_ci movzbl RX0bl, RT2d; \ 6862306a36Sopenharmony_ci rolq $32, RX0; \ 6962306a36Sopenharmony_ci xorl s2(CTX,RT1,4), RT0d; \ 7062306a36Sopenharmony_ci addl s3(CTX,RT2,4), RT0d; \ 7162306a36Sopenharmony_ci xorq RT0, RX0; 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci#define add_roundkey_enc(n) \ 7462306a36Sopenharmony_ci xorq p+4*(n)(CTX), RX0; 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci#define round_enc(n) \ 7762306a36Sopenharmony_ci add_roundkey_enc(n); \ 7862306a36Sopenharmony_ci \ 7962306a36Sopenharmony_ci F(); \ 8062306a36Sopenharmony_ci F(); 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci#define add_roundkey_dec(n) \ 8362306a36Sopenharmony_ci movq p+4*(n-1)(CTX), RT0; \ 8462306a36Sopenharmony_ci rorq $32, RT0; \ 8562306a36Sopenharmony_ci xorq RT0, RX0; 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci#define round_dec(n) \ 8862306a36Sopenharmony_ci add_roundkey_dec(n); \ 8962306a36Sopenharmony_ci \ 9062306a36Sopenharmony_ci F(); \ 9162306a36Sopenharmony_ci F(); \ 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci#define read_block() \ 9462306a36Sopenharmony_ci movq (RIO), RX0; \ 9562306a36Sopenharmony_ci rorq $32, RX0; \ 9662306a36Sopenharmony_ci bswapq RX0; 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci#define write_block() \ 9962306a36Sopenharmony_ci bswapq RX0; \ 10062306a36Sopenharmony_ci movq RX0, (RIO); 10162306a36Sopenharmony_ci 10262306a36Sopenharmony_ciSYM_FUNC_START(blowfish_enc_blk) 10362306a36Sopenharmony_ci /* input: 10462306a36Sopenharmony_ci * %rdi: ctx 10562306a36Sopenharmony_ci * %rsi: dst 10662306a36Sopenharmony_ci * %rdx: src 10762306a36Sopenharmony_ci */ 10862306a36Sopenharmony_ci movq %r12, %r11; 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci movq %rdi, CTX; 11162306a36Sopenharmony_ci movq %rsi, %r10; 11262306a36Sopenharmony_ci movq %rdx, RIO; 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci read_block(); 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci round_enc(0); 11762306a36Sopenharmony_ci round_enc(2); 11862306a36Sopenharmony_ci round_enc(4); 11962306a36Sopenharmony_ci round_enc(6); 12062306a36Sopenharmony_ci round_enc(8); 12162306a36Sopenharmony_ci round_enc(10); 12262306a36Sopenharmony_ci round_enc(12); 12362306a36Sopenharmony_ci round_enc(14); 12462306a36Sopenharmony_ci add_roundkey_enc(16); 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci movq %r11, %r12; 12762306a36Sopenharmony_ci movq %r10, RIO; 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_ci write_block(); 13062306a36Sopenharmony_ci RET; 13162306a36Sopenharmony_ciSYM_FUNC_END(blowfish_enc_blk) 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ciSYM_FUNC_START(blowfish_dec_blk) 13462306a36Sopenharmony_ci /* input: 13562306a36Sopenharmony_ci * %rdi: ctx 13662306a36Sopenharmony_ci * %rsi: dst 13762306a36Sopenharmony_ci * %rdx: src 13862306a36Sopenharmony_ci */ 13962306a36Sopenharmony_ci movq %r12, %r11; 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci movq %rdi, CTX; 14262306a36Sopenharmony_ci movq %rsi, %r10; 14362306a36Sopenharmony_ci movq %rdx, RIO; 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci read_block(); 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci round_dec(17); 14862306a36Sopenharmony_ci round_dec(15); 14962306a36Sopenharmony_ci round_dec(13); 15062306a36Sopenharmony_ci round_dec(11); 15162306a36Sopenharmony_ci round_dec(9); 15262306a36Sopenharmony_ci round_dec(7); 15362306a36Sopenharmony_ci round_dec(5); 15462306a36Sopenharmony_ci round_dec(3); 15562306a36Sopenharmony_ci add_roundkey_dec(1); 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci movq %r10, RIO; 15862306a36Sopenharmony_ci write_block(); 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_ci movq %r11, %r12; 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci RET; 16362306a36Sopenharmony_ciSYM_FUNC_END(blowfish_dec_blk) 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_ci/********************************************************************** 16662306a36Sopenharmony_ci 4-way blowfish, four blocks parallel 16762306a36Sopenharmony_ci **********************************************************************/ 16862306a36Sopenharmony_ci 16962306a36Sopenharmony_ci/* F() for 4-way. Slower when used alone/1-way, but faster when used 17062306a36Sopenharmony_ci * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). 17162306a36Sopenharmony_ci */ 17262306a36Sopenharmony_ci#define F4(x) \ 17362306a36Sopenharmony_ci movzbl x ## bh, RT1d; \ 17462306a36Sopenharmony_ci movzbl x ## bl, RT3d; \ 17562306a36Sopenharmony_ci rorq $16, x; \ 17662306a36Sopenharmony_ci movzbl x ## bh, RT0d; \ 17762306a36Sopenharmony_ci movzbl x ## bl, RT2d; \ 17862306a36Sopenharmony_ci rorq $16, x; \ 17962306a36Sopenharmony_ci movl s0(CTX,RT0,4), RT0d; \ 18062306a36Sopenharmony_ci addl s1(CTX,RT2,4), RT0d; \ 18162306a36Sopenharmony_ci xorl s2(CTX,RT1,4), RT0d; \ 18262306a36Sopenharmony_ci addl s3(CTX,RT3,4), RT0d; \ 18362306a36Sopenharmony_ci xorq RT0, x; 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci#define add_preloaded_roundkey4() \ 18662306a36Sopenharmony_ci xorq RKEY, RX0; \ 18762306a36Sopenharmony_ci xorq RKEY, RX1; \ 18862306a36Sopenharmony_ci xorq RKEY, RX2; \ 18962306a36Sopenharmony_ci xorq RKEY, RX3; 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci#define preload_roundkey_enc(n) \ 19262306a36Sopenharmony_ci movq p+4*(n)(CTX), RKEY; 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci#define add_roundkey_enc4(n) \ 19562306a36Sopenharmony_ci add_preloaded_roundkey4(); \ 19662306a36Sopenharmony_ci preload_roundkey_enc(n + 2); 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci#define round_enc4(n) \ 19962306a36Sopenharmony_ci add_roundkey_enc4(n); \ 20062306a36Sopenharmony_ci \ 20162306a36Sopenharmony_ci F4(RX0); \ 20262306a36Sopenharmony_ci F4(RX1); \ 20362306a36Sopenharmony_ci F4(RX2); \ 20462306a36Sopenharmony_ci F4(RX3); \ 20562306a36Sopenharmony_ci \ 20662306a36Sopenharmony_ci F4(RX0); \ 20762306a36Sopenharmony_ci F4(RX1); \ 20862306a36Sopenharmony_ci F4(RX2); \ 20962306a36Sopenharmony_ci F4(RX3); 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci#define preload_roundkey_dec(n) \ 21262306a36Sopenharmony_ci movq p+4*((n)-1)(CTX), RKEY; \ 21362306a36Sopenharmony_ci rorq $32, RKEY; 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci#define add_roundkey_dec4(n) \ 21662306a36Sopenharmony_ci add_preloaded_roundkey4(); \ 21762306a36Sopenharmony_ci preload_roundkey_dec(n - 2); 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci#define round_dec4(n) \ 22062306a36Sopenharmony_ci add_roundkey_dec4(n); \ 22162306a36Sopenharmony_ci \ 22262306a36Sopenharmony_ci F4(RX0); \ 22362306a36Sopenharmony_ci F4(RX1); \ 22462306a36Sopenharmony_ci F4(RX2); \ 22562306a36Sopenharmony_ci F4(RX3); \ 22662306a36Sopenharmony_ci \ 22762306a36Sopenharmony_ci F4(RX0); \ 22862306a36Sopenharmony_ci F4(RX1); \ 22962306a36Sopenharmony_ci F4(RX2); \ 23062306a36Sopenharmony_ci F4(RX3); 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_ci#define read_block4() \ 23362306a36Sopenharmony_ci movq (RIO), RX0; \ 23462306a36Sopenharmony_ci rorq $32, RX0; \ 23562306a36Sopenharmony_ci bswapq RX0; \ 23662306a36Sopenharmony_ci \ 23762306a36Sopenharmony_ci movq 8(RIO), RX1; \ 23862306a36Sopenharmony_ci rorq $32, RX1; \ 23962306a36Sopenharmony_ci bswapq RX1; \ 24062306a36Sopenharmony_ci \ 24162306a36Sopenharmony_ci movq 16(RIO), RX2; \ 24262306a36Sopenharmony_ci rorq $32, RX2; \ 24362306a36Sopenharmony_ci bswapq RX2; \ 24462306a36Sopenharmony_ci \ 24562306a36Sopenharmony_ci movq 24(RIO), RX3; \ 24662306a36Sopenharmony_ci rorq $32, RX3; \ 24762306a36Sopenharmony_ci bswapq RX3; 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_ci#define write_block4() \ 25062306a36Sopenharmony_ci bswapq RX0; \ 25162306a36Sopenharmony_ci movq RX0, (RIO); \ 25262306a36Sopenharmony_ci \ 25362306a36Sopenharmony_ci bswapq RX1; \ 25462306a36Sopenharmony_ci movq RX1, 8(RIO); \ 25562306a36Sopenharmony_ci \ 25662306a36Sopenharmony_ci bswapq RX2; \ 25762306a36Sopenharmony_ci movq RX2, 16(RIO); \ 25862306a36Sopenharmony_ci \ 25962306a36Sopenharmony_ci bswapq RX3; \ 26062306a36Sopenharmony_ci movq RX3, 24(RIO); 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_ci#define xor_block4() \ 26362306a36Sopenharmony_ci movq (RIO), RT0; \ 26462306a36Sopenharmony_ci bswapq RT0; \ 26562306a36Sopenharmony_ci xorq RT0, RX1; \ 26662306a36Sopenharmony_ci \ 26762306a36Sopenharmony_ci movq 8(RIO), RT2; \ 26862306a36Sopenharmony_ci bswapq RT2; \ 26962306a36Sopenharmony_ci xorq RT2, RX2; \ 27062306a36Sopenharmony_ci \ 27162306a36Sopenharmony_ci movq 16(RIO), RT3; \ 27262306a36Sopenharmony_ci bswapq RT3; \ 27362306a36Sopenharmony_ci xorq RT3, RX3; 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_ciSYM_FUNC_START(blowfish_enc_blk_4way) 27662306a36Sopenharmony_ci /* input: 27762306a36Sopenharmony_ci * %rdi: ctx 27862306a36Sopenharmony_ci * %rsi: dst 27962306a36Sopenharmony_ci * %rdx: src 28062306a36Sopenharmony_ci */ 28162306a36Sopenharmony_ci pushq %r12; 28262306a36Sopenharmony_ci pushq %rbx; 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci movq %rdi, CTX 28562306a36Sopenharmony_ci movq %rsi, %r11; 28662306a36Sopenharmony_ci movq %rdx, RIO; 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci preload_roundkey_enc(0); 28962306a36Sopenharmony_ci 29062306a36Sopenharmony_ci read_block4(); 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci round_enc4(0); 29362306a36Sopenharmony_ci round_enc4(2); 29462306a36Sopenharmony_ci round_enc4(4); 29562306a36Sopenharmony_ci round_enc4(6); 29662306a36Sopenharmony_ci round_enc4(8); 29762306a36Sopenharmony_ci round_enc4(10); 29862306a36Sopenharmony_ci round_enc4(12); 29962306a36Sopenharmony_ci round_enc4(14); 30062306a36Sopenharmony_ci add_preloaded_roundkey4(); 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci movq %r11, RIO; 30362306a36Sopenharmony_ci write_block4(); 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci popq %rbx; 30662306a36Sopenharmony_ci popq %r12; 30762306a36Sopenharmony_ci RET; 30862306a36Sopenharmony_ciSYM_FUNC_END(blowfish_enc_blk_4way) 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ciSYM_FUNC_START(__blowfish_dec_blk_4way) 31162306a36Sopenharmony_ci /* input: 31262306a36Sopenharmony_ci * %rdi: ctx 31362306a36Sopenharmony_ci * %rsi: dst 31462306a36Sopenharmony_ci * %rdx: src 31562306a36Sopenharmony_ci * %rcx: cbc (bool) 31662306a36Sopenharmony_ci */ 31762306a36Sopenharmony_ci pushq %r12; 31862306a36Sopenharmony_ci pushq %rbx; 31962306a36Sopenharmony_ci pushq %rcx; 32062306a36Sopenharmony_ci pushq %rdx; 32162306a36Sopenharmony_ci 32262306a36Sopenharmony_ci movq %rdi, CTX; 32362306a36Sopenharmony_ci movq %rsi, %r11; 32462306a36Sopenharmony_ci movq %rdx, RIO; 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci preload_roundkey_dec(17); 32762306a36Sopenharmony_ci read_block4(); 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_ci round_dec4(17); 33062306a36Sopenharmony_ci round_dec4(15); 33162306a36Sopenharmony_ci round_dec4(13); 33262306a36Sopenharmony_ci round_dec4(11); 33362306a36Sopenharmony_ci round_dec4(9); 33462306a36Sopenharmony_ci round_dec4(7); 33562306a36Sopenharmony_ci round_dec4(5); 33662306a36Sopenharmony_ci round_dec4(3); 33762306a36Sopenharmony_ci add_preloaded_roundkey4(); 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci popq RIO; 34062306a36Sopenharmony_ci popq %r12; 34162306a36Sopenharmony_ci testq %r12, %r12; 34262306a36Sopenharmony_ci jz .L_no_cbc_xor; 34362306a36Sopenharmony_ci 34462306a36Sopenharmony_ci xor_block4(); 34562306a36Sopenharmony_ci 34662306a36Sopenharmony_ci.L_no_cbc_xor: 34762306a36Sopenharmony_ci movq %r11, RIO; 34862306a36Sopenharmony_ci write_block4(); 34962306a36Sopenharmony_ci 35062306a36Sopenharmony_ci popq %rbx; 35162306a36Sopenharmony_ci popq %r12; 35262306a36Sopenharmony_ci 35362306a36Sopenharmony_ci RET; 35462306a36Sopenharmony_ciSYM_FUNC_END(__blowfish_dec_blk_4way) 355