162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Twofish Cipher 3-way parallel algorithm (x86_64) 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 662306a36Sopenharmony_ci */ 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci#include <linux/linkage.h> 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci.file "twofish-x86_64-asm-3way.S" 1162306a36Sopenharmony_ci.text 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci/* structure of crypto context */ 1462306a36Sopenharmony_ci#define s0 0 1562306a36Sopenharmony_ci#define s1 1024 1662306a36Sopenharmony_ci#define s2 2048 1762306a36Sopenharmony_ci#define s3 3072 1862306a36Sopenharmony_ci#define w 4096 1962306a36Sopenharmony_ci#define k 4128 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_ci/********************************************************************** 2262306a36Sopenharmony_ci 3-way twofish 2362306a36Sopenharmony_ci **********************************************************************/ 2462306a36Sopenharmony_ci#define CTX %rdi 2562306a36Sopenharmony_ci#define RIO %rdx 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci#define RAB0 %rax 2862306a36Sopenharmony_ci#define RAB1 %rbx 2962306a36Sopenharmony_ci#define RAB2 %rcx 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_ci#define RAB0d %eax 3262306a36Sopenharmony_ci#define RAB1d %ebx 3362306a36Sopenharmony_ci#define RAB2d %ecx 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci#define RAB0bh %ah 3662306a36Sopenharmony_ci#define RAB1bh %bh 3762306a36Sopenharmony_ci#define RAB2bh %ch 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci#define RAB0bl %al 4062306a36Sopenharmony_ci#define RAB1bl %bl 4162306a36Sopenharmony_ci#define RAB2bl %cl 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ci#define CD0 0x0(%rsp) 4462306a36Sopenharmony_ci#define CD1 0x8(%rsp) 4562306a36Sopenharmony_ci#define CD2 0x10(%rsp) 4662306a36Sopenharmony_ci 4762306a36Sopenharmony_ci# used only before/after all rounds 4862306a36Sopenharmony_ci#define RCD0 %r8 4962306a36Sopenharmony_ci#define RCD1 %r9 5062306a36Sopenharmony_ci#define RCD2 %r10 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci# used only during rounds 5362306a36Sopenharmony_ci#define RX0 %r8 5462306a36Sopenharmony_ci#define RX1 %r9 5562306a36Sopenharmony_ci#define RX2 %r10 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_ci#define RX0d %r8d 5862306a36Sopenharmony_ci#define RX1d %r9d 5962306a36Sopenharmony_ci#define RX2d %r10d 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ci#define RY0 %r11 6262306a36Sopenharmony_ci#define RY1 %r12 6362306a36Sopenharmony_ci#define RY2 %r13 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci#define RY0d %r11d 6662306a36Sopenharmony_ci#define RY1d %r12d 6762306a36Sopenharmony_ci#define RY2d %r13d 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_ci#define RT0 %rdx 7062306a36Sopenharmony_ci#define RT1 %rsi 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci#define RT0d %edx 7362306a36Sopenharmony_ci#define RT1d %esi 7462306a36Sopenharmony_ci 7562306a36Sopenharmony_ci#define RT1bl %sil 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ 7862306a36Sopenharmony_ci movzbl ab ## bl, tmp2 ## d; \ 7962306a36Sopenharmony_ci movzbl ab ## bh, tmp1 ## d; \ 8062306a36Sopenharmony_ci rorq $(rot), ab; \ 8162306a36Sopenharmony_ci op1##l T0(CTX, tmp2, 4), dst ## d; \ 8262306a36Sopenharmony_ci op2##l T1(CTX, tmp1, 4), dst ## d; 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci#define swap_ab_with_cd(ab, cd, tmp) \ 8562306a36Sopenharmony_ci movq cd, tmp; \ 8662306a36Sopenharmony_ci movq ab, cd; \ 8762306a36Sopenharmony_ci movq tmp, ab; 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci/* 9062306a36Sopenharmony_ci * Combined G1 & G2 function. Reordered with help of rotates to have moves 9162306a36Sopenharmony_ci * at beginning. 9262306a36Sopenharmony_ci */ 9362306a36Sopenharmony_ci#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ 9462306a36Sopenharmony_ci /* G1,1 && G2,1 */ \ 9562306a36Sopenharmony_ci do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ 9662306a36Sopenharmony_ci do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ 9762306a36Sopenharmony_ci \ 9862306a36Sopenharmony_ci do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ 9962306a36Sopenharmony_ci do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ 10062306a36Sopenharmony_ci \ 10162306a36Sopenharmony_ci do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ 10262306a36Sopenharmony_ci do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ 10362306a36Sopenharmony_ci \ 10462306a36Sopenharmony_ci /* G1,2 && G2,2 */ \ 10562306a36Sopenharmony_ci do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ 10662306a36Sopenharmony_ci do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ 10762306a36Sopenharmony_ci swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \ 10862306a36Sopenharmony_ci \ 10962306a36Sopenharmony_ci do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ 11062306a36Sopenharmony_ci do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ 11162306a36Sopenharmony_ci swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \ 11262306a36Sopenharmony_ci \ 11362306a36Sopenharmony_ci do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ 11462306a36Sopenharmony_ci do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ 11562306a36Sopenharmony_ci swap_ab_with_cd(ab ## 2, cd ## 2, RT0); 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci#define enc_round_end(ab, x, y, n) \ 11862306a36Sopenharmony_ci addl y ## d, x ## d; \ 11962306a36Sopenharmony_ci addl x ## d, y ## d; \ 12062306a36Sopenharmony_ci addl k+4*(2*(n))(CTX), x ## d; \ 12162306a36Sopenharmony_ci xorl ab ## d, x ## d; \ 12262306a36Sopenharmony_ci addl k+4*(2*(n)+1)(CTX), y ## d; \ 12362306a36Sopenharmony_ci shrq $32, ab; \ 12462306a36Sopenharmony_ci roll $1, ab ## d; \ 12562306a36Sopenharmony_ci xorl y ## d, ab ## d; \ 12662306a36Sopenharmony_ci shlq $32, ab; \ 12762306a36Sopenharmony_ci rorl $1, x ## d; \ 12862306a36Sopenharmony_ci orq x, ab; 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci#define dec_round_end(ba, x, y, n) \ 13162306a36Sopenharmony_ci addl y ## d, x ## d; \ 13262306a36Sopenharmony_ci addl x ## d, y ## d; \ 13362306a36Sopenharmony_ci addl k+4*(2*(n))(CTX), x ## d; \ 13462306a36Sopenharmony_ci addl k+4*(2*(n)+1)(CTX), y ## d; \ 13562306a36Sopenharmony_ci xorl ba ## d, y ## d; \ 13662306a36Sopenharmony_ci shrq $32, ba; \ 13762306a36Sopenharmony_ci roll $1, ba ## d; \ 13862306a36Sopenharmony_ci xorl x ## d, ba ## d; \ 13962306a36Sopenharmony_ci shlq $32, ba; \ 14062306a36Sopenharmony_ci rorl $1, y ## d; \ 14162306a36Sopenharmony_ci orq y, ba; 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci#define encrypt_round3(ab, cd, n) \ 14462306a36Sopenharmony_ci g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ 14562306a36Sopenharmony_ci \ 14662306a36Sopenharmony_ci enc_round_end(ab ## 0, RX0, RY0, n); \ 14762306a36Sopenharmony_ci enc_round_end(ab ## 1, RX1, RY1, n); \ 14862306a36Sopenharmony_ci enc_round_end(ab ## 2, RX2, RY2, n); 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_ci#define decrypt_round3(ba, dc, n) \ 15162306a36Sopenharmony_ci g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ 15262306a36Sopenharmony_ci \ 15362306a36Sopenharmony_ci dec_round_end(ba ## 0, RX0, RY0, n); \ 15462306a36Sopenharmony_ci dec_round_end(ba ## 1, RX1, RY1, n); \ 15562306a36Sopenharmony_ci dec_round_end(ba ## 2, RX2, RY2, n); 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci#define encrypt_cycle3(ab, cd, n) \ 15862306a36Sopenharmony_ci encrypt_round3(ab, cd, n*2); \ 15962306a36Sopenharmony_ci encrypt_round3(ab, cd, (n*2)+1); 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci#define decrypt_cycle3(ba, dc, n) \ 16262306a36Sopenharmony_ci decrypt_round3(ba, dc, (n*2)+1); \ 16362306a36Sopenharmony_ci decrypt_round3(ba, dc, (n*2)); 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_ci#define push_cd() \ 16662306a36Sopenharmony_ci pushq RCD2; \ 16762306a36Sopenharmony_ci pushq RCD1; \ 16862306a36Sopenharmony_ci pushq RCD0; 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci#define pop_cd() \ 17162306a36Sopenharmony_ci popq RCD0; \ 17262306a36Sopenharmony_ci popq RCD1; \ 17362306a36Sopenharmony_ci popq RCD2; 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci#define inpack3(in, n, xy, m) \ 17662306a36Sopenharmony_ci movq 4*(n)(in), xy ## 0; \ 17762306a36Sopenharmony_ci xorq w+4*m(CTX), xy ## 0; \ 17862306a36Sopenharmony_ci \ 17962306a36Sopenharmony_ci movq 4*(4+(n))(in), xy ## 1; \ 18062306a36Sopenharmony_ci xorq w+4*m(CTX), xy ## 1; \ 18162306a36Sopenharmony_ci \ 18262306a36Sopenharmony_ci movq 4*(8+(n))(in), xy ## 2; \ 18362306a36Sopenharmony_ci xorq w+4*m(CTX), xy ## 2; 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci#define outunpack3(op, out, n, xy, m) \ 18662306a36Sopenharmony_ci xorq w+4*m(CTX), xy ## 0; \ 18762306a36Sopenharmony_ci op ## q xy ## 0, 4*(n)(out); \ 18862306a36Sopenharmony_ci \ 18962306a36Sopenharmony_ci xorq w+4*m(CTX), xy ## 1; \ 19062306a36Sopenharmony_ci op ## q xy ## 1, 4*(4+(n))(out); \ 19162306a36Sopenharmony_ci \ 19262306a36Sopenharmony_ci xorq w+4*m(CTX), xy ## 2; \ 19362306a36Sopenharmony_ci op ## q xy ## 2, 4*(8+(n))(out); 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci#define inpack_enc3() \ 19662306a36Sopenharmony_ci inpack3(RIO, 0, RAB, 0); \ 19762306a36Sopenharmony_ci inpack3(RIO, 2, RCD, 2); 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci#define outunpack_enc3(op) \ 20062306a36Sopenharmony_ci outunpack3(op, RIO, 2, RAB, 6); \ 20162306a36Sopenharmony_ci outunpack3(op, RIO, 0, RCD, 4); 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci#define inpack_dec3() \ 20462306a36Sopenharmony_ci inpack3(RIO, 0, RAB, 4); \ 20562306a36Sopenharmony_ci rorq $32, RAB0; \ 20662306a36Sopenharmony_ci rorq $32, RAB1; \ 20762306a36Sopenharmony_ci rorq $32, RAB2; \ 20862306a36Sopenharmony_ci inpack3(RIO, 2, RCD, 6); \ 20962306a36Sopenharmony_ci rorq $32, RCD0; \ 21062306a36Sopenharmony_ci rorq $32, RCD1; \ 21162306a36Sopenharmony_ci rorq $32, RCD2; 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci#define outunpack_dec3() \ 21462306a36Sopenharmony_ci rorq $32, RCD0; \ 21562306a36Sopenharmony_ci rorq $32, RCD1; \ 21662306a36Sopenharmony_ci rorq $32, RCD2; \ 21762306a36Sopenharmony_ci outunpack3(mov, RIO, 0, RCD, 0); \ 21862306a36Sopenharmony_ci rorq $32, RAB0; \ 21962306a36Sopenharmony_ci rorq $32, RAB1; \ 22062306a36Sopenharmony_ci rorq $32, RAB2; \ 22162306a36Sopenharmony_ci outunpack3(mov, RIO, 2, RAB, 2); 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ciSYM_FUNC_START(__twofish_enc_blk_3way) 22462306a36Sopenharmony_ci /* input: 22562306a36Sopenharmony_ci * %rdi: ctx, CTX 22662306a36Sopenharmony_ci * %rsi: dst 22762306a36Sopenharmony_ci * %rdx: src, RIO 22862306a36Sopenharmony_ci * %rcx: bool, if true: xor output 22962306a36Sopenharmony_ci */ 23062306a36Sopenharmony_ci pushq %r13; 23162306a36Sopenharmony_ci pushq %r12; 23262306a36Sopenharmony_ci pushq %rbx; 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_ci pushq %rcx; /* bool xor */ 23562306a36Sopenharmony_ci pushq %rsi; /* dst */ 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci inpack_enc3(); 23862306a36Sopenharmony_ci 23962306a36Sopenharmony_ci push_cd(); 24062306a36Sopenharmony_ci encrypt_cycle3(RAB, CD, 0); 24162306a36Sopenharmony_ci encrypt_cycle3(RAB, CD, 1); 24262306a36Sopenharmony_ci encrypt_cycle3(RAB, CD, 2); 24362306a36Sopenharmony_ci encrypt_cycle3(RAB, CD, 3); 24462306a36Sopenharmony_ci encrypt_cycle3(RAB, CD, 4); 24562306a36Sopenharmony_ci encrypt_cycle3(RAB, CD, 5); 24662306a36Sopenharmony_ci encrypt_cycle3(RAB, CD, 6); 24762306a36Sopenharmony_ci encrypt_cycle3(RAB, CD, 7); 24862306a36Sopenharmony_ci pop_cd(); 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci popq RIO; /* dst */ 25162306a36Sopenharmony_ci popq RT1; /* bool xor */ 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci testb RT1bl, RT1bl; 25462306a36Sopenharmony_ci jnz .L__enc_xor3; 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci outunpack_enc3(mov); 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci popq %rbx; 25962306a36Sopenharmony_ci popq %r12; 26062306a36Sopenharmony_ci popq %r13; 26162306a36Sopenharmony_ci RET; 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci.L__enc_xor3: 26462306a36Sopenharmony_ci outunpack_enc3(xor); 26562306a36Sopenharmony_ci 26662306a36Sopenharmony_ci popq %rbx; 26762306a36Sopenharmony_ci popq %r12; 26862306a36Sopenharmony_ci popq %r13; 26962306a36Sopenharmony_ci RET; 27062306a36Sopenharmony_ciSYM_FUNC_END(__twofish_enc_blk_3way) 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ciSYM_FUNC_START(twofish_dec_blk_3way) 27362306a36Sopenharmony_ci /* input: 27462306a36Sopenharmony_ci * %rdi: ctx, CTX 27562306a36Sopenharmony_ci * %rsi: dst 27662306a36Sopenharmony_ci * %rdx: src, RIO 27762306a36Sopenharmony_ci */ 27862306a36Sopenharmony_ci pushq %r13; 27962306a36Sopenharmony_ci pushq %r12; 28062306a36Sopenharmony_ci pushq %rbx; 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_ci pushq %rsi; /* dst */ 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci inpack_dec3(); 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci push_cd(); 28762306a36Sopenharmony_ci decrypt_cycle3(RAB, CD, 7); 28862306a36Sopenharmony_ci decrypt_cycle3(RAB, CD, 6); 28962306a36Sopenharmony_ci decrypt_cycle3(RAB, CD, 5); 29062306a36Sopenharmony_ci decrypt_cycle3(RAB, CD, 4); 29162306a36Sopenharmony_ci decrypt_cycle3(RAB, CD, 3); 29262306a36Sopenharmony_ci decrypt_cycle3(RAB, CD, 2); 29362306a36Sopenharmony_ci decrypt_cycle3(RAB, CD, 1); 29462306a36Sopenharmony_ci decrypt_cycle3(RAB, CD, 0); 29562306a36Sopenharmony_ci pop_cd(); 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci popq RIO; /* dst */ 29862306a36Sopenharmony_ci 29962306a36Sopenharmony_ci outunpack_dec3(); 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci popq %rbx; 30262306a36Sopenharmony_ci popq %r12; 30362306a36Sopenharmony_ci popq %r13; 30462306a36Sopenharmony_ci RET; 30562306a36Sopenharmony_ciSYM_FUNC_END(twofish_dec_blk_3way) 306