162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 262306a36Sopenharmony_ci/*************************************************************************** 362306a36Sopenharmony_ci* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> * 462306a36Sopenharmony_ci* * 562306a36Sopenharmony_ci***************************************************************************/ 662306a36Sopenharmony_ci 762306a36Sopenharmony_ci.file "twofish-x86_64-asm.S" 862306a36Sopenharmony_ci.text 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci#include <linux/linkage.h> 1162306a36Sopenharmony_ci#include <asm/asm-offsets.h> 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci#define a_offset 0 1462306a36Sopenharmony_ci#define b_offset 4 1562306a36Sopenharmony_ci#define c_offset 8 1662306a36Sopenharmony_ci#define d_offset 12 1762306a36Sopenharmony_ci 1862306a36Sopenharmony_ci/* Structure of the crypto context struct*/ 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ci#define s0 0 /* S0 Array 256 Words each */ 2162306a36Sopenharmony_ci#define s1 1024 /* S1 Array */ 2262306a36Sopenharmony_ci#define s2 2048 /* S2 Array */ 2362306a36Sopenharmony_ci#define s3 3072 /* S3 Array */ 2462306a36Sopenharmony_ci#define w 4096 /* 8 whitening keys (word) */ 2562306a36Sopenharmony_ci#define k 4128 /* key 1-32 ( word ) */ 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci/* define a few register aliases to allow macro substitution */ 2862306a36Sopenharmony_ci 2962306a36Sopenharmony_ci#define R0 %rax 3062306a36Sopenharmony_ci#define R0D %eax 3162306a36Sopenharmony_ci#define R0B %al 3262306a36Sopenharmony_ci#define R0H %ah 3362306a36Sopenharmony_ci 3462306a36Sopenharmony_ci#define R1 %rbx 3562306a36Sopenharmony_ci#define R1D %ebx 3662306a36Sopenharmony_ci#define R1B %bl 3762306a36Sopenharmony_ci#define R1H %bh 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci#define R2 %rcx 4062306a36Sopenharmony_ci#define R2D %ecx 4162306a36Sopenharmony_ci#define R2B %cl 4262306a36Sopenharmony_ci#define R2H %ch 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci#define R3 %rdx 4562306a36Sopenharmony_ci#define R3D %edx 4662306a36Sopenharmony_ci#define R3B %dl 4762306a36Sopenharmony_ci#define R3H %dh 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci/* performs input whitening */ 5162306a36Sopenharmony_ci#define input_whitening(src,context,offset)\ 5262306a36Sopenharmony_ci xor w+offset(context), src; 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_ci/* performs input whitening */ 5562306a36Sopenharmony_ci#define output_whitening(src,context,offset)\ 5662306a36Sopenharmony_ci xor w+16+offset(context), src; 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci/* 6062306a36Sopenharmony_ci * a input register containing a (rotated 16) 6162306a36Sopenharmony_ci * b input register containing b 6262306a36Sopenharmony_ci * c input register containing c 6362306a36Sopenharmony_ci * d input register containing d (already rol $1) 6462306a36Sopenharmony_ci * operations on a and b are interleaved to increase performance 6562306a36Sopenharmony_ci */ 6662306a36Sopenharmony_ci#define encrypt_round(a,b,c,d,round)\ 6762306a36Sopenharmony_ci movzx b ## B, %edi;\ 6862306a36Sopenharmony_ci mov s1(%r11,%rdi,4),%r8d;\ 6962306a36Sopenharmony_ci movzx a ## B, %edi;\ 7062306a36Sopenharmony_ci mov s2(%r11,%rdi,4),%r9d;\ 7162306a36Sopenharmony_ci movzx b ## H, %edi;\ 7262306a36Sopenharmony_ci ror $16, b ## D;\ 7362306a36Sopenharmony_ci xor s2(%r11,%rdi,4),%r8d;\ 7462306a36Sopenharmony_ci movzx a ## H, %edi;\ 7562306a36Sopenharmony_ci ror $16, a ## D;\ 7662306a36Sopenharmony_ci xor s3(%r11,%rdi,4),%r9d;\ 7762306a36Sopenharmony_ci movzx b ## B, %edi;\ 7862306a36Sopenharmony_ci xor s3(%r11,%rdi,4),%r8d;\ 7962306a36Sopenharmony_ci movzx a ## B, %edi;\ 8062306a36Sopenharmony_ci xor (%r11,%rdi,4), %r9d;\ 8162306a36Sopenharmony_ci movzx b ## H, %edi;\ 8262306a36Sopenharmony_ci ror $15, b ## D;\ 8362306a36Sopenharmony_ci xor (%r11,%rdi,4), %r8d;\ 8462306a36Sopenharmony_ci movzx a ## H, %edi;\ 8562306a36Sopenharmony_ci xor s1(%r11,%rdi,4),%r9d;\ 8662306a36Sopenharmony_ci add %r8d, %r9d;\ 8762306a36Sopenharmony_ci add %r9d, %r8d;\ 8862306a36Sopenharmony_ci add k+round(%r11), %r9d;\ 8962306a36Sopenharmony_ci xor %r9d, c ## D;\ 9062306a36Sopenharmony_ci rol $15, c ## D;\ 9162306a36Sopenharmony_ci add k+4+round(%r11),%r8d;\ 9262306a36Sopenharmony_ci xor %r8d, d ## D; 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci/* 9562306a36Sopenharmony_ci * a input register containing a(rotated 16) 9662306a36Sopenharmony_ci * b input register containing b 9762306a36Sopenharmony_ci * c input register containing c 9862306a36Sopenharmony_ci * d input register containing d (already rol $1) 9962306a36Sopenharmony_ci * operations on a and b are interleaved to increase performance 10062306a36Sopenharmony_ci * during the round a and b are prepared for the output whitening 10162306a36Sopenharmony_ci */ 10262306a36Sopenharmony_ci#define encrypt_last_round(a,b,c,d,round)\ 10362306a36Sopenharmony_ci mov b ## D, %r10d;\ 10462306a36Sopenharmony_ci shl $32, %r10;\ 10562306a36Sopenharmony_ci movzx b ## B, %edi;\ 10662306a36Sopenharmony_ci mov s1(%r11,%rdi,4),%r8d;\ 10762306a36Sopenharmony_ci movzx a ## B, %edi;\ 10862306a36Sopenharmony_ci mov s2(%r11,%rdi,4),%r9d;\ 10962306a36Sopenharmony_ci movzx b ## H, %edi;\ 11062306a36Sopenharmony_ci ror $16, b ## D;\ 11162306a36Sopenharmony_ci xor s2(%r11,%rdi,4),%r8d;\ 11262306a36Sopenharmony_ci movzx a ## H, %edi;\ 11362306a36Sopenharmony_ci ror $16, a ## D;\ 11462306a36Sopenharmony_ci xor s3(%r11,%rdi,4),%r9d;\ 11562306a36Sopenharmony_ci movzx b ## B, %edi;\ 11662306a36Sopenharmony_ci xor s3(%r11,%rdi,4),%r8d;\ 11762306a36Sopenharmony_ci movzx a ## B, %edi;\ 11862306a36Sopenharmony_ci xor (%r11,%rdi,4), %r9d;\ 11962306a36Sopenharmony_ci xor a, %r10;\ 12062306a36Sopenharmony_ci movzx b ## H, %edi;\ 12162306a36Sopenharmony_ci xor (%r11,%rdi,4), %r8d;\ 12262306a36Sopenharmony_ci movzx a ## H, %edi;\ 12362306a36Sopenharmony_ci xor s1(%r11,%rdi,4),%r9d;\ 12462306a36Sopenharmony_ci add %r8d, %r9d;\ 12562306a36Sopenharmony_ci add %r9d, %r8d;\ 12662306a36Sopenharmony_ci add k+round(%r11), %r9d;\ 12762306a36Sopenharmony_ci xor %r9d, c ## D;\ 12862306a36Sopenharmony_ci ror $1, c ## D;\ 12962306a36Sopenharmony_ci add k+4+round(%r11),%r8d;\ 13062306a36Sopenharmony_ci xor %r8d, d ## D 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci/* 13362306a36Sopenharmony_ci * a input register containing a 13462306a36Sopenharmony_ci * b input register containing b (rotated 16) 13562306a36Sopenharmony_ci * c input register containing c (already rol $1) 13662306a36Sopenharmony_ci * d input register containing d 13762306a36Sopenharmony_ci * operations on a and b are interleaved to increase performance 13862306a36Sopenharmony_ci */ 13962306a36Sopenharmony_ci#define decrypt_round(a,b,c,d,round)\ 14062306a36Sopenharmony_ci movzx a ## B, %edi;\ 14162306a36Sopenharmony_ci mov (%r11,%rdi,4), %r9d;\ 14262306a36Sopenharmony_ci movzx b ## B, %edi;\ 14362306a36Sopenharmony_ci mov s3(%r11,%rdi,4),%r8d;\ 14462306a36Sopenharmony_ci movzx a ## H, %edi;\ 14562306a36Sopenharmony_ci ror $16, a ## D;\ 14662306a36Sopenharmony_ci xor s1(%r11,%rdi,4),%r9d;\ 14762306a36Sopenharmony_ci movzx b ## H, %edi;\ 14862306a36Sopenharmony_ci ror $16, b ## D;\ 14962306a36Sopenharmony_ci xor (%r11,%rdi,4), %r8d;\ 15062306a36Sopenharmony_ci movzx a ## B, %edi;\ 15162306a36Sopenharmony_ci xor s2(%r11,%rdi,4),%r9d;\ 15262306a36Sopenharmony_ci movzx b ## B, %edi;\ 15362306a36Sopenharmony_ci xor s1(%r11,%rdi,4),%r8d;\ 15462306a36Sopenharmony_ci movzx a ## H, %edi;\ 15562306a36Sopenharmony_ci ror $15, a ## D;\ 15662306a36Sopenharmony_ci xor s3(%r11,%rdi,4),%r9d;\ 15762306a36Sopenharmony_ci movzx b ## H, %edi;\ 15862306a36Sopenharmony_ci xor s2(%r11,%rdi,4),%r8d;\ 15962306a36Sopenharmony_ci add %r8d, %r9d;\ 16062306a36Sopenharmony_ci add %r9d, %r8d;\ 16162306a36Sopenharmony_ci add k+round(%r11), %r9d;\ 16262306a36Sopenharmony_ci xor %r9d, c ## D;\ 16362306a36Sopenharmony_ci add k+4+round(%r11),%r8d;\ 16462306a36Sopenharmony_ci xor %r8d, d ## D;\ 16562306a36Sopenharmony_ci rol $15, d ## D; 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci/* 16862306a36Sopenharmony_ci * a input register containing a 16962306a36Sopenharmony_ci * b input register containing b 17062306a36Sopenharmony_ci * c input register containing c (already rol $1) 17162306a36Sopenharmony_ci * d input register containing d 17262306a36Sopenharmony_ci * operations on a and b are interleaved to increase performance 17362306a36Sopenharmony_ci * during the round a and b are prepared for the output whitening 17462306a36Sopenharmony_ci */ 17562306a36Sopenharmony_ci#define decrypt_last_round(a,b,c,d,round)\ 17662306a36Sopenharmony_ci movzx a ## B, %edi;\ 17762306a36Sopenharmony_ci mov (%r11,%rdi,4), %r9d;\ 17862306a36Sopenharmony_ci movzx b ## B, %edi;\ 17962306a36Sopenharmony_ci mov s3(%r11,%rdi,4),%r8d;\ 18062306a36Sopenharmony_ci movzx b ## H, %edi;\ 18162306a36Sopenharmony_ci ror $16, b ## D;\ 18262306a36Sopenharmony_ci xor (%r11,%rdi,4), %r8d;\ 18362306a36Sopenharmony_ci movzx a ## H, %edi;\ 18462306a36Sopenharmony_ci mov b ## D, %r10d;\ 18562306a36Sopenharmony_ci shl $32, %r10;\ 18662306a36Sopenharmony_ci xor a, %r10;\ 18762306a36Sopenharmony_ci ror $16, a ## D;\ 18862306a36Sopenharmony_ci xor s1(%r11,%rdi,4),%r9d;\ 18962306a36Sopenharmony_ci movzx b ## B, %edi;\ 19062306a36Sopenharmony_ci xor s1(%r11,%rdi,4),%r8d;\ 19162306a36Sopenharmony_ci movzx a ## B, %edi;\ 19262306a36Sopenharmony_ci xor s2(%r11,%rdi,4),%r9d;\ 19362306a36Sopenharmony_ci movzx b ## H, %edi;\ 19462306a36Sopenharmony_ci xor s2(%r11,%rdi,4),%r8d;\ 19562306a36Sopenharmony_ci movzx a ## H, %edi;\ 19662306a36Sopenharmony_ci xor s3(%r11,%rdi,4),%r9d;\ 19762306a36Sopenharmony_ci add %r8d, %r9d;\ 19862306a36Sopenharmony_ci add %r9d, %r8d;\ 19962306a36Sopenharmony_ci add k+round(%r11), %r9d;\ 20062306a36Sopenharmony_ci xor %r9d, c ## D;\ 20162306a36Sopenharmony_ci add k+4+round(%r11),%r8d;\ 20262306a36Sopenharmony_ci xor %r8d, d ## D;\ 20362306a36Sopenharmony_ci ror $1, d ## D; 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ciSYM_FUNC_START(twofish_enc_blk) 20662306a36Sopenharmony_ci pushq R1 20762306a36Sopenharmony_ci 20862306a36Sopenharmony_ci /* %rdi contains the ctx address */ 20962306a36Sopenharmony_ci /* %rsi contains the output address */ 21062306a36Sopenharmony_ci /* %rdx contains the input address */ 21162306a36Sopenharmony_ci /* ctx address is moved to free one non-rex register 21262306a36Sopenharmony_ci as target for the 8bit high operations */ 21362306a36Sopenharmony_ci mov %rdi, %r11 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci movq (R3), R1 21662306a36Sopenharmony_ci movq 8(R3), R3 21762306a36Sopenharmony_ci input_whitening(R1,%r11,a_offset) 21862306a36Sopenharmony_ci input_whitening(R3,%r11,c_offset) 21962306a36Sopenharmony_ci mov R1D, R0D 22062306a36Sopenharmony_ci rol $16, R0D 22162306a36Sopenharmony_ci shr $32, R1 22262306a36Sopenharmony_ci mov R3D, R2D 22362306a36Sopenharmony_ci shr $32, R3 22462306a36Sopenharmony_ci rol $1, R3D 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci encrypt_round(R0,R1,R2,R3,0); 22762306a36Sopenharmony_ci encrypt_round(R2,R3,R0,R1,8); 22862306a36Sopenharmony_ci encrypt_round(R0,R1,R2,R3,2*8); 22962306a36Sopenharmony_ci encrypt_round(R2,R3,R0,R1,3*8); 23062306a36Sopenharmony_ci encrypt_round(R0,R1,R2,R3,4*8); 23162306a36Sopenharmony_ci encrypt_round(R2,R3,R0,R1,5*8); 23262306a36Sopenharmony_ci encrypt_round(R0,R1,R2,R3,6*8); 23362306a36Sopenharmony_ci encrypt_round(R2,R3,R0,R1,7*8); 23462306a36Sopenharmony_ci encrypt_round(R0,R1,R2,R3,8*8); 23562306a36Sopenharmony_ci encrypt_round(R2,R3,R0,R1,9*8); 23662306a36Sopenharmony_ci encrypt_round(R0,R1,R2,R3,10*8); 23762306a36Sopenharmony_ci encrypt_round(R2,R3,R0,R1,11*8); 23862306a36Sopenharmony_ci encrypt_round(R0,R1,R2,R3,12*8); 23962306a36Sopenharmony_ci encrypt_round(R2,R3,R0,R1,13*8); 24062306a36Sopenharmony_ci encrypt_round(R0,R1,R2,R3,14*8); 24162306a36Sopenharmony_ci encrypt_last_round(R2,R3,R0,R1,15*8); 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci output_whitening(%r10,%r11,a_offset) 24562306a36Sopenharmony_ci movq %r10, (%rsi) 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci shl $32, R1 24862306a36Sopenharmony_ci xor R0, R1 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci output_whitening(R1,%r11,c_offset) 25162306a36Sopenharmony_ci movq R1, 8(%rsi) 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci popq R1 25462306a36Sopenharmony_ci movl $1,%eax 25562306a36Sopenharmony_ci RET 25662306a36Sopenharmony_ciSYM_FUNC_END(twofish_enc_blk) 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ciSYM_FUNC_START(twofish_dec_blk) 25962306a36Sopenharmony_ci pushq R1 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_ci /* %rdi contains the ctx address */ 26262306a36Sopenharmony_ci /* %rsi contains the output address */ 26362306a36Sopenharmony_ci /* %rdx contains the input address */ 26462306a36Sopenharmony_ci /* ctx address is moved to free one non-rex register 26562306a36Sopenharmony_ci as target for the 8bit high operations */ 26662306a36Sopenharmony_ci mov %rdi, %r11 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci movq (R3), R1 26962306a36Sopenharmony_ci movq 8(R3), R3 27062306a36Sopenharmony_ci output_whitening(R1,%r11,a_offset) 27162306a36Sopenharmony_ci output_whitening(R3,%r11,c_offset) 27262306a36Sopenharmony_ci mov R1D, R0D 27362306a36Sopenharmony_ci shr $32, R1 27462306a36Sopenharmony_ci rol $16, R1D 27562306a36Sopenharmony_ci mov R3D, R2D 27662306a36Sopenharmony_ci shr $32, R3 27762306a36Sopenharmony_ci rol $1, R2D 27862306a36Sopenharmony_ci 27962306a36Sopenharmony_ci decrypt_round(R0,R1,R2,R3,15*8); 28062306a36Sopenharmony_ci decrypt_round(R2,R3,R0,R1,14*8); 28162306a36Sopenharmony_ci decrypt_round(R0,R1,R2,R3,13*8); 28262306a36Sopenharmony_ci decrypt_round(R2,R3,R0,R1,12*8); 28362306a36Sopenharmony_ci decrypt_round(R0,R1,R2,R3,11*8); 28462306a36Sopenharmony_ci decrypt_round(R2,R3,R0,R1,10*8); 28562306a36Sopenharmony_ci decrypt_round(R0,R1,R2,R3,9*8); 28662306a36Sopenharmony_ci decrypt_round(R2,R3,R0,R1,8*8); 28762306a36Sopenharmony_ci decrypt_round(R0,R1,R2,R3,7*8); 28862306a36Sopenharmony_ci decrypt_round(R2,R3,R0,R1,6*8); 28962306a36Sopenharmony_ci decrypt_round(R0,R1,R2,R3,5*8); 29062306a36Sopenharmony_ci decrypt_round(R2,R3,R0,R1,4*8); 29162306a36Sopenharmony_ci decrypt_round(R0,R1,R2,R3,3*8); 29262306a36Sopenharmony_ci decrypt_round(R2,R3,R0,R1,2*8); 29362306a36Sopenharmony_ci decrypt_round(R0,R1,R2,R3,1*8); 29462306a36Sopenharmony_ci decrypt_last_round(R2,R3,R0,R1,0); 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci input_whitening(%r10,%r11,a_offset) 29762306a36Sopenharmony_ci movq %r10, (%rsi) 29862306a36Sopenharmony_ci 29962306a36Sopenharmony_ci shl $32, R1 30062306a36Sopenharmony_ci xor R0, R1 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci input_whitening(R1,%r11,c_offset) 30362306a36Sopenharmony_ci movq R1, 8(%rsi) 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci popq R1 30662306a36Sopenharmony_ci movl $1,%eax 30762306a36Sopenharmony_ci RET 30862306a36Sopenharmony_ciSYM_FUNC_END(twofish_dec_blk) 309