18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 28c2ecf20Sopenharmony_ci/*************************************************************************** 38c2ecf20Sopenharmony_ci* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> * 48c2ecf20Sopenharmony_ci* * 58c2ecf20Sopenharmony_ci***************************************************************************/ 68c2ecf20Sopenharmony_ci 78c2ecf20Sopenharmony_ci.file "twofish-x86_64-asm.S" 88c2ecf20Sopenharmony_ci.text 98c2ecf20Sopenharmony_ci 108c2ecf20Sopenharmony_ci#include <linux/linkage.h> 118c2ecf20Sopenharmony_ci#include <asm/asm-offsets.h> 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_ci#define a_offset 0 148c2ecf20Sopenharmony_ci#define b_offset 4 158c2ecf20Sopenharmony_ci#define c_offset 8 168c2ecf20Sopenharmony_ci#define d_offset 12 178c2ecf20Sopenharmony_ci 188c2ecf20Sopenharmony_ci/* Structure of the crypto context struct*/ 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ci#define s0 0 /* S0 Array 256 Words each */ 218c2ecf20Sopenharmony_ci#define s1 1024 /* S1 Array */ 228c2ecf20Sopenharmony_ci#define s2 2048 /* S2 Array */ 238c2ecf20Sopenharmony_ci#define s3 3072 /* S3 Array */ 248c2ecf20Sopenharmony_ci#define w 4096 /* 8 whitening keys (word) */ 258c2ecf20Sopenharmony_ci#define k 4128 /* key 1-32 ( word ) */ 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci/* define a few register aliases to allow macro substitution */ 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_ci#define R0 %rax 308c2ecf20Sopenharmony_ci#define R0D %eax 318c2ecf20Sopenharmony_ci#define R0B %al 328c2ecf20Sopenharmony_ci#define R0H %ah 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_ci#define R1 %rbx 358c2ecf20Sopenharmony_ci#define R1D %ebx 368c2ecf20Sopenharmony_ci#define R1B %bl 378c2ecf20Sopenharmony_ci#define R1H %bh 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ci#define R2 %rcx 408c2ecf20Sopenharmony_ci#define R2D %ecx 418c2ecf20Sopenharmony_ci#define R2B %cl 428c2ecf20Sopenharmony_ci#define R2H %ch 438c2ecf20Sopenharmony_ci 448c2ecf20Sopenharmony_ci#define R3 %rdx 458c2ecf20Sopenharmony_ci#define R3D %edx 468c2ecf20Sopenharmony_ci#define R3B %dl 478c2ecf20Sopenharmony_ci#define R3H %dh 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci/* performs input whitening */ 518c2ecf20Sopenharmony_ci#define input_whitening(src,context,offset)\ 528c2ecf20Sopenharmony_ci xor w+offset(context), src; 538c2ecf20Sopenharmony_ci 548c2ecf20Sopenharmony_ci/* performs input whitening */ 558c2ecf20Sopenharmony_ci#define output_whitening(src,context,offset)\ 568c2ecf20Sopenharmony_ci xor w+16+offset(context), src; 578c2ecf20Sopenharmony_ci 588c2ecf20Sopenharmony_ci 598c2ecf20Sopenharmony_ci/* 608c2ecf20Sopenharmony_ci * a input register containing a (rotated 16) 618c2ecf20Sopenharmony_ci * b input register containing b 628c2ecf20Sopenharmony_ci * c input register containing c 638c2ecf20Sopenharmony_ci * d input register containing d (already rol $1) 648c2ecf20Sopenharmony_ci * operations on a and b are interleaved to increase performance 658c2ecf20Sopenharmony_ci */ 668c2ecf20Sopenharmony_ci#define encrypt_round(a,b,c,d,round)\ 678c2ecf20Sopenharmony_ci movzx b ## B, %edi;\ 688c2ecf20Sopenharmony_ci mov s1(%r11,%rdi,4),%r8d;\ 698c2ecf20Sopenharmony_ci movzx a ## B, %edi;\ 708c2ecf20Sopenharmony_ci mov s2(%r11,%rdi,4),%r9d;\ 718c2ecf20Sopenharmony_ci movzx b ## H, %edi;\ 728c2ecf20Sopenharmony_ci ror $16, b ## D;\ 738c2ecf20Sopenharmony_ci xor s2(%r11,%rdi,4),%r8d;\ 748c2ecf20Sopenharmony_ci movzx a ## H, %edi;\ 758c2ecf20Sopenharmony_ci ror $16, a ## D;\ 768c2ecf20Sopenharmony_ci xor s3(%r11,%rdi,4),%r9d;\ 778c2ecf20Sopenharmony_ci movzx b ## B, %edi;\ 788c2ecf20Sopenharmony_ci xor s3(%r11,%rdi,4),%r8d;\ 798c2ecf20Sopenharmony_ci movzx a ## B, %edi;\ 808c2ecf20Sopenharmony_ci xor (%r11,%rdi,4), %r9d;\ 818c2ecf20Sopenharmony_ci movzx b ## H, %edi;\ 828c2ecf20Sopenharmony_ci ror $15, b ## D;\ 838c2ecf20Sopenharmony_ci xor (%r11,%rdi,4), %r8d;\ 848c2ecf20Sopenharmony_ci movzx a ## H, %edi;\ 858c2ecf20Sopenharmony_ci xor s1(%r11,%rdi,4),%r9d;\ 868c2ecf20Sopenharmony_ci add %r8d, %r9d;\ 878c2ecf20Sopenharmony_ci add %r9d, %r8d;\ 888c2ecf20Sopenharmony_ci add k+round(%r11), %r9d;\ 898c2ecf20Sopenharmony_ci xor %r9d, c ## D;\ 908c2ecf20Sopenharmony_ci rol $15, c ## D;\ 918c2ecf20Sopenharmony_ci add k+4+round(%r11),%r8d;\ 928c2ecf20Sopenharmony_ci xor %r8d, d ## D; 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci/* 958c2ecf20Sopenharmony_ci * a input register containing a(rotated 16) 968c2ecf20Sopenharmony_ci * b input register containing b 978c2ecf20Sopenharmony_ci * c input register containing c 988c2ecf20Sopenharmony_ci * d input register containing d (already rol $1) 998c2ecf20Sopenharmony_ci * operations on a and b are interleaved to increase performance 1008c2ecf20Sopenharmony_ci * during the round a and b are prepared for the output whitening 1018c2ecf20Sopenharmony_ci */ 1028c2ecf20Sopenharmony_ci#define encrypt_last_round(a,b,c,d,round)\ 1038c2ecf20Sopenharmony_ci mov b ## D, %r10d;\ 1048c2ecf20Sopenharmony_ci shl $32, %r10;\ 1058c2ecf20Sopenharmony_ci movzx b ## B, %edi;\ 1068c2ecf20Sopenharmony_ci mov s1(%r11,%rdi,4),%r8d;\ 1078c2ecf20Sopenharmony_ci movzx a ## B, %edi;\ 1088c2ecf20Sopenharmony_ci mov s2(%r11,%rdi,4),%r9d;\ 1098c2ecf20Sopenharmony_ci movzx b ## H, %edi;\ 1108c2ecf20Sopenharmony_ci ror $16, b ## D;\ 1118c2ecf20Sopenharmony_ci xor s2(%r11,%rdi,4),%r8d;\ 1128c2ecf20Sopenharmony_ci movzx a ## H, %edi;\ 1138c2ecf20Sopenharmony_ci ror $16, a ## D;\ 1148c2ecf20Sopenharmony_ci xor s3(%r11,%rdi,4),%r9d;\ 1158c2ecf20Sopenharmony_ci movzx b ## B, %edi;\ 1168c2ecf20Sopenharmony_ci xor s3(%r11,%rdi,4),%r8d;\ 1178c2ecf20Sopenharmony_ci movzx a ## B, %edi;\ 1188c2ecf20Sopenharmony_ci xor (%r11,%rdi,4), %r9d;\ 1198c2ecf20Sopenharmony_ci xor a, %r10;\ 1208c2ecf20Sopenharmony_ci movzx b ## H, %edi;\ 1218c2ecf20Sopenharmony_ci xor (%r11,%rdi,4), %r8d;\ 1228c2ecf20Sopenharmony_ci movzx a ## H, %edi;\ 1238c2ecf20Sopenharmony_ci xor s1(%r11,%rdi,4),%r9d;\ 1248c2ecf20Sopenharmony_ci add %r8d, %r9d;\ 1258c2ecf20Sopenharmony_ci add %r9d, %r8d;\ 1268c2ecf20Sopenharmony_ci add k+round(%r11), %r9d;\ 1278c2ecf20Sopenharmony_ci xor %r9d, c ## D;\ 1288c2ecf20Sopenharmony_ci ror $1, c ## D;\ 1298c2ecf20Sopenharmony_ci add k+4+round(%r11),%r8d;\ 1308c2ecf20Sopenharmony_ci xor %r8d, d ## D 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci/* 1338c2ecf20Sopenharmony_ci * a input register containing a 1348c2ecf20Sopenharmony_ci * b input register containing b (rotated 16) 1358c2ecf20Sopenharmony_ci * c input register containing c (already rol $1) 1368c2ecf20Sopenharmony_ci * d input register containing d 1378c2ecf20Sopenharmony_ci * operations on a and b are interleaved to increase performance 1388c2ecf20Sopenharmony_ci */ 1398c2ecf20Sopenharmony_ci#define decrypt_round(a,b,c,d,round)\ 1408c2ecf20Sopenharmony_ci movzx a ## B, %edi;\ 1418c2ecf20Sopenharmony_ci mov (%r11,%rdi,4), %r9d;\ 1428c2ecf20Sopenharmony_ci movzx b ## B, %edi;\ 1438c2ecf20Sopenharmony_ci mov s3(%r11,%rdi,4),%r8d;\ 1448c2ecf20Sopenharmony_ci movzx a ## H, %edi;\ 1458c2ecf20Sopenharmony_ci ror $16, a ## D;\ 1468c2ecf20Sopenharmony_ci xor s1(%r11,%rdi,4),%r9d;\ 1478c2ecf20Sopenharmony_ci movzx b ## H, %edi;\ 1488c2ecf20Sopenharmony_ci ror $16, b ## D;\ 1498c2ecf20Sopenharmony_ci xor (%r11,%rdi,4), %r8d;\ 1508c2ecf20Sopenharmony_ci movzx a ## B, %edi;\ 1518c2ecf20Sopenharmony_ci xor s2(%r11,%rdi,4),%r9d;\ 1528c2ecf20Sopenharmony_ci movzx b ## B, %edi;\ 1538c2ecf20Sopenharmony_ci xor s1(%r11,%rdi,4),%r8d;\ 1548c2ecf20Sopenharmony_ci movzx a ## H, %edi;\ 1558c2ecf20Sopenharmony_ci ror $15, a ## D;\ 1568c2ecf20Sopenharmony_ci xor s3(%r11,%rdi,4),%r9d;\ 1578c2ecf20Sopenharmony_ci movzx b ## H, %edi;\ 1588c2ecf20Sopenharmony_ci xor s2(%r11,%rdi,4),%r8d;\ 1598c2ecf20Sopenharmony_ci add %r8d, %r9d;\ 1608c2ecf20Sopenharmony_ci add %r9d, %r8d;\ 1618c2ecf20Sopenharmony_ci add k+round(%r11), %r9d;\ 1628c2ecf20Sopenharmony_ci xor %r9d, c ## D;\ 1638c2ecf20Sopenharmony_ci add k+4+round(%r11),%r8d;\ 1648c2ecf20Sopenharmony_ci xor %r8d, d ## D;\ 1658c2ecf20Sopenharmony_ci rol $15, d ## D; 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ci/* 1688c2ecf20Sopenharmony_ci * a input register containing a 1698c2ecf20Sopenharmony_ci * b input register containing b 1708c2ecf20Sopenharmony_ci * c input register containing c (already rol $1) 1718c2ecf20Sopenharmony_ci * d input register containing d 1728c2ecf20Sopenharmony_ci * operations on a and b are interleaved to increase performance 1738c2ecf20Sopenharmony_ci * during the round a and b are prepared for the output whitening 1748c2ecf20Sopenharmony_ci */ 1758c2ecf20Sopenharmony_ci#define decrypt_last_round(a,b,c,d,round)\ 1768c2ecf20Sopenharmony_ci movzx a ## B, %edi;\ 1778c2ecf20Sopenharmony_ci mov (%r11,%rdi,4), %r9d;\ 1788c2ecf20Sopenharmony_ci movzx b ## B, %edi;\ 1798c2ecf20Sopenharmony_ci mov s3(%r11,%rdi,4),%r8d;\ 1808c2ecf20Sopenharmony_ci movzx b ## H, %edi;\ 1818c2ecf20Sopenharmony_ci ror $16, b ## D;\ 1828c2ecf20Sopenharmony_ci xor (%r11,%rdi,4), %r8d;\ 1838c2ecf20Sopenharmony_ci movzx a ## H, %edi;\ 1848c2ecf20Sopenharmony_ci mov b ## D, %r10d;\ 1858c2ecf20Sopenharmony_ci shl $32, %r10;\ 1868c2ecf20Sopenharmony_ci xor a, %r10;\ 1878c2ecf20Sopenharmony_ci ror $16, a ## D;\ 1888c2ecf20Sopenharmony_ci xor s1(%r11,%rdi,4),%r9d;\ 1898c2ecf20Sopenharmony_ci movzx b ## B, %edi;\ 1908c2ecf20Sopenharmony_ci xor s1(%r11,%rdi,4),%r8d;\ 1918c2ecf20Sopenharmony_ci movzx a ## B, %edi;\ 1928c2ecf20Sopenharmony_ci xor s2(%r11,%rdi,4),%r9d;\ 1938c2ecf20Sopenharmony_ci movzx b ## H, %edi;\ 1948c2ecf20Sopenharmony_ci xor s2(%r11,%rdi,4),%r8d;\ 1958c2ecf20Sopenharmony_ci movzx a ## H, %edi;\ 1968c2ecf20Sopenharmony_ci xor s3(%r11,%rdi,4),%r9d;\ 1978c2ecf20Sopenharmony_ci add %r8d, %r9d;\ 1988c2ecf20Sopenharmony_ci add %r9d, %r8d;\ 1998c2ecf20Sopenharmony_ci add k+round(%r11), %r9d;\ 2008c2ecf20Sopenharmony_ci xor %r9d, c ## D;\ 2018c2ecf20Sopenharmony_ci add k+4+round(%r11),%r8d;\ 2028c2ecf20Sopenharmony_ci xor %r8d, d ## D;\ 2038c2ecf20Sopenharmony_ci ror $1, d ## D; 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_enc_blk) 2068c2ecf20Sopenharmony_ci pushq R1 2078c2ecf20Sopenharmony_ci 2088c2ecf20Sopenharmony_ci /* %rdi contains the ctx address */ 2098c2ecf20Sopenharmony_ci /* %rsi contains the output address */ 2108c2ecf20Sopenharmony_ci /* %rdx contains the input address */ 2118c2ecf20Sopenharmony_ci /* ctx address is moved to free one non-rex register 2128c2ecf20Sopenharmony_ci as target for the 8bit high operations */ 2138c2ecf20Sopenharmony_ci mov %rdi, %r11 2148c2ecf20Sopenharmony_ci 2158c2ecf20Sopenharmony_ci movq (R3), R1 2168c2ecf20Sopenharmony_ci movq 8(R3), R3 2178c2ecf20Sopenharmony_ci input_whitening(R1,%r11,a_offset) 2188c2ecf20Sopenharmony_ci input_whitening(R3,%r11,c_offset) 2198c2ecf20Sopenharmony_ci mov R1D, R0D 2208c2ecf20Sopenharmony_ci rol $16, R0D 2218c2ecf20Sopenharmony_ci shr $32, R1 2228c2ecf20Sopenharmony_ci mov R3D, R2D 2238c2ecf20Sopenharmony_ci shr $32, R3 2248c2ecf20Sopenharmony_ci rol $1, R3D 2258c2ecf20Sopenharmony_ci 2268c2ecf20Sopenharmony_ci encrypt_round(R0,R1,R2,R3,0); 2278c2ecf20Sopenharmony_ci encrypt_round(R2,R3,R0,R1,8); 2288c2ecf20Sopenharmony_ci encrypt_round(R0,R1,R2,R3,2*8); 2298c2ecf20Sopenharmony_ci encrypt_round(R2,R3,R0,R1,3*8); 2308c2ecf20Sopenharmony_ci encrypt_round(R0,R1,R2,R3,4*8); 2318c2ecf20Sopenharmony_ci encrypt_round(R2,R3,R0,R1,5*8); 2328c2ecf20Sopenharmony_ci encrypt_round(R0,R1,R2,R3,6*8); 2338c2ecf20Sopenharmony_ci encrypt_round(R2,R3,R0,R1,7*8); 2348c2ecf20Sopenharmony_ci encrypt_round(R0,R1,R2,R3,8*8); 2358c2ecf20Sopenharmony_ci encrypt_round(R2,R3,R0,R1,9*8); 2368c2ecf20Sopenharmony_ci encrypt_round(R0,R1,R2,R3,10*8); 2378c2ecf20Sopenharmony_ci encrypt_round(R2,R3,R0,R1,11*8); 2388c2ecf20Sopenharmony_ci encrypt_round(R0,R1,R2,R3,12*8); 2398c2ecf20Sopenharmony_ci encrypt_round(R2,R3,R0,R1,13*8); 2408c2ecf20Sopenharmony_ci encrypt_round(R0,R1,R2,R3,14*8); 2418c2ecf20Sopenharmony_ci encrypt_last_round(R2,R3,R0,R1,15*8); 2428c2ecf20Sopenharmony_ci 2438c2ecf20Sopenharmony_ci 2448c2ecf20Sopenharmony_ci output_whitening(%r10,%r11,a_offset) 2458c2ecf20Sopenharmony_ci movq %r10, (%rsi) 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_ci shl $32, R1 2488c2ecf20Sopenharmony_ci xor R0, R1 2498c2ecf20Sopenharmony_ci 2508c2ecf20Sopenharmony_ci output_whitening(R1,%r11,c_offset) 2518c2ecf20Sopenharmony_ci movq R1, 8(%rsi) 2528c2ecf20Sopenharmony_ci 2538c2ecf20Sopenharmony_ci popq R1 2548c2ecf20Sopenharmony_ci movl $1,%eax 2558c2ecf20Sopenharmony_ci RET 2568c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_enc_blk) 2578c2ecf20Sopenharmony_ci 2588c2ecf20Sopenharmony_ciSYM_FUNC_START(twofish_dec_blk) 2598c2ecf20Sopenharmony_ci pushq R1 2608c2ecf20Sopenharmony_ci 2618c2ecf20Sopenharmony_ci /* %rdi contains the ctx address */ 2628c2ecf20Sopenharmony_ci /* %rsi contains the output address */ 2638c2ecf20Sopenharmony_ci /* %rdx contains the input address */ 2648c2ecf20Sopenharmony_ci /* ctx address is moved to free one non-rex register 2658c2ecf20Sopenharmony_ci as target for the 8bit high operations */ 2668c2ecf20Sopenharmony_ci mov %rdi, %r11 2678c2ecf20Sopenharmony_ci 2688c2ecf20Sopenharmony_ci movq (R3), R1 2698c2ecf20Sopenharmony_ci movq 8(R3), R3 2708c2ecf20Sopenharmony_ci output_whitening(R1,%r11,a_offset) 2718c2ecf20Sopenharmony_ci output_whitening(R3,%r11,c_offset) 2728c2ecf20Sopenharmony_ci mov R1D, R0D 2738c2ecf20Sopenharmony_ci shr $32, R1 2748c2ecf20Sopenharmony_ci rol $16, R1D 2758c2ecf20Sopenharmony_ci mov R3D, R2D 2768c2ecf20Sopenharmony_ci shr $32, R3 2778c2ecf20Sopenharmony_ci rol $1, R2D 2788c2ecf20Sopenharmony_ci 2798c2ecf20Sopenharmony_ci decrypt_round(R0,R1,R2,R3,15*8); 2808c2ecf20Sopenharmony_ci decrypt_round(R2,R3,R0,R1,14*8); 2818c2ecf20Sopenharmony_ci decrypt_round(R0,R1,R2,R3,13*8); 2828c2ecf20Sopenharmony_ci decrypt_round(R2,R3,R0,R1,12*8); 2838c2ecf20Sopenharmony_ci decrypt_round(R0,R1,R2,R3,11*8); 2848c2ecf20Sopenharmony_ci decrypt_round(R2,R3,R0,R1,10*8); 2858c2ecf20Sopenharmony_ci decrypt_round(R0,R1,R2,R3,9*8); 2868c2ecf20Sopenharmony_ci decrypt_round(R2,R3,R0,R1,8*8); 2878c2ecf20Sopenharmony_ci decrypt_round(R0,R1,R2,R3,7*8); 2888c2ecf20Sopenharmony_ci decrypt_round(R2,R3,R0,R1,6*8); 2898c2ecf20Sopenharmony_ci decrypt_round(R0,R1,R2,R3,5*8); 2908c2ecf20Sopenharmony_ci decrypt_round(R2,R3,R0,R1,4*8); 2918c2ecf20Sopenharmony_ci decrypt_round(R0,R1,R2,R3,3*8); 2928c2ecf20Sopenharmony_ci decrypt_round(R2,R3,R0,R1,2*8); 2938c2ecf20Sopenharmony_ci decrypt_round(R0,R1,R2,R3,1*8); 2948c2ecf20Sopenharmony_ci decrypt_last_round(R2,R3,R0,R1,0); 2958c2ecf20Sopenharmony_ci 2968c2ecf20Sopenharmony_ci input_whitening(%r10,%r11,a_offset) 2978c2ecf20Sopenharmony_ci movq %r10, (%rsi) 2988c2ecf20Sopenharmony_ci 2998c2ecf20Sopenharmony_ci shl $32, R1 3008c2ecf20Sopenharmony_ci xor R0, R1 3018c2ecf20Sopenharmony_ci 3028c2ecf20Sopenharmony_ci input_whitening(R1,%r11,c_offset) 3038c2ecf20Sopenharmony_ci movq R1, 8(%rsi) 3048c2ecf20Sopenharmony_ci 3058c2ecf20Sopenharmony_ci popq R1 3068c2ecf20Sopenharmony_ci movl $1,%eax 3078c2ecf20Sopenharmony_ci RET 3088c2ecf20Sopenharmony_ciSYM_FUNC_END(twofish_dec_blk) 309