18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental 48c2ecf20Sopenharmony_ci * SSE3 instruction set extensions introduced in Intel Core Microarchitecture 58c2ecf20Sopenharmony_ci * processors. CPUs supporting Intel(R) AVX extensions will get an additional 68c2ecf20Sopenharmony_ci * boost. 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * This work was inspired by the vectorized implementation of Dean Gaudet. 98c2ecf20Sopenharmony_ci * Additional information on it can be found at: 108c2ecf20Sopenharmony_ci * http://www.arctic.org/~dean/crypto/sha1.html 118c2ecf20Sopenharmony_ci * 128c2ecf20Sopenharmony_ci * It was improved upon with more efficient vectorization of the message 138c2ecf20Sopenharmony_ci * scheduling. This implementation has also been optimized for all current and 148c2ecf20Sopenharmony_ci * several future generations of Intel CPUs. 158c2ecf20Sopenharmony_ci * 168c2ecf20Sopenharmony_ci * See this article for more information about the implementation details: 178c2ecf20Sopenharmony_ci * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ 188c2ecf20Sopenharmony_ci * 198c2ecf20Sopenharmony_ci * Copyright (C) 2010, Intel Corp. 208c2ecf20Sopenharmony_ci * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> 218c2ecf20Sopenharmony_ci * Ronen Zohar <ronen.zohar@intel.com> 228c2ecf20Sopenharmony_ci * 238c2ecf20Sopenharmony_ci * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: 248c2ecf20Sopenharmony_ci * Author: Mathias Krause <minipli@googlemail.com> 258c2ecf20Sopenharmony_ci */ 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci#include <linux/linkage.h> 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_ci#define CTX %rdi // arg1 308c2ecf20Sopenharmony_ci#define BUF %rsi // arg2 318c2ecf20Sopenharmony_ci#define CNT %rdx // arg3 328c2ecf20Sopenharmony_ci 338c2ecf20Sopenharmony_ci#define REG_A %ecx 348c2ecf20Sopenharmony_ci#define REG_B %esi 358c2ecf20Sopenharmony_ci#define REG_C %edi 368c2ecf20Sopenharmony_ci#define REG_D %r12d 378c2ecf20Sopenharmony_ci#define REG_E %edx 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ci#define REG_T1 %eax 408c2ecf20Sopenharmony_ci#define REG_T2 %ebx 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci#define K_BASE %r8 438c2ecf20Sopenharmony_ci#define HASH_PTR %r9 448c2ecf20Sopenharmony_ci#define BUFFER_PTR %r10 458c2ecf20Sopenharmony_ci#define BUFFER_END %r11 468c2ecf20Sopenharmony_ci 478c2ecf20Sopenharmony_ci#define W_TMP1 %xmm0 488c2ecf20Sopenharmony_ci#define W_TMP2 %xmm9 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci#define W0 %xmm1 518c2ecf20Sopenharmony_ci#define W4 %xmm2 528c2ecf20Sopenharmony_ci#define W8 %xmm3 538c2ecf20Sopenharmony_ci#define W12 %xmm4 548c2ecf20Sopenharmony_ci#define W16 %xmm5 558c2ecf20Sopenharmony_ci#define W20 %xmm6 568c2ecf20Sopenharmony_ci#define W24 %xmm7 578c2ecf20Sopenharmony_ci#define W28 %xmm8 588c2ecf20Sopenharmony_ci 598c2ecf20Sopenharmony_ci#define XMM_SHUFB_BSWAP %xmm10 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ 628c2ecf20Sopenharmony_ci#define WK(t) (((t) & 15) * 4)(%rsp) 638c2ecf20Sopenharmony_ci#define W_PRECALC_AHEAD 16 648c2ecf20Sopenharmony_ci 658c2ecf20Sopenharmony_ci/* 668c2ecf20Sopenharmony_ci * This macro implements the SHA-1 function's body for single 64-byte block 678c2ecf20Sopenharmony_ci * param: function's name 688c2ecf20Sopenharmony_ci */ 698c2ecf20Sopenharmony_ci.macro SHA1_VECTOR_ASM name 708c2ecf20Sopenharmony_ci SYM_FUNC_START(\name) 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_ci push %rbx 738c2ecf20Sopenharmony_ci push %r12 748c2ecf20Sopenharmony_ci push %rbp 758c2ecf20Sopenharmony_ci mov %rsp, %rbp 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci sub $64, %rsp # allocate workspace 788c2ecf20Sopenharmony_ci and $~15, %rsp # align stack 798c2ecf20Sopenharmony_ci 808c2ecf20Sopenharmony_ci mov CTX, HASH_PTR 818c2ecf20Sopenharmony_ci mov BUF, BUFFER_PTR 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_ci shl $6, CNT # multiply by 64 848c2ecf20Sopenharmony_ci add BUF, CNT 858c2ecf20Sopenharmony_ci mov CNT, BUFFER_END 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_ci lea K_XMM_AR(%rip), K_BASE 888c2ecf20Sopenharmony_ci xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci SHA1_PIPELINED_MAIN_BODY 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ci # cleanup workspace 938c2ecf20Sopenharmony_ci mov $8, %ecx 948c2ecf20Sopenharmony_ci mov %rsp, %rdi 958c2ecf20Sopenharmony_ci xor %eax, %eax 968c2ecf20Sopenharmony_ci rep stosq 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci mov %rbp, %rsp # deallocate workspace 998c2ecf20Sopenharmony_ci pop %rbp 1008c2ecf20Sopenharmony_ci pop %r12 1018c2ecf20Sopenharmony_ci pop %rbx 1028c2ecf20Sopenharmony_ci RET 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci SYM_FUNC_END(\name) 1058c2ecf20Sopenharmony_ci.endm 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_ci/* 1088c2ecf20Sopenharmony_ci * This macro implements 80 rounds of SHA-1 for one 64-byte block 1098c2ecf20Sopenharmony_ci */ 1108c2ecf20Sopenharmony_ci.macro SHA1_PIPELINED_MAIN_BODY 1118c2ecf20Sopenharmony_ci INIT_REGALLOC 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ci mov (HASH_PTR), A 1148c2ecf20Sopenharmony_ci mov 4(HASH_PTR), B 1158c2ecf20Sopenharmony_ci mov 8(HASH_PTR), C 1168c2ecf20Sopenharmony_ci mov 12(HASH_PTR), D 1178c2ecf20Sopenharmony_ci mov 16(HASH_PTR), E 1188c2ecf20Sopenharmony_ci 1198c2ecf20Sopenharmony_ci .set i, 0 1208c2ecf20Sopenharmony_ci .rept W_PRECALC_AHEAD 1218c2ecf20Sopenharmony_ci W_PRECALC i 1228c2ecf20Sopenharmony_ci .set i, (i+1) 1238c2ecf20Sopenharmony_ci .endr 1248c2ecf20Sopenharmony_ci 1258c2ecf20Sopenharmony_ci.align 4 1268c2ecf20Sopenharmony_ci1: 1278c2ecf20Sopenharmony_ci RR F1,A,B,C,D,E,0 1288c2ecf20Sopenharmony_ci RR F1,D,E,A,B,C,2 1298c2ecf20Sopenharmony_ci RR F1,B,C,D,E,A,4 1308c2ecf20Sopenharmony_ci RR F1,E,A,B,C,D,6 1318c2ecf20Sopenharmony_ci RR F1,C,D,E,A,B,8 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_ci RR F1,A,B,C,D,E,10 1348c2ecf20Sopenharmony_ci RR F1,D,E,A,B,C,12 1358c2ecf20Sopenharmony_ci RR F1,B,C,D,E,A,14 1368c2ecf20Sopenharmony_ci RR F1,E,A,B,C,D,16 1378c2ecf20Sopenharmony_ci RR F1,C,D,E,A,B,18 1388c2ecf20Sopenharmony_ci 1398c2ecf20Sopenharmony_ci RR F2,A,B,C,D,E,20 1408c2ecf20Sopenharmony_ci RR F2,D,E,A,B,C,22 1418c2ecf20Sopenharmony_ci RR F2,B,C,D,E,A,24 1428c2ecf20Sopenharmony_ci RR F2,E,A,B,C,D,26 1438c2ecf20Sopenharmony_ci RR F2,C,D,E,A,B,28 1448c2ecf20Sopenharmony_ci 1458c2ecf20Sopenharmony_ci RR F2,A,B,C,D,E,30 1468c2ecf20Sopenharmony_ci RR F2,D,E,A,B,C,32 1478c2ecf20Sopenharmony_ci RR F2,B,C,D,E,A,34 1488c2ecf20Sopenharmony_ci RR F2,E,A,B,C,D,36 1498c2ecf20Sopenharmony_ci RR F2,C,D,E,A,B,38 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci RR F3,A,B,C,D,E,40 1528c2ecf20Sopenharmony_ci RR F3,D,E,A,B,C,42 1538c2ecf20Sopenharmony_ci RR F3,B,C,D,E,A,44 1548c2ecf20Sopenharmony_ci RR F3,E,A,B,C,D,46 1558c2ecf20Sopenharmony_ci RR F3,C,D,E,A,B,48 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci RR F3,A,B,C,D,E,50 1588c2ecf20Sopenharmony_ci RR F3,D,E,A,B,C,52 1598c2ecf20Sopenharmony_ci RR F3,B,C,D,E,A,54 1608c2ecf20Sopenharmony_ci RR F3,E,A,B,C,D,56 1618c2ecf20Sopenharmony_ci RR F3,C,D,E,A,B,58 1628c2ecf20Sopenharmony_ci 1638c2ecf20Sopenharmony_ci add $64, BUFFER_PTR # move to the next 64-byte block 1648c2ecf20Sopenharmony_ci cmp BUFFER_END, BUFFER_PTR # if the current is the last one use 1658c2ecf20Sopenharmony_ci cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ci RR F4,A,B,C,D,E,60 1688c2ecf20Sopenharmony_ci RR F4,D,E,A,B,C,62 1698c2ecf20Sopenharmony_ci RR F4,B,C,D,E,A,64 1708c2ecf20Sopenharmony_ci RR F4,E,A,B,C,D,66 1718c2ecf20Sopenharmony_ci RR F4,C,D,E,A,B,68 1728c2ecf20Sopenharmony_ci 1738c2ecf20Sopenharmony_ci RR F4,A,B,C,D,E,70 1748c2ecf20Sopenharmony_ci RR F4,D,E,A,B,C,72 1758c2ecf20Sopenharmony_ci RR F4,B,C,D,E,A,74 1768c2ecf20Sopenharmony_ci RR F4,E,A,B,C,D,76 1778c2ecf20Sopenharmony_ci RR F4,C,D,E,A,B,78 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci UPDATE_HASH (HASH_PTR), A 1808c2ecf20Sopenharmony_ci UPDATE_HASH 4(HASH_PTR), B 1818c2ecf20Sopenharmony_ci UPDATE_HASH 8(HASH_PTR), C 1828c2ecf20Sopenharmony_ci UPDATE_HASH 12(HASH_PTR), D 1838c2ecf20Sopenharmony_ci UPDATE_HASH 16(HASH_PTR), E 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci RESTORE_RENAMED_REGS 1868c2ecf20Sopenharmony_ci cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end 1878c2ecf20Sopenharmony_ci jne 1b 1888c2ecf20Sopenharmony_ci.endm 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci.macro INIT_REGALLOC 1918c2ecf20Sopenharmony_ci .set A, REG_A 1928c2ecf20Sopenharmony_ci .set B, REG_B 1938c2ecf20Sopenharmony_ci .set C, REG_C 1948c2ecf20Sopenharmony_ci .set D, REG_D 1958c2ecf20Sopenharmony_ci .set E, REG_E 1968c2ecf20Sopenharmony_ci .set T1, REG_T1 1978c2ecf20Sopenharmony_ci .set T2, REG_T2 1988c2ecf20Sopenharmony_ci.endm 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_ci.macro RESTORE_RENAMED_REGS 2018c2ecf20Sopenharmony_ci # order is important (REG_C is where it should be) 2028c2ecf20Sopenharmony_ci mov B, REG_B 2038c2ecf20Sopenharmony_ci mov D, REG_D 2048c2ecf20Sopenharmony_ci mov A, REG_A 2058c2ecf20Sopenharmony_ci mov E, REG_E 2068c2ecf20Sopenharmony_ci.endm 2078c2ecf20Sopenharmony_ci 2088c2ecf20Sopenharmony_ci.macro SWAP_REG_NAMES a, b 2098c2ecf20Sopenharmony_ci .set _T, \a 2108c2ecf20Sopenharmony_ci .set \a, \b 2118c2ecf20Sopenharmony_ci .set \b, _T 2128c2ecf20Sopenharmony_ci.endm 2138c2ecf20Sopenharmony_ci 2148c2ecf20Sopenharmony_ci.macro F1 b, c, d 2158c2ecf20Sopenharmony_ci mov \c, T1 2168c2ecf20Sopenharmony_ci SWAP_REG_NAMES \c, T1 2178c2ecf20Sopenharmony_ci xor \d, T1 2188c2ecf20Sopenharmony_ci and \b, T1 2198c2ecf20Sopenharmony_ci xor \d, T1 2208c2ecf20Sopenharmony_ci.endm 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci.macro F2 b, c, d 2238c2ecf20Sopenharmony_ci mov \d, T1 2248c2ecf20Sopenharmony_ci SWAP_REG_NAMES \d, T1 2258c2ecf20Sopenharmony_ci xor \c, T1 2268c2ecf20Sopenharmony_ci xor \b, T1 2278c2ecf20Sopenharmony_ci.endm 2288c2ecf20Sopenharmony_ci 2298c2ecf20Sopenharmony_ci.macro F3 b, c ,d 2308c2ecf20Sopenharmony_ci mov \c, T1 2318c2ecf20Sopenharmony_ci SWAP_REG_NAMES \c, T1 2328c2ecf20Sopenharmony_ci mov \b, T2 2338c2ecf20Sopenharmony_ci or \b, T1 2348c2ecf20Sopenharmony_ci and \c, T2 2358c2ecf20Sopenharmony_ci and \d, T1 2368c2ecf20Sopenharmony_ci or T2, T1 2378c2ecf20Sopenharmony_ci.endm 2388c2ecf20Sopenharmony_ci 2398c2ecf20Sopenharmony_ci.macro F4 b, c, d 2408c2ecf20Sopenharmony_ci F2 \b, \c, \d 2418c2ecf20Sopenharmony_ci.endm 2428c2ecf20Sopenharmony_ci 2438c2ecf20Sopenharmony_ci.macro UPDATE_HASH hash, val 2448c2ecf20Sopenharmony_ci add \hash, \val 2458c2ecf20Sopenharmony_ci mov \val, \hash 2468c2ecf20Sopenharmony_ci.endm 2478c2ecf20Sopenharmony_ci 2488c2ecf20Sopenharmony_ci/* 2498c2ecf20Sopenharmony_ci * RR does two rounds of SHA-1 back to back with W[] pre-calc 2508c2ecf20Sopenharmony_ci * t1 = F(b, c, d); e += w(i) 2518c2ecf20Sopenharmony_ci * e += t1; b <<= 30; d += w(i+1); 2528c2ecf20Sopenharmony_ci * t1 = F(a, b, c); 2538c2ecf20Sopenharmony_ci * d += t1; a <<= 5; 2548c2ecf20Sopenharmony_ci * e += a; 2558c2ecf20Sopenharmony_ci * t1 = e; a >>= 7; 2568c2ecf20Sopenharmony_ci * t1 <<= 5; 2578c2ecf20Sopenharmony_ci * d += t1; 2588c2ecf20Sopenharmony_ci */ 2598c2ecf20Sopenharmony_ci.macro RR F, a, b, c, d, e, round 2608c2ecf20Sopenharmony_ci add WK(\round), \e 2618c2ecf20Sopenharmony_ci \F \b, \c, \d # t1 = F(b, c, d); 2628c2ecf20Sopenharmony_ci W_PRECALC (\round + W_PRECALC_AHEAD) 2638c2ecf20Sopenharmony_ci rol $30, \b 2648c2ecf20Sopenharmony_ci add T1, \e 2658c2ecf20Sopenharmony_ci add WK(\round + 1), \d 2668c2ecf20Sopenharmony_ci 2678c2ecf20Sopenharmony_ci \F \a, \b, \c 2688c2ecf20Sopenharmony_ci W_PRECALC (\round + W_PRECALC_AHEAD + 1) 2698c2ecf20Sopenharmony_ci rol $5, \a 2708c2ecf20Sopenharmony_ci add \a, \e 2718c2ecf20Sopenharmony_ci add T1, \d 2728c2ecf20Sopenharmony_ci ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) 2738c2ecf20Sopenharmony_ci 2748c2ecf20Sopenharmony_ci mov \e, T1 2758c2ecf20Sopenharmony_ci SWAP_REG_NAMES \e, T1 2768c2ecf20Sopenharmony_ci 2778c2ecf20Sopenharmony_ci rol $5, T1 2788c2ecf20Sopenharmony_ci add T1, \d 2798c2ecf20Sopenharmony_ci 2808c2ecf20Sopenharmony_ci # write: \a, \b 2818c2ecf20Sopenharmony_ci # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c 2828c2ecf20Sopenharmony_ci.endm 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_ci.macro W_PRECALC r 2858c2ecf20Sopenharmony_ci .set i, \r 2868c2ecf20Sopenharmony_ci 2878c2ecf20Sopenharmony_ci .if (i < 20) 2888c2ecf20Sopenharmony_ci .set K_XMM, 0 2898c2ecf20Sopenharmony_ci .elseif (i < 40) 2908c2ecf20Sopenharmony_ci .set K_XMM, 16 2918c2ecf20Sopenharmony_ci .elseif (i < 60) 2928c2ecf20Sopenharmony_ci .set K_XMM, 32 2938c2ecf20Sopenharmony_ci .elseif (i < 80) 2948c2ecf20Sopenharmony_ci .set K_XMM, 48 2958c2ecf20Sopenharmony_ci .endif 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_ci .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) 2988c2ecf20Sopenharmony_ci .set i, ((\r) % 80) # pre-compute for the next iteration 2998c2ecf20Sopenharmony_ci .if (i == 0) 3008c2ecf20Sopenharmony_ci W_PRECALC_RESET 3018c2ecf20Sopenharmony_ci .endif 3028c2ecf20Sopenharmony_ci W_PRECALC_00_15 3038c2ecf20Sopenharmony_ci .elseif (i<32) 3048c2ecf20Sopenharmony_ci W_PRECALC_16_31 3058c2ecf20Sopenharmony_ci .elseif (i < 80) // rounds 32-79 3068c2ecf20Sopenharmony_ci W_PRECALC_32_79 3078c2ecf20Sopenharmony_ci .endif 3088c2ecf20Sopenharmony_ci.endm 3098c2ecf20Sopenharmony_ci 3108c2ecf20Sopenharmony_ci.macro W_PRECALC_RESET 3118c2ecf20Sopenharmony_ci .set W, W0 3128c2ecf20Sopenharmony_ci .set W_minus_04, W4 3138c2ecf20Sopenharmony_ci .set W_minus_08, W8 3148c2ecf20Sopenharmony_ci .set W_minus_12, W12 3158c2ecf20Sopenharmony_ci .set W_minus_16, W16 3168c2ecf20Sopenharmony_ci .set W_minus_20, W20 3178c2ecf20Sopenharmony_ci .set W_minus_24, W24 3188c2ecf20Sopenharmony_ci .set W_minus_28, W28 3198c2ecf20Sopenharmony_ci .set W_minus_32, W 3208c2ecf20Sopenharmony_ci.endm 3218c2ecf20Sopenharmony_ci 3228c2ecf20Sopenharmony_ci.macro W_PRECALC_ROTATE 3238c2ecf20Sopenharmony_ci .set W_minus_32, W_minus_28 3248c2ecf20Sopenharmony_ci .set W_minus_28, W_minus_24 3258c2ecf20Sopenharmony_ci .set W_minus_24, W_minus_20 3268c2ecf20Sopenharmony_ci .set W_minus_20, W_minus_16 3278c2ecf20Sopenharmony_ci .set W_minus_16, W_minus_12 3288c2ecf20Sopenharmony_ci .set W_minus_12, W_minus_08 3298c2ecf20Sopenharmony_ci .set W_minus_08, W_minus_04 3308c2ecf20Sopenharmony_ci .set W_minus_04, W 3318c2ecf20Sopenharmony_ci .set W, W_minus_32 3328c2ecf20Sopenharmony_ci.endm 3338c2ecf20Sopenharmony_ci 3348c2ecf20Sopenharmony_ci.macro W_PRECALC_SSSE3 3358c2ecf20Sopenharmony_ci 3368c2ecf20Sopenharmony_ci.macro W_PRECALC_00_15 3378c2ecf20Sopenharmony_ci W_PRECALC_00_15_SSSE3 3388c2ecf20Sopenharmony_ci.endm 3398c2ecf20Sopenharmony_ci.macro W_PRECALC_16_31 3408c2ecf20Sopenharmony_ci W_PRECALC_16_31_SSSE3 3418c2ecf20Sopenharmony_ci.endm 3428c2ecf20Sopenharmony_ci.macro W_PRECALC_32_79 3438c2ecf20Sopenharmony_ci W_PRECALC_32_79_SSSE3 3448c2ecf20Sopenharmony_ci.endm 3458c2ecf20Sopenharmony_ci 3468c2ecf20Sopenharmony_ci/* message scheduling pre-compute for rounds 0-15 */ 3478c2ecf20Sopenharmony_ci.macro W_PRECALC_00_15_SSSE3 3488c2ecf20Sopenharmony_ci .if ((i & 3) == 0) 3498c2ecf20Sopenharmony_ci movdqu (i*4)(BUFFER_PTR), W_TMP1 3508c2ecf20Sopenharmony_ci .elseif ((i & 3) == 1) 3518c2ecf20Sopenharmony_ci pshufb XMM_SHUFB_BSWAP, W_TMP1 3528c2ecf20Sopenharmony_ci movdqa W_TMP1, W 3538c2ecf20Sopenharmony_ci .elseif ((i & 3) == 2) 3548c2ecf20Sopenharmony_ci paddd (K_BASE), W_TMP1 3558c2ecf20Sopenharmony_ci .elseif ((i & 3) == 3) 3568c2ecf20Sopenharmony_ci movdqa W_TMP1, WK(i&~3) 3578c2ecf20Sopenharmony_ci W_PRECALC_ROTATE 3588c2ecf20Sopenharmony_ci .endif 3598c2ecf20Sopenharmony_ci.endm 3608c2ecf20Sopenharmony_ci 3618c2ecf20Sopenharmony_ci/* message scheduling pre-compute for rounds 16-31 3628c2ecf20Sopenharmony_ci * 3638c2ecf20Sopenharmony_ci * - calculating last 32 w[i] values in 8 XMM registers 3648c2ecf20Sopenharmony_ci * - pre-calculate K+w[i] values and store to mem, for later load by ALU add 3658c2ecf20Sopenharmony_ci * instruction 3668c2ecf20Sopenharmony_ci * 3678c2ecf20Sopenharmony_ci * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] 3688c2ecf20Sopenharmony_ci * dependency, but improves for 32-79 3698c2ecf20Sopenharmony_ci */ 3708c2ecf20Sopenharmony_ci.macro W_PRECALC_16_31_SSSE3 3718c2ecf20Sopenharmony_ci # blended scheduling of vector and scalar instruction streams, one 4-wide 3728c2ecf20Sopenharmony_ci # vector iteration / 4 scalar rounds 3738c2ecf20Sopenharmony_ci .if ((i & 3) == 0) 3748c2ecf20Sopenharmony_ci movdqa W_minus_12, W 3758c2ecf20Sopenharmony_ci palignr $8, W_minus_16, W # w[i-14] 3768c2ecf20Sopenharmony_ci movdqa W_minus_04, W_TMP1 3778c2ecf20Sopenharmony_ci psrldq $4, W_TMP1 # w[i-3] 3788c2ecf20Sopenharmony_ci pxor W_minus_08, W 3798c2ecf20Sopenharmony_ci .elseif ((i & 3) == 1) 3808c2ecf20Sopenharmony_ci pxor W_minus_16, W_TMP1 3818c2ecf20Sopenharmony_ci pxor W_TMP1, W 3828c2ecf20Sopenharmony_ci movdqa W, W_TMP2 3838c2ecf20Sopenharmony_ci movdqa W, W_TMP1 3848c2ecf20Sopenharmony_ci pslldq $12, W_TMP2 3858c2ecf20Sopenharmony_ci .elseif ((i & 3) == 2) 3868c2ecf20Sopenharmony_ci psrld $31, W 3878c2ecf20Sopenharmony_ci pslld $1, W_TMP1 3888c2ecf20Sopenharmony_ci por W, W_TMP1 3898c2ecf20Sopenharmony_ci movdqa W_TMP2, W 3908c2ecf20Sopenharmony_ci psrld $30, W_TMP2 3918c2ecf20Sopenharmony_ci pslld $2, W 3928c2ecf20Sopenharmony_ci .elseif ((i & 3) == 3) 3938c2ecf20Sopenharmony_ci pxor W, W_TMP1 3948c2ecf20Sopenharmony_ci pxor W_TMP2, W_TMP1 3958c2ecf20Sopenharmony_ci movdqa W_TMP1, W 3968c2ecf20Sopenharmony_ci paddd K_XMM(K_BASE), W_TMP1 3978c2ecf20Sopenharmony_ci movdqa W_TMP1, WK(i&~3) 3988c2ecf20Sopenharmony_ci W_PRECALC_ROTATE 3998c2ecf20Sopenharmony_ci .endif 4008c2ecf20Sopenharmony_ci.endm 4018c2ecf20Sopenharmony_ci 4028c2ecf20Sopenharmony_ci/* message scheduling pre-compute for rounds 32-79 4038c2ecf20Sopenharmony_ci * 4048c2ecf20Sopenharmony_ci * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 4058c2ecf20Sopenharmony_ci * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 4068c2ecf20Sopenharmony_ci * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken 4078c2ecf20Sopenharmony_ci */ 4088c2ecf20Sopenharmony_ci.macro W_PRECALC_32_79_SSSE3 4098c2ecf20Sopenharmony_ci .if ((i & 3) == 0) 4108c2ecf20Sopenharmony_ci movdqa W_minus_04, W_TMP1 4118c2ecf20Sopenharmony_ci pxor W_minus_28, W # W is W_minus_32 before xor 4128c2ecf20Sopenharmony_ci palignr $8, W_minus_08, W_TMP1 4138c2ecf20Sopenharmony_ci .elseif ((i & 3) == 1) 4148c2ecf20Sopenharmony_ci pxor W_minus_16, W 4158c2ecf20Sopenharmony_ci pxor W_TMP1, W 4168c2ecf20Sopenharmony_ci movdqa W, W_TMP1 4178c2ecf20Sopenharmony_ci .elseif ((i & 3) == 2) 4188c2ecf20Sopenharmony_ci psrld $30, W 4198c2ecf20Sopenharmony_ci pslld $2, W_TMP1 4208c2ecf20Sopenharmony_ci por W, W_TMP1 4218c2ecf20Sopenharmony_ci .elseif ((i & 3) == 3) 4228c2ecf20Sopenharmony_ci movdqa W_TMP1, W 4238c2ecf20Sopenharmony_ci paddd K_XMM(K_BASE), W_TMP1 4248c2ecf20Sopenharmony_ci movdqa W_TMP1, WK(i&~3) 4258c2ecf20Sopenharmony_ci W_PRECALC_ROTATE 4268c2ecf20Sopenharmony_ci .endif 4278c2ecf20Sopenharmony_ci.endm 4288c2ecf20Sopenharmony_ci 4298c2ecf20Sopenharmony_ci.endm // W_PRECALC_SSSE3 4308c2ecf20Sopenharmony_ci 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_ci#define K1 0x5a827999 4338c2ecf20Sopenharmony_ci#define K2 0x6ed9eba1 4348c2ecf20Sopenharmony_ci#define K3 0x8f1bbcdc 4358c2ecf20Sopenharmony_ci#define K4 0xca62c1d6 4368c2ecf20Sopenharmony_ci 4378c2ecf20Sopenharmony_ci.section .rodata 4388c2ecf20Sopenharmony_ci.align 16 4398c2ecf20Sopenharmony_ci 4408c2ecf20Sopenharmony_ciK_XMM_AR: 4418c2ecf20Sopenharmony_ci .long K1, K1, K1, K1 4428c2ecf20Sopenharmony_ci .long K2, K2, K2, K2 4438c2ecf20Sopenharmony_ci .long K3, K3, K3, K3 4448c2ecf20Sopenharmony_ci .long K4, K4, K4, K4 4458c2ecf20Sopenharmony_ci 4468c2ecf20Sopenharmony_ciBSWAP_SHUFB_CTL: 4478c2ecf20Sopenharmony_ci .long 0x00010203 4488c2ecf20Sopenharmony_ci .long 0x04050607 4498c2ecf20Sopenharmony_ci .long 0x08090a0b 4508c2ecf20Sopenharmony_ci .long 0x0c0d0e0f 4518c2ecf20Sopenharmony_ci 4528c2ecf20Sopenharmony_ci 4538c2ecf20Sopenharmony_ci.section .text 4548c2ecf20Sopenharmony_ci 4558c2ecf20Sopenharmony_ciW_PRECALC_SSSE3 4568c2ecf20Sopenharmony_ci.macro xmm_mov a, b 4578c2ecf20Sopenharmony_ci movdqu \a,\b 4588c2ecf20Sopenharmony_ci.endm 4598c2ecf20Sopenharmony_ci 4608c2ecf20Sopenharmony_ci/* 4618c2ecf20Sopenharmony_ci * SSSE3 optimized implementation: 4628c2ecf20Sopenharmony_ci * 4638c2ecf20Sopenharmony_ci * extern "C" void sha1_transform_ssse3(struct sha1_state *state, 4648c2ecf20Sopenharmony_ci * const u8 *data, int blocks); 4658c2ecf20Sopenharmony_ci * 4668c2ecf20Sopenharmony_ci * Note that struct sha1_state is assumed to begin with u32 state[5]. 4678c2ecf20Sopenharmony_ci */ 4688c2ecf20Sopenharmony_ciSHA1_VECTOR_ASM sha1_transform_ssse3 4698c2ecf20Sopenharmony_ci 4708c2ecf20Sopenharmony_ci.macro W_PRECALC_AVX 4718c2ecf20Sopenharmony_ci 4728c2ecf20Sopenharmony_ci.purgem W_PRECALC_00_15 4738c2ecf20Sopenharmony_ci.macro W_PRECALC_00_15 4748c2ecf20Sopenharmony_ci W_PRECALC_00_15_AVX 4758c2ecf20Sopenharmony_ci.endm 4768c2ecf20Sopenharmony_ci.purgem W_PRECALC_16_31 4778c2ecf20Sopenharmony_ci.macro W_PRECALC_16_31 4788c2ecf20Sopenharmony_ci W_PRECALC_16_31_AVX 4798c2ecf20Sopenharmony_ci.endm 4808c2ecf20Sopenharmony_ci.purgem W_PRECALC_32_79 4818c2ecf20Sopenharmony_ci.macro W_PRECALC_32_79 4828c2ecf20Sopenharmony_ci W_PRECALC_32_79_AVX 4838c2ecf20Sopenharmony_ci.endm 4848c2ecf20Sopenharmony_ci 4858c2ecf20Sopenharmony_ci.macro W_PRECALC_00_15_AVX 4868c2ecf20Sopenharmony_ci .if ((i & 3) == 0) 4878c2ecf20Sopenharmony_ci vmovdqu (i*4)(BUFFER_PTR), W_TMP1 4888c2ecf20Sopenharmony_ci .elseif ((i & 3) == 1) 4898c2ecf20Sopenharmony_ci vpshufb XMM_SHUFB_BSWAP, W_TMP1, W 4908c2ecf20Sopenharmony_ci .elseif ((i & 3) == 2) 4918c2ecf20Sopenharmony_ci vpaddd (K_BASE), W, W_TMP1 4928c2ecf20Sopenharmony_ci .elseif ((i & 3) == 3) 4938c2ecf20Sopenharmony_ci vmovdqa W_TMP1, WK(i&~3) 4948c2ecf20Sopenharmony_ci W_PRECALC_ROTATE 4958c2ecf20Sopenharmony_ci .endif 4968c2ecf20Sopenharmony_ci.endm 4978c2ecf20Sopenharmony_ci 4988c2ecf20Sopenharmony_ci.macro W_PRECALC_16_31_AVX 4998c2ecf20Sopenharmony_ci .if ((i & 3) == 0) 5008c2ecf20Sopenharmony_ci vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] 5018c2ecf20Sopenharmony_ci vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] 5028c2ecf20Sopenharmony_ci vpxor W_minus_08, W, W 5038c2ecf20Sopenharmony_ci vpxor W_minus_16, W_TMP1, W_TMP1 5048c2ecf20Sopenharmony_ci .elseif ((i & 3) == 1) 5058c2ecf20Sopenharmony_ci vpxor W_TMP1, W, W 5068c2ecf20Sopenharmony_ci vpslldq $12, W, W_TMP2 5078c2ecf20Sopenharmony_ci vpslld $1, W, W_TMP1 5088c2ecf20Sopenharmony_ci .elseif ((i & 3) == 2) 5098c2ecf20Sopenharmony_ci vpsrld $31, W, W 5108c2ecf20Sopenharmony_ci vpor W, W_TMP1, W_TMP1 5118c2ecf20Sopenharmony_ci vpslld $2, W_TMP2, W 5128c2ecf20Sopenharmony_ci vpsrld $30, W_TMP2, W_TMP2 5138c2ecf20Sopenharmony_ci .elseif ((i & 3) == 3) 5148c2ecf20Sopenharmony_ci vpxor W, W_TMP1, W_TMP1 5158c2ecf20Sopenharmony_ci vpxor W_TMP2, W_TMP1, W 5168c2ecf20Sopenharmony_ci vpaddd K_XMM(K_BASE), W, W_TMP1 5178c2ecf20Sopenharmony_ci vmovdqu W_TMP1, WK(i&~3) 5188c2ecf20Sopenharmony_ci W_PRECALC_ROTATE 5198c2ecf20Sopenharmony_ci .endif 5208c2ecf20Sopenharmony_ci.endm 5218c2ecf20Sopenharmony_ci 5228c2ecf20Sopenharmony_ci.macro W_PRECALC_32_79_AVX 5238c2ecf20Sopenharmony_ci .if ((i & 3) == 0) 5248c2ecf20Sopenharmony_ci vpalignr $8, W_minus_08, W_minus_04, W_TMP1 5258c2ecf20Sopenharmony_ci vpxor W_minus_28, W, W # W is W_minus_32 before xor 5268c2ecf20Sopenharmony_ci .elseif ((i & 3) == 1) 5278c2ecf20Sopenharmony_ci vpxor W_minus_16, W_TMP1, W_TMP1 5288c2ecf20Sopenharmony_ci vpxor W_TMP1, W, W 5298c2ecf20Sopenharmony_ci .elseif ((i & 3) == 2) 5308c2ecf20Sopenharmony_ci vpslld $2, W, W_TMP1 5318c2ecf20Sopenharmony_ci vpsrld $30, W, W 5328c2ecf20Sopenharmony_ci vpor W, W_TMP1, W 5338c2ecf20Sopenharmony_ci .elseif ((i & 3) == 3) 5348c2ecf20Sopenharmony_ci vpaddd K_XMM(K_BASE), W, W_TMP1 5358c2ecf20Sopenharmony_ci vmovdqu W_TMP1, WK(i&~3) 5368c2ecf20Sopenharmony_ci W_PRECALC_ROTATE 5378c2ecf20Sopenharmony_ci .endif 5388c2ecf20Sopenharmony_ci.endm 5398c2ecf20Sopenharmony_ci 5408c2ecf20Sopenharmony_ci.endm // W_PRECALC_AVX 5418c2ecf20Sopenharmony_ci 5428c2ecf20Sopenharmony_ciW_PRECALC_AVX 5438c2ecf20Sopenharmony_ci.purgem xmm_mov 5448c2ecf20Sopenharmony_ci.macro xmm_mov a, b 5458c2ecf20Sopenharmony_ci vmovdqu \a,\b 5468c2ecf20Sopenharmony_ci.endm 5478c2ecf20Sopenharmony_ci 5488c2ecf20Sopenharmony_ci 5498c2ecf20Sopenharmony_ci/* AVX optimized implementation: 5508c2ecf20Sopenharmony_ci * extern "C" void sha1_transform_avx(struct sha1_state *state, 5518c2ecf20Sopenharmony_ci * const u8 *data, int blocks); 5528c2ecf20Sopenharmony_ci */ 5538c2ecf20Sopenharmony_ciSHA1_VECTOR_ASM sha1_transform_avx 554