162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental 462306a36Sopenharmony_ci * SSE3 instruction set extensions introduced in Intel Core Microarchitecture 562306a36Sopenharmony_ci * processors. CPUs supporting Intel(R) AVX extensions will get an additional 662306a36Sopenharmony_ci * boost. 762306a36Sopenharmony_ci * 862306a36Sopenharmony_ci * This work was inspired by the vectorized implementation of Dean Gaudet. 962306a36Sopenharmony_ci * Additional information on it can be found at: 1062306a36Sopenharmony_ci * http://www.arctic.org/~dean/crypto/sha1.html 1162306a36Sopenharmony_ci * 1262306a36Sopenharmony_ci * It was improved upon with more efficient vectorization of the message 1362306a36Sopenharmony_ci * scheduling. This implementation has also been optimized for all current and 1462306a36Sopenharmony_ci * several future generations of Intel CPUs. 1562306a36Sopenharmony_ci * 1662306a36Sopenharmony_ci * See this article for more information about the implementation details: 1762306a36Sopenharmony_ci * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ 1862306a36Sopenharmony_ci * 1962306a36Sopenharmony_ci * Copyright (C) 2010, Intel Corp. 2062306a36Sopenharmony_ci * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> 2162306a36Sopenharmony_ci * Ronen Zohar <ronen.zohar@intel.com> 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: 2462306a36Sopenharmony_ci * Author: Mathias Krause <minipli@googlemail.com> 2562306a36Sopenharmony_ci */ 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci#include <linux/linkage.h> 2862306a36Sopenharmony_ci#include <linux/cfi_types.h> 2962306a36Sopenharmony_ci 3062306a36Sopenharmony_ci#define CTX %rdi // arg1 3162306a36Sopenharmony_ci#define BUF %rsi // arg2 3262306a36Sopenharmony_ci#define CNT %rdx // arg3 3362306a36Sopenharmony_ci 3462306a36Sopenharmony_ci#define REG_A %ecx 3562306a36Sopenharmony_ci#define REG_B %esi 3662306a36Sopenharmony_ci#define REG_C %edi 3762306a36Sopenharmony_ci#define REG_D %r12d 3862306a36Sopenharmony_ci#define REG_E %edx 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_ci#define REG_T1 %eax 4162306a36Sopenharmony_ci#define REG_T2 %ebx 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ci#define K_BASE %r8 4462306a36Sopenharmony_ci#define HASH_PTR %r9 4562306a36Sopenharmony_ci#define BUFFER_PTR %r10 4662306a36Sopenharmony_ci#define BUFFER_END %r11 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_ci#define W_TMP1 %xmm0 4962306a36Sopenharmony_ci#define W_TMP2 %xmm9 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci#define W0 %xmm1 5262306a36Sopenharmony_ci#define W4 %xmm2 5362306a36Sopenharmony_ci#define W8 %xmm3 5462306a36Sopenharmony_ci#define W12 %xmm4 5562306a36Sopenharmony_ci#define W16 %xmm5 5662306a36Sopenharmony_ci#define W20 %xmm6 5762306a36Sopenharmony_ci#define W24 %xmm7 5862306a36Sopenharmony_ci#define W28 %xmm8 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ci#define XMM_SHUFB_BSWAP %xmm10 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ci/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ 6362306a36Sopenharmony_ci#define WK(t) (((t) & 15) * 4)(%rsp) 6462306a36Sopenharmony_ci#define W_PRECALC_AHEAD 16 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_ci/* 6762306a36Sopenharmony_ci * This macro implements the SHA-1 function's body for single 64-byte block 6862306a36Sopenharmony_ci * param: function's name 6962306a36Sopenharmony_ci */ 7062306a36Sopenharmony_ci.macro SHA1_VECTOR_ASM name 7162306a36Sopenharmony_ci SYM_TYPED_FUNC_START(\name) 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci push %rbx 7462306a36Sopenharmony_ci push %r12 7562306a36Sopenharmony_ci push %rbp 7662306a36Sopenharmony_ci mov %rsp, %rbp 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_ci sub $64, %rsp # allocate workspace 7962306a36Sopenharmony_ci and $~15, %rsp # align stack 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci mov CTX, HASH_PTR 8262306a36Sopenharmony_ci mov BUF, BUFFER_PTR 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci shl $6, CNT # multiply by 64 8562306a36Sopenharmony_ci add BUF, CNT 8662306a36Sopenharmony_ci mov CNT, BUFFER_END 8762306a36Sopenharmony_ci 8862306a36Sopenharmony_ci lea K_XMM_AR(%rip), K_BASE 8962306a36Sopenharmony_ci xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci SHA1_PIPELINED_MAIN_BODY 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci # cleanup workspace 9462306a36Sopenharmony_ci mov $8, %ecx 9562306a36Sopenharmony_ci mov %rsp, %rdi 9662306a36Sopenharmony_ci xor %eax, %eax 9762306a36Sopenharmony_ci rep stosq 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci mov %rbp, %rsp # deallocate workspace 10062306a36Sopenharmony_ci pop %rbp 10162306a36Sopenharmony_ci pop %r12 10262306a36Sopenharmony_ci pop %rbx 10362306a36Sopenharmony_ci RET 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci SYM_FUNC_END(\name) 10662306a36Sopenharmony_ci.endm 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci/* 10962306a36Sopenharmony_ci * This macro implements 80 rounds of SHA-1 for one 64-byte block 11062306a36Sopenharmony_ci */ 11162306a36Sopenharmony_ci.macro SHA1_PIPELINED_MAIN_BODY 11262306a36Sopenharmony_ci INIT_REGALLOC 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci mov (HASH_PTR), A 11562306a36Sopenharmony_ci mov 4(HASH_PTR), B 11662306a36Sopenharmony_ci mov 8(HASH_PTR), C 11762306a36Sopenharmony_ci mov 12(HASH_PTR), D 11862306a36Sopenharmony_ci mov 16(HASH_PTR), E 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci .set i, 0 12162306a36Sopenharmony_ci .rept W_PRECALC_AHEAD 12262306a36Sopenharmony_ci W_PRECALC i 12362306a36Sopenharmony_ci .set i, (i+1) 12462306a36Sopenharmony_ci .endr 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci.align 4 12762306a36Sopenharmony_ci1: 12862306a36Sopenharmony_ci RR F1,A,B,C,D,E,0 12962306a36Sopenharmony_ci RR F1,D,E,A,B,C,2 13062306a36Sopenharmony_ci RR F1,B,C,D,E,A,4 13162306a36Sopenharmony_ci RR F1,E,A,B,C,D,6 13262306a36Sopenharmony_ci RR F1,C,D,E,A,B,8 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci RR F1,A,B,C,D,E,10 13562306a36Sopenharmony_ci RR F1,D,E,A,B,C,12 13662306a36Sopenharmony_ci RR F1,B,C,D,E,A,14 13762306a36Sopenharmony_ci RR F1,E,A,B,C,D,16 13862306a36Sopenharmony_ci RR F1,C,D,E,A,B,18 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_ci RR F2,A,B,C,D,E,20 14162306a36Sopenharmony_ci RR F2,D,E,A,B,C,22 14262306a36Sopenharmony_ci RR F2,B,C,D,E,A,24 14362306a36Sopenharmony_ci RR F2,E,A,B,C,D,26 14462306a36Sopenharmony_ci RR F2,C,D,E,A,B,28 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci RR F2,A,B,C,D,E,30 14762306a36Sopenharmony_ci RR F2,D,E,A,B,C,32 14862306a36Sopenharmony_ci RR F2,B,C,D,E,A,34 14962306a36Sopenharmony_ci RR F2,E,A,B,C,D,36 15062306a36Sopenharmony_ci RR F2,C,D,E,A,B,38 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci RR F3,A,B,C,D,E,40 15362306a36Sopenharmony_ci RR F3,D,E,A,B,C,42 15462306a36Sopenharmony_ci RR F3,B,C,D,E,A,44 15562306a36Sopenharmony_ci RR F3,E,A,B,C,D,46 15662306a36Sopenharmony_ci RR F3,C,D,E,A,B,48 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci RR F3,A,B,C,D,E,50 15962306a36Sopenharmony_ci RR F3,D,E,A,B,C,52 16062306a36Sopenharmony_ci RR F3,B,C,D,E,A,54 16162306a36Sopenharmony_ci RR F3,E,A,B,C,D,56 16262306a36Sopenharmony_ci RR F3,C,D,E,A,B,58 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci add $64, BUFFER_PTR # move to the next 64-byte block 16562306a36Sopenharmony_ci cmp BUFFER_END, BUFFER_PTR # if the current is the last one use 16662306a36Sopenharmony_ci cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci RR F4,A,B,C,D,E,60 16962306a36Sopenharmony_ci RR F4,D,E,A,B,C,62 17062306a36Sopenharmony_ci RR F4,B,C,D,E,A,64 17162306a36Sopenharmony_ci RR F4,E,A,B,C,D,66 17262306a36Sopenharmony_ci RR F4,C,D,E,A,B,68 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_ci RR F4,A,B,C,D,E,70 17562306a36Sopenharmony_ci RR F4,D,E,A,B,C,72 17662306a36Sopenharmony_ci RR F4,B,C,D,E,A,74 17762306a36Sopenharmony_ci RR F4,E,A,B,C,D,76 17862306a36Sopenharmony_ci RR F4,C,D,E,A,B,78 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci UPDATE_HASH (HASH_PTR), A 18162306a36Sopenharmony_ci UPDATE_HASH 4(HASH_PTR), B 18262306a36Sopenharmony_ci UPDATE_HASH 8(HASH_PTR), C 18362306a36Sopenharmony_ci UPDATE_HASH 12(HASH_PTR), D 18462306a36Sopenharmony_ci UPDATE_HASH 16(HASH_PTR), E 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_ci RESTORE_RENAMED_REGS 18762306a36Sopenharmony_ci cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end 18862306a36Sopenharmony_ci jne 1b 18962306a36Sopenharmony_ci.endm 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci.macro INIT_REGALLOC 19262306a36Sopenharmony_ci .set A, REG_A 19362306a36Sopenharmony_ci .set B, REG_B 19462306a36Sopenharmony_ci .set C, REG_C 19562306a36Sopenharmony_ci .set D, REG_D 19662306a36Sopenharmony_ci .set E, REG_E 19762306a36Sopenharmony_ci .set T1, REG_T1 19862306a36Sopenharmony_ci .set T2, REG_T2 19962306a36Sopenharmony_ci.endm 20062306a36Sopenharmony_ci 20162306a36Sopenharmony_ci.macro RESTORE_RENAMED_REGS 20262306a36Sopenharmony_ci # order is important (REG_C is where it should be) 20362306a36Sopenharmony_ci mov B, REG_B 20462306a36Sopenharmony_ci mov D, REG_D 20562306a36Sopenharmony_ci mov A, REG_A 20662306a36Sopenharmony_ci mov E, REG_E 20762306a36Sopenharmony_ci.endm 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci.macro SWAP_REG_NAMES a, b 21062306a36Sopenharmony_ci .set _T, \a 21162306a36Sopenharmony_ci .set \a, \b 21262306a36Sopenharmony_ci .set \b, _T 21362306a36Sopenharmony_ci.endm 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci.macro F1 b, c, d 21662306a36Sopenharmony_ci mov \c, T1 21762306a36Sopenharmony_ci SWAP_REG_NAMES \c, T1 21862306a36Sopenharmony_ci xor \d, T1 21962306a36Sopenharmony_ci and \b, T1 22062306a36Sopenharmony_ci xor \d, T1 22162306a36Sopenharmony_ci.endm 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ci.macro F2 b, c, d 22462306a36Sopenharmony_ci mov \d, T1 22562306a36Sopenharmony_ci SWAP_REG_NAMES \d, T1 22662306a36Sopenharmony_ci xor \c, T1 22762306a36Sopenharmony_ci xor \b, T1 22862306a36Sopenharmony_ci.endm 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci.macro F3 b, c ,d 23162306a36Sopenharmony_ci mov \c, T1 23262306a36Sopenharmony_ci SWAP_REG_NAMES \c, T1 23362306a36Sopenharmony_ci mov \b, T2 23462306a36Sopenharmony_ci or \b, T1 23562306a36Sopenharmony_ci and \c, T2 23662306a36Sopenharmony_ci and \d, T1 23762306a36Sopenharmony_ci or T2, T1 23862306a36Sopenharmony_ci.endm 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci.macro F4 b, c, d 24162306a36Sopenharmony_ci F2 \b, \c, \d 24262306a36Sopenharmony_ci.endm 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci.macro UPDATE_HASH hash, val 24562306a36Sopenharmony_ci add \hash, \val 24662306a36Sopenharmony_ci mov \val, \hash 24762306a36Sopenharmony_ci.endm 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_ci/* 25062306a36Sopenharmony_ci * RR does two rounds of SHA-1 back to back with W[] pre-calc 25162306a36Sopenharmony_ci * t1 = F(b, c, d); e += w(i) 25262306a36Sopenharmony_ci * e += t1; b <<= 30; d += w(i+1); 25362306a36Sopenharmony_ci * t1 = F(a, b, c); 25462306a36Sopenharmony_ci * d += t1; a <<= 5; 25562306a36Sopenharmony_ci * e += a; 25662306a36Sopenharmony_ci * t1 = e; a >>= 7; 25762306a36Sopenharmony_ci * t1 <<= 5; 25862306a36Sopenharmony_ci * d += t1; 25962306a36Sopenharmony_ci */ 26062306a36Sopenharmony_ci.macro RR F, a, b, c, d, e, round 26162306a36Sopenharmony_ci add WK(\round), \e 26262306a36Sopenharmony_ci \F \b, \c, \d # t1 = F(b, c, d); 26362306a36Sopenharmony_ci W_PRECALC (\round + W_PRECALC_AHEAD) 26462306a36Sopenharmony_ci rol $30, \b 26562306a36Sopenharmony_ci add T1, \e 26662306a36Sopenharmony_ci add WK(\round + 1), \d 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci \F \a, \b, \c 26962306a36Sopenharmony_ci W_PRECALC (\round + W_PRECALC_AHEAD + 1) 27062306a36Sopenharmony_ci rol $5, \a 27162306a36Sopenharmony_ci add \a, \e 27262306a36Sopenharmony_ci add T1, \d 27362306a36Sopenharmony_ci ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_ci mov \e, T1 27662306a36Sopenharmony_ci SWAP_REG_NAMES \e, T1 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci rol $5, T1 27962306a36Sopenharmony_ci add T1, \d 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci # write: \a, \b 28262306a36Sopenharmony_ci # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c 28362306a36Sopenharmony_ci.endm 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci.macro W_PRECALC r 28662306a36Sopenharmony_ci .set i, \r 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci .if (i < 20) 28962306a36Sopenharmony_ci .set K_XMM, 0 29062306a36Sopenharmony_ci .elseif (i < 40) 29162306a36Sopenharmony_ci .set K_XMM, 16 29262306a36Sopenharmony_ci .elseif (i < 60) 29362306a36Sopenharmony_ci .set K_XMM, 32 29462306a36Sopenharmony_ci .elseif (i < 80) 29562306a36Sopenharmony_ci .set K_XMM, 48 29662306a36Sopenharmony_ci .endif 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) 29962306a36Sopenharmony_ci .set i, ((\r) % 80) # pre-compute for the next iteration 30062306a36Sopenharmony_ci .if (i == 0) 30162306a36Sopenharmony_ci W_PRECALC_RESET 30262306a36Sopenharmony_ci .endif 30362306a36Sopenharmony_ci W_PRECALC_00_15 30462306a36Sopenharmony_ci .elseif (i<32) 30562306a36Sopenharmony_ci W_PRECALC_16_31 30662306a36Sopenharmony_ci .elseif (i < 80) // rounds 32-79 30762306a36Sopenharmony_ci W_PRECALC_32_79 30862306a36Sopenharmony_ci .endif 30962306a36Sopenharmony_ci.endm 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci.macro W_PRECALC_RESET 31262306a36Sopenharmony_ci .set W, W0 31362306a36Sopenharmony_ci .set W_minus_04, W4 31462306a36Sopenharmony_ci .set W_minus_08, W8 31562306a36Sopenharmony_ci .set W_minus_12, W12 31662306a36Sopenharmony_ci .set W_minus_16, W16 31762306a36Sopenharmony_ci .set W_minus_20, W20 31862306a36Sopenharmony_ci .set W_minus_24, W24 31962306a36Sopenharmony_ci .set W_minus_28, W28 32062306a36Sopenharmony_ci .set W_minus_32, W 32162306a36Sopenharmony_ci.endm 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci.macro W_PRECALC_ROTATE 32462306a36Sopenharmony_ci .set W_minus_32, W_minus_28 32562306a36Sopenharmony_ci .set W_minus_28, W_minus_24 32662306a36Sopenharmony_ci .set W_minus_24, W_minus_20 32762306a36Sopenharmony_ci .set W_minus_20, W_minus_16 32862306a36Sopenharmony_ci .set W_minus_16, W_minus_12 32962306a36Sopenharmony_ci .set W_minus_12, W_minus_08 33062306a36Sopenharmony_ci .set W_minus_08, W_minus_04 33162306a36Sopenharmony_ci .set W_minus_04, W 33262306a36Sopenharmony_ci .set W, W_minus_32 33362306a36Sopenharmony_ci.endm 33462306a36Sopenharmony_ci 33562306a36Sopenharmony_ci.macro W_PRECALC_SSSE3 33662306a36Sopenharmony_ci 33762306a36Sopenharmony_ci.macro W_PRECALC_00_15 33862306a36Sopenharmony_ci W_PRECALC_00_15_SSSE3 33962306a36Sopenharmony_ci.endm 34062306a36Sopenharmony_ci.macro W_PRECALC_16_31 34162306a36Sopenharmony_ci W_PRECALC_16_31_SSSE3 34262306a36Sopenharmony_ci.endm 34362306a36Sopenharmony_ci.macro W_PRECALC_32_79 34462306a36Sopenharmony_ci W_PRECALC_32_79_SSSE3 34562306a36Sopenharmony_ci.endm 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci/* message scheduling pre-compute for rounds 0-15 */ 34862306a36Sopenharmony_ci.macro W_PRECALC_00_15_SSSE3 34962306a36Sopenharmony_ci .if ((i & 3) == 0) 35062306a36Sopenharmony_ci movdqu (i*4)(BUFFER_PTR), W_TMP1 35162306a36Sopenharmony_ci .elseif ((i & 3) == 1) 35262306a36Sopenharmony_ci pshufb XMM_SHUFB_BSWAP, W_TMP1 35362306a36Sopenharmony_ci movdqa W_TMP1, W 35462306a36Sopenharmony_ci .elseif ((i & 3) == 2) 35562306a36Sopenharmony_ci paddd (K_BASE), W_TMP1 35662306a36Sopenharmony_ci .elseif ((i & 3) == 3) 35762306a36Sopenharmony_ci movdqa W_TMP1, WK(i&~3) 35862306a36Sopenharmony_ci W_PRECALC_ROTATE 35962306a36Sopenharmony_ci .endif 36062306a36Sopenharmony_ci.endm 36162306a36Sopenharmony_ci 36262306a36Sopenharmony_ci/* message scheduling pre-compute for rounds 16-31 36362306a36Sopenharmony_ci * 36462306a36Sopenharmony_ci * - calculating last 32 w[i] values in 8 XMM registers 36562306a36Sopenharmony_ci * - pre-calculate K+w[i] values and store to mem, for later load by ALU add 36662306a36Sopenharmony_ci * instruction 36762306a36Sopenharmony_ci * 36862306a36Sopenharmony_ci * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] 36962306a36Sopenharmony_ci * dependency, but improves for 32-79 37062306a36Sopenharmony_ci */ 37162306a36Sopenharmony_ci.macro W_PRECALC_16_31_SSSE3 37262306a36Sopenharmony_ci # blended scheduling of vector and scalar instruction streams, one 4-wide 37362306a36Sopenharmony_ci # vector iteration / 4 scalar rounds 37462306a36Sopenharmony_ci .if ((i & 3) == 0) 37562306a36Sopenharmony_ci movdqa W_minus_12, W 37662306a36Sopenharmony_ci palignr $8, W_minus_16, W # w[i-14] 37762306a36Sopenharmony_ci movdqa W_minus_04, W_TMP1 37862306a36Sopenharmony_ci psrldq $4, W_TMP1 # w[i-3] 37962306a36Sopenharmony_ci pxor W_minus_08, W 38062306a36Sopenharmony_ci .elseif ((i & 3) == 1) 38162306a36Sopenharmony_ci pxor W_minus_16, W_TMP1 38262306a36Sopenharmony_ci pxor W_TMP1, W 38362306a36Sopenharmony_ci movdqa W, W_TMP2 38462306a36Sopenharmony_ci movdqa W, W_TMP1 38562306a36Sopenharmony_ci pslldq $12, W_TMP2 38662306a36Sopenharmony_ci .elseif ((i & 3) == 2) 38762306a36Sopenharmony_ci psrld $31, W 38862306a36Sopenharmony_ci pslld $1, W_TMP1 38962306a36Sopenharmony_ci por W, W_TMP1 39062306a36Sopenharmony_ci movdqa W_TMP2, W 39162306a36Sopenharmony_ci psrld $30, W_TMP2 39262306a36Sopenharmony_ci pslld $2, W 39362306a36Sopenharmony_ci .elseif ((i & 3) == 3) 39462306a36Sopenharmony_ci pxor W, W_TMP1 39562306a36Sopenharmony_ci pxor W_TMP2, W_TMP1 39662306a36Sopenharmony_ci movdqa W_TMP1, W 39762306a36Sopenharmony_ci paddd K_XMM(K_BASE), W_TMP1 39862306a36Sopenharmony_ci movdqa W_TMP1, WK(i&~3) 39962306a36Sopenharmony_ci W_PRECALC_ROTATE 40062306a36Sopenharmony_ci .endif 40162306a36Sopenharmony_ci.endm 40262306a36Sopenharmony_ci 40362306a36Sopenharmony_ci/* message scheduling pre-compute for rounds 32-79 40462306a36Sopenharmony_ci * 40562306a36Sopenharmony_ci * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 40662306a36Sopenharmony_ci * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 40762306a36Sopenharmony_ci * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken 40862306a36Sopenharmony_ci */ 40962306a36Sopenharmony_ci.macro W_PRECALC_32_79_SSSE3 41062306a36Sopenharmony_ci .if ((i & 3) == 0) 41162306a36Sopenharmony_ci movdqa W_minus_04, W_TMP1 41262306a36Sopenharmony_ci pxor W_minus_28, W # W is W_minus_32 before xor 41362306a36Sopenharmony_ci palignr $8, W_minus_08, W_TMP1 41462306a36Sopenharmony_ci .elseif ((i & 3) == 1) 41562306a36Sopenharmony_ci pxor W_minus_16, W 41662306a36Sopenharmony_ci pxor W_TMP1, W 41762306a36Sopenharmony_ci movdqa W, W_TMP1 41862306a36Sopenharmony_ci .elseif ((i & 3) == 2) 41962306a36Sopenharmony_ci psrld $30, W 42062306a36Sopenharmony_ci pslld $2, W_TMP1 42162306a36Sopenharmony_ci por W, W_TMP1 42262306a36Sopenharmony_ci .elseif ((i & 3) == 3) 42362306a36Sopenharmony_ci movdqa W_TMP1, W 42462306a36Sopenharmony_ci paddd K_XMM(K_BASE), W_TMP1 42562306a36Sopenharmony_ci movdqa W_TMP1, WK(i&~3) 42662306a36Sopenharmony_ci W_PRECALC_ROTATE 42762306a36Sopenharmony_ci .endif 42862306a36Sopenharmony_ci.endm 42962306a36Sopenharmony_ci 43062306a36Sopenharmony_ci.endm // W_PRECALC_SSSE3 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci#define K1 0x5a827999 43462306a36Sopenharmony_ci#define K2 0x6ed9eba1 43562306a36Sopenharmony_ci#define K3 0x8f1bbcdc 43662306a36Sopenharmony_ci#define K4 0xca62c1d6 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_ci.section .rodata 43962306a36Sopenharmony_ci.align 16 44062306a36Sopenharmony_ci 44162306a36Sopenharmony_ciK_XMM_AR: 44262306a36Sopenharmony_ci .long K1, K1, K1, K1 44362306a36Sopenharmony_ci .long K2, K2, K2, K2 44462306a36Sopenharmony_ci .long K3, K3, K3, K3 44562306a36Sopenharmony_ci .long K4, K4, K4, K4 44662306a36Sopenharmony_ci 44762306a36Sopenharmony_ciBSWAP_SHUFB_CTL: 44862306a36Sopenharmony_ci .long 0x00010203 44962306a36Sopenharmony_ci .long 0x04050607 45062306a36Sopenharmony_ci .long 0x08090a0b 45162306a36Sopenharmony_ci .long 0x0c0d0e0f 45262306a36Sopenharmony_ci 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_ci.section .text 45562306a36Sopenharmony_ci 45662306a36Sopenharmony_ciW_PRECALC_SSSE3 45762306a36Sopenharmony_ci.macro xmm_mov a, b 45862306a36Sopenharmony_ci movdqu \a,\b 45962306a36Sopenharmony_ci.endm 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_ci/* 46262306a36Sopenharmony_ci * SSSE3 optimized implementation: 46362306a36Sopenharmony_ci * 46462306a36Sopenharmony_ci * extern "C" void sha1_transform_ssse3(struct sha1_state *state, 46562306a36Sopenharmony_ci * const u8 *data, int blocks); 46662306a36Sopenharmony_ci * 46762306a36Sopenharmony_ci * Note that struct sha1_state is assumed to begin with u32 state[5]. 46862306a36Sopenharmony_ci */ 46962306a36Sopenharmony_ciSHA1_VECTOR_ASM sha1_transform_ssse3 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci.macro W_PRECALC_AVX 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_ci.purgem W_PRECALC_00_15 47462306a36Sopenharmony_ci.macro W_PRECALC_00_15 47562306a36Sopenharmony_ci W_PRECALC_00_15_AVX 47662306a36Sopenharmony_ci.endm 47762306a36Sopenharmony_ci.purgem W_PRECALC_16_31 47862306a36Sopenharmony_ci.macro W_PRECALC_16_31 47962306a36Sopenharmony_ci W_PRECALC_16_31_AVX 48062306a36Sopenharmony_ci.endm 48162306a36Sopenharmony_ci.purgem W_PRECALC_32_79 48262306a36Sopenharmony_ci.macro W_PRECALC_32_79 48362306a36Sopenharmony_ci W_PRECALC_32_79_AVX 48462306a36Sopenharmony_ci.endm 48562306a36Sopenharmony_ci 48662306a36Sopenharmony_ci.macro W_PRECALC_00_15_AVX 48762306a36Sopenharmony_ci .if ((i & 3) == 0) 48862306a36Sopenharmony_ci vmovdqu (i*4)(BUFFER_PTR), W_TMP1 48962306a36Sopenharmony_ci .elseif ((i & 3) == 1) 49062306a36Sopenharmony_ci vpshufb XMM_SHUFB_BSWAP, W_TMP1, W 49162306a36Sopenharmony_ci .elseif ((i & 3) == 2) 49262306a36Sopenharmony_ci vpaddd (K_BASE), W, W_TMP1 49362306a36Sopenharmony_ci .elseif ((i & 3) == 3) 49462306a36Sopenharmony_ci vmovdqa W_TMP1, WK(i&~3) 49562306a36Sopenharmony_ci W_PRECALC_ROTATE 49662306a36Sopenharmony_ci .endif 49762306a36Sopenharmony_ci.endm 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_ci.macro W_PRECALC_16_31_AVX 50062306a36Sopenharmony_ci .if ((i & 3) == 0) 50162306a36Sopenharmony_ci vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] 50262306a36Sopenharmony_ci vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] 50362306a36Sopenharmony_ci vpxor W_minus_08, W, W 50462306a36Sopenharmony_ci vpxor W_minus_16, W_TMP1, W_TMP1 50562306a36Sopenharmony_ci .elseif ((i & 3) == 1) 50662306a36Sopenharmony_ci vpxor W_TMP1, W, W 50762306a36Sopenharmony_ci vpslldq $12, W, W_TMP2 50862306a36Sopenharmony_ci vpslld $1, W, W_TMP1 50962306a36Sopenharmony_ci .elseif ((i & 3) == 2) 51062306a36Sopenharmony_ci vpsrld $31, W, W 51162306a36Sopenharmony_ci vpor W, W_TMP1, W_TMP1 51262306a36Sopenharmony_ci vpslld $2, W_TMP2, W 51362306a36Sopenharmony_ci vpsrld $30, W_TMP2, W_TMP2 51462306a36Sopenharmony_ci .elseif ((i & 3) == 3) 51562306a36Sopenharmony_ci vpxor W, W_TMP1, W_TMP1 51662306a36Sopenharmony_ci vpxor W_TMP2, W_TMP1, W 51762306a36Sopenharmony_ci vpaddd K_XMM(K_BASE), W, W_TMP1 51862306a36Sopenharmony_ci vmovdqu W_TMP1, WK(i&~3) 51962306a36Sopenharmony_ci W_PRECALC_ROTATE 52062306a36Sopenharmony_ci .endif 52162306a36Sopenharmony_ci.endm 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_ci.macro W_PRECALC_32_79_AVX 52462306a36Sopenharmony_ci .if ((i & 3) == 0) 52562306a36Sopenharmony_ci vpalignr $8, W_minus_08, W_minus_04, W_TMP1 52662306a36Sopenharmony_ci vpxor W_minus_28, W, W # W is W_minus_32 before xor 52762306a36Sopenharmony_ci .elseif ((i & 3) == 1) 52862306a36Sopenharmony_ci vpxor W_minus_16, W_TMP1, W_TMP1 52962306a36Sopenharmony_ci vpxor W_TMP1, W, W 53062306a36Sopenharmony_ci .elseif ((i & 3) == 2) 53162306a36Sopenharmony_ci vpslld $2, W, W_TMP1 53262306a36Sopenharmony_ci vpsrld $30, W, W 53362306a36Sopenharmony_ci vpor W, W_TMP1, W 53462306a36Sopenharmony_ci .elseif ((i & 3) == 3) 53562306a36Sopenharmony_ci vpaddd K_XMM(K_BASE), W, W_TMP1 53662306a36Sopenharmony_ci vmovdqu W_TMP1, WK(i&~3) 53762306a36Sopenharmony_ci W_PRECALC_ROTATE 53862306a36Sopenharmony_ci .endif 53962306a36Sopenharmony_ci.endm 54062306a36Sopenharmony_ci 54162306a36Sopenharmony_ci.endm // W_PRECALC_AVX 54262306a36Sopenharmony_ci 54362306a36Sopenharmony_ciW_PRECALC_AVX 54462306a36Sopenharmony_ci.purgem xmm_mov 54562306a36Sopenharmony_ci.macro xmm_mov a, b 54662306a36Sopenharmony_ci vmovdqu \a,\b 54762306a36Sopenharmony_ci.endm 54862306a36Sopenharmony_ci 54962306a36Sopenharmony_ci 55062306a36Sopenharmony_ci/* AVX optimized implementation: 55162306a36Sopenharmony_ci * extern "C" void sha1_transform_avx(struct sha1_state *state, 55262306a36Sopenharmony_ci * const u8 *data, int blocks); 55362306a36Sopenharmony_ci */ 55462306a36Sopenharmony_ciSHA1_VECTOR_ASM sha1_transform_avx 555