162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
462306a36Sopenharmony_ci * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
562306a36Sopenharmony_ci * processors. CPUs supporting Intel(R) AVX extensions will get an additional
662306a36Sopenharmony_ci * boost.
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci * This work was inspired by the vectorized implementation of Dean Gaudet.
962306a36Sopenharmony_ci * Additional information on it can be found at:
1062306a36Sopenharmony_ci *    http://www.arctic.org/~dean/crypto/sha1.html
1162306a36Sopenharmony_ci *
1262306a36Sopenharmony_ci * It was improved upon with more efficient vectorization of the message
1362306a36Sopenharmony_ci * scheduling. This implementation has also been optimized for all current and
1462306a36Sopenharmony_ci * several future generations of Intel CPUs.
1562306a36Sopenharmony_ci *
1662306a36Sopenharmony_ci * See this article for more information about the implementation details:
1762306a36Sopenharmony_ci *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
1862306a36Sopenharmony_ci *
1962306a36Sopenharmony_ci * Copyright (C) 2010, Intel Corp.
2062306a36Sopenharmony_ci *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
2162306a36Sopenharmony_ci *            Ronen Zohar <ronen.zohar@intel.com>
2262306a36Sopenharmony_ci *
2362306a36Sopenharmony_ci * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
2462306a36Sopenharmony_ci *   Author: Mathias Krause <minipli@googlemail.com>
2562306a36Sopenharmony_ci */
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci#include <linux/linkage.h>
2862306a36Sopenharmony_ci#include <linux/cfi_types.h>
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci#define CTX	%rdi	// arg1
3162306a36Sopenharmony_ci#define BUF	%rsi	// arg2
3262306a36Sopenharmony_ci#define CNT	%rdx	// arg3
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci#define REG_A	%ecx
3562306a36Sopenharmony_ci#define REG_B	%esi
3662306a36Sopenharmony_ci#define REG_C	%edi
3762306a36Sopenharmony_ci#define REG_D	%r12d
3862306a36Sopenharmony_ci#define REG_E	%edx
3962306a36Sopenharmony_ci
4062306a36Sopenharmony_ci#define REG_T1	%eax
4162306a36Sopenharmony_ci#define REG_T2	%ebx
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_ci#define K_BASE		%r8
4462306a36Sopenharmony_ci#define HASH_PTR	%r9
4562306a36Sopenharmony_ci#define BUFFER_PTR	%r10
4662306a36Sopenharmony_ci#define BUFFER_END	%r11
4762306a36Sopenharmony_ci
4862306a36Sopenharmony_ci#define W_TMP1	%xmm0
4962306a36Sopenharmony_ci#define W_TMP2	%xmm9
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci#define W0	%xmm1
5262306a36Sopenharmony_ci#define W4	%xmm2
5362306a36Sopenharmony_ci#define W8	%xmm3
5462306a36Sopenharmony_ci#define W12	%xmm4
5562306a36Sopenharmony_ci#define W16	%xmm5
5662306a36Sopenharmony_ci#define W20	%xmm6
5762306a36Sopenharmony_ci#define W24	%xmm7
5862306a36Sopenharmony_ci#define W28	%xmm8
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ci#define XMM_SHUFB_BSWAP	%xmm10
6162306a36Sopenharmony_ci
6262306a36Sopenharmony_ci/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
6362306a36Sopenharmony_ci#define WK(t)	(((t) & 15) * 4)(%rsp)
6462306a36Sopenharmony_ci#define W_PRECALC_AHEAD	16
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci/*
6762306a36Sopenharmony_ci * This macro implements the SHA-1 function's body for single 64-byte block
6862306a36Sopenharmony_ci * param: function's name
6962306a36Sopenharmony_ci */
7062306a36Sopenharmony_ci.macro SHA1_VECTOR_ASM  name
7162306a36Sopenharmony_ci	SYM_TYPED_FUNC_START(\name)
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci	push	%rbx
7462306a36Sopenharmony_ci	push	%r12
7562306a36Sopenharmony_ci	push	%rbp
7662306a36Sopenharmony_ci	mov	%rsp, %rbp
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_ci	sub	$64, %rsp		# allocate workspace
7962306a36Sopenharmony_ci	and	$~15, %rsp		# align stack
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_ci	mov	CTX, HASH_PTR
8262306a36Sopenharmony_ci	mov	BUF, BUFFER_PTR
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci	shl	$6, CNT			# multiply by 64
8562306a36Sopenharmony_ci	add	BUF, CNT
8662306a36Sopenharmony_ci	mov	CNT, BUFFER_END
8762306a36Sopenharmony_ci
8862306a36Sopenharmony_ci	lea	K_XMM_AR(%rip), K_BASE
8962306a36Sopenharmony_ci	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci	SHA1_PIPELINED_MAIN_BODY
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_ci	# cleanup workspace
9462306a36Sopenharmony_ci	mov	$8, %ecx
9562306a36Sopenharmony_ci	mov	%rsp, %rdi
9662306a36Sopenharmony_ci	xor	%eax, %eax
9762306a36Sopenharmony_ci	rep stosq
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci	mov	%rbp, %rsp		# deallocate workspace
10062306a36Sopenharmony_ci	pop	%rbp
10162306a36Sopenharmony_ci	pop	%r12
10262306a36Sopenharmony_ci	pop	%rbx
10362306a36Sopenharmony_ci	RET
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci	SYM_FUNC_END(\name)
10662306a36Sopenharmony_ci.endm
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_ci/*
10962306a36Sopenharmony_ci * This macro implements 80 rounds of SHA-1 for one 64-byte block
11062306a36Sopenharmony_ci */
11162306a36Sopenharmony_ci.macro SHA1_PIPELINED_MAIN_BODY
11262306a36Sopenharmony_ci	INIT_REGALLOC
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci	mov	  (HASH_PTR), A
11562306a36Sopenharmony_ci	mov	 4(HASH_PTR), B
11662306a36Sopenharmony_ci	mov	 8(HASH_PTR), C
11762306a36Sopenharmony_ci	mov	12(HASH_PTR), D
11862306a36Sopenharmony_ci	mov	16(HASH_PTR), E
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_ci  .set i, 0
12162306a36Sopenharmony_ci  .rept W_PRECALC_AHEAD
12262306a36Sopenharmony_ci	W_PRECALC i
12362306a36Sopenharmony_ci    .set i, (i+1)
12462306a36Sopenharmony_ci  .endr
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci.align 4
12762306a36Sopenharmony_ci1:
12862306a36Sopenharmony_ci	RR F1,A,B,C,D,E,0
12962306a36Sopenharmony_ci	RR F1,D,E,A,B,C,2
13062306a36Sopenharmony_ci	RR F1,B,C,D,E,A,4
13162306a36Sopenharmony_ci	RR F1,E,A,B,C,D,6
13262306a36Sopenharmony_ci	RR F1,C,D,E,A,B,8
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci	RR F1,A,B,C,D,E,10
13562306a36Sopenharmony_ci	RR F1,D,E,A,B,C,12
13662306a36Sopenharmony_ci	RR F1,B,C,D,E,A,14
13762306a36Sopenharmony_ci	RR F1,E,A,B,C,D,16
13862306a36Sopenharmony_ci	RR F1,C,D,E,A,B,18
13962306a36Sopenharmony_ci
14062306a36Sopenharmony_ci	RR F2,A,B,C,D,E,20
14162306a36Sopenharmony_ci	RR F2,D,E,A,B,C,22
14262306a36Sopenharmony_ci	RR F2,B,C,D,E,A,24
14362306a36Sopenharmony_ci	RR F2,E,A,B,C,D,26
14462306a36Sopenharmony_ci	RR F2,C,D,E,A,B,28
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ci	RR F2,A,B,C,D,E,30
14762306a36Sopenharmony_ci	RR F2,D,E,A,B,C,32
14862306a36Sopenharmony_ci	RR F2,B,C,D,E,A,34
14962306a36Sopenharmony_ci	RR F2,E,A,B,C,D,36
15062306a36Sopenharmony_ci	RR F2,C,D,E,A,B,38
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci	RR F3,A,B,C,D,E,40
15362306a36Sopenharmony_ci	RR F3,D,E,A,B,C,42
15462306a36Sopenharmony_ci	RR F3,B,C,D,E,A,44
15562306a36Sopenharmony_ci	RR F3,E,A,B,C,D,46
15662306a36Sopenharmony_ci	RR F3,C,D,E,A,B,48
15762306a36Sopenharmony_ci
15862306a36Sopenharmony_ci	RR F3,A,B,C,D,E,50
15962306a36Sopenharmony_ci	RR F3,D,E,A,B,C,52
16062306a36Sopenharmony_ci	RR F3,B,C,D,E,A,54
16162306a36Sopenharmony_ci	RR F3,E,A,B,C,D,56
16262306a36Sopenharmony_ci	RR F3,C,D,E,A,B,58
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci	add	$64, BUFFER_PTR		# move to the next 64-byte block
16562306a36Sopenharmony_ci	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
16662306a36Sopenharmony_ci	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ci	RR F4,A,B,C,D,E,60
16962306a36Sopenharmony_ci	RR F4,D,E,A,B,C,62
17062306a36Sopenharmony_ci	RR F4,B,C,D,E,A,64
17162306a36Sopenharmony_ci	RR F4,E,A,B,C,D,66
17262306a36Sopenharmony_ci	RR F4,C,D,E,A,B,68
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_ci	RR F4,A,B,C,D,E,70
17562306a36Sopenharmony_ci	RR F4,D,E,A,B,C,72
17662306a36Sopenharmony_ci	RR F4,B,C,D,E,A,74
17762306a36Sopenharmony_ci	RR F4,E,A,B,C,D,76
17862306a36Sopenharmony_ci	RR F4,C,D,E,A,B,78
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_ci	UPDATE_HASH   (HASH_PTR), A
18162306a36Sopenharmony_ci	UPDATE_HASH  4(HASH_PTR), B
18262306a36Sopenharmony_ci	UPDATE_HASH  8(HASH_PTR), C
18362306a36Sopenharmony_ci	UPDATE_HASH 12(HASH_PTR), D
18462306a36Sopenharmony_ci	UPDATE_HASH 16(HASH_PTR), E
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ci	RESTORE_RENAMED_REGS
18762306a36Sopenharmony_ci	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
18862306a36Sopenharmony_ci	jne	1b
18962306a36Sopenharmony_ci.endm
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci.macro INIT_REGALLOC
19262306a36Sopenharmony_ci  .set A, REG_A
19362306a36Sopenharmony_ci  .set B, REG_B
19462306a36Sopenharmony_ci  .set C, REG_C
19562306a36Sopenharmony_ci  .set D, REG_D
19662306a36Sopenharmony_ci  .set E, REG_E
19762306a36Sopenharmony_ci  .set T1, REG_T1
19862306a36Sopenharmony_ci  .set T2, REG_T2
19962306a36Sopenharmony_ci.endm
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_ci.macro RESTORE_RENAMED_REGS
20262306a36Sopenharmony_ci	# order is important (REG_C is where it should be)
20362306a36Sopenharmony_ci	mov	B, REG_B
20462306a36Sopenharmony_ci	mov	D, REG_D
20562306a36Sopenharmony_ci	mov	A, REG_A
20662306a36Sopenharmony_ci	mov	E, REG_E
20762306a36Sopenharmony_ci.endm
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci.macro SWAP_REG_NAMES  a, b
21062306a36Sopenharmony_ci  .set _T, \a
21162306a36Sopenharmony_ci  .set \a, \b
21262306a36Sopenharmony_ci  .set \b, _T
21362306a36Sopenharmony_ci.endm
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ci.macro F1  b, c, d
21662306a36Sopenharmony_ci	mov	\c, T1
21762306a36Sopenharmony_ci	SWAP_REG_NAMES \c, T1
21862306a36Sopenharmony_ci	xor	\d, T1
21962306a36Sopenharmony_ci	and	\b, T1
22062306a36Sopenharmony_ci	xor	\d, T1
22162306a36Sopenharmony_ci.endm
22262306a36Sopenharmony_ci
22362306a36Sopenharmony_ci.macro F2  b, c, d
22462306a36Sopenharmony_ci	mov	\d, T1
22562306a36Sopenharmony_ci	SWAP_REG_NAMES \d, T1
22662306a36Sopenharmony_ci	xor	\c, T1
22762306a36Sopenharmony_ci	xor	\b, T1
22862306a36Sopenharmony_ci.endm
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_ci.macro F3  b, c ,d
23162306a36Sopenharmony_ci	mov	\c, T1
23262306a36Sopenharmony_ci	SWAP_REG_NAMES \c, T1
23362306a36Sopenharmony_ci	mov	\b, T2
23462306a36Sopenharmony_ci	or	\b, T1
23562306a36Sopenharmony_ci	and	\c, T2
23662306a36Sopenharmony_ci	and	\d, T1
23762306a36Sopenharmony_ci	or	T2, T1
23862306a36Sopenharmony_ci.endm
23962306a36Sopenharmony_ci
24062306a36Sopenharmony_ci.macro F4  b, c, d
24162306a36Sopenharmony_ci	F2 \b, \c, \d
24262306a36Sopenharmony_ci.endm
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci.macro UPDATE_HASH  hash, val
24562306a36Sopenharmony_ci	add	\hash, \val
24662306a36Sopenharmony_ci	mov	\val, \hash
24762306a36Sopenharmony_ci.endm
24862306a36Sopenharmony_ci
24962306a36Sopenharmony_ci/*
25062306a36Sopenharmony_ci * RR does two rounds of SHA-1 back to back with W[] pre-calc
25162306a36Sopenharmony_ci *   t1 = F(b, c, d);   e += w(i)
25262306a36Sopenharmony_ci *   e += t1;           b <<= 30;   d  += w(i+1);
25362306a36Sopenharmony_ci *   t1 = F(a, b, c);
25462306a36Sopenharmony_ci *   d += t1;           a <<= 5;
25562306a36Sopenharmony_ci *   e += a;
25662306a36Sopenharmony_ci *   t1 = e;            a >>= 7;
25762306a36Sopenharmony_ci *   t1 <<= 5;
25862306a36Sopenharmony_ci *   d += t1;
25962306a36Sopenharmony_ci */
26062306a36Sopenharmony_ci.macro RR  F, a, b, c, d, e, round
26162306a36Sopenharmony_ci	add	WK(\round), \e
26262306a36Sopenharmony_ci	\F   \b, \c, \d		# t1 = F(b, c, d);
26362306a36Sopenharmony_ci	W_PRECALC (\round + W_PRECALC_AHEAD)
26462306a36Sopenharmony_ci	rol	$30, \b
26562306a36Sopenharmony_ci	add	T1, \e
26662306a36Sopenharmony_ci	add	WK(\round + 1), \d
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	\F   \a, \b, \c
26962306a36Sopenharmony_ci	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
27062306a36Sopenharmony_ci	rol	$5, \a
27162306a36Sopenharmony_ci	add	\a, \e
27262306a36Sopenharmony_ci	add	T1, \d
27362306a36Sopenharmony_ci	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ci	mov	\e, T1
27662306a36Sopenharmony_ci	SWAP_REG_NAMES \e, T1
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci	rol	$5, T1
27962306a36Sopenharmony_ci	add	T1, \d
28062306a36Sopenharmony_ci
28162306a36Sopenharmony_ci	# write:  \a, \b
28262306a36Sopenharmony_ci	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
28362306a36Sopenharmony_ci.endm
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_ci.macro W_PRECALC  r
28662306a36Sopenharmony_ci  .set i, \r
28762306a36Sopenharmony_ci
28862306a36Sopenharmony_ci  .if (i < 20)
28962306a36Sopenharmony_ci    .set K_XMM, 0
29062306a36Sopenharmony_ci  .elseif (i < 40)
29162306a36Sopenharmony_ci    .set K_XMM, 16
29262306a36Sopenharmony_ci  .elseif (i < 60)
29362306a36Sopenharmony_ci    .set K_XMM, 32
29462306a36Sopenharmony_ci  .elseif (i < 80)
29562306a36Sopenharmony_ci    .set K_XMM, 48
29662306a36Sopenharmony_ci  .endif
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
29962306a36Sopenharmony_ci    .set i, ((\r) % 80)	    # pre-compute for the next iteration
30062306a36Sopenharmony_ci    .if (i == 0)
30162306a36Sopenharmony_ci	W_PRECALC_RESET
30262306a36Sopenharmony_ci    .endif
30362306a36Sopenharmony_ci	W_PRECALC_00_15
30462306a36Sopenharmony_ci  .elseif (i<32)
30562306a36Sopenharmony_ci	W_PRECALC_16_31
30662306a36Sopenharmony_ci  .elseif (i < 80)   // rounds 32-79
30762306a36Sopenharmony_ci	W_PRECALC_32_79
30862306a36Sopenharmony_ci  .endif
30962306a36Sopenharmony_ci.endm
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci.macro W_PRECALC_RESET
31262306a36Sopenharmony_ci  .set W,          W0
31362306a36Sopenharmony_ci  .set W_minus_04, W4
31462306a36Sopenharmony_ci  .set W_minus_08, W8
31562306a36Sopenharmony_ci  .set W_minus_12, W12
31662306a36Sopenharmony_ci  .set W_minus_16, W16
31762306a36Sopenharmony_ci  .set W_minus_20, W20
31862306a36Sopenharmony_ci  .set W_minus_24, W24
31962306a36Sopenharmony_ci  .set W_minus_28, W28
32062306a36Sopenharmony_ci  .set W_minus_32, W
32162306a36Sopenharmony_ci.endm
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci.macro W_PRECALC_ROTATE
32462306a36Sopenharmony_ci  .set W_minus_32, W_minus_28
32562306a36Sopenharmony_ci  .set W_minus_28, W_minus_24
32662306a36Sopenharmony_ci  .set W_minus_24, W_minus_20
32762306a36Sopenharmony_ci  .set W_minus_20, W_minus_16
32862306a36Sopenharmony_ci  .set W_minus_16, W_minus_12
32962306a36Sopenharmony_ci  .set W_minus_12, W_minus_08
33062306a36Sopenharmony_ci  .set W_minus_08, W_minus_04
33162306a36Sopenharmony_ci  .set W_minus_04, W
33262306a36Sopenharmony_ci  .set W,          W_minus_32
33362306a36Sopenharmony_ci.endm
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci.macro W_PRECALC_SSSE3
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci.macro W_PRECALC_00_15
33862306a36Sopenharmony_ci	W_PRECALC_00_15_SSSE3
33962306a36Sopenharmony_ci.endm
34062306a36Sopenharmony_ci.macro W_PRECALC_16_31
34162306a36Sopenharmony_ci	W_PRECALC_16_31_SSSE3
34262306a36Sopenharmony_ci.endm
34362306a36Sopenharmony_ci.macro W_PRECALC_32_79
34462306a36Sopenharmony_ci	W_PRECALC_32_79_SSSE3
34562306a36Sopenharmony_ci.endm
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_ci/* message scheduling pre-compute for rounds 0-15 */
34862306a36Sopenharmony_ci.macro W_PRECALC_00_15_SSSE3
34962306a36Sopenharmony_ci  .if ((i & 3) == 0)
35062306a36Sopenharmony_ci	movdqu	(i*4)(BUFFER_PTR), W_TMP1
35162306a36Sopenharmony_ci  .elseif ((i & 3) == 1)
35262306a36Sopenharmony_ci	pshufb	XMM_SHUFB_BSWAP, W_TMP1
35362306a36Sopenharmony_ci	movdqa	W_TMP1, W
35462306a36Sopenharmony_ci  .elseif ((i & 3) == 2)
35562306a36Sopenharmony_ci	paddd	(K_BASE), W_TMP1
35662306a36Sopenharmony_ci  .elseif ((i & 3) == 3)
35762306a36Sopenharmony_ci	movdqa  W_TMP1, WK(i&~3)
35862306a36Sopenharmony_ci	W_PRECALC_ROTATE
35962306a36Sopenharmony_ci  .endif
36062306a36Sopenharmony_ci.endm
36162306a36Sopenharmony_ci
36262306a36Sopenharmony_ci/* message scheduling pre-compute for rounds 16-31
36362306a36Sopenharmony_ci *
36462306a36Sopenharmony_ci * - calculating last 32 w[i] values in 8 XMM registers
36562306a36Sopenharmony_ci * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
36662306a36Sopenharmony_ci *   instruction
36762306a36Sopenharmony_ci *
36862306a36Sopenharmony_ci * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
36962306a36Sopenharmony_ci * dependency, but improves for 32-79
37062306a36Sopenharmony_ci */
37162306a36Sopenharmony_ci.macro W_PRECALC_16_31_SSSE3
37262306a36Sopenharmony_ci  # blended scheduling of vector and scalar instruction streams, one 4-wide
37362306a36Sopenharmony_ci  # vector iteration / 4 scalar rounds
37462306a36Sopenharmony_ci  .if ((i & 3) == 0)
37562306a36Sopenharmony_ci	movdqa	W_minus_12, W
37662306a36Sopenharmony_ci	palignr	$8, W_minus_16, W	# w[i-14]
37762306a36Sopenharmony_ci	movdqa	W_minus_04, W_TMP1
37862306a36Sopenharmony_ci	psrldq	$4, W_TMP1		# w[i-3]
37962306a36Sopenharmony_ci	pxor	W_minus_08, W
38062306a36Sopenharmony_ci  .elseif ((i & 3) == 1)
38162306a36Sopenharmony_ci	pxor	W_minus_16, W_TMP1
38262306a36Sopenharmony_ci	pxor	W_TMP1, W
38362306a36Sopenharmony_ci	movdqa	W, W_TMP2
38462306a36Sopenharmony_ci	movdqa	W, W_TMP1
38562306a36Sopenharmony_ci	pslldq	$12, W_TMP2
38662306a36Sopenharmony_ci  .elseif ((i & 3) == 2)
38762306a36Sopenharmony_ci	psrld	$31, W
38862306a36Sopenharmony_ci	pslld	$1, W_TMP1
38962306a36Sopenharmony_ci	por	W, W_TMP1
39062306a36Sopenharmony_ci	movdqa	W_TMP2, W
39162306a36Sopenharmony_ci	psrld	$30, W_TMP2
39262306a36Sopenharmony_ci	pslld	$2, W
39362306a36Sopenharmony_ci  .elseif ((i & 3) == 3)
39462306a36Sopenharmony_ci	pxor	W, W_TMP1
39562306a36Sopenharmony_ci	pxor	W_TMP2, W_TMP1
39662306a36Sopenharmony_ci	movdqa	W_TMP1, W
39762306a36Sopenharmony_ci	paddd	K_XMM(K_BASE), W_TMP1
39862306a36Sopenharmony_ci	movdqa	W_TMP1, WK(i&~3)
39962306a36Sopenharmony_ci	W_PRECALC_ROTATE
40062306a36Sopenharmony_ci  .endif
40162306a36Sopenharmony_ci.endm
40262306a36Sopenharmony_ci
40362306a36Sopenharmony_ci/* message scheduling pre-compute for rounds 32-79
40462306a36Sopenharmony_ci *
40562306a36Sopenharmony_ci * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
40662306a36Sopenharmony_ci * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
40762306a36Sopenharmony_ci * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
40862306a36Sopenharmony_ci */
40962306a36Sopenharmony_ci.macro W_PRECALC_32_79_SSSE3
41062306a36Sopenharmony_ci  .if ((i & 3) == 0)
41162306a36Sopenharmony_ci	movdqa	W_minus_04, W_TMP1
41262306a36Sopenharmony_ci	pxor	W_minus_28, W		# W is W_minus_32 before xor
41362306a36Sopenharmony_ci	palignr	$8, W_minus_08, W_TMP1
41462306a36Sopenharmony_ci  .elseif ((i & 3) == 1)
41562306a36Sopenharmony_ci	pxor	W_minus_16, W
41662306a36Sopenharmony_ci	pxor	W_TMP1, W
41762306a36Sopenharmony_ci	movdqa	W, W_TMP1
41862306a36Sopenharmony_ci  .elseif ((i & 3) == 2)
41962306a36Sopenharmony_ci	psrld	$30, W
42062306a36Sopenharmony_ci	pslld	$2, W_TMP1
42162306a36Sopenharmony_ci	por	W, W_TMP1
42262306a36Sopenharmony_ci  .elseif ((i & 3) == 3)
42362306a36Sopenharmony_ci	movdqa	W_TMP1, W
42462306a36Sopenharmony_ci	paddd	K_XMM(K_BASE), W_TMP1
42562306a36Sopenharmony_ci	movdqa	W_TMP1, WK(i&~3)
42662306a36Sopenharmony_ci	W_PRECALC_ROTATE
42762306a36Sopenharmony_ci  .endif
42862306a36Sopenharmony_ci.endm
42962306a36Sopenharmony_ci
43062306a36Sopenharmony_ci.endm		// W_PRECALC_SSSE3
43162306a36Sopenharmony_ci
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci#define K1	0x5a827999
43462306a36Sopenharmony_ci#define K2	0x6ed9eba1
43562306a36Sopenharmony_ci#define K3	0x8f1bbcdc
43662306a36Sopenharmony_ci#define K4	0xca62c1d6
43762306a36Sopenharmony_ci
43862306a36Sopenharmony_ci.section .rodata
43962306a36Sopenharmony_ci.align 16
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ciK_XMM_AR:
44262306a36Sopenharmony_ci	.long K1, K1, K1, K1
44362306a36Sopenharmony_ci	.long K2, K2, K2, K2
44462306a36Sopenharmony_ci	.long K3, K3, K3, K3
44562306a36Sopenharmony_ci	.long K4, K4, K4, K4
44662306a36Sopenharmony_ci
44762306a36Sopenharmony_ciBSWAP_SHUFB_CTL:
44862306a36Sopenharmony_ci	.long 0x00010203
44962306a36Sopenharmony_ci	.long 0x04050607
45062306a36Sopenharmony_ci	.long 0x08090a0b
45162306a36Sopenharmony_ci	.long 0x0c0d0e0f
45262306a36Sopenharmony_ci
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci.section .text
45562306a36Sopenharmony_ci
45662306a36Sopenharmony_ciW_PRECALC_SSSE3
45762306a36Sopenharmony_ci.macro xmm_mov a, b
45862306a36Sopenharmony_ci	movdqu	\a,\b
45962306a36Sopenharmony_ci.endm
46062306a36Sopenharmony_ci
46162306a36Sopenharmony_ci/*
46262306a36Sopenharmony_ci * SSSE3 optimized implementation:
46362306a36Sopenharmony_ci *
46462306a36Sopenharmony_ci * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
46562306a36Sopenharmony_ci *					const u8 *data, int blocks);
46662306a36Sopenharmony_ci *
46762306a36Sopenharmony_ci * Note that struct sha1_state is assumed to begin with u32 state[5].
46862306a36Sopenharmony_ci */
46962306a36Sopenharmony_ciSHA1_VECTOR_ASM     sha1_transform_ssse3
47062306a36Sopenharmony_ci
47162306a36Sopenharmony_ci.macro W_PRECALC_AVX
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_ci.purgem W_PRECALC_00_15
47462306a36Sopenharmony_ci.macro  W_PRECALC_00_15
47562306a36Sopenharmony_ci    W_PRECALC_00_15_AVX
47662306a36Sopenharmony_ci.endm
47762306a36Sopenharmony_ci.purgem W_PRECALC_16_31
47862306a36Sopenharmony_ci.macro  W_PRECALC_16_31
47962306a36Sopenharmony_ci    W_PRECALC_16_31_AVX
48062306a36Sopenharmony_ci.endm
48162306a36Sopenharmony_ci.purgem W_PRECALC_32_79
48262306a36Sopenharmony_ci.macro  W_PRECALC_32_79
48362306a36Sopenharmony_ci    W_PRECALC_32_79_AVX
48462306a36Sopenharmony_ci.endm
48562306a36Sopenharmony_ci
48662306a36Sopenharmony_ci.macro W_PRECALC_00_15_AVX
48762306a36Sopenharmony_ci  .if ((i & 3) == 0)
48862306a36Sopenharmony_ci	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
48962306a36Sopenharmony_ci  .elseif ((i & 3) == 1)
49062306a36Sopenharmony_ci	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
49162306a36Sopenharmony_ci  .elseif ((i & 3) == 2)
49262306a36Sopenharmony_ci	vpaddd	(K_BASE), W, W_TMP1
49362306a36Sopenharmony_ci  .elseif ((i & 3) == 3)
49462306a36Sopenharmony_ci	vmovdqa	W_TMP1, WK(i&~3)
49562306a36Sopenharmony_ci	W_PRECALC_ROTATE
49662306a36Sopenharmony_ci  .endif
49762306a36Sopenharmony_ci.endm
49862306a36Sopenharmony_ci
49962306a36Sopenharmony_ci.macro W_PRECALC_16_31_AVX
50062306a36Sopenharmony_ci  .if ((i & 3) == 0)
50162306a36Sopenharmony_ci	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
50262306a36Sopenharmony_ci	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
50362306a36Sopenharmony_ci	vpxor	W_minus_08, W, W
50462306a36Sopenharmony_ci	vpxor	W_minus_16, W_TMP1, W_TMP1
50562306a36Sopenharmony_ci  .elseif ((i & 3) == 1)
50662306a36Sopenharmony_ci	vpxor	W_TMP1, W, W
50762306a36Sopenharmony_ci	vpslldq	$12, W, W_TMP2
50862306a36Sopenharmony_ci	vpslld	$1, W, W_TMP1
50962306a36Sopenharmony_ci  .elseif ((i & 3) == 2)
51062306a36Sopenharmony_ci	vpsrld	$31, W, W
51162306a36Sopenharmony_ci	vpor	W, W_TMP1, W_TMP1
51262306a36Sopenharmony_ci	vpslld	$2, W_TMP2, W
51362306a36Sopenharmony_ci	vpsrld	$30, W_TMP2, W_TMP2
51462306a36Sopenharmony_ci  .elseif ((i & 3) == 3)
51562306a36Sopenharmony_ci	vpxor	W, W_TMP1, W_TMP1
51662306a36Sopenharmony_ci	vpxor	W_TMP2, W_TMP1, W
51762306a36Sopenharmony_ci	vpaddd	K_XMM(K_BASE), W, W_TMP1
51862306a36Sopenharmony_ci	vmovdqu	W_TMP1, WK(i&~3)
51962306a36Sopenharmony_ci	W_PRECALC_ROTATE
52062306a36Sopenharmony_ci  .endif
52162306a36Sopenharmony_ci.endm
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_ci.macro W_PRECALC_32_79_AVX
52462306a36Sopenharmony_ci  .if ((i & 3) == 0)
52562306a36Sopenharmony_ci	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
52662306a36Sopenharmony_ci	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
52762306a36Sopenharmony_ci  .elseif ((i & 3) == 1)
52862306a36Sopenharmony_ci	vpxor	W_minus_16, W_TMP1, W_TMP1
52962306a36Sopenharmony_ci	vpxor	W_TMP1, W, W
53062306a36Sopenharmony_ci  .elseif ((i & 3) == 2)
53162306a36Sopenharmony_ci	vpslld	$2, W, W_TMP1
53262306a36Sopenharmony_ci	vpsrld	$30, W, W
53362306a36Sopenharmony_ci	vpor	W, W_TMP1, W
53462306a36Sopenharmony_ci  .elseif ((i & 3) == 3)
53562306a36Sopenharmony_ci	vpaddd	K_XMM(K_BASE), W, W_TMP1
53662306a36Sopenharmony_ci	vmovdqu	W_TMP1, WK(i&~3)
53762306a36Sopenharmony_ci	W_PRECALC_ROTATE
53862306a36Sopenharmony_ci  .endif
53962306a36Sopenharmony_ci.endm
54062306a36Sopenharmony_ci
54162306a36Sopenharmony_ci.endm    // W_PRECALC_AVX
54262306a36Sopenharmony_ci
54362306a36Sopenharmony_ciW_PRECALC_AVX
54462306a36Sopenharmony_ci.purgem xmm_mov
54562306a36Sopenharmony_ci.macro xmm_mov a, b
54662306a36Sopenharmony_ci	vmovdqu	\a,\b
54762306a36Sopenharmony_ci.endm
54862306a36Sopenharmony_ci
54962306a36Sopenharmony_ci
55062306a36Sopenharmony_ci/* AVX optimized implementation:
55162306a36Sopenharmony_ci *  extern "C" void sha1_transform_avx(struct sha1_state *state,
55262306a36Sopenharmony_ci *				       const u8 *data, int blocks);
55362306a36Sopenharmony_ci */
55462306a36Sopenharmony_ciSHA1_VECTOR_ASM     sha1_transform_avx
555