18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
48c2ecf20Sopenharmony_ci * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
58c2ecf20Sopenharmony_ci * processors. CPUs supporting Intel(R) AVX extensions will get an additional
68c2ecf20Sopenharmony_ci * boost.
78c2ecf20Sopenharmony_ci *
88c2ecf20Sopenharmony_ci * This work was inspired by the vectorized implementation of Dean Gaudet.
98c2ecf20Sopenharmony_ci * Additional information on it can be found at:
108c2ecf20Sopenharmony_ci *    http://www.arctic.org/~dean/crypto/sha1.html
118c2ecf20Sopenharmony_ci *
128c2ecf20Sopenharmony_ci * It was improved upon with more efficient vectorization of the message
138c2ecf20Sopenharmony_ci * scheduling. This implementation has also been optimized for all current and
148c2ecf20Sopenharmony_ci * several future generations of Intel CPUs.
158c2ecf20Sopenharmony_ci *
168c2ecf20Sopenharmony_ci * See this article for more information about the implementation details:
178c2ecf20Sopenharmony_ci *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
188c2ecf20Sopenharmony_ci *
198c2ecf20Sopenharmony_ci * Copyright (C) 2010, Intel Corp.
208c2ecf20Sopenharmony_ci *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
218c2ecf20Sopenharmony_ci *            Ronen Zohar <ronen.zohar@intel.com>
228c2ecf20Sopenharmony_ci *
238c2ecf20Sopenharmony_ci * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
248c2ecf20Sopenharmony_ci *   Author: Mathias Krause <minipli@googlemail.com>
258c2ecf20Sopenharmony_ci */
268c2ecf20Sopenharmony_ci
278c2ecf20Sopenharmony_ci#include <linux/linkage.h>
288c2ecf20Sopenharmony_ci
298c2ecf20Sopenharmony_ci#define CTX	%rdi	// arg1
308c2ecf20Sopenharmony_ci#define BUF	%rsi	// arg2
318c2ecf20Sopenharmony_ci#define CNT	%rdx	// arg3
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_ci#define REG_A	%ecx
348c2ecf20Sopenharmony_ci#define REG_B	%esi
358c2ecf20Sopenharmony_ci#define REG_C	%edi
368c2ecf20Sopenharmony_ci#define REG_D	%r12d
378c2ecf20Sopenharmony_ci#define REG_E	%edx
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_ci#define REG_T1	%eax
408c2ecf20Sopenharmony_ci#define REG_T2	%ebx
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_ci#define K_BASE		%r8
438c2ecf20Sopenharmony_ci#define HASH_PTR	%r9
448c2ecf20Sopenharmony_ci#define BUFFER_PTR	%r10
458c2ecf20Sopenharmony_ci#define BUFFER_END	%r11
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci#define W_TMP1	%xmm0
488c2ecf20Sopenharmony_ci#define W_TMP2	%xmm9
498c2ecf20Sopenharmony_ci
508c2ecf20Sopenharmony_ci#define W0	%xmm1
518c2ecf20Sopenharmony_ci#define W4	%xmm2
528c2ecf20Sopenharmony_ci#define W8	%xmm3
538c2ecf20Sopenharmony_ci#define W12	%xmm4
548c2ecf20Sopenharmony_ci#define W16	%xmm5
558c2ecf20Sopenharmony_ci#define W20	%xmm6
568c2ecf20Sopenharmony_ci#define W24	%xmm7
578c2ecf20Sopenharmony_ci#define W28	%xmm8
588c2ecf20Sopenharmony_ci
598c2ecf20Sopenharmony_ci#define XMM_SHUFB_BSWAP	%xmm10
608c2ecf20Sopenharmony_ci
618c2ecf20Sopenharmony_ci/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
628c2ecf20Sopenharmony_ci#define WK(t)	(((t) & 15) * 4)(%rsp)
638c2ecf20Sopenharmony_ci#define W_PRECALC_AHEAD	16
648c2ecf20Sopenharmony_ci
658c2ecf20Sopenharmony_ci/*
668c2ecf20Sopenharmony_ci * This macro implements the SHA-1 function's body for single 64-byte block
678c2ecf20Sopenharmony_ci * param: function's name
688c2ecf20Sopenharmony_ci */
698c2ecf20Sopenharmony_ci.macro SHA1_VECTOR_ASM  name
708c2ecf20Sopenharmony_ci	SYM_FUNC_START(\name)
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_ci	push	%rbx
738c2ecf20Sopenharmony_ci	push	%r12
748c2ecf20Sopenharmony_ci	push	%rbp
758c2ecf20Sopenharmony_ci	mov	%rsp, %rbp
768c2ecf20Sopenharmony_ci
778c2ecf20Sopenharmony_ci	sub	$64, %rsp		# allocate workspace
788c2ecf20Sopenharmony_ci	and	$~15, %rsp		# align stack
798c2ecf20Sopenharmony_ci
808c2ecf20Sopenharmony_ci	mov	CTX, HASH_PTR
818c2ecf20Sopenharmony_ci	mov	BUF, BUFFER_PTR
828c2ecf20Sopenharmony_ci
838c2ecf20Sopenharmony_ci	shl	$6, CNT			# multiply by 64
848c2ecf20Sopenharmony_ci	add	BUF, CNT
858c2ecf20Sopenharmony_ci	mov	CNT, BUFFER_END
868c2ecf20Sopenharmony_ci
878c2ecf20Sopenharmony_ci	lea	K_XMM_AR(%rip), K_BASE
888c2ecf20Sopenharmony_ci	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
898c2ecf20Sopenharmony_ci
908c2ecf20Sopenharmony_ci	SHA1_PIPELINED_MAIN_BODY
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ci	# cleanup workspace
938c2ecf20Sopenharmony_ci	mov	$8, %ecx
948c2ecf20Sopenharmony_ci	mov	%rsp, %rdi
958c2ecf20Sopenharmony_ci	xor	%eax, %eax
968c2ecf20Sopenharmony_ci	rep stosq
978c2ecf20Sopenharmony_ci
988c2ecf20Sopenharmony_ci	mov	%rbp, %rsp		# deallocate workspace
998c2ecf20Sopenharmony_ci	pop	%rbp
1008c2ecf20Sopenharmony_ci	pop	%r12
1018c2ecf20Sopenharmony_ci	pop	%rbx
1028c2ecf20Sopenharmony_ci	RET
1038c2ecf20Sopenharmony_ci
1048c2ecf20Sopenharmony_ci	SYM_FUNC_END(\name)
1058c2ecf20Sopenharmony_ci.endm
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_ci/*
1088c2ecf20Sopenharmony_ci * This macro implements 80 rounds of SHA-1 for one 64-byte block
1098c2ecf20Sopenharmony_ci */
1108c2ecf20Sopenharmony_ci.macro SHA1_PIPELINED_MAIN_BODY
1118c2ecf20Sopenharmony_ci	INIT_REGALLOC
1128c2ecf20Sopenharmony_ci
1138c2ecf20Sopenharmony_ci	mov	  (HASH_PTR), A
1148c2ecf20Sopenharmony_ci	mov	 4(HASH_PTR), B
1158c2ecf20Sopenharmony_ci	mov	 8(HASH_PTR), C
1168c2ecf20Sopenharmony_ci	mov	12(HASH_PTR), D
1178c2ecf20Sopenharmony_ci	mov	16(HASH_PTR), E
1188c2ecf20Sopenharmony_ci
1198c2ecf20Sopenharmony_ci  .set i, 0
1208c2ecf20Sopenharmony_ci  .rept W_PRECALC_AHEAD
1218c2ecf20Sopenharmony_ci	W_PRECALC i
1228c2ecf20Sopenharmony_ci    .set i, (i+1)
1238c2ecf20Sopenharmony_ci  .endr
1248c2ecf20Sopenharmony_ci
1258c2ecf20Sopenharmony_ci.align 4
1268c2ecf20Sopenharmony_ci1:
1278c2ecf20Sopenharmony_ci	RR F1,A,B,C,D,E,0
1288c2ecf20Sopenharmony_ci	RR F1,D,E,A,B,C,2
1298c2ecf20Sopenharmony_ci	RR F1,B,C,D,E,A,4
1308c2ecf20Sopenharmony_ci	RR F1,E,A,B,C,D,6
1318c2ecf20Sopenharmony_ci	RR F1,C,D,E,A,B,8
1328c2ecf20Sopenharmony_ci
1338c2ecf20Sopenharmony_ci	RR F1,A,B,C,D,E,10
1348c2ecf20Sopenharmony_ci	RR F1,D,E,A,B,C,12
1358c2ecf20Sopenharmony_ci	RR F1,B,C,D,E,A,14
1368c2ecf20Sopenharmony_ci	RR F1,E,A,B,C,D,16
1378c2ecf20Sopenharmony_ci	RR F1,C,D,E,A,B,18
1388c2ecf20Sopenharmony_ci
1398c2ecf20Sopenharmony_ci	RR F2,A,B,C,D,E,20
1408c2ecf20Sopenharmony_ci	RR F2,D,E,A,B,C,22
1418c2ecf20Sopenharmony_ci	RR F2,B,C,D,E,A,24
1428c2ecf20Sopenharmony_ci	RR F2,E,A,B,C,D,26
1438c2ecf20Sopenharmony_ci	RR F2,C,D,E,A,B,28
1448c2ecf20Sopenharmony_ci
1458c2ecf20Sopenharmony_ci	RR F2,A,B,C,D,E,30
1468c2ecf20Sopenharmony_ci	RR F2,D,E,A,B,C,32
1478c2ecf20Sopenharmony_ci	RR F2,B,C,D,E,A,34
1488c2ecf20Sopenharmony_ci	RR F2,E,A,B,C,D,36
1498c2ecf20Sopenharmony_ci	RR F2,C,D,E,A,B,38
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_ci	RR F3,A,B,C,D,E,40
1528c2ecf20Sopenharmony_ci	RR F3,D,E,A,B,C,42
1538c2ecf20Sopenharmony_ci	RR F3,B,C,D,E,A,44
1548c2ecf20Sopenharmony_ci	RR F3,E,A,B,C,D,46
1558c2ecf20Sopenharmony_ci	RR F3,C,D,E,A,B,48
1568c2ecf20Sopenharmony_ci
1578c2ecf20Sopenharmony_ci	RR F3,A,B,C,D,E,50
1588c2ecf20Sopenharmony_ci	RR F3,D,E,A,B,C,52
1598c2ecf20Sopenharmony_ci	RR F3,B,C,D,E,A,54
1608c2ecf20Sopenharmony_ci	RR F3,E,A,B,C,D,56
1618c2ecf20Sopenharmony_ci	RR F3,C,D,E,A,B,58
1628c2ecf20Sopenharmony_ci
1638c2ecf20Sopenharmony_ci	add	$64, BUFFER_PTR		# move to the next 64-byte block
1648c2ecf20Sopenharmony_ci	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
1658c2ecf20Sopenharmony_ci	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
1668c2ecf20Sopenharmony_ci
1678c2ecf20Sopenharmony_ci	RR F4,A,B,C,D,E,60
1688c2ecf20Sopenharmony_ci	RR F4,D,E,A,B,C,62
1698c2ecf20Sopenharmony_ci	RR F4,B,C,D,E,A,64
1708c2ecf20Sopenharmony_ci	RR F4,E,A,B,C,D,66
1718c2ecf20Sopenharmony_ci	RR F4,C,D,E,A,B,68
1728c2ecf20Sopenharmony_ci
1738c2ecf20Sopenharmony_ci	RR F4,A,B,C,D,E,70
1748c2ecf20Sopenharmony_ci	RR F4,D,E,A,B,C,72
1758c2ecf20Sopenharmony_ci	RR F4,B,C,D,E,A,74
1768c2ecf20Sopenharmony_ci	RR F4,E,A,B,C,D,76
1778c2ecf20Sopenharmony_ci	RR F4,C,D,E,A,B,78
1788c2ecf20Sopenharmony_ci
1798c2ecf20Sopenharmony_ci	UPDATE_HASH   (HASH_PTR), A
1808c2ecf20Sopenharmony_ci	UPDATE_HASH  4(HASH_PTR), B
1818c2ecf20Sopenharmony_ci	UPDATE_HASH  8(HASH_PTR), C
1828c2ecf20Sopenharmony_ci	UPDATE_HASH 12(HASH_PTR), D
1838c2ecf20Sopenharmony_ci	UPDATE_HASH 16(HASH_PTR), E
1848c2ecf20Sopenharmony_ci
1858c2ecf20Sopenharmony_ci	RESTORE_RENAMED_REGS
1868c2ecf20Sopenharmony_ci	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
1878c2ecf20Sopenharmony_ci	jne	1b
1888c2ecf20Sopenharmony_ci.endm
1898c2ecf20Sopenharmony_ci
1908c2ecf20Sopenharmony_ci.macro INIT_REGALLOC
1918c2ecf20Sopenharmony_ci  .set A, REG_A
1928c2ecf20Sopenharmony_ci  .set B, REG_B
1938c2ecf20Sopenharmony_ci  .set C, REG_C
1948c2ecf20Sopenharmony_ci  .set D, REG_D
1958c2ecf20Sopenharmony_ci  .set E, REG_E
1968c2ecf20Sopenharmony_ci  .set T1, REG_T1
1978c2ecf20Sopenharmony_ci  .set T2, REG_T2
1988c2ecf20Sopenharmony_ci.endm
1998c2ecf20Sopenharmony_ci
2008c2ecf20Sopenharmony_ci.macro RESTORE_RENAMED_REGS
2018c2ecf20Sopenharmony_ci	# order is important (REG_C is where it should be)
2028c2ecf20Sopenharmony_ci	mov	B, REG_B
2038c2ecf20Sopenharmony_ci	mov	D, REG_D
2048c2ecf20Sopenharmony_ci	mov	A, REG_A
2058c2ecf20Sopenharmony_ci	mov	E, REG_E
2068c2ecf20Sopenharmony_ci.endm
2078c2ecf20Sopenharmony_ci
2088c2ecf20Sopenharmony_ci.macro SWAP_REG_NAMES  a, b
2098c2ecf20Sopenharmony_ci  .set _T, \a
2108c2ecf20Sopenharmony_ci  .set \a, \b
2118c2ecf20Sopenharmony_ci  .set \b, _T
2128c2ecf20Sopenharmony_ci.endm
2138c2ecf20Sopenharmony_ci
2148c2ecf20Sopenharmony_ci.macro F1  b, c, d
2158c2ecf20Sopenharmony_ci	mov	\c, T1
2168c2ecf20Sopenharmony_ci	SWAP_REG_NAMES \c, T1
2178c2ecf20Sopenharmony_ci	xor	\d, T1
2188c2ecf20Sopenharmony_ci	and	\b, T1
2198c2ecf20Sopenharmony_ci	xor	\d, T1
2208c2ecf20Sopenharmony_ci.endm
2218c2ecf20Sopenharmony_ci
2228c2ecf20Sopenharmony_ci.macro F2  b, c, d
2238c2ecf20Sopenharmony_ci	mov	\d, T1
2248c2ecf20Sopenharmony_ci	SWAP_REG_NAMES \d, T1
2258c2ecf20Sopenharmony_ci	xor	\c, T1
2268c2ecf20Sopenharmony_ci	xor	\b, T1
2278c2ecf20Sopenharmony_ci.endm
2288c2ecf20Sopenharmony_ci
2298c2ecf20Sopenharmony_ci.macro F3  b, c ,d
2308c2ecf20Sopenharmony_ci	mov	\c, T1
2318c2ecf20Sopenharmony_ci	SWAP_REG_NAMES \c, T1
2328c2ecf20Sopenharmony_ci	mov	\b, T2
2338c2ecf20Sopenharmony_ci	or	\b, T1
2348c2ecf20Sopenharmony_ci	and	\c, T2
2358c2ecf20Sopenharmony_ci	and	\d, T1
2368c2ecf20Sopenharmony_ci	or	T2, T1
2378c2ecf20Sopenharmony_ci.endm
2388c2ecf20Sopenharmony_ci
2398c2ecf20Sopenharmony_ci.macro F4  b, c, d
2408c2ecf20Sopenharmony_ci	F2 \b, \c, \d
2418c2ecf20Sopenharmony_ci.endm
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ci.macro UPDATE_HASH  hash, val
2448c2ecf20Sopenharmony_ci	add	\hash, \val
2458c2ecf20Sopenharmony_ci	mov	\val, \hash
2468c2ecf20Sopenharmony_ci.endm
2478c2ecf20Sopenharmony_ci
2488c2ecf20Sopenharmony_ci/*
2498c2ecf20Sopenharmony_ci * RR does two rounds of SHA-1 back to back with W[] pre-calc
2508c2ecf20Sopenharmony_ci *   t1 = F(b, c, d);   e += w(i)
2518c2ecf20Sopenharmony_ci *   e += t1;           b <<= 30;   d  += w(i+1);
2528c2ecf20Sopenharmony_ci *   t1 = F(a, b, c);
2538c2ecf20Sopenharmony_ci *   d += t1;           a <<= 5;
2548c2ecf20Sopenharmony_ci *   e += a;
2558c2ecf20Sopenharmony_ci *   t1 = e;            a >>= 7;
2568c2ecf20Sopenharmony_ci *   t1 <<= 5;
2578c2ecf20Sopenharmony_ci *   d += t1;
2588c2ecf20Sopenharmony_ci */
2598c2ecf20Sopenharmony_ci.macro RR  F, a, b, c, d, e, round
2608c2ecf20Sopenharmony_ci	add	WK(\round), \e
2618c2ecf20Sopenharmony_ci	\F   \b, \c, \d		# t1 = F(b, c, d);
2628c2ecf20Sopenharmony_ci	W_PRECALC (\round + W_PRECALC_AHEAD)
2638c2ecf20Sopenharmony_ci	rol	$30, \b
2648c2ecf20Sopenharmony_ci	add	T1, \e
2658c2ecf20Sopenharmony_ci	add	WK(\round + 1), \d
2668c2ecf20Sopenharmony_ci
2678c2ecf20Sopenharmony_ci	\F   \a, \b, \c
2688c2ecf20Sopenharmony_ci	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
2698c2ecf20Sopenharmony_ci	rol	$5, \a
2708c2ecf20Sopenharmony_ci	add	\a, \e
2718c2ecf20Sopenharmony_ci	add	T1, \d
2728c2ecf20Sopenharmony_ci	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
2738c2ecf20Sopenharmony_ci
2748c2ecf20Sopenharmony_ci	mov	\e, T1
2758c2ecf20Sopenharmony_ci	SWAP_REG_NAMES \e, T1
2768c2ecf20Sopenharmony_ci
2778c2ecf20Sopenharmony_ci	rol	$5, T1
2788c2ecf20Sopenharmony_ci	add	T1, \d
2798c2ecf20Sopenharmony_ci
2808c2ecf20Sopenharmony_ci	# write:  \a, \b
2818c2ecf20Sopenharmony_ci	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
2828c2ecf20Sopenharmony_ci.endm
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_ci.macro W_PRECALC  r
2858c2ecf20Sopenharmony_ci  .set i, \r
2868c2ecf20Sopenharmony_ci
2878c2ecf20Sopenharmony_ci  .if (i < 20)
2888c2ecf20Sopenharmony_ci    .set K_XMM, 0
2898c2ecf20Sopenharmony_ci  .elseif (i < 40)
2908c2ecf20Sopenharmony_ci    .set K_XMM, 16
2918c2ecf20Sopenharmony_ci  .elseif (i < 60)
2928c2ecf20Sopenharmony_ci    .set K_XMM, 32
2938c2ecf20Sopenharmony_ci  .elseif (i < 80)
2948c2ecf20Sopenharmony_ci    .set K_XMM, 48
2958c2ecf20Sopenharmony_ci  .endif
2968c2ecf20Sopenharmony_ci
2978c2ecf20Sopenharmony_ci  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
2988c2ecf20Sopenharmony_ci    .set i, ((\r) % 80)	    # pre-compute for the next iteration
2998c2ecf20Sopenharmony_ci    .if (i == 0)
3008c2ecf20Sopenharmony_ci	W_PRECALC_RESET
3018c2ecf20Sopenharmony_ci    .endif
3028c2ecf20Sopenharmony_ci	W_PRECALC_00_15
3038c2ecf20Sopenharmony_ci  .elseif (i<32)
3048c2ecf20Sopenharmony_ci	W_PRECALC_16_31
3058c2ecf20Sopenharmony_ci  .elseif (i < 80)   // rounds 32-79
3068c2ecf20Sopenharmony_ci	W_PRECALC_32_79
3078c2ecf20Sopenharmony_ci  .endif
3088c2ecf20Sopenharmony_ci.endm
3098c2ecf20Sopenharmony_ci
3108c2ecf20Sopenharmony_ci.macro W_PRECALC_RESET
3118c2ecf20Sopenharmony_ci  .set W,          W0
3128c2ecf20Sopenharmony_ci  .set W_minus_04, W4
3138c2ecf20Sopenharmony_ci  .set W_minus_08, W8
3148c2ecf20Sopenharmony_ci  .set W_minus_12, W12
3158c2ecf20Sopenharmony_ci  .set W_minus_16, W16
3168c2ecf20Sopenharmony_ci  .set W_minus_20, W20
3178c2ecf20Sopenharmony_ci  .set W_minus_24, W24
3188c2ecf20Sopenharmony_ci  .set W_minus_28, W28
3198c2ecf20Sopenharmony_ci  .set W_minus_32, W
3208c2ecf20Sopenharmony_ci.endm
3218c2ecf20Sopenharmony_ci
3228c2ecf20Sopenharmony_ci.macro W_PRECALC_ROTATE
3238c2ecf20Sopenharmony_ci  .set W_minus_32, W_minus_28
3248c2ecf20Sopenharmony_ci  .set W_minus_28, W_minus_24
3258c2ecf20Sopenharmony_ci  .set W_minus_24, W_minus_20
3268c2ecf20Sopenharmony_ci  .set W_minus_20, W_minus_16
3278c2ecf20Sopenharmony_ci  .set W_minus_16, W_minus_12
3288c2ecf20Sopenharmony_ci  .set W_minus_12, W_minus_08
3298c2ecf20Sopenharmony_ci  .set W_minus_08, W_minus_04
3308c2ecf20Sopenharmony_ci  .set W_minus_04, W
3318c2ecf20Sopenharmony_ci  .set W,          W_minus_32
3328c2ecf20Sopenharmony_ci.endm
3338c2ecf20Sopenharmony_ci
3348c2ecf20Sopenharmony_ci.macro W_PRECALC_SSSE3
3358c2ecf20Sopenharmony_ci
3368c2ecf20Sopenharmony_ci.macro W_PRECALC_00_15
3378c2ecf20Sopenharmony_ci	W_PRECALC_00_15_SSSE3
3388c2ecf20Sopenharmony_ci.endm
3398c2ecf20Sopenharmony_ci.macro W_PRECALC_16_31
3408c2ecf20Sopenharmony_ci	W_PRECALC_16_31_SSSE3
3418c2ecf20Sopenharmony_ci.endm
3428c2ecf20Sopenharmony_ci.macro W_PRECALC_32_79
3438c2ecf20Sopenharmony_ci	W_PRECALC_32_79_SSSE3
3448c2ecf20Sopenharmony_ci.endm
3458c2ecf20Sopenharmony_ci
3468c2ecf20Sopenharmony_ci/* message scheduling pre-compute for rounds 0-15 */
3478c2ecf20Sopenharmony_ci.macro W_PRECALC_00_15_SSSE3
3488c2ecf20Sopenharmony_ci  .if ((i & 3) == 0)
3498c2ecf20Sopenharmony_ci	movdqu	(i*4)(BUFFER_PTR), W_TMP1
3508c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 1)
3518c2ecf20Sopenharmony_ci	pshufb	XMM_SHUFB_BSWAP, W_TMP1
3528c2ecf20Sopenharmony_ci	movdqa	W_TMP1, W
3538c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 2)
3548c2ecf20Sopenharmony_ci	paddd	(K_BASE), W_TMP1
3558c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 3)
3568c2ecf20Sopenharmony_ci	movdqa  W_TMP1, WK(i&~3)
3578c2ecf20Sopenharmony_ci	W_PRECALC_ROTATE
3588c2ecf20Sopenharmony_ci  .endif
3598c2ecf20Sopenharmony_ci.endm
3608c2ecf20Sopenharmony_ci
3618c2ecf20Sopenharmony_ci/* message scheduling pre-compute for rounds 16-31
3628c2ecf20Sopenharmony_ci *
3638c2ecf20Sopenharmony_ci * - calculating last 32 w[i] values in 8 XMM registers
3648c2ecf20Sopenharmony_ci * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
3658c2ecf20Sopenharmony_ci *   instruction
3668c2ecf20Sopenharmony_ci *
3678c2ecf20Sopenharmony_ci * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
3688c2ecf20Sopenharmony_ci * dependency, but improves for 32-79
3698c2ecf20Sopenharmony_ci */
3708c2ecf20Sopenharmony_ci.macro W_PRECALC_16_31_SSSE3
3718c2ecf20Sopenharmony_ci  # blended scheduling of vector and scalar instruction streams, one 4-wide
3728c2ecf20Sopenharmony_ci  # vector iteration / 4 scalar rounds
3738c2ecf20Sopenharmony_ci  .if ((i & 3) == 0)
3748c2ecf20Sopenharmony_ci	movdqa	W_minus_12, W
3758c2ecf20Sopenharmony_ci	palignr	$8, W_minus_16, W	# w[i-14]
3768c2ecf20Sopenharmony_ci	movdqa	W_minus_04, W_TMP1
3778c2ecf20Sopenharmony_ci	psrldq	$4, W_TMP1		# w[i-3]
3788c2ecf20Sopenharmony_ci	pxor	W_minus_08, W
3798c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 1)
3808c2ecf20Sopenharmony_ci	pxor	W_minus_16, W_TMP1
3818c2ecf20Sopenharmony_ci	pxor	W_TMP1, W
3828c2ecf20Sopenharmony_ci	movdqa	W, W_TMP2
3838c2ecf20Sopenharmony_ci	movdqa	W, W_TMP1
3848c2ecf20Sopenharmony_ci	pslldq	$12, W_TMP2
3858c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 2)
3868c2ecf20Sopenharmony_ci	psrld	$31, W
3878c2ecf20Sopenharmony_ci	pslld	$1, W_TMP1
3888c2ecf20Sopenharmony_ci	por	W, W_TMP1
3898c2ecf20Sopenharmony_ci	movdqa	W_TMP2, W
3908c2ecf20Sopenharmony_ci	psrld	$30, W_TMP2
3918c2ecf20Sopenharmony_ci	pslld	$2, W
3928c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 3)
3938c2ecf20Sopenharmony_ci	pxor	W, W_TMP1
3948c2ecf20Sopenharmony_ci	pxor	W_TMP2, W_TMP1
3958c2ecf20Sopenharmony_ci	movdqa	W_TMP1, W
3968c2ecf20Sopenharmony_ci	paddd	K_XMM(K_BASE), W_TMP1
3978c2ecf20Sopenharmony_ci	movdqa	W_TMP1, WK(i&~3)
3988c2ecf20Sopenharmony_ci	W_PRECALC_ROTATE
3998c2ecf20Sopenharmony_ci  .endif
4008c2ecf20Sopenharmony_ci.endm
4018c2ecf20Sopenharmony_ci
4028c2ecf20Sopenharmony_ci/* message scheduling pre-compute for rounds 32-79
4038c2ecf20Sopenharmony_ci *
4048c2ecf20Sopenharmony_ci * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
4058c2ecf20Sopenharmony_ci * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
4068c2ecf20Sopenharmony_ci * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
4078c2ecf20Sopenharmony_ci */
4088c2ecf20Sopenharmony_ci.macro W_PRECALC_32_79_SSSE3
4098c2ecf20Sopenharmony_ci  .if ((i & 3) == 0)
4108c2ecf20Sopenharmony_ci	movdqa	W_minus_04, W_TMP1
4118c2ecf20Sopenharmony_ci	pxor	W_minus_28, W		# W is W_minus_32 before xor
4128c2ecf20Sopenharmony_ci	palignr	$8, W_minus_08, W_TMP1
4138c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 1)
4148c2ecf20Sopenharmony_ci	pxor	W_minus_16, W
4158c2ecf20Sopenharmony_ci	pxor	W_TMP1, W
4168c2ecf20Sopenharmony_ci	movdqa	W, W_TMP1
4178c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 2)
4188c2ecf20Sopenharmony_ci	psrld	$30, W
4198c2ecf20Sopenharmony_ci	pslld	$2, W_TMP1
4208c2ecf20Sopenharmony_ci	por	W, W_TMP1
4218c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 3)
4228c2ecf20Sopenharmony_ci	movdqa	W_TMP1, W
4238c2ecf20Sopenharmony_ci	paddd	K_XMM(K_BASE), W_TMP1
4248c2ecf20Sopenharmony_ci	movdqa	W_TMP1, WK(i&~3)
4258c2ecf20Sopenharmony_ci	W_PRECALC_ROTATE
4268c2ecf20Sopenharmony_ci  .endif
4278c2ecf20Sopenharmony_ci.endm
4288c2ecf20Sopenharmony_ci
4298c2ecf20Sopenharmony_ci.endm		// W_PRECALC_SSSE3
4308c2ecf20Sopenharmony_ci
4318c2ecf20Sopenharmony_ci
4328c2ecf20Sopenharmony_ci#define K1	0x5a827999
4338c2ecf20Sopenharmony_ci#define K2	0x6ed9eba1
4348c2ecf20Sopenharmony_ci#define K3	0x8f1bbcdc
4358c2ecf20Sopenharmony_ci#define K4	0xca62c1d6
4368c2ecf20Sopenharmony_ci
4378c2ecf20Sopenharmony_ci.section .rodata
4388c2ecf20Sopenharmony_ci.align 16
4398c2ecf20Sopenharmony_ci
4408c2ecf20Sopenharmony_ciK_XMM_AR:
4418c2ecf20Sopenharmony_ci	.long K1, K1, K1, K1
4428c2ecf20Sopenharmony_ci	.long K2, K2, K2, K2
4438c2ecf20Sopenharmony_ci	.long K3, K3, K3, K3
4448c2ecf20Sopenharmony_ci	.long K4, K4, K4, K4
4458c2ecf20Sopenharmony_ci
4468c2ecf20Sopenharmony_ciBSWAP_SHUFB_CTL:
4478c2ecf20Sopenharmony_ci	.long 0x00010203
4488c2ecf20Sopenharmony_ci	.long 0x04050607
4498c2ecf20Sopenharmony_ci	.long 0x08090a0b
4508c2ecf20Sopenharmony_ci	.long 0x0c0d0e0f
4518c2ecf20Sopenharmony_ci
4528c2ecf20Sopenharmony_ci
4538c2ecf20Sopenharmony_ci.section .text
4548c2ecf20Sopenharmony_ci
4558c2ecf20Sopenharmony_ciW_PRECALC_SSSE3
4568c2ecf20Sopenharmony_ci.macro xmm_mov a, b
4578c2ecf20Sopenharmony_ci	movdqu	\a,\b
4588c2ecf20Sopenharmony_ci.endm
4598c2ecf20Sopenharmony_ci
4608c2ecf20Sopenharmony_ci/*
4618c2ecf20Sopenharmony_ci * SSSE3 optimized implementation:
4628c2ecf20Sopenharmony_ci *
4638c2ecf20Sopenharmony_ci * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
4648c2ecf20Sopenharmony_ci *					const u8 *data, int blocks);
4658c2ecf20Sopenharmony_ci *
4668c2ecf20Sopenharmony_ci * Note that struct sha1_state is assumed to begin with u32 state[5].
4678c2ecf20Sopenharmony_ci */
4688c2ecf20Sopenharmony_ciSHA1_VECTOR_ASM     sha1_transform_ssse3
4698c2ecf20Sopenharmony_ci
4708c2ecf20Sopenharmony_ci.macro W_PRECALC_AVX
4718c2ecf20Sopenharmony_ci
4728c2ecf20Sopenharmony_ci.purgem W_PRECALC_00_15
4738c2ecf20Sopenharmony_ci.macro  W_PRECALC_00_15
4748c2ecf20Sopenharmony_ci    W_PRECALC_00_15_AVX
4758c2ecf20Sopenharmony_ci.endm
4768c2ecf20Sopenharmony_ci.purgem W_PRECALC_16_31
4778c2ecf20Sopenharmony_ci.macro  W_PRECALC_16_31
4788c2ecf20Sopenharmony_ci    W_PRECALC_16_31_AVX
4798c2ecf20Sopenharmony_ci.endm
4808c2ecf20Sopenharmony_ci.purgem W_PRECALC_32_79
4818c2ecf20Sopenharmony_ci.macro  W_PRECALC_32_79
4828c2ecf20Sopenharmony_ci    W_PRECALC_32_79_AVX
4838c2ecf20Sopenharmony_ci.endm
4848c2ecf20Sopenharmony_ci
4858c2ecf20Sopenharmony_ci.macro W_PRECALC_00_15_AVX
4868c2ecf20Sopenharmony_ci  .if ((i & 3) == 0)
4878c2ecf20Sopenharmony_ci	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
4888c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 1)
4898c2ecf20Sopenharmony_ci	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
4908c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 2)
4918c2ecf20Sopenharmony_ci	vpaddd	(K_BASE), W, W_TMP1
4928c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 3)
4938c2ecf20Sopenharmony_ci	vmovdqa	W_TMP1, WK(i&~3)
4948c2ecf20Sopenharmony_ci	W_PRECALC_ROTATE
4958c2ecf20Sopenharmony_ci  .endif
4968c2ecf20Sopenharmony_ci.endm
4978c2ecf20Sopenharmony_ci
4988c2ecf20Sopenharmony_ci.macro W_PRECALC_16_31_AVX
4998c2ecf20Sopenharmony_ci  .if ((i & 3) == 0)
5008c2ecf20Sopenharmony_ci	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
5018c2ecf20Sopenharmony_ci	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
5028c2ecf20Sopenharmony_ci	vpxor	W_minus_08, W, W
5038c2ecf20Sopenharmony_ci	vpxor	W_minus_16, W_TMP1, W_TMP1
5048c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 1)
5058c2ecf20Sopenharmony_ci	vpxor	W_TMP1, W, W
5068c2ecf20Sopenharmony_ci	vpslldq	$12, W, W_TMP2
5078c2ecf20Sopenharmony_ci	vpslld	$1, W, W_TMP1
5088c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 2)
5098c2ecf20Sopenharmony_ci	vpsrld	$31, W, W
5108c2ecf20Sopenharmony_ci	vpor	W, W_TMP1, W_TMP1
5118c2ecf20Sopenharmony_ci	vpslld	$2, W_TMP2, W
5128c2ecf20Sopenharmony_ci	vpsrld	$30, W_TMP2, W_TMP2
5138c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 3)
5148c2ecf20Sopenharmony_ci	vpxor	W, W_TMP1, W_TMP1
5158c2ecf20Sopenharmony_ci	vpxor	W_TMP2, W_TMP1, W
5168c2ecf20Sopenharmony_ci	vpaddd	K_XMM(K_BASE), W, W_TMP1
5178c2ecf20Sopenharmony_ci	vmovdqu	W_TMP1, WK(i&~3)
5188c2ecf20Sopenharmony_ci	W_PRECALC_ROTATE
5198c2ecf20Sopenharmony_ci  .endif
5208c2ecf20Sopenharmony_ci.endm
5218c2ecf20Sopenharmony_ci
5228c2ecf20Sopenharmony_ci.macro W_PRECALC_32_79_AVX
5238c2ecf20Sopenharmony_ci  .if ((i & 3) == 0)
5248c2ecf20Sopenharmony_ci	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
5258c2ecf20Sopenharmony_ci	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
5268c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 1)
5278c2ecf20Sopenharmony_ci	vpxor	W_minus_16, W_TMP1, W_TMP1
5288c2ecf20Sopenharmony_ci	vpxor	W_TMP1, W, W
5298c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 2)
5308c2ecf20Sopenharmony_ci	vpslld	$2, W, W_TMP1
5318c2ecf20Sopenharmony_ci	vpsrld	$30, W, W
5328c2ecf20Sopenharmony_ci	vpor	W, W_TMP1, W
5338c2ecf20Sopenharmony_ci  .elseif ((i & 3) == 3)
5348c2ecf20Sopenharmony_ci	vpaddd	K_XMM(K_BASE), W, W_TMP1
5358c2ecf20Sopenharmony_ci	vmovdqu	W_TMP1, WK(i&~3)
5368c2ecf20Sopenharmony_ci	W_PRECALC_ROTATE
5378c2ecf20Sopenharmony_ci  .endif
5388c2ecf20Sopenharmony_ci.endm
5398c2ecf20Sopenharmony_ci
5408c2ecf20Sopenharmony_ci.endm    // W_PRECALC_AVX
5418c2ecf20Sopenharmony_ci
5428c2ecf20Sopenharmony_ciW_PRECALC_AVX
5438c2ecf20Sopenharmony_ci.purgem xmm_mov
5448c2ecf20Sopenharmony_ci.macro xmm_mov a, b
5458c2ecf20Sopenharmony_ci	vmovdqu	\a,\b
5468c2ecf20Sopenharmony_ci.endm
5478c2ecf20Sopenharmony_ci
5488c2ecf20Sopenharmony_ci
5498c2ecf20Sopenharmony_ci/* AVX optimized implementation:
5508c2ecf20Sopenharmony_ci *  extern "C" void sha1_transform_avx(struct sha1_state *state,
5518c2ecf20Sopenharmony_ci *				       const u8 *data, int blocks);
5528c2ecf20Sopenharmony_ci */
5538c2ecf20Sopenharmony_ciSHA1_VECTOR_ASM     sha1_transform_avx
554