1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
4  *
5  * Copyright 2018 Google LLC
6  *
7  * Author: Eric Biggers <ebiggers@google.com>
8  */
9 
10 #include <linux/linkage.h>
11 #include <linux/cfi_types.h>
12 
13 #define		PASS0_SUMS	%xmm0
14 #define		PASS1_SUMS	%xmm1
15 #define		PASS2_SUMS	%xmm2
16 #define		PASS3_SUMS	%xmm3
17 #define		K0		%xmm4
18 #define		K1		%xmm5
19 #define		K2		%xmm6
20 #define		K3		%xmm7
21 #define		T0		%xmm8
22 #define		T1		%xmm9
23 #define		T2		%xmm10
24 #define		T3		%xmm11
25 #define		T4		%xmm12
26 #define		T5		%xmm13
27 #define		T6		%xmm14
28 #define		T7		%xmm15
29 #define		KEY		%rdi
30 #define		MESSAGE		%rsi
31 #define		MESSAGE_LEN	%rdx
32 #define		HASH		%rcx
33 
34 .macro _nh_stride	k0, k1, k2, k3, offset
35 
36 	// Load next message stride
37 	movdqu		\offset(MESSAGE), T1
38 
39 	// Load next key stride
40 	movdqu		\offset(KEY), \k3
41 
42 	// Add message words to key words
43 	movdqa		T1, T2
44 	movdqa		T1, T3
45 	paddd		T1, \k0    // reuse k0 to avoid a move
46 	paddd		\k1, T1
47 	paddd		\k2, T2
48 	paddd		\k3, T3
49 
50 	// Multiply 32x32 => 64 and accumulate
51 	pshufd		$0x10, \k0, T4
52 	pshufd		$0x32, \k0, \k0
53 	pshufd		$0x10, T1, T5
54 	pshufd		$0x32, T1, T1
55 	pshufd		$0x10, T2, T6
56 	pshufd		$0x32, T2, T2
57 	pshufd		$0x10, T3, T7
58 	pshufd		$0x32, T3, T3
59 	pmuludq		T4, \k0
60 	pmuludq		T5, T1
61 	pmuludq		T6, T2
62 	pmuludq		T7, T3
63 	paddq		\k0, PASS0_SUMS
64 	paddq		T1, PASS1_SUMS
65 	paddq		T2, PASS2_SUMS
66 	paddq		T3, PASS3_SUMS
67 .endm
68 
69 /*
70  * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
71  *		__le64 hash[NH_NUM_PASSES])
72  *
73  * It's guaranteed that message_len % 16 == 0.
74  */
75 SYM_TYPED_FUNC_START(nh_sse2)
76 
77 	movdqu		0x00(KEY), K0
78 	movdqu		0x10(KEY), K1
79 	movdqu		0x20(KEY), K2
80 	add		$0x30, KEY
81 	pxor		PASS0_SUMS, PASS0_SUMS
82 	pxor		PASS1_SUMS, PASS1_SUMS
83 	pxor		PASS2_SUMS, PASS2_SUMS
84 	pxor		PASS3_SUMS, PASS3_SUMS
85 
86 	sub		$0x40, MESSAGE_LEN
87 	jl		.Lloop4_done
88 .Lloop4:
89 	_nh_stride	K0, K1, K2, K3, 0x00
90 	_nh_stride	K1, K2, K3, K0, 0x10
91 	_nh_stride	K2, K3, K0, K1, 0x20
92 	_nh_stride	K3, K0, K1, K2, 0x30
93 	add		$0x40, KEY
94 	add		$0x40, MESSAGE
95 	sub		$0x40, MESSAGE_LEN
96 	jge		.Lloop4
97 
98 .Lloop4_done:
99 	and		$0x3f, MESSAGE_LEN
100 	jz		.Ldone
101 	_nh_stride	K0, K1, K2, K3, 0x00
102 
103 	sub		$0x10, MESSAGE_LEN
104 	jz		.Ldone
105 	_nh_stride	K1, K2, K3, K0, 0x10
106 
107 	sub		$0x10, MESSAGE_LEN
108 	jz		.Ldone
109 	_nh_stride	K2, K3, K0, K1, 0x20
110 
111 .Ldone:
112 	// Sum the accumulators for each pass, then store the sums to 'hash'
113 	movdqa		PASS0_SUMS, T0
114 	movdqa		PASS2_SUMS, T1
115 	punpcklqdq	PASS1_SUMS, T0		// => (PASS0_SUM_A PASS1_SUM_A)
116 	punpcklqdq	PASS3_SUMS, T1		// => (PASS2_SUM_A PASS3_SUM_A)
117 	punpckhqdq	PASS1_SUMS, PASS0_SUMS	// => (PASS0_SUM_B PASS1_SUM_B)
118 	punpckhqdq	PASS3_SUMS, PASS2_SUMS	// => (PASS2_SUM_B PASS3_SUM_B)
119 	paddq		PASS0_SUMS, T0
120 	paddq		PASS2_SUMS, T1
121 	movdqu		T0, 0x00(HASH)
122 	movdqu		T1, 0x10(HASH)
123 	RET
124 SYM_FUNC_END(nh_sse2)
125