18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright 2018 Google LLC
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci * Author: Eric Biggers <ebiggers@google.com>
88c2ecf20Sopenharmony_ci */
98c2ecf20Sopenharmony_ci
108c2ecf20Sopenharmony_ci#include <linux/linkage.h>
118c2ecf20Sopenharmony_ci
128c2ecf20Sopenharmony_ci#define		PASS0_SUMS	%xmm0
138c2ecf20Sopenharmony_ci#define		PASS1_SUMS	%xmm1
148c2ecf20Sopenharmony_ci#define		PASS2_SUMS	%xmm2
158c2ecf20Sopenharmony_ci#define		PASS3_SUMS	%xmm3
168c2ecf20Sopenharmony_ci#define		K0		%xmm4
178c2ecf20Sopenharmony_ci#define		K1		%xmm5
188c2ecf20Sopenharmony_ci#define		K2		%xmm6
198c2ecf20Sopenharmony_ci#define		K3		%xmm7
208c2ecf20Sopenharmony_ci#define		T0		%xmm8
218c2ecf20Sopenharmony_ci#define		T1		%xmm9
228c2ecf20Sopenharmony_ci#define		T2		%xmm10
238c2ecf20Sopenharmony_ci#define		T3		%xmm11
248c2ecf20Sopenharmony_ci#define		T4		%xmm12
258c2ecf20Sopenharmony_ci#define		T5		%xmm13
268c2ecf20Sopenharmony_ci#define		T6		%xmm14
278c2ecf20Sopenharmony_ci#define		T7		%xmm15
288c2ecf20Sopenharmony_ci#define		KEY		%rdi
298c2ecf20Sopenharmony_ci#define		MESSAGE		%rsi
308c2ecf20Sopenharmony_ci#define		MESSAGE_LEN	%rdx
318c2ecf20Sopenharmony_ci#define		HASH		%rcx
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_ci.macro _nh_stride	k0, k1, k2, k3, offset
348c2ecf20Sopenharmony_ci
358c2ecf20Sopenharmony_ci	// Load next message stride
368c2ecf20Sopenharmony_ci	movdqu		\offset(MESSAGE), T1
378c2ecf20Sopenharmony_ci
388c2ecf20Sopenharmony_ci	// Load next key stride
398c2ecf20Sopenharmony_ci	movdqu		\offset(KEY), \k3
408c2ecf20Sopenharmony_ci
418c2ecf20Sopenharmony_ci	// Add message words to key words
428c2ecf20Sopenharmony_ci	movdqa		T1, T2
438c2ecf20Sopenharmony_ci	movdqa		T1, T3
448c2ecf20Sopenharmony_ci	paddd		T1, \k0    // reuse k0 to avoid a move
458c2ecf20Sopenharmony_ci	paddd		\k1, T1
468c2ecf20Sopenharmony_ci	paddd		\k2, T2
478c2ecf20Sopenharmony_ci	paddd		\k3, T3
488c2ecf20Sopenharmony_ci
498c2ecf20Sopenharmony_ci	// Multiply 32x32 => 64 and accumulate
508c2ecf20Sopenharmony_ci	pshufd		$0x10, \k0, T4
518c2ecf20Sopenharmony_ci	pshufd		$0x32, \k0, \k0
528c2ecf20Sopenharmony_ci	pshufd		$0x10, T1, T5
538c2ecf20Sopenharmony_ci	pshufd		$0x32, T1, T1
548c2ecf20Sopenharmony_ci	pshufd		$0x10, T2, T6
558c2ecf20Sopenharmony_ci	pshufd		$0x32, T2, T2
568c2ecf20Sopenharmony_ci	pshufd		$0x10, T3, T7
578c2ecf20Sopenharmony_ci	pshufd		$0x32, T3, T3
588c2ecf20Sopenharmony_ci	pmuludq		T4, \k0
598c2ecf20Sopenharmony_ci	pmuludq		T5, T1
608c2ecf20Sopenharmony_ci	pmuludq		T6, T2
618c2ecf20Sopenharmony_ci	pmuludq		T7, T3
628c2ecf20Sopenharmony_ci	paddq		\k0, PASS0_SUMS
638c2ecf20Sopenharmony_ci	paddq		T1, PASS1_SUMS
648c2ecf20Sopenharmony_ci	paddq		T2, PASS2_SUMS
658c2ecf20Sopenharmony_ci	paddq		T3, PASS3_SUMS
668c2ecf20Sopenharmony_ci.endm
678c2ecf20Sopenharmony_ci
688c2ecf20Sopenharmony_ci/*
698c2ecf20Sopenharmony_ci * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
708c2ecf20Sopenharmony_ci *		u8 hash[NH_HASH_BYTES])
718c2ecf20Sopenharmony_ci *
728c2ecf20Sopenharmony_ci * It's guaranteed that message_len % 16 == 0.
738c2ecf20Sopenharmony_ci */
748c2ecf20Sopenharmony_ciSYM_FUNC_START(nh_sse2)
758c2ecf20Sopenharmony_ci
768c2ecf20Sopenharmony_ci	movdqu		0x00(KEY), K0
778c2ecf20Sopenharmony_ci	movdqu		0x10(KEY), K1
788c2ecf20Sopenharmony_ci	movdqu		0x20(KEY), K2
798c2ecf20Sopenharmony_ci	add		$0x30, KEY
808c2ecf20Sopenharmony_ci	pxor		PASS0_SUMS, PASS0_SUMS
818c2ecf20Sopenharmony_ci	pxor		PASS1_SUMS, PASS1_SUMS
828c2ecf20Sopenharmony_ci	pxor		PASS2_SUMS, PASS2_SUMS
838c2ecf20Sopenharmony_ci	pxor		PASS3_SUMS, PASS3_SUMS
848c2ecf20Sopenharmony_ci
858c2ecf20Sopenharmony_ci	sub		$0x40, MESSAGE_LEN
868c2ecf20Sopenharmony_ci	jl		.Lloop4_done
878c2ecf20Sopenharmony_ci.Lloop4:
888c2ecf20Sopenharmony_ci	_nh_stride	K0, K1, K2, K3, 0x00
898c2ecf20Sopenharmony_ci	_nh_stride	K1, K2, K3, K0, 0x10
908c2ecf20Sopenharmony_ci	_nh_stride	K2, K3, K0, K1, 0x20
918c2ecf20Sopenharmony_ci	_nh_stride	K3, K0, K1, K2, 0x30
928c2ecf20Sopenharmony_ci	add		$0x40, KEY
938c2ecf20Sopenharmony_ci	add		$0x40, MESSAGE
948c2ecf20Sopenharmony_ci	sub		$0x40, MESSAGE_LEN
958c2ecf20Sopenharmony_ci	jge		.Lloop4
968c2ecf20Sopenharmony_ci
978c2ecf20Sopenharmony_ci.Lloop4_done:
988c2ecf20Sopenharmony_ci	and		$0x3f, MESSAGE_LEN
998c2ecf20Sopenharmony_ci	jz		.Ldone
1008c2ecf20Sopenharmony_ci	_nh_stride	K0, K1, K2, K3, 0x00
1018c2ecf20Sopenharmony_ci
1028c2ecf20Sopenharmony_ci	sub		$0x10, MESSAGE_LEN
1038c2ecf20Sopenharmony_ci	jz		.Ldone
1048c2ecf20Sopenharmony_ci	_nh_stride	K1, K2, K3, K0, 0x10
1058c2ecf20Sopenharmony_ci
1068c2ecf20Sopenharmony_ci	sub		$0x10, MESSAGE_LEN
1078c2ecf20Sopenharmony_ci	jz		.Ldone
1088c2ecf20Sopenharmony_ci	_nh_stride	K2, K3, K0, K1, 0x20
1098c2ecf20Sopenharmony_ci
1108c2ecf20Sopenharmony_ci.Ldone:
1118c2ecf20Sopenharmony_ci	// Sum the accumulators for each pass, then store the sums to 'hash'
1128c2ecf20Sopenharmony_ci	movdqa		PASS0_SUMS, T0
1138c2ecf20Sopenharmony_ci	movdqa		PASS2_SUMS, T1
1148c2ecf20Sopenharmony_ci	punpcklqdq	PASS1_SUMS, T0		// => (PASS0_SUM_A PASS1_SUM_A)
1158c2ecf20Sopenharmony_ci	punpcklqdq	PASS3_SUMS, T1		// => (PASS2_SUM_A PASS3_SUM_A)
1168c2ecf20Sopenharmony_ci	punpckhqdq	PASS1_SUMS, PASS0_SUMS	// => (PASS0_SUM_B PASS1_SUM_B)
1178c2ecf20Sopenharmony_ci	punpckhqdq	PASS3_SUMS, PASS2_SUMS	// => (PASS2_SUM_B PASS3_SUM_B)
1188c2ecf20Sopenharmony_ci	paddq		PASS0_SUMS, T0
1198c2ecf20Sopenharmony_ci	paddq		PASS2_SUMS, T1
1208c2ecf20Sopenharmony_ci	movdqu		T0, 0x00(HASH)
1218c2ecf20Sopenharmony_ci	movdqu		T1, 0x10(HASH)
1228c2ecf20Sopenharmony_ci	RET
1238c2ecf20Sopenharmony_ciSYM_FUNC_END(nh_sse2)
124