18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright 2018 Google LLC
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci * Author: Eric Biggers <ebiggers@google.com>
88c2ecf20Sopenharmony_ci */
98c2ecf20Sopenharmony_ci
108c2ecf20Sopenharmony_ci#include <linux/linkage.h>
118c2ecf20Sopenharmony_ci
128c2ecf20Sopenharmony_ci#define		PASS0_SUMS	%ymm0
138c2ecf20Sopenharmony_ci#define		PASS1_SUMS	%ymm1
148c2ecf20Sopenharmony_ci#define		PASS2_SUMS	%ymm2
158c2ecf20Sopenharmony_ci#define		PASS3_SUMS	%ymm3
168c2ecf20Sopenharmony_ci#define		K0		%ymm4
178c2ecf20Sopenharmony_ci#define		K0_XMM		%xmm4
188c2ecf20Sopenharmony_ci#define		K1		%ymm5
198c2ecf20Sopenharmony_ci#define		K1_XMM		%xmm5
208c2ecf20Sopenharmony_ci#define		K2		%ymm6
218c2ecf20Sopenharmony_ci#define		K2_XMM		%xmm6
228c2ecf20Sopenharmony_ci#define		K3		%ymm7
238c2ecf20Sopenharmony_ci#define		K3_XMM		%xmm7
248c2ecf20Sopenharmony_ci#define		T0		%ymm8
258c2ecf20Sopenharmony_ci#define		T1		%ymm9
268c2ecf20Sopenharmony_ci#define		T2		%ymm10
278c2ecf20Sopenharmony_ci#define		T2_XMM		%xmm10
288c2ecf20Sopenharmony_ci#define		T3		%ymm11
298c2ecf20Sopenharmony_ci#define		T3_XMM		%xmm11
308c2ecf20Sopenharmony_ci#define		T4		%ymm12
318c2ecf20Sopenharmony_ci#define		T5		%ymm13
328c2ecf20Sopenharmony_ci#define		T6		%ymm14
338c2ecf20Sopenharmony_ci#define		T7		%ymm15
348c2ecf20Sopenharmony_ci#define		KEY		%rdi
358c2ecf20Sopenharmony_ci#define		MESSAGE		%rsi
368c2ecf20Sopenharmony_ci#define		MESSAGE_LEN	%rdx
378c2ecf20Sopenharmony_ci#define		HASH		%rcx
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_ci.macro _nh_2xstride	k0, k1, k2, k3
408c2ecf20Sopenharmony_ci
418c2ecf20Sopenharmony_ci	// Add message words to key words
428c2ecf20Sopenharmony_ci	vpaddd		\k0, T3, T0
438c2ecf20Sopenharmony_ci	vpaddd		\k1, T3, T1
448c2ecf20Sopenharmony_ci	vpaddd		\k2, T3, T2
458c2ecf20Sopenharmony_ci	vpaddd		\k3, T3, T3
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci	// Multiply 32x32 => 64 and accumulate
488c2ecf20Sopenharmony_ci	vpshufd		$0x10, T0, T4
498c2ecf20Sopenharmony_ci	vpshufd		$0x32, T0, T0
508c2ecf20Sopenharmony_ci	vpshufd		$0x10, T1, T5
518c2ecf20Sopenharmony_ci	vpshufd		$0x32, T1, T1
528c2ecf20Sopenharmony_ci	vpshufd		$0x10, T2, T6
538c2ecf20Sopenharmony_ci	vpshufd		$0x32, T2, T2
548c2ecf20Sopenharmony_ci	vpshufd		$0x10, T3, T7
558c2ecf20Sopenharmony_ci	vpshufd		$0x32, T3, T3
568c2ecf20Sopenharmony_ci	vpmuludq	T4, T0, T0
578c2ecf20Sopenharmony_ci	vpmuludq	T5, T1, T1
588c2ecf20Sopenharmony_ci	vpmuludq	T6, T2, T2
598c2ecf20Sopenharmony_ci	vpmuludq	T7, T3, T3
608c2ecf20Sopenharmony_ci	vpaddq		T0, PASS0_SUMS, PASS0_SUMS
618c2ecf20Sopenharmony_ci	vpaddq		T1, PASS1_SUMS, PASS1_SUMS
628c2ecf20Sopenharmony_ci	vpaddq		T2, PASS2_SUMS, PASS2_SUMS
638c2ecf20Sopenharmony_ci	vpaddq		T3, PASS3_SUMS, PASS3_SUMS
648c2ecf20Sopenharmony_ci.endm
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_ci/*
678c2ecf20Sopenharmony_ci * void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
688c2ecf20Sopenharmony_ci *		u8 hash[NH_HASH_BYTES])
698c2ecf20Sopenharmony_ci *
708c2ecf20Sopenharmony_ci * It's guaranteed that message_len % 16 == 0.
718c2ecf20Sopenharmony_ci */
728c2ecf20Sopenharmony_ciSYM_FUNC_START(nh_avx2)
738c2ecf20Sopenharmony_ci
748c2ecf20Sopenharmony_ci	vmovdqu		0x00(KEY), K0
758c2ecf20Sopenharmony_ci	vmovdqu		0x10(KEY), K1
768c2ecf20Sopenharmony_ci	add		$0x20, KEY
778c2ecf20Sopenharmony_ci	vpxor		PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
788c2ecf20Sopenharmony_ci	vpxor		PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
798c2ecf20Sopenharmony_ci	vpxor		PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
808c2ecf20Sopenharmony_ci	vpxor		PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
818c2ecf20Sopenharmony_ci
828c2ecf20Sopenharmony_ci	sub		$0x40, MESSAGE_LEN
838c2ecf20Sopenharmony_ci	jl		.Lloop4_done
848c2ecf20Sopenharmony_ci.Lloop4:
858c2ecf20Sopenharmony_ci	vmovdqu		(MESSAGE), T3
868c2ecf20Sopenharmony_ci	vmovdqu		0x00(KEY), K2
878c2ecf20Sopenharmony_ci	vmovdqu		0x10(KEY), K3
888c2ecf20Sopenharmony_ci	_nh_2xstride	K0, K1, K2, K3
898c2ecf20Sopenharmony_ci
908c2ecf20Sopenharmony_ci	vmovdqu		0x20(MESSAGE), T3
918c2ecf20Sopenharmony_ci	vmovdqu		0x20(KEY), K0
928c2ecf20Sopenharmony_ci	vmovdqu		0x30(KEY), K1
938c2ecf20Sopenharmony_ci	_nh_2xstride	K2, K3, K0, K1
948c2ecf20Sopenharmony_ci
958c2ecf20Sopenharmony_ci	add		$0x40, MESSAGE
968c2ecf20Sopenharmony_ci	add		$0x40, KEY
978c2ecf20Sopenharmony_ci	sub		$0x40, MESSAGE_LEN
988c2ecf20Sopenharmony_ci	jge		.Lloop4
998c2ecf20Sopenharmony_ci
1008c2ecf20Sopenharmony_ci.Lloop4_done:
1018c2ecf20Sopenharmony_ci	and		$0x3f, MESSAGE_LEN
1028c2ecf20Sopenharmony_ci	jz		.Ldone
1038c2ecf20Sopenharmony_ci
1048c2ecf20Sopenharmony_ci	cmp		$0x20, MESSAGE_LEN
1058c2ecf20Sopenharmony_ci	jl		.Llast
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_ci	// 2 or 3 strides remain; do 2 more.
1088c2ecf20Sopenharmony_ci	vmovdqu		(MESSAGE), T3
1098c2ecf20Sopenharmony_ci	vmovdqu		0x00(KEY), K2
1108c2ecf20Sopenharmony_ci	vmovdqu		0x10(KEY), K3
1118c2ecf20Sopenharmony_ci	_nh_2xstride	K0, K1, K2, K3
1128c2ecf20Sopenharmony_ci	add		$0x20, MESSAGE
1138c2ecf20Sopenharmony_ci	add		$0x20, KEY
1148c2ecf20Sopenharmony_ci	sub		$0x20, MESSAGE_LEN
1158c2ecf20Sopenharmony_ci	jz		.Ldone
1168c2ecf20Sopenharmony_ci	vmovdqa		K2, K0
1178c2ecf20Sopenharmony_ci	vmovdqa		K3, K1
1188c2ecf20Sopenharmony_ci.Llast:
1198c2ecf20Sopenharmony_ci	// Last stride.  Zero the high 128 bits of the message and keys so they
1208c2ecf20Sopenharmony_ci	// don't affect the result when processing them like 2 strides.
1218c2ecf20Sopenharmony_ci	vmovdqu		(MESSAGE), T3_XMM
1228c2ecf20Sopenharmony_ci	vmovdqa		K0_XMM, K0_XMM
1238c2ecf20Sopenharmony_ci	vmovdqa		K1_XMM, K1_XMM
1248c2ecf20Sopenharmony_ci	vmovdqu		0x00(KEY), K2_XMM
1258c2ecf20Sopenharmony_ci	vmovdqu		0x10(KEY), K3_XMM
1268c2ecf20Sopenharmony_ci	_nh_2xstride	K0, K1, K2, K3
1278c2ecf20Sopenharmony_ci
1288c2ecf20Sopenharmony_ci.Ldone:
1298c2ecf20Sopenharmony_ci	// Sum the accumulators for each pass, then store the sums to 'hash'
1308c2ecf20Sopenharmony_ci
1318c2ecf20Sopenharmony_ci	// PASS0_SUMS is (0A 0B 0C 0D)
1328c2ecf20Sopenharmony_ci	// PASS1_SUMS is (1A 1B 1C 1D)
1338c2ecf20Sopenharmony_ci	// PASS2_SUMS is (2A 2B 2C 2D)
1348c2ecf20Sopenharmony_ci	// PASS3_SUMS is (3A 3B 3C 3D)
1358c2ecf20Sopenharmony_ci	// We need the horizontal sums:
1368c2ecf20Sopenharmony_ci	//     (0A + 0B + 0C + 0D,
1378c2ecf20Sopenharmony_ci	//	1A + 1B + 1C + 1D,
1388c2ecf20Sopenharmony_ci	//	2A + 2B + 2C + 2D,
1398c2ecf20Sopenharmony_ci	//	3A + 3B + 3C + 3D)
1408c2ecf20Sopenharmony_ci	//
1418c2ecf20Sopenharmony_ci
1428c2ecf20Sopenharmony_ci	vpunpcklqdq	PASS1_SUMS, PASS0_SUMS, T0	// T0 = (0A 1A 0C 1C)
1438c2ecf20Sopenharmony_ci	vpunpckhqdq	PASS1_SUMS, PASS0_SUMS, T1	// T1 = (0B 1B 0D 1D)
1448c2ecf20Sopenharmony_ci	vpunpcklqdq	PASS3_SUMS, PASS2_SUMS, T2	// T2 = (2A 3A 2C 3C)
1458c2ecf20Sopenharmony_ci	vpunpckhqdq	PASS3_SUMS, PASS2_SUMS, T3	// T3 = (2B 3B 2D 3D)
1468c2ecf20Sopenharmony_ci
1478c2ecf20Sopenharmony_ci	vinserti128	$0x1, T2_XMM, T0, T4		// T4 = (0A 1A 2A 3A)
1488c2ecf20Sopenharmony_ci	vinserti128	$0x1, T3_XMM, T1, T5		// T5 = (0B 1B 2B 3B)
1498c2ecf20Sopenharmony_ci	vperm2i128	$0x31, T2, T0, T0		// T0 = (0C 1C 2C 3C)
1508c2ecf20Sopenharmony_ci	vperm2i128	$0x31, T3, T1, T1		// T1 = (0D 1D 2D 3D)
1518c2ecf20Sopenharmony_ci
1528c2ecf20Sopenharmony_ci	vpaddq		T5, T4, T4
1538c2ecf20Sopenharmony_ci	vpaddq		T1, T0, T0
1548c2ecf20Sopenharmony_ci	vpaddq		T4, T0, T0
1558c2ecf20Sopenharmony_ci	vmovdqu		T0, (HASH)
1568c2ecf20Sopenharmony_ci	RET
1578c2ecf20Sopenharmony_ciSYM_FUNC_END(nh_avx2)
158