162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright 2018 Google LLC
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Author: Eric Biggers <ebiggers@google.com>
862306a36Sopenharmony_ci */
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#include <linux/linkage.h>
1162306a36Sopenharmony_ci#include <linux/cfi_types.h>
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci#define		PASS0_SUMS	%ymm0
1462306a36Sopenharmony_ci#define		PASS1_SUMS	%ymm1
1562306a36Sopenharmony_ci#define		PASS2_SUMS	%ymm2
1662306a36Sopenharmony_ci#define		PASS3_SUMS	%ymm3
1762306a36Sopenharmony_ci#define		K0		%ymm4
1862306a36Sopenharmony_ci#define		K0_XMM		%xmm4
1962306a36Sopenharmony_ci#define		K1		%ymm5
2062306a36Sopenharmony_ci#define		K1_XMM		%xmm5
2162306a36Sopenharmony_ci#define		K2		%ymm6
2262306a36Sopenharmony_ci#define		K2_XMM		%xmm6
2362306a36Sopenharmony_ci#define		K3		%ymm7
2462306a36Sopenharmony_ci#define		K3_XMM		%xmm7
2562306a36Sopenharmony_ci#define		T0		%ymm8
2662306a36Sopenharmony_ci#define		T1		%ymm9
2762306a36Sopenharmony_ci#define		T2		%ymm10
2862306a36Sopenharmony_ci#define		T2_XMM		%xmm10
2962306a36Sopenharmony_ci#define		T3		%ymm11
3062306a36Sopenharmony_ci#define		T3_XMM		%xmm11
3162306a36Sopenharmony_ci#define		T4		%ymm12
3262306a36Sopenharmony_ci#define		T5		%ymm13
3362306a36Sopenharmony_ci#define		T6		%ymm14
3462306a36Sopenharmony_ci#define		T7		%ymm15
3562306a36Sopenharmony_ci#define		KEY		%rdi
3662306a36Sopenharmony_ci#define		MESSAGE		%rsi
3762306a36Sopenharmony_ci#define		MESSAGE_LEN	%rdx
3862306a36Sopenharmony_ci#define		HASH		%rcx
3962306a36Sopenharmony_ci
4062306a36Sopenharmony_ci.macro _nh_2xstride	k0, k1, k2, k3
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci	// Add message words to key words
4362306a36Sopenharmony_ci	vpaddd		\k0, T3, T0
4462306a36Sopenharmony_ci	vpaddd		\k1, T3, T1
4562306a36Sopenharmony_ci	vpaddd		\k2, T3, T2
4662306a36Sopenharmony_ci	vpaddd		\k3, T3, T3
4762306a36Sopenharmony_ci
4862306a36Sopenharmony_ci	// Multiply 32x32 => 64 and accumulate
4962306a36Sopenharmony_ci	vpshufd		$0x10, T0, T4
5062306a36Sopenharmony_ci	vpshufd		$0x32, T0, T0
5162306a36Sopenharmony_ci	vpshufd		$0x10, T1, T5
5262306a36Sopenharmony_ci	vpshufd		$0x32, T1, T1
5362306a36Sopenharmony_ci	vpshufd		$0x10, T2, T6
5462306a36Sopenharmony_ci	vpshufd		$0x32, T2, T2
5562306a36Sopenharmony_ci	vpshufd		$0x10, T3, T7
5662306a36Sopenharmony_ci	vpshufd		$0x32, T3, T3
5762306a36Sopenharmony_ci	vpmuludq	T4, T0, T0
5862306a36Sopenharmony_ci	vpmuludq	T5, T1, T1
5962306a36Sopenharmony_ci	vpmuludq	T6, T2, T2
6062306a36Sopenharmony_ci	vpmuludq	T7, T3, T3
6162306a36Sopenharmony_ci	vpaddq		T0, PASS0_SUMS, PASS0_SUMS
6262306a36Sopenharmony_ci	vpaddq		T1, PASS1_SUMS, PASS1_SUMS
6362306a36Sopenharmony_ci	vpaddq		T2, PASS2_SUMS, PASS2_SUMS
6462306a36Sopenharmony_ci	vpaddq		T3, PASS3_SUMS, PASS3_SUMS
6562306a36Sopenharmony_ci.endm
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ci/*
6862306a36Sopenharmony_ci * void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
6962306a36Sopenharmony_ci *		__le64 hash[NH_NUM_PASSES])
7062306a36Sopenharmony_ci *
7162306a36Sopenharmony_ci * It's guaranteed that message_len % 16 == 0.
7262306a36Sopenharmony_ci */
7362306a36Sopenharmony_ciSYM_TYPED_FUNC_START(nh_avx2)
7462306a36Sopenharmony_ci
7562306a36Sopenharmony_ci	vmovdqu		0x00(KEY), K0
7662306a36Sopenharmony_ci	vmovdqu		0x10(KEY), K1
7762306a36Sopenharmony_ci	add		$0x20, KEY
7862306a36Sopenharmony_ci	vpxor		PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
7962306a36Sopenharmony_ci	vpxor		PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
8062306a36Sopenharmony_ci	vpxor		PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
8162306a36Sopenharmony_ci	vpxor		PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ci	sub		$0x40, MESSAGE_LEN
8462306a36Sopenharmony_ci	jl		.Lloop4_done
8562306a36Sopenharmony_ci.Lloop4:
8662306a36Sopenharmony_ci	vmovdqu		(MESSAGE), T3
8762306a36Sopenharmony_ci	vmovdqu		0x00(KEY), K2
8862306a36Sopenharmony_ci	vmovdqu		0x10(KEY), K3
8962306a36Sopenharmony_ci	_nh_2xstride	K0, K1, K2, K3
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci	vmovdqu		0x20(MESSAGE), T3
9262306a36Sopenharmony_ci	vmovdqu		0x20(KEY), K0
9362306a36Sopenharmony_ci	vmovdqu		0x30(KEY), K1
9462306a36Sopenharmony_ci	_nh_2xstride	K2, K3, K0, K1
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci	add		$0x40, MESSAGE
9762306a36Sopenharmony_ci	add		$0x40, KEY
9862306a36Sopenharmony_ci	sub		$0x40, MESSAGE_LEN
9962306a36Sopenharmony_ci	jge		.Lloop4
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci.Lloop4_done:
10262306a36Sopenharmony_ci	and		$0x3f, MESSAGE_LEN
10362306a36Sopenharmony_ci	jz		.Ldone
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci	cmp		$0x20, MESSAGE_LEN
10662306a36Sopenharmony_ci	jl		.Llast
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_ci	// 2 or 3 strides remain; do 2 more.
10962306a36Sopenharmony_ci	vmovdqu		(MESSAGE), T3
11062306a36Sopenharmony_ci	vmovdqu		0x00(KEY), K2
11162306a36Sopenharmony_ci	vmovdqu		0x10(KEY), K3
11262306a36Sopenharmony_ci	_nh_2xstride	K0, K1, K2, K3
11362306a36Sopenharmony_ci	add		$0x20, MESSAGE
11462306a36Sopenharmony_ci	add		$0x20, KEY
11562306a36Sopenharmony_ci	sub		$0x20, MESSAGE_LEN
11662306a36Sopenharmony_ci	jz		.Ldone
11762306a36Sopenharmony_ci	vmovdqa		K2, K0
11862306a36Sopenharmony_ci	vmovdqa		K3, K1
11962306a36Sopenharmony_ci.Llast:
12062306a36Sopenharmony_ci	// Last stride.  Zero the high 128 bits of the message and keys so they
12162306a36Sopenharmony_ci	// don't affect the result when processing them like 2 strides.
12262306a36Sopenharmony_ci	vmovdqu		(MESSAGE), T3_XMM
12362306a36Sopenharmony_ci	vmovdqa		K0_XMM, K0_XMM
12462306a36Sopenharmony_ci	vmovdqa		K1_XMM, K1_XMM
12562306a36Sopenharmony_ci	vmovdqu		0x00(KEY), K2_XMM
12662306a36Sopenharmony_ci	vmovdqu		0x10(KEY), K3_XMM
12762306a36Sopenharmony_ci	_nh_2xstride	K0, K1, K2, K3
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci.Ldone:
13062306a36Sopenharmony_ci	// Sum the accumulators for each pass, then store the sums to 'hash'
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci	// PASS0_SUMS is (0A 0B 0C 0D)
13362306a36Sopenharmony_ci	// PASS1_SUMS is (1A 1B 1C 1D)
13462306a36Sopenharmony_ci	// PASS2_SUMS is (2A 2B 2C 2D)
13562306a36Sopenharmony_ci	// PASS3_SUMS is (3A 3B 3C 3D)
13662306a36Sopenharmony_ci	// We need the horizontal sums:
13762306a36Sopenharmony_ci	//     (0A + 0B + 0C + 0D,
13862306a36Sopenharmony_ci	//	1A + 1B + 1C + 1D,
13962306a36Sopenharmony_ci	//	2A + 2B + 2C + 2D,
14062306a36Sopenharmony_ci	//	3A + 3B + 3C + 3D)
14162306a36Sopenharmony_ci	//
14262306a36Sopenharmony_ci
14362306a36Sopenharmony_ci	vpunpcklqdq	PASS1_SUMS, PASS0_SUMS, T0	// T0 = (0A 1A 0C 1C)
14462306a36Sopenharmony_ci	vpunpckhqdq	PASS1_SUMS, PASS0_SUMS, T1	// T1 = (0B 1B 0D 1D)
14562306a36Sopenharmony_ci	vpunpcklqdq	PASS3_SUMS, PASS2_SUMS, T2	// T2 = (2A 3A 2C 3C)
14662306a36Sopenharmony_ci	vpunpckhqdq	PASS3_SUMS, PASS2_SUMS, T3	// T3 = (2B 3B 2D 3D)
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_ci	vinserti128	$0x1, T2_XMM, T0, T4		// T4 = (0A 1A 2A 3A)
14962306a36Sopenharmony_ci	vinserti128	$0x1, T3_XMM, T1, T5		// T5 = (0B 1B 2B 3B)
15062306a36Sopenharmony_ci	vperm2i128	$0x31, T2, T0, T0		// T0 = (0C 1C 2C 3C)
15162306a36Sopenharmony_ci	vperm2i128	$0x31, T3, T1, T1		// T1 = (0D 1D 2D 3D)
15262306a36Sopenharmony_ci
15362306a36Sopenharmony_ci	vpaddq		T5, T4, T4
15462306a36Sopenharmony_ci	vpaddq		T1, T0, T0
15562306a36Sopenharmony_ci	vpaddq		T4, T0, T0
15662306a36Sopenharmony_ci	vmovdqu		T0, (HASH)
15762306a36Sopenharmony_ci	RET
15862306a36Sopenharmony_ciSYM_FUNC_END(nh_avx2)
159