162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * NH - ε-almost-universal hash function, NEON accelerated version
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright 2018 Google LLC
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Author: Eric Biggers <ebiggers@google.com>
862306a36Sopenharmony_ci */
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#include <linux/linkage.h>
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci	.text
1362306a36Sopenharmony_ci	.fpu		neon
1462306a36Sopenharmony_ci
1562306a36Sopenharmony_ci	KEY		.req	r0
1662306a36Sopenharmony_ci	MESSAGE		.req	r1
1762306a36Sopenharmony_ci	MESSAGE_LEN	.req	r2
1862306a36Sopenharmony_ci	HASH		.req	r3
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci	PASS0_SUMS	.req	q0
2162306a36Sopenharmony_ci	PASS0_SUM_A	.req	d0
2262306a36Sopenharmony_ci	PASS0_SUM_B	.req	d1
2362306a36Sopenharmony_ci	PASS1_SUMS	.req	q1
2462306a36Sopenharmony_ci	PASS1_SUM_A	.req	d2
2562306a36Sopenharmony_ci	PASS1_SUM_B	.req	d3
2662306a36Sopenharmony_ci	PASS2_SUMS	.req	q2
2762306a36Sopenharmony_ci	PASS2_SUM_A	.req	d4
2862306a36Sopenharmony_ci	PASS2_SUM_B	.req	d5
2962306a36Sopenharmony_ci	PASS3_SUMS	.req	q3
3062306a36Sopenharmony_ci	PASS3_SUM_A	.req	d6
3162306a36Sopenharmony_ci	PASS3_SUM_B	.req	d7
3262306a36Sopenharmony_ci	K0		.req	q4
3362306a36Sopenharmony_ci	K1		.req	q5
3462306a36Sopenharmony_ci	K2		.req	q6
3562306a36Sopenharmony_ci	K3		.req	q7
3662306a36Sopenharmony_ci	T0		.req	q8
3762306a36Sopenharmony_ci	T0_L		.req	d16
3862306a36Sopenharmony_ci	T0_H		.req	d17
3962306a36Sopenharmony_ci	T1		.req	q9
4062306a36Sopenharmony_ci	T1_L		.req	d18
4162306a36Sopenharmony_ci	T1_H		.req	d19
4262306a36Sopenharmony_ci	T2		.req	q10
4362306a36Sopenharmony_ci	T2_L		.req	d20
4462306a36Sopenharmony_ci	T2_H		.req	d21
4562306a36Sopenharmony_ci	T3		.req	q11
4662306a36Sopenharmony_ci	T3_L		.req	d22
4762306a36Sopenharmony_ci	T3_H		.req	d23
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci.macro _nh_stride	k0, k1, k2, k3
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci	// Load next message stride
5262306a36Sopenharmony_ci	vld1.8		{T3}, [MESSAGE]!
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci	// Load next key stride
5562306a36Sopenharmony_ci	vld1.32		{\k3}, [KEY]!
5662306a36Sopenharmony_ci
5762306a36Sopenharmony_ci	// Add message words to key words
5862306a36Sopenharmony_ci	vadd.u32	T0, T3, \k0
5962306a36Sopenharmony_ci	vadd.u32	T1, T3, \k1
6062306a36Sopenharmony_ci	vadd.u32	T2, T3, \k2
6162306a36Sopenharmony_ci	vadd.u32	T3, T3, \k3
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci	// Multiply 32x32 => 64 and accumulate
6462306a36Sopenharmony_ci	vmlal.u32	PASS0_SUMS, T0_L, T0_H
6562306a36Sopenharmony_ci	vmlal.u32	PASS1_SUMS, T1_L, T1_H
6662306a36Sopenharmony_ci	vmlal.u32	PASS2_SUMS, T2_L, T2_H
6762306a36Sopenharmony_ci	vmlal.u32	PASS3_SUMS, T3_L, T3_H
6862306a36Sopenharmony_ci.endm
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci/*
7162306a36Sopenharmony_ci * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
7262306a36Sopenharmony_ci *		__le64 hash[NH_NUM_PASSES])
7362306a36Sopenharmony_ci *
7462306a36Sopenharmony_ci * It's guaranteed that message_len % 16 == 0.
7562306a36Sopenharmony_ci */
7662306a36Sopenharmony_ciENTRY(nh_neon)
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_ci	vld1.32		{K0,K1}, [KEY]!
7962306a36Sopenharmony_ci	  vmov.u64	PASS0_SUMS, #0
8062306a36Sopenharmony_ci	  vmov.u64	PASS1_SUMS, #0
8162306a36Sopenharmony_ci	vld1.32		{K2}, [KEY]!
8262306a36Sopenharmony_ci	  vmov.u64	PASS2_SUMS, #0
8362306a36Sopenharmony_ci	  vmov.u64	PASS3_SUMS, #0
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_ci	subs		MESSAGE_LEN, MESSAGE_LEN, #64
8662306a36Sopenharmony_ci	blt		.Lloop4_done
8762306a36Sopenharmony_ci.Lloop4:
8862306a36Sopenharmony_ci	_nh_stride	K0, K1, K2, K3
8962306a36Sopenharmony_ci	_nh_stride	K1, K2, K3, K0
9062306a36Sopenharmony_ci	_nh_stride	K2, K3, K0, K1
9162306a36Sopenharmony_ci	_nh_stride	K3, K0, K1, K2
9262306a36Sopenharmony_ci	subs		MESSAGE_LEN, MESSAGE_LEN, #64
9362306a36Sopenharmony_ci	bge		.Lloop4
9462306a36Sopenharmony_ci
9562306a36Sopenharmony_ci.Lloop4_done:
9662306a36Sopenharmony_ci	ands		MESSAGE_LEN, MESSAGE_LEN, #63
9762306a36Sopenharmony_ci	beq		.Ldone
9862306a36Sopenharmony_ci	_nh_stride	K0, K1, K2, K3
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci	subs		MESSAGE_LEN, MESSAGE_LEN, #16
10162306a36Sopenharmony_ci	beq		.Ldone
10262306a36Sopenharmony_ci	_nh_stride	K1, K2, K3, K0
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci	subs		MESSAGE_LEN, MESSAGE_LEN, #16
10562306a36Sopenharmony_ci	beq		.Ldone
10662306a36Sopenharmony_ci	_nh_stride	K2, K3, K0, K1
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_ci.Ldone:
10962306a36Sopenharmony_ci	// Sum the accumulators for each pass, then store the sums to 'hash'
11062306a36Sopenharmony_ci	vadd.u64	T0_L, PASS0_SUM_A, PASS0_SUM_B
11162306a36Sopenharmony_ci	vadd.u64	T0_H, PASS1_SUM_A, PASS1_SUM_B
11262306a36Sopenharmony_ci	vadd.u64	T1_L, PASS2_SUM_A, PASS2_SUM_B
11362306a36Sopenharmony_ci	vadd.u64	T1_H, PASS3_SUM_A, PASS3_SUM_B
11462306a36Sopenharmony_ci	vst1.8		{T0-T1}, [HASH]
11562306a36Sopenharmony_ci	bx		lr
11662306a36Sopenharmony_ciENDPROC(nh_neon)
117