18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright 2018 Google LLC
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci * Author: Eric Biggers <ebiggers@google.com>
88c2ecf20Sopenharmony_ci */
98c2ecf20Sopenharmony_ci
108c2ecf20Sopenharmony_ci#include <linux/linkage.h>
118c2ecf20Sopenharmony_ci
128c2ecf20Sopenharmony_ci	KEY		.req	x0
138c2ecf20Sopenharmony_ci	MESSAGE		.req	x1
148c2ecf20Sopenharmony_ci	MESSAGE_LEN	.req	x2
158c2ecf20Sopenharmony_ci	HASH		.req	x3
168c2ecf20Sopenharmony_ci
178c2ecf20Sopenharmony_ci	PASS0_SUMS	.req	v0
188c2ecf20Sopenharmony_ci	PASS1_SUMS	.req	v1
198c2ecf20Sopenharmony_ci	PASS2_SUMS	.req	v2
208c2ecf20Sopenharmony_ci	PASS3_SUMS	.req	v3
218c2ecf20Sopenharmony_ci	K0		.req	v4
228c2ecf20Sopenharmony_ci	K1		.req	v5
238c2ecf20Sopenharmony_ci	K2		.req	v6
248c2ecf20Sopenharmony_ci	K3		.req	v7
258c2ecf20Sopenharmony_ci	T0		.req	v8
268c2ecf20Sopenharmony_ci	T1		.req	v9
278c2ecf20Sopenharmony_ci	T2		.req	v10
288c2ecf20Sopenharmony_ci	T3		.req	v11
298c2ecf20Sopenharmony_ci	T4		.req	v12
308c2ecf20Sopenharmony_ci	T5		.req	v13
318c2ecf20Sopenharmony_ci	T6		.req	v14
328c2ecf20Sopenharmony_ci	T7		.req	v15
338c2ecf20Sopenharmony_ci
348c2ecf20Sopenharmony_ci.macro _nh_stride	k0, k1, k2, k3
358c2ecf20Sopenharmony_ci
368c2ecf20Sopenharmony_ci	// Load next message stride
378c2ecf20Sopenharmony_ci	ld1		{T3.16b}, [MESSAGE], #16
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_ci	// Load next key stride
408c2ecf20Sopenharmony_ci	ld1		{\k3\().4s}, [KEY], #16
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_ci	// Add message words to key words
438c2ecf20Sopenharmony_ci	add		T0.4s, T3.4s, \k0\().4s
448c2ecf20Sopenharmony_ci	add		T1.4s, T3.4s, \k1\().4s
458c2ecf20Sopenharmony_ci	add		T2.4s, T3.4s, \k2\().4s
468c2ecf20Sopenharmony_ci	add		T3.4s, T3.4s, \k3\().4s
478c2ecf20Sopenharmony_ci
488c2ecf20Sopenharmony_ci	// Multiply 32x32 => 64 and accumulate
498c2ecf20Sopenharmony_ci	mov		T4.d[0], T0.d[1]
508c2ecf20Sopenharmony_ci	mov		T5.d[0], T1.d[1]
518c2ecf20Sopenharmony_ci	mov		T6.d[0], T2.d[1]
528c2ecf20Sopenharmony_ci	mov		T7.d[0], T3.d[1]
538c2ecf20Sopenharmony_ci	umlal		PASS0_SUMS.2d, T0.2s, T4.2s
548c2ecf20Sopenharmony_ci	umlal		PASS1_SUMS.2d, T1.2s, T5.2s
558c2ecf20Sopenharmony_ci	umlal		PASS2_SUMS.2d, T2.2s, T6.2s
568c2ecf20Sopenharmony_ci	umlal		PASS3_SUMS.2d, T3.2s, T7.2s
578c2ecf20Sopenharmony_ci.endm
588c2ecf20Sopenharmony_ci
598c2ecf20Sopenharmony_ci/*
608c2ecf20Sopenharmony_ci * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
618c2ecf20Sopenharmony_ci *		u8 hash[NH_HASH_BYTES])
628c2ecf20Sopenharmony_ci *
638c2ecf20Sopenharmony_ci * It's guaranteed that message_len % 16 == 0.
648c2ecf20Sopenharmony_ci */
658c2ecf20Sopenharmony_ciSYM_FUNC_START(nh_neon)
668c2ecf20Sopenharmony_ci
678c2ecf20Sopenharmony_ci	ld1		{K0.4s,K1.4s}, [KEY], #32
688c2ecf20Sopenharmony_ci	  movi		PASS0_SUMS.2d, #0
698c2ecf20Sopenharmony_ci	  movi		PASS1_SUMS.2d, #0
708c2ecf20Sopenharmony_ci	ld1		{K2.4s}, [KEY], #16
718c2ecf20Sopenharmony_ci	  movi		PASS2_SUMS.2d, #0
728c2ecf20Sopenharmony_ci	  movi		PASS3_SUMS.2d, #0
738c2ecf20Sopenharmony_ci
748c2ecf20Sopenharmony_ci	subs		MESSAGE_LEN, MESSAGE_LEN, #64
758c2ecf20Sopenharmony_ci	blt		.Lloop4_done
768c2ecf20Sopenharmony_ci.Lloop4:
778c2ecf20Sopenharmony_ci	_nh_stride	K0, K1, K2, K3
788c2ecf20Sopenharmony_ci	_nh_stride	K1, K2, K3, K0
798c2ecf20Sopenharmony_ci	_nh_stride	K2, K3, K0, K1
808c2ecf20Sopenharmony_ci	_nh_stride	K3, K0, K1, K2
818c2ecf20Sopenharmony_ci	subs		MESSAGE_LEN, MESSAGE_LEN, #64
828c2ecf20Sopenharmony_ci	bge		.Lloop4
838c2ecf20Sopenharmony_ci
848c2ecf20Sopenharmony_ci.Lloop4_done:
858c2ecf20Sopenharmony_ci	ands		MESSAGE_LEN, MESSAGE_LEN, #63
868c2ecf20Sopenharmony_ci	beq		.Ldone
878c2ecf20Sopenharmony_ci	_nh_stride	K0, K1, K2, K3
888c2ecf20Sopenharmony_ci
898c2ecf20Sopenharmony_ci	subs		MESSAGE_LEN, MESSAGE_LEN, #16
908c2ecf20Sopenharmony_ci	beq		.Ldone
918c2ecf20Sopenharmony_ci	_nh_stride	K1, K2, K3, K0
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci	subs		MESSAGE_LEN, MESSAGE_LEN, #16
948c2ecf20Sopenharmony_ci	beq		.Ldone
958c2ecf20Sopenharmony_ci	_nh_stride	K2, K3, K0, K1
968c2ecf20Sopenharmony_ci
978c2ecf20Sopenharmony_ci.Ldone:
988c2ecf20Sopenharmony_ci	// Sum the accumulators for each pass, then store the sums to 'hash'
998c2ecf20Sopenharmony_ci	addp		T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
1008c2ecf20Sopenharmony_ci	addp		T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
1018c2ecf20Sopenharmony_ci	st1		{T0.16b,T1.16b}, [HASH]
1028c2ecf20Sopenharmony_ci	ret
1038c2ecf20Sopenharmony_ciSYM_FUNC_END(nh_neon)
104