1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * NH - ε-almost-universal hash function, NEON accelerated version
4  *
5  * Copyright 2018 Google LLC
6  *
7  * Author: Eric Biggers <ebiggers@google.com>
8  */
9 
10 #include <linux/linkage.h>
11 
12 	.text
13 	.fpu		neon
14 
15 	KEY		.req	r0
16 	MESSAGE		.req	r1
17 	MESSAGE_LEN	.req	r2
18 	HASH		.req	r3
19 
20 	PASS0_SUMS	.req	q0
21 	PASS0_SUM_A	.req	d0
22 	PASS0_SUM_B	.req	d1
23 	PASS1_SUMS	.req	q1
24 	PASS1_SUM_A	.req	d2
25 	PASS1_SUM_B	.req	d3
26 	PASS2_SUMS	.req	q2
27 	PASS2_SUM_A	.req	d4
28 	PASS2_SUM_B	.req	d5
29 	PASS3_SUMS	.req	q3
30 	PASS3_SUM_A	.req	d6
31 	PASS3_SUM_B	.req	d7
32 	K0		.req	q4
33 	K1		.req	q5
34 	K2		.req	q6
35 	K3		.req	q7
36 	T0		.req	q8
37 	T0_L		.req	d16
38 	T0_H		.req	d17
39 	T1		.req	q9
40 	T1_L		.req	d18
41 	T1_H		.req	d19
42 	T2		.req	q10
43 	T2_L		.req	d20
44 	T2_H		.req	d21
45 	T3		.req	q11
46 	T3_L		.req	d22
47 	T3_H		.req	d23
48 
49 .macro _nh_stride	k0, k1, k2, k3
50 
51 	// Load next message stride
52 	vld1.8		{T3}, [MESSAGE]!
53 
54 	// Load next key stride
55 	vld1.32		{\k3}, [KEY]!
56 
57 	// Add message words to key words
58 	vadd.u32	T0, T3, \k0
59 	vadd.u32	T1, T3, \k1
60 	vadd.u32	T2, T3, \k2
61 	vadd.u32	T3, T3, \k3
62 
63 	// Multiply 32x32 => 64 and accumulate
64 	vmlal.u32	PASS0_SUMS, T0_L, T0_H
65 	vmlal.u32	PASS1_SUMS, T1_L, T1_H
66 	vmlal.u32	PASS2_SUMS, T2_L, T2_H
67 	vmlal.u32	PASS3_SUMS, T3_L, T3_H
68 .endm
69 
70 /*
71  * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
72  *		__le64 hash[NH_NUM_PASSES])
73  *
74  * It's guaranteed that message_len % 16 == 0.
75  */
76 ENTRY(nh_neon)
77 
78 	vld1.32		{K0,K1}, [KEY]!
79 	  vmov.u64	PASS0_SUMS, #0
80 	  vmov.u64	PASS1_SUMS, #0
81 	vld1.32		{K2}, [KEY]!
82 	  vmov.u64	PASS2_SUMS, #0
83 	  vmov.u64	PASS3_SUMS, #0
84 
85 	subs		MESSAGE_LEN, MESSAGE_LEN, #64
86 	blt		.Lloop4_done
87 .Lloop4:
88 	_nh_stride	K0, K1, K2, K3
89 	_nh_stride	K1, K2, K3, K0
90 	_nh_stride	K2, K3, K0, K1
91 	_nh_stride	K3, K0, K1, K2
92 	subs		MESSAGE_LEN, MESSAGE_LEN, #64
93 	bge		.Lloop4
94 
95 .Lloop4_done:
96 	ands		MESSAGE_LEN, MESSAGE_LEN, #63
97 	beq		.Ldone
98 	_nh_stride	K0, K1, K2, K3
99 
100 	subs		MESSAGE_LEN, MESSAGE_LEN, #16
101 	beq		.Ldone
102 	_nh_stride	K1, K2, K3, K0
103 
104 	subs		MESSAGE_LEN, MESSAGE_LEN, #16
105 	beq		.Ldone
106 	_nh_stride	K2, K3, K0, K1
107 
108 .Ldone:
109 	// Sum the accumulators for each pass, then store the sums to 'hash'
110 	vadd.u64	T0_L, PASS0_SUM_A, PASS0_SUM_B
111 	vadd.u64	T0_H, PASS1_SUM_A, PASS1_SUM_B
112 	vadd.u64	T1_L, PASS2_SUM_A, PASS2_SUM_B
113 	vadd.u64	T1_H, PASS3_SUM_A, PASS3_SUM_B
114 	vst1.8		{T0-T1}, [HASH]
115 	bx		lr
116 ENDPROC(nh_neon)
117