1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * BLAKE2b digest algorithm, NEON accelerated
4  *
5  * Copyright 2020 Google LLC
6  *
7  * Author: Eric Biggers <ebiggers@google.com>
8  */
9 
10 #include <linux/linkage.h>
11 
12 	.text
13 	.fpu		neon
14 
15 	// The arguments to blake2b_compress_neon()
16 	STATE		.req	r0
17 	BLOCK		.req	r1
18 	NBLOCKS		.req	r2
19 	INC		.req	r3
20 
21 	// Pointers to the rotation tables
22 	ROR24_TABLE	.req	r4
23 	ROR16_TABLE	.req	r5
24 
25 	// The original stack pointer
26 	ORIG_SP		.req	r6
27 
28 	// NEON registers which contain the message words of the current block.
29 	// M_0-M_3 are occasionally used for other purposes too.
30 	M_0		.req	d16
31 	M_1		.req	d17
32 	M_2		.req	d18
33 	M_3		.req	d19
34 	M_4		.req	d20
35 	M_5		.req	d21
36 	M_6		.req	d22
37 	M_7		.req	d23
38 	M_8		.req	d24
39 	M_9		.req	d25
40 	M_10		.req	d26
41 	M_11		.req	d27
42 	M_12		.req	d28
43 	M_13		.req	d29
44 	M_14		.req	d30
45 	M_15		.req	d31
46 
47 	.align		4
48 	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
49 	// instruction.  This is the most efficient way to implement these
50 	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
51 	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
52 .Lror24_table:
53 	.byte		3, 4, 5, 6, 7, 0, 1, 2
54 .Lror16_table:
55 	.byte		2, 3, 4, 5, 6, 7, 0, 1
56 	// The BLAKE2b initialization vector
57 .Lblake2b_IV:
58 	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
59 	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
60 	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
61 	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
62 
63 // Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
64 // NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
65 // pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
66 // (M_0-M_3), so that they can be reloaded if they are used as temporary
67 // registers.  The macro arguments s0-s15 give the order in which the message
68 // words are used in this round.  'final' is 1 if this is the final round.
69 .macro	_blake2b_round	s0, s1, s2, s3, s4, s5, s6, s7, \
70 			s8, s9, s10, s11, s12, s13, s14, s15, final=0
71 
72 	// Mix the columns:
73 	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
74 	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
75 
76 	// a += b + m[blake2b_sigma[r][2*i + 0]];
77 	vadd.u64	q0, q0, q2
78 	vadd.u64	q1, q1, q3
79 	vadd.u64	d0, d0, M_\s0
80 	vadd.u64	d1, d1, M_\s2
81 	vadd.u64	d2, d2, M_\s4
82 	vadd.u64	d3, d3, M_\s6
83 
84 	// d = ror64(d ^ a, 32);
85 	veor		q6, q6, q0
86 	veor		q7, q7, q1
87 	vrev64.32	q6, q6
88 	vrev64.32	q7, q7
89 
90 	// c += d;
91 	vadd.u64	q4, q4, q6
92 	vadd.u64	q5, q5, q7
93 
94 	// b = ror64(b ^ c, 24);
95 	vld1.8		{M_0}, [ROR24_TABLE, :64]
96 	veor		q2, q2, q4
97 	veor		q3, q3, q5
98 	vtbl.8		d4, {d4}, M_0
99 	vtbl.8		d5, {d5}, M_0
100 	vtbl.8		d6, {d6}, M_0
101 	vtbl.8		d7, {d7}, M_0
102 
103 	// a += b + m[blake2b_sigma[r][2*i + 1]];
104 	//
105 	// M_0 got clobbered above, so we have to reload it if any of the four
106 	// message words this step needs happens to be M_0.  Otherwise we don't
107 	// need to reload it here, as it will just get clobbered again below.
108 .if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
109 	vld1.8		{M_0}, [sp, :64]
110 .endif
111 	vadd.u64	q0, q0, q2
112 	vadd.u64	q1, q1, q3
113 	vadd.u64	d0, d0, M_\s1
114 	vadd.u64	d1, d1, M_\s3
115 	vadd.u64	d2, d2, M_\s5
116 	vadd.u64	d3, d3, M_\s7
117 
118 	// d = ror64(d ^ a, 16);
119 	vld1.8		{M_0}, [ROR16_TABLE, :64]
120 	veor		q6, q6, q0
121 	veor		q7, q7, q1
122 	vtbl.8		d12, {d12}, M_0
123 	vtbl.8		d13, {d13}, M_0
124 	vtbl.8		d14, {d14}, M_0
125 	vtbl.8		d15, {d15}, M_0
126 
127 	// c += d;
128 	vadd.u64	q4, q4, q6
129 	vadd.u64	q5, q5, q7
130 
131 	// b = ror64(b ^ c, 63);
132 	//
133 	// This rotation amount isn't a multiple of 8, so it has to be
134 	// implemented using a pair of shifts, which requires temporary
135 	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
136 	veor		q8, q2, q4
137 	veor		q9, q3, q5
138 	vshr.u64	q2, q8, #63
139 	vshr.u64	q3, q9, #63
140 	vsli.u64	q2, q8, #1
141 	vsli.u64	q3, q9, #1
142 	vld1.8		{q8-q9}, [sp, :256]
143 
144 	// Mix the diagonals:
145 	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
146 	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
147 	//
148 	// There are two possible ways to do this: use 'vext' instructions to
149 	// shift the rows of the matrix so that the diagonals become columns,
150 	// and undo it afterwards; or just use 64-bit operations on 'd'
151 	// registers instead of 128-bit operations on 'q' registers.  We use the
152 	// latter approach, as it performs much better on Cortex-A7.
153 
154 	// a += b + m[blake2b_sigma[r][2*i + 0]];
155 	vadd.u64	d0, d0, d5
156 	vadd.u64	d1, d1, d6
157 	vadd.u64	d2, d2, d7
158 	vadd.u64	d3, d3, d4
159 	vadd.u64	d0, d0, M_\s8
160 	vadd.u64	d1, d1, M_\s10
161 	vadd.u64	d2, d2, M_\s12
162 	vadd.u64	d3, d3, M_\s14
163 
164 	// d = ror64(d ^ a, 32);
165 	veor		d15, d15, d0
166 	veor		d12, d12, d1
167 	veor		d13, d13, d2
168 	veor		d14, d14, d3
169 	vrev64.32	d15, d15
170 	vrev64.32	d12, d12
171 	vrev64.32	d13, d13
172 	vrev64.32	d14, d14
173 
174 	// c += d;
175 	vadd.u64	d10, d10, d15
176 	vadd.u64	d11, d11, d12
177 	vadd.u64	d8, d8, d13
178 	vadd.u64	d9, d9, d14
179 
180 	// b = ror64(b ^ c, 24);
181 	vld1.8		{M_0}, [ROR24_TABLE, :64]
182 	veor		d5, d5, d10
183 	veor		d6, d6, d11
184 	veor		d7, d7, d8
185 	veor		d4, d4, d9
186 	vtbl.8		d5, {d5}, M_0
187 	vtbl.8		d6, {d6}, M_0
188 	vtbl.8		d7, {d7}, M_0
189 	vtbl.8		d4, {d4}, M_0
190 
191 	// a += b + m[blake2b_sigma[r][2*i + 1]];
192 .if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
193 	vld1.8		{M_0}, [sp, :64]
194 .endif
195 	vadd.u64	d0, d0, d5
196 	vadd.u64	d1, d1, d6
197 	vadd.u64	d2, d2, d7
198 	vadd.u64	d3, d3, d4
199 	vadd.u64	d0, d0, M_\s9
200 	vadd.u64	d1, d1, M_\s11
201 	vadd.u64	d2, d2, M_\s13
202 	vadd.u64	d3, d3, M_\s15
203 
204 	// d = ror64(d ^ a, 16);
205 	vld1.8		{M_0}, [ROR16_TABLE, :64]
206 	veor		d15, d15, d0
207 	veor		d12, d12, d1
208 	veor		d13, d13, d2
209 	veor		d14, d14, d3
210 	vtbl.8		d12, {d12}, M_0
211 	vtbl.8		d13, {d13}, M_0
212 	vtbl.8		d14, {d14}, M_0
213 	vtbl.8		d15, {d15}, M_0
214 
215 	// c += d;
216 	vadd.u64	d10, d10, d15
217 	vadd.u64	d11, d11, d12
218 	vadd.u64	d8, d8, d13
219 	vadd.u64	d9, d9, d14
220 
221 	// b = ror64(b ^ c, 63);
222 	veor		d16, d4, d9
223 	veor		d17, d5, d10
224 	veor		d18, d6, d11
225 	veor		d19, d7, d8
226 	vshr.u64	q2, q8, #63
227 	vshr.u64	q3, q9, #63
228 	vsli.u64	q2, q8, #1
229 	vsli.u64	q3, q9, #1
230 	// Reloading q8-q9 can be skipped on the final round.
231 .if ! \final
232 	vld1.8		{q8-q9}, [sp, :256]
233 .endif
234 .endm
235 
236 //
237 // void blake2b_compress_neon(struct blake2b_state *state,
238 //			      const u8 *block, size_t nblocks, u32 inc);
239 //
240 // Only the first three fields of struct blake2b_state are used:
241 //	u64 h[8];	(inout)
242 //	u64 t[2];	(inout)
243 //	u64 f[2];	(in)
244 //
245 	.align		5
246 ENTRY(blake2b_compress_neon)
247 	push		{r4-r10}
248 
249 	// Allocate a 32-byte stack buffer that is 32-byte aligned.
250 	mov		ORIG_SP, sp
251 	sub		ip, sp, #32
252 	bic		ip, ip, #31
253 	mov		sp, ip
254 
255 	adr		ROR24_TABLE, .Lror24_table
256 	adr		ROR16_TABLE, .Lror16_table
257 
258 	mov		ip, STATE
259 	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
260 	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
261 .Lnext_block:
262 	  adr		r10, .Lblake2b_IV
263 	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
264 	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
265 	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
266 	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
267 	  adds		r7, r7, INC		// Increment counter
268 	bcs		.Lslow_inc_ctr
269 	vmov.i32	d28[0], r7
270 	vst1.64		{d28}, [ip]		// Update t[0]
271 .Linc_ctr_done:
272 
273 	// Load the next message block and finish initializing the state matrix
274 	// 'v'.  Fortunately, there are exactly enough NEON registers to fit the
275 	// entire state matrix in q0-q7 and the entire message block in q8-15.
276 	//
277 	// However, _blake2b_round also needs some extra registers for rotates,
278 	// so we have to spill some registers.  It's better to spill the message
279 	// registers than the state registers, as the message doesn't change.
280 	// Therefore we store a copy of the first 32 bytes of the message block
281 	// (q8-q9) in an aligned buffer on the stack so that they can be
282 	// reloaded when needed.  (We could just reload directly from the
283 	// message buffer, but it's faster to use aligned loads.)
284 	vld1.8		{q8-q9}, [BLOCK]!
285 	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
286 	vld1.8		{q10-q11}, [BLOCK]!
287 	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
288 	vld1.8		{q12-q13}, [BLOCK]!
289 	vst1.8		{q8-q9}, [sp, :256]
290 	  mov		ip, STATE
291 	vld1.8		{q14-q15}, [BLOCK]!
292 
293 	// Execute the rounds.  Each round is provided the order in which it
294 	// needs to use the message words.
295 	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
296 	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
297 	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
298 	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
299 	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
300 	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
301 	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
302 	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
303 	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
304 	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
305 	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
306 	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
307 			final=1
308 
309 	// Fold the final state matrix into the hash chaining value:
310 	//
311 	//	for (i = 0; i < 8; i++)
312 	//		h[i] ^= v[i] ^ v[i + 8];
313 	//
314 	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
315 	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
316 	veor		q1, q1, q5		// v[2..3] ^= v[10..11]
317 	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
318 	veor		q2, q2, q6		// v[4..5] ^= v[12..13]
319 	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
320 	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
321 	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
322 	  mov		ip, STATE
323 	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
324 	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
325 	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
326 	veor		q3, q3, q11		// v[6..7] ^= h[6..7]
327 	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]
328 
329 	// Advance to the next block, if there is one.
330 	bne		.Lnext_block		// nblocks != 0?
331 
332 	mov		sp, ORIG_SP
333 	pop		{r4-r10}
334 	mov		pc, lr
335 
336 .Lslow_inc_ctr:
337 	// Handle the case where the counter overflowed its low 32 bits, by
338 	// carrying the overflow bit into the full 128-bit counter.
339 	vmov		r9, r10, d29
340 	adcs		r8, r8, #0
341 	adcs		r9, r9, #0
342 	adc		r10, r10, #0
343 	vmov		d28, r7, r8
344 	vmov		d29, r9, r10
345 	vst1.64		{q14}, [ip]		// Update t[0] and t[1]
346 	b		.Linc_ctr_done
347 ENDPROC(blake2b_compress_neon)
348