1 /* SPDX-License-Identifier: GPL-2.0 OR MIT */
2 /*
3  * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
4  * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
5  */
6 
7 #define MASK_U32		0x3c
8 #define CHACHA20_BLOCK_SIZE	64
9 #define STACK_SIZE		32
10 
11 #define X0	$t0
12 #define X1	$t1
13 #define X2	$t2
14 #define X3	$t3
15 #define X4	$t4
16 #define X5	$t5
17 #define X6	$t6
18 #define X7	$t7
19 #define X8	$t8
20 #define X9	$t9
21 #define X10	$v1
22 #define X11	$s6
23 #define X12	$s5
24 #define X13	$s4
25 #define X14	$s3
26 #define X15	$s2
27 /* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
28 #define T0	$s1
29 #define T1	$s0
30 #define T(n)	T ## n
31 #define X(n)	X ## n
32 
33 /* Input arguments */
34 #define STATE		$a0
35 #define OUT		$a1
36 #define IN		$a2
37 #define BYTES		$a3
38 
39 /* Output argument */
40 /* NONCE[0] is kept in a register and not in memory.
41  * We don't want to touch original value in memory.
42  * Must be incremented every loop iteration.
43  */
44 #define NONCE_0		$v0
45 
46 /* SAVED_X and SAVED_CA are set in the jump table.
47  * Use regs which are overwritten on exit else we don't leak clear data.
48  * They are used to handling the last bytes which are not multiple of 4.
49  */
50 #define SAVED_X		X15
51 #define SAVED_CA	$s7
52 
53 #define IS_UNALIGNED	$s7
54 
55 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
56 #define MSB 0
57 #define LSB 3
58 #define ROTx rotl
59 #define ROTR(n) rotr n, 24
60 #define	CPU_TO_LE32(n) \
61 	wsbh	n; \
62 	rotr	n, 16;
63 #else
64 #define MSB 3
65 #define LSB 0
66 #define ROTx rotr
67 #define CPU_TO_LE32(n)
68 #define ROTR(n)
69 #endif
70 
71 #define FOR_EACH_WORD(x) \
72 	x( 0); \
73 	x( 1); \
74 	x( 2); \
75 	x( 3); \
76 	x( 4); \
77 	x( 5); \
78 	x( 6); \
79 	x( 7); \
80 	x( 8); \
81 	x( 9); \
82 	x(10); \
83 	x(11); \
84 	x(12); \
85 	x(13); \
86 	x(14); \
87 	x(15);
88 
89 #define FOR_EACH_WORD_REV(x) \
90 	x(15); \
91 	x(14); \
92 	x(13); \
93 	x(12); \
94 	x(11); \
95 	x(10); \
96 	x( 9); \
97 	x( 8); \
98 	x( 7); \
99 	x( 6); \
100 	x( 5); \
101 	x( 4); \
102 	x( 3); \
103 	x( 2); \
104 	x( 1); \
105 	x( 0);
106 
107 #define PLUS_ONE_0	 1
108 #define PLUS_ONE_1	 2
109 #define PLUS_ONE_2	 3
110 #define PLUS_ONE_3	 4
111 #define PLUS_ONE_4	 5
112 #define PLUS_ONE_5	 6
113 #define PLUS_ONE_6	 7
114 #define PLUS_ONE_7	 8
115 #define PLUS_ONE_8	 9
116 #define PLUS_ONE_9	10
117 #define PLUS_ONE_10	11
118 #define PLUS_ONE_11	12
119 #define PLUS_ONE_12	13
120 #define PLUS_ONE_13	14
121 #define PLUS_ONE_14	15
122 #define PLUS_ONE_15	16
123 #define PLUS_ONE(x)	PLUS_ONE_ ## x
124 #define _CONCAT3(a,b,c)	a ## b ## c
125 #define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
126 
127 #define STORE_UNALIGNED(x) \
128 CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
129 	.if (x != 12); \
130 		lw	T0, (x*4)(STATE); \
131 	.endif; \
132 	lwl	T1, (x*4)+MSB ## (IN); \
133 	lwr	T1, (x*4)+LSB ## (IN); \
134 	.if (x == 12); \
135 		addu	X ## x, NONCE_0; \
136 	.else; \
137 		addu	X ## x, T0; \
138 	.endif; \
139 	CPU_TO_LE32(X ## x); \
140 	xor	X ## x, T1; \
141 	swl	X ## x, (x*4)+MSB ## (OUT); \
142 	swr	X ## x, (x*4)+LSB ## (OUT);
143 
144 #define STORE_ALIGNED(x) \
145 CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
146 	.if (x != 12); \
147 		lw	T0, (x*4)(STATE); \
148 	.endif; \
149 	lw	T1, (x*4) ## (IN); \
150 	.if (x == 12); \
151 		addu	X ## x, NONCE_0; \
152 	.else; \
153 		addu	X ## x, T0; \
154 	.endif; \
155 	CPU_TO_LE32(X ## x); \
156 	xor	X ## x, T1; \
157 	sw	X ## x, (x*4) ## (OUT);
158 
159 /* Jump table macro.
160  * Used for setup and handling the last bytes, which are not multiple of 4.
161  * X15 is free to store Xn
162  * Every jumptable entry must be equal in size.
163  */
164 #define JMPTBL_ALIGNED(x) \
165 .Lchacha_mips_jmptbl_aligned_ ## x: ; \
166 	.set	noreorder; \
167 	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
168 	.if (x == 12); \
169 		addu	SAVED_X, X ## x, NONCE_0; \
170 	.else; \
171 		addu	SAVED_X, X ## x, SAVED_CA; \
172 	.endif; \
173 	.set	reorder
174 
175 #define JMPTBL_UNALIGNED(x) \
176 .Lchacha_mips_jmptbl_unaligned_ ## x: ; \
177 	.set	noreorder; \
178 	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
179 	.if (x == 12); \
180 		addu	SAVED_X, X ## x, NONCE_0; \
181 	.else; \
182 		addu	SAVED_X, X ## x, SAVED_CA; \
183 	.endif; \
184 	.set	reorder
185 
186 #define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
187 	addu	X(A), X(K); \
188 	addu	X(B), X(L); \
189 	addu	X(C), X(M); \
190 	addu	X(D), X(N); \
191 	xor	X(V), X(A); \
192 	xor	X(W), X(B); \
193 	xor	X(Y), X(C); \
194 	xor	X(Z), X(D); \
195 	rotl	X(V), S;    \
196 	rotl	X(W), S;    \
197 	rotl	X(Y), S;    \
198 	rotl	X(Z), S;
199 
200 .text
201 .set	reorder
202 .set	noat
203 .globl	chacha_crypt_arch
204 .ent	chacha_crypt_arch
205 chacha_crypt_arch:
206 	.frame	$sp, STACK_SIZE, $ra
207 
208 	/* Load number of rounds */
209 	lw	$at, 16($sp)
210 
211 	addiu	$sp, -STACK_SIZE
212 
213 	/* Return bytes = 0. */
214 	beqz	BYTES, .Lchacha_mips_end
215 
216 	lw	NONCE_0, 48(STATE)
217 
218 	/* Save s0-s7 */
219 	sw	$s0,  0($sp)
220 	sw	$s1,  4($sp)
221 	sw	$s2,  8($sp)
222 	sw	$s3, 12($sp)
223 	sw	$s4, 16($sp)
224 	sw	$s5, 20($sp)
225 	sw	$s6, 24($sp)
226 	sw	$s7, 28($sp)
227 
228 	/* Test IN or OUT is unaligned.
229 	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
230 	 */
231 	or	IS_UNALIGNED, IN, OUT
232 	andi	IS_UNALIGNED, 0x3
233 
234 	b	.Lchacha_rounds_start
235 
236 .align 4
237 .Loop_chacha_rounds:
238 	addiu	IN,  CHACHA20_BLOCK_SIZE
239 	addiu	OUT, CHACHA20_BLOCK_SIZE
240 	addiu	NONCE_0, 1
241 
242 .Lchacha_rounds_start:
243 	lw	X0,  0(STATE)
244 	lw	X1,  4(STATE)
245 	lw	X2,  8(STATE)
246 	lw	X3,  12(STATE)
247 
248 	lw	X4,  16(STATE)
249 	lw	X5,  20(STATE)
250 	lw	X6,  24(STATE)
251 	lw	X7,  28(STATE)
252 	lw	X8,  32(STATE)
253 	lw	X9,  36(STATE)
254 	lw	X10, 40(STATE)
255 	lw	X11, 44(STATE)
256 
257 	move	X12, NONCE_0
258 	lw	X13, 52(STATE)
259 	lw	X14, 56(STATE)
260 	lw	X15, 60(STATE)
261 
262 .Loop_chacha_xor_rounds:
263 	addiu	$at, -2
264 	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
265 	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
266 	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
267 	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
268 	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
269 	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
270 	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
271 	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
272 	bnez	$at, .Loop_chacha_xor_rounds
273 
274 	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
275 
276 	/* Is data src/dst unaligned? Jump */
277 	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
278 
279 	/* Set number rounds here to fill delayslot. */
280 	lw	$at, (STACK_SIZE+16)($sp)
281 
282 	/* BYTES < 0, it has no full block. */
283 	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
284 
285 	FOR_EACH_WORD_REV(STORE_ALIGNED)
286 
287 	/* BYTES > 0? Loop again. */
288 	bgtz	BYTES, .Loop_chacha_rounds
289 
290 	/* Place this here to fill delay slot */
291 	addiu	NONCE_0, 1
292 
293 	/* BYTES < 0? Handle last bytes */
294 	bltz	BYTES, .Lchacha_mips_xor_bytes
295 
296 .Lchacha_mips_xor_done:
297 	/* Restore used registers */
298 	lw	$s0,  0($sp)
299 	lw	$s1,  4($sp)
300 	lw	$s2,  8($sp)
301 	lw	$s3, 12($sp)
302 	lw	$s4, 16($sp)
303 	lw	$s5, 20($sp)
304 	lw	$s6, 24($sp)
305 	lw	$s7, 28($sp)
306 
307 	/* Write NONCE_0 back to right location in state */
308 	sw	NONCE_0, 48(STATE)
309 
310 .Lchacha_mips_end:
311 	addiu	$sp, STACK_SIZE
312 	jr	$ra
313 
314 .Lchacha_mips_no_full_block_aligned:
315 	/* Restore the offset on BYTES */
316 	addiu	BYTES, CHACHA20_BLOCK_SIZE
317 
318 	/* Get number of full WORDS */
319 	andi	$at, BYTES, MASK_U32
320 
321 	/* Load upper half of jump table addr */
322 	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
323 
324 	/* Calculate lower half jump table offset */
325 	ins	T0, $at, 1, 6
326 
327 	/* Add offset to STATE */
328 	addu	T1, STATE, $at
329 
330 	/* Add lower half jump table addr */
331 	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
332 
333 	/* Read value from STATE */
334 	lw	SAVED_CA, 0(T1)
335 
336 	/* Store remaining bytecounter as negative value */
337 	subu	BYTES, $at, BYTES
338 
339 	jr	T0
340 
341 	/* Jump table */
342 	FOR_EACH_WORD(JMPTBL_ALIGNED)
343 
344 
345 .Loop_chacha_unaligned:
346 	/* Set number rounds here to fill delayslot. */
347 	lw	$at, (STACK_SIZE+16)($sp)
348 
349 	/* BYTES > 0, it has no full block. */
350 	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
351 
352 	FOR_EACH_WORD_REV(STORE_UNALIGNED)
353 
354 	/* BYTES > 0? Loop again. */
355 	bgtz	BYTES, .Loop_chacha_rounds
356 
357 	/* Write NONCE_0 back to right location in state */
358 	sw	NONCE_0, 48(STATE)
359 
360 	.set noreorder
361 	/* Fall through to byte handling */
362 	bgez	BYTES, .Lchacha_mips_xor_done
363 .Lchacha_mips_xor_unaligned_0_b:
364 .Lchacha_mips_xor_aligned_0_b:
365 	/* Place this here to fill delay slot */
366 	addiu	NONCE_0, 1
367 	.set reorder
368 
369 .Lchacha_mips_xor_bytes:
370 	addu	IN, $at
371 	addu	OUT, $at
372 	/* First byte */
373 	lbu	T1, 0(IN)
374 	addiu	$at, BYTES, 1
375 	CPU_TO_LE32(SAVED_X)
376 	ROTR(SAVED_X)
377 	xor	T1, SAVED_X
378 	sb	T1, 0(OUT)
379 	beqz	$at, .Lchacha_mips_xor_done
380 	/* Second byte */
381 	lbu	T1, 1(IN)
382 	addiu	$at, BYTES, 2
383 	ROTx	SAVED_X, 8
384 	xor	T1, SAVED_X
385 	sb	T1, 1(OUT)
386 	beqz	$at, .Lchacha_mips_xor_done
387 	/* Third byte */
388 	lbu	T1, 2(IN)
389 	ROTx	SAVED_X, 8
390 	xor	T1, SAVED_X
391 	sb	T1, 2(OUT)
392 	b	.Lchacha_mips_xor_done
393 
394 .Lchacha_mips_no_full_block_unaligned:
395 	/* Restore the offset on BYTES */
396 	addiu	BYTES, CHACHA20_BLOCK_SIZE
397 
398 	/* Get number of full WORDS */
399 	andi	$at, BYTES, MASK_U32
400 
401 	/* Load upper half of jump table addr */
402 	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
403 
404 	/* Calculate lower half jump table offset */
405 	ins	T0, $at, 1, 6
406 
407 	/* Add offset to STATE */
408 	addu	T1, STATE, $at
409 
410 	/* Add lower half jump table addr */
411 	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
412 
413 	/* Read value from STATE */
414 	lw	SAVED_CA, 0(T1)
415 
416 	/* Store remaining bytecounter as negative value */
417 	subu	BYTES, $at, BYTES
418 
419 	jr	T0
420 
421 	/* Jump table */
422 	FOR_EACH_WORD(JMPTBL_UNALIGNED)
423 .end chacha_crypt_arch
424 .set at
425 
426 /* Input arguments
427  * STATE	$a0
428  * OUT		$a1
429  * NROUND	$a2
430  */
431 
432 #undef X12
433 #undef X13
434 #undef X14
435 #undef X15
436 
437 #define X12	$a3
438 #define X13	$at
439 #define X14	$v0
440 #define X15	STATE
441 
442 .set noat
443 .globl	hchacha_block_arch
444 .ent	hchacha_block_arch
445 hchacha_block_arch:
446 	.frame	$sp, STACK_SIZE, $ra
447 
448 	addiu	$sp, -STACK_SIZE
449 
450 	/* Save X11(s6) */
451 	sw	X11, 0($sp)
452 
453 	lw	X0,  0(STATE)
454 	lw	X1,  4(STATE)
455 	lw	X2,  8(STATE)
456 	lw	X3,  12(STATE)
457 	lw	X4,  16(STATE)
458 	lw	X5,  20(STATE)
459 	lw	X6,  24(STATE)
460 	lw	X7,  28(STATE)
461 	lw	X8,  32(STATE)
462 	lw	X9,  36(STATE)
463 	lw	X10, 40(STATE)
464 	lw	X11, 44(STATE)
465 	lw	X12, 48(STATE)
466 	lw	X13, 52(STATE)
467 	lw	X14, 56(STATE)
468 	lw	X15, 60(STATE)
469 
470 .Loop_hchacha_xor_rounds:
471 	addiu	$a2, -2
472 	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
473 	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
474 	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
475 	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
476 	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
477 	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
478 	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
479 	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
480 	bnez	$a2, .Loop_hchacha_xor_rounds
481 
482 	/* Restore used register */
483 	lw	X11, 0($sp)
484 
485 	sw	X0,  0(OUT)
486 	sw	X1,  4(OUT)
487 	sw	X2,  8(OUT)
488 	sw	X3,  12(OUT)
489 	sw	X12, 16(OUT)
490 	sw	X13, 20(OUT)
491 	sw	X14, 24(OUT)
492 	sw	X15, 28(OUT)
493 
494 	addiu	$sp, STACK_SIZE
495 	jr	$ra
496 .end hchacha_block_arch
497 .set at
498