162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 OR MIT */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
462306a36Sopenharmony_ci * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
562306a36Sopenharmony_ci */
662306a36Sopenharmony_ci
762306a36Sopenharmony_ci#define MASK_U32		0x3c
862306a36Sopenharmony_ci#define CHACHA20_BLOCK_SIZE	64
962306a36Sopenharmony_ci#define STACK_SIZE		32
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_ci#define X0	$t0
1262306a36Sopenharmony_ci#define X1	$t1
1362306a36Sopenharmony_ci#define X2	$t2
1462306a36Sopenharmony_ci#define X3	$t3
1562306a36Sopenharmony_ci#define X4	$t4
1662306a36Sopenharmony_ci#define X5	$t5
1762306a36Sopenharmony_ci#define X6	$t6
1862306a36Sopenharmony_ci#define X7	$t7
1962306a36Sopenharmony_ci#define X8	$t8
2062306a36Sopenharmony_ci#define X9	$t9
2162306a36Sopenharmony_ci#define X10	$v1
2262306a36Sopenharmony_ci#define X11	$s6
2362306a36Sopenharmony_ci#define X12	$s5
2462306a36Sopenharmony_ci#define X13	$s4
2562306a36Sopenharmony_ci#define X14	$s3
2662306a36Sopenharmony_ci#define X15	$s2
2762306a36Sopenharmony_ci/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
2862306a36Sopenharmony_ci#define T0	$s1
2962306a36Sopenharmony_ci#define T1	$s0
3062306a36Sopenharmony_ci#define T(n)	T ## n
3162306a36Sopenharmony_ci#define X(n)	X ## n
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci/* Input arguments */
3462306a36Sopenharmony_ci#define STATE		$a0
3562306a36Sopenharmony_ci#define OUT		$a1
3662306a36Sopenharmony_ci#define IN		$a2
3762306a36Sopenharmony_ci#define BYTES		$a3
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci/* Output argument */
4062306a36Sopenharmony_ci/* NONCE[0] is kept in a register and not in memory.
4162306a36Sopenharmony_ci * We don't want to touch original value in memory.
4262306a36Sopenharmony_ci * Must be incremented every loop iteration.
4362306a36Sopenharmony_ci */
4462306a36Sopenharmony_ci#define NONCE_0		$v0
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ci/* SAVED_X and SAVED_CA are set in the jump table.
4762306a36Sopenharmony_ci * Use regs which are overwritten on exit else we don't leak clear data.
4862306a36Sopenharmony_ci * They are used to handling the last bytes which are not multiple of 4.
4962306a36Sopenharmony_ci */
5062306a36Sopenharmony_ci#define SAVED_X		X15
5162306a36Sopenharmony_ci#define SAVED_CA	$s7
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci#define IS_UNALIGNED	$s7
5462306a36Sopenharmony_ci
5562306a36Sopenharmony_ci#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
5662306a36Sopenharmony_ci#define MSB 0
5762306a36Sopenharmony_ci#define LSB 3
5862306a36Sopenharmony_ci#define ROTx rotl
5962306a36Sopenharmony_ci#define ROTR(n) rotr n, 24
6062306a36Sopenharmony_ci#define	CPU_TO_LE32(n) \
6162306a36Sopenharmony_ci	wsbh	n; \
6262306a36Sopenharmony_ci	rotr	n, 16;
6362306a36Sopenharmony_ci#else
6462306a36Sopenharmony_ci#define MSB 3
6562306a36Sopenharmony_ci#define LSB 0
6662306a36Sopenharmony_ci#define ROTx rotr
6762306a36Sopenharmony_ci#define CPU_TO_LE32(n)
6862306a36Sopenharmony_ci#define ROTR(n)
6962306a36Sopenharmony_ci#endif
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci#define FOR_EACH_WORD(x) \
7262306a36Sopenharmony_ci	x( 0); \
7362306a36Sopenharmony_ci	x( 1); \
7462306a36Sopenharmony_ci	x( 2); \
7562306a36Sopenharmony_ci	x( 3); \
7662306a36Sopenharmony_ci	x( 4); \
7762306a36Sopenharmony_ci	x( 5); \
7862306a36Sopenharmony_ci	x( 6); \
7962306a36Sopenharmony_ci	x( 7); \
8062306a36Sopenharmony_ci	x( 8); \
8162306a36Sopenharmony_ci	x( 9); \
8262306a36Sopenharmony_ci	x(10); \
8362306a36Sopenharmony_ci	x(11); \
8462306a36Sopenharmony_ci	x(12); \
8562306a36Sopenharmony_ci	x(13); \
8662306a36Sopenharmony_ci	x(14); \
8762306a36Sopenharmony_ci	x(15);
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ci#define FOR_EACH_WORD_REV(x) \
9062306a36Sopenharmony_ci	x(15); \
9162306a36Sopenharmony_ci	x(14); \
9262306a36Sopenharmony_ci	x(13); \
9362306a36Sopenharmony_ci	x(12); \
9462306a36Sopenharmony_ci	x(11); \
9562306a36Sopenharmony_ci	x(10); \
9662306a36Sopenharmony_ci	x( 9); \
9762306a36Sopenharmony_ci	x( 8); \
9862306a36Sopenharmony_ci	x( 7); \
9962306a36Sopenharmony_ci	x( 6); \
10062306a36Sopenharmony_ci	x( 5); \
10162306a36Sopenharmony_ci	x( 4); \
10262306a36Sopenharmony_ci	x( 3); \
10362306a36Sopenharmony_ci	x( 2); \
10462306a36Sopenharmony_ci	x( 1); \
10562306a36Sopenharmony_ci	x( 0);
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci#define PLUS_ONE_0	 1
10862306a36Sopenharmony_ci#define PLUS_ONE_1	 2
10962306a36Sopenharmony_ci#define PLUS_ONE_2	 3
11062306a36Sopenharmony_ci#define PLUS_ONE_3	 4
11162306a36Sopenharmony_ci#define PLUS_ONE_4	 5
11262306a36Sopenharmony_ci#define PLUS_ONE_5	 6
11362306a36Sopenharmony_ci#define PLUS_ONE_6	 7
11462306a36Sopenharmony_ci#define PLUS_ONE_7	 8
11562306a36Sopenharmony_ci#define PLUS_ONE_8	 9
11662306a36Sopenharmony_ci#define PLUS_ONE_9	10
11762306a36Sopenharmony_ci#define PLUS_ONE_10	11
11862306a36Sopenharmony_ci#define PLUS_ONE_11	12
11962306a36Sopenharmony_ci#define PLUS_ONE_12	13
12062306a36Sopenharmony_ci#define PLUS_ONE_13	14
12162306a36Sopenharmony_ci#define PLUS_ONE_14	15
12262306a36Sopenharmony_ci#define PLUS_ONE_15	16
12362306a36Sopenharmony_ci#define PLUS_ONE(x)	PLUS_ONE_ ## x
12462306a36Sopenharmony_ci#define _CONCAT3(a,b,c)	a ## b ## c
12562306a36Sopenharmony_ci#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci#define STORE_UNALIGNED(x) \
12862306a36Sopenharmony_ciCONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
12962306a36Sopenharmony_ci	.if (x != 12); \
13062306a36Sopenharmony_ci		lw	T0, (x*4)(STATE); \
13162306a36Sopenharmony_ci	.endif; \
13262306a36Sopenharmony_ci	lwl	T1, (x*4)+MSB ## (IN); \
13362306a36Sopenharmony_ci	lwr	T1, (x*4)+LSB ## (IN); \
13462306a36Sopenharmony_ci	.if (x == 12); \
13562306a36Sopenharmony_ci		addu	X ## x, NONCE_0; \
13662306a36Sopenharmony_ci	.else; \
13762306a36Sopenharmony_ci		addu	X ## x, T0; \
13862306a36Sopenharmony_ci	.endif; \
13962306a36Sopenharmony_ci	CPU_TO_LE32(X ## x); \
14062306a36Sopenharmony_ci	xor	X ## x, T1; \
14162306a36Sopenharmony_ci	swl	X ## x, (x*4)+MSB ## (OUT); \
14262306a36Sopenharmony_ci	swr	X ## x, (x*4)+LSB ## (OUT);
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci#define STORE_ALIGNED(x) \
14562306a36Sopenharmony_ciCONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
14662306a36Sopenharmony_ci	.if (x != 12); \
14762306a36Sopenharmony_ci		lw	T0, (x*4)(STATE); \
14862306a36Sopenharmony_ci	.endif; \
14962306a36Sopenharmony_ci	lw	T1, (x*4) ## (IN); \
15062306a36Sopenharmony_ci	.if (x == 12); \
15162306a36Sopenharmony_ci		addu	X ## x, NONCE_0; \
15262306a36Sopenharmony_ci	.else; \
15362306a36Sopenharmony_ci		addu	X ## x, T0; \
15462306a36Sopenharmony_ci	.endif; \
15562306a36Sopenharmony_ci	CPU_TO_LE32(X ## x); \
15662306a36Sopenharmony_ci	xor	X ## x, T1; \
15762306a36Sopenharmony_ci	sw	X ## x, (x*4) ## (OUT);
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci/* Jump table macro.
16062306a36Sopenharmony_ci * Used for setup and handling the last bytes, which are not multiple of 4.
16162306a36Sopenharmony_ci * X15 is free to store Xn
16262306a36Sopenharmony_ci * Every jumptable entry must be equal in size.
16362306a36Sopenharmony_ci */
16462306a36Sopenharmony_ci#define JMPTBL_ALIGNED(x) \
16562306a36Sopenharmony_ci.Lchacha_mips_jmptbl_aligned_ ## x: ; \
16662306a36Sopenharmony_ci	.set	noreorder; \
16762306a36Sopenharmony_ci	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
16862306a36Sopenharmony_ci	.if (x == 12); \
16962306a36Sopenharmony_ci		addu	SAVED_X, X ## x, NONCE_0; \
17062306a36Sopenharmony_ci	.else; \
17162306a36Sopenharmony_ci		addu	SAVED_X, X ## x, SAVED_CA; \
17262306a36Sopenharmony_ci	.endif; \
17362306a36Sopenharmony_ci	.set	reorder
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_ci#define JMPTBL_UNALIGNED(x) \
17662306a36Sopenharmony_ci.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
17762306a36Sopenharmony_ci	.set	noreorder; \
17862306a36Sopenharmony_ci	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
17962306a36Sopenharmony_ci	.if (x == 12); \
18062306a36Sopenharmony_ci		addu	SAVED_X, X ## x, NONCE_0; \
18162306a36Sopenharmony_ci	.else; \
18262306a36Sopenharmony_ci		addu	SAVED_X, X ## x, SAVED_CA; \
18362306a36Sopenharmony_ci	.endif; \
18462306a36Sopenharmony_ci	.set	reorder
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ci#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
18762306a36Sopenharmony_ci	addu	X(A), X(K); \
18862306a36Sopenharmony_ci	addu	X(B), X(L); \
18962306a36Sopenharmony_ci	addu	X(C), X(M); \
19062306a36Sopenharmony_ci	addu	X(D), X(N); \
19162306a36Sopenharmony_ci	xor	X(V), X(A); \
19262306a36Sopenharmony_ci	xor	X(W), X(B); \
19362306a36Sopenharmony_ci	xor	X(Y), X(C); \
19462306a36Sopenharmony_ci	xor	X(Z), X(D); \
19562306a36Sopenharmony_ci	rotl	X(V), S;    \
19662306a36Sopenharmony_ci	rotl	X(W), S;    \
19762306a36Sopenharmony_ci	rotl	X(Y), S;    \
19862306a36Sopenharmony_ci	rotl	X(Z), S;
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci.text
20162306a36Sopenharmony_ci.set	reorder
20262306a36Sopenharmony_ci.set	noat
20362306a36Sopenharmony_ci.globl	chacha_crypt_arch
20462306a36Sopenharmony_ci.ent	chacha_crypt_arch
20562306a36Sopenharmony_cichacha_crypt_arch:
20662306a36Sopenharmony_ci	.frame	$sp, STACK_SIZE, $ra
20762306a36Sopenharmony_ci
20862306a36Sopenharmony_ci	/* Load number of rounds */
20962306a36Sopenharmony_ci	lw	$at, 16($sp)
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_ci	addiu	$sp, -STACK_SIZE
21262306a36Sopenharmony_ci
21362306a36Sopenharmony_ci	/* Return bytes = 0. */
21462306a36Sopenharmony_ci	beqz	BYTES, .Lchacha_mips_end
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci	lw	NONCE_0, 48(STATE)
21762306a36Sopenharmony_ci
21862306a36Sopenharmony_ci	/* Save s0-s7 */
21962306a36Sopenharmony_ci	sw	$s0,  0($sp)
22062306a36Sopenharmony_ci	sw	$s1,  4($sp)
22162306a36Sopenharmony_ci	sw	$s2,  8($sp)
22262306a36Sopenharmony_ci	sw	$s3, 12($sp)
22362306a36Sopenharmony_ci	sw	$s4, 16($sp)
22462306a36Sopenharmony_ci	sw	$s5, 20($sp)
22562306a36Sopenharmony_ci	sw	$s6, 24($sp)
22662306a36Sopenharmony_ci	sw	$s7, 28($sp)
22762306a36Sopenharmony_ci
22862306a36Sopenharmony_ci	/* Test IN or OUT is unaligned.
22962306a36Sopenharmony_ci	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
23062306a36Sopenharmony_ci	 */
23162306a36Sopenharmony_ci	or	IS_UNALIGNED, IN, OUT
23262306a36Sopenharmony_ci	andi	IS_UNALIGNED, 0x3
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_ci	b	.Lchacha_rounds_start
23562306a36Sopenharmony_ci
23662306a36Sopenharmony_ci.align 4
23762306a36Sopenharmony_ci.Loop_chacha_rounds:
23862306a36Sopenharmony_ci	addiu	IN,  CHACHA20_BLOCK_SIZE
23962306a36Sopenharmony_ci	addiu	OUT, CHACHA20_BLOCK_SIZE
24062306a36Sopenharmony_ci	addiu	NONCE_0, 1
24162306a36Sopenharmony_ci
24262306a36Sopenharmony_ci.Lchacha_rounds_start:
24362306a36Sopenharmony_ci	lw	X0,  0(STATE)
24462306a36Sopenharmony_ci	lw	X1,  4(STATE)
24562306a36Sopenharmony_ci	lw	X2,  8(STATE)
24662306a36Sopenharmony_ci	lw	X3,  12(STATE)
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci	lw	X4,  16(STATE)
24962306a36Sopenharmony_ci	lw	X5,  20(STATE)
25062306a36Sopenharmony_ci	lw	X6,  24(STATE)
25162306a36Sopenharmony_ci	lw	X7,  28(STATE)
25262306a36Sopenharmony_ci	lw	X8,  32(STATE)
25362306a36Sopenharmony_ci	lw	X9,  36(STATE)
25462306a36Sopenharmony_ci	lw	X10, 40(STATE)
25562306a36Sopenharmony_ci	lw	X11, 44(STATE)
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	move	X12, NONCE_0
25862306a36Sopenharmony_ci	lw	X13, 52(STATE)
25962306a36Sopenharmony_ci	lw	X14, 56(STATE)
26062306a36Sopenharmony_ci	lw	X15, 60(STATE)
26162306a36Sopenharmony_ci
26262306a36Sopenharmony_ci.Loop_chacha_xor_rounds:
26362306a36Sopenharmony_ci	addiu	$at, -2
26462306a36Sopenharmony_ci	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
26562306a36Sopenharmony_ci	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
26662306a36Sopenharmony_ci	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
26762306a36Sopenharmony_ci	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
26862306a36Sopenharmony_ci	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
26962306a36Sopenharmony_ci	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
27062306a36Sopenharmony_ci	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
27162306a36Sopenharmony_ci	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
27262306a36Sopenharmony_ci	bnez	$at, .Loop_chacha_xor_rounds
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_ci	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci	/* Is data src/dst unaligned? Jump */
27762306a36Sopenharmony_ci	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci	/* Set number rounds here to fill delayslot. */
28062306a36Sopenharmony_ci	lw	$at, (STACK_SIZE+16)($sp)
28162306a36Sopenharmony_ci
28262306a36Sopenharmony_ci	/* BYTES < 0, it has no full block. */
28362306a36Sopenharmony_ci	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_ci	FOR_EACH_WORD_REV(STORE_ALIGNED)
28662306a36Sopenharmony_ci
28762306a36Sopenharmony_ci	/* BYTES > 0? Loop again. */
28862306a36Sopenharmony_ci	bgtz	BYTES, .Loop_chacha_rounds
28962306a36Sopenharmony_ci
29062306a36Sopenharmony_ci	/* Place this here to fill delay slot */
29162306a36Sopenharmony_ci	addiu	NONCE_0, 1
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	/* BYTES < 0? Handle last bytes */
29462306a36Sopenharmony_ci	bltz	BYTES, .Lchacha_mips_xor_bytes
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci.Lchacha_mips_xor_done:
29762306a36Sopenharmony_ci	/* Restore used registers */
29862306a36Sopenharmony_ci	lw	$s0,  0($sp)
29962306a36Sopenharmony_ci	lw	$s1,  4($sp)
30062306a36Sopenharmony_ci	lw	$s2,  8($sp)
30162306a36Sopenharmony_ci	lw	$s3, 12($sp)
30262306a36Sopenharmony_ci	lw	$s4, 16($sp)
30362306a36Sopenharmony_ci	lw	$s5, 20($sp)
30462306a36Sopenharmony_ci	lw	$s6, 24($sp)
30562306a36Sopenharmony_ci	lw	$s7, 28($sp)
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_ci	/* Write NONCE_0 back to right location in state */
30862306a36Sopenharmony_ci	sw	NONCE_0, 48(STATE)
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci.Lchacha_mips_end:
31162306a36Sopenharmony_ci	addiu	$sp, STACK_SIZE
31262306a36Sopenharmony_ci	jr	$ra
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_ci.Lchacha_mips_no_full_block_aligned:
31562306a36Sopenharmony_ci	/* Restore the offset on BYTES */
31662306a36Sopenharmony_ci	addiu	BYTES, CHACHA20_BLOCK_SIZE
31762306a36Sopenharmony_ci
31862306a36Sopenharmony_ci	/* Get number of full WORDS */
31962306a36Sopenharmony_ci	andi	$at, BYTES, MASK_U32
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_ci	/* Load upper half of jump table addr */
32262306a36Sopenharmony_ci	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci	/* Calculate lower half jump table offset */
32562306a36Sopenharmony_ci	ins	T0, $at, 1, 6
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci	/* Add offset to STATE */
32862306a36Sopenharmony_ci	addu	T1, STATE, $at
32962306a36Sopenharmony_ci
33062306a36Sopenharmony_ci	/* Add lower half jump table addr */
33162306a36Sopenharmony_ci	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_ci	/* Read value from STATE */
33462306a36Sopenharmony_ci	lw	SAVED_CA, 0(T1)
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_ci	/* Store remaining bytecounter as negative value */
33762306a36Sopenharmony_ci	subu	BYTES, $at, BYTES
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci	jr	T0
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci	/* Jump table */
34262306a36Sopenharmony_ci	FOR_EACH_WORD(JMPTBL_ALIGNED)
34362306a36Sopenharmony_ci
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ci.Loop_chacha_unaligned:
34662306a36Sopenharmony_ci	/* Set number rounds here to fill delayslot. */
34762306a36Sopenharmony_ci	lw	$at, (STACK_SIZE+16)($sp)
34862306a36Sopenharmony_ci
34962306a36Sopenharmony_ci	/* BYTES > 0, it has no full block. */
35062306a36Sopenharmony_ci	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
35162306a36Sopenharmony_ci
35262306a36Sopenharmony_ci	FOR_EACH_WORD_REV(STORE_UNALIGNED)
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci	/* BYTES > 0? Loop again. */
35562306a36Sopenharmony_ci	bgtz	BYTES, .Loop_chacha_rounds
35662306a36Sopenharmony_ci
35762306a36Sopenharmony_ci	/* Write NONCE_0 back to right location in state */
35862306a36Sopenharmony_ci	sw	NONCE_0, 48(STATE)
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci	.set noreorder
36162306a36Sopenharmony_ci	/* Fall through to byte handling */
36262306a36Sopenharmony_ci	bgez	BYTES, .Lchacha_mips_xor_done
36362306a36Sopenharmony_ci.Lchacha_mips_xor_unaligned_0_b:
36462306a36Sopenharmony_ci.Lchacha_mips_xor_aligned_0_b:
36562306a36Sopenharmony_ci	/* Place this here to fill delay slot */
36662306a36Sopenharmony_ci	addiu	NONCE_0, 1
36762306a36Sopenharmony_ci	.set reorder
36862306a36Sopenharmony_ci
36962306a36Sopenharmony_ci.Lchacha_mips_xor_bytes:
37062306a36Sopenharmony_ci	addu	IN, $at
37162306a36Sopenharmony_ci	addu	OUT, $at
37262306a36Sopenharmony_ci	/* First byte */
37362306a36Sopenharmony_ci	lbu	T1, 0(IN)
37462306a36Sopenharmony_ci	addiu	$at, BYTES, 1
37562306a36Sopenharmony_ci	CPU_TO_LE32(SAVED_X)
37662306a36Sopenharmony_ci	ROTR(SAVED_X)
37762306a36Sopenharmony_ci	xor	T1, SAVED_X
37862306a36Sopenharmony_ci	sb	T1, 0(OUT)
37962306a36Sopenharmony_ci	beqz	$at, .Lchacha_mips_xor_done
38062306a36Sopenharmony_ci	/* Second byte */
38162306a36Sopenharmony_ci	lbu	T1, 1(IN)
38262306a36Sopenharmony_ci	addiu	$at, BYTES, 2
38362306a36Sopenharmony_ci	ROTx	SAVED_X, 8
38462306a36Sopenharmony_ci	xor	T1, SAVED_X
38562306a36Sopenharmony_ci	sb	T1, 1(OUT)
38662306a36Sopenharmony_ci	beqz	$at, .Lchacha_mips_xor_done
38762306a36Sopenharmony_ci	/* Third byte */
38862306a36Sopenharmony_ci	lbu	T1, 2(IN)
38962306a36Sopenharmony_ci	ROTx	SAVED_X, 8
39062306a36Sopenharmony_ci	xor	T1, SAVED_X
39162306a36Sopenharmony_ci	sb	T1, 2(OUT)
39262306a36Sopenharmony_ci	b	.Lchacha_mips_xor_done
39362306a36Sopenharmony_ci
39462306a36Sopenharmony_ci.Lchacha_mips_no_full_block_unaligned:
39562306a36Sopenharmony_ci	/* Restore the offset on BYTES */
39662306a36Sopenharmony_ci	addiu	BYTES, CHACHA20_BLOCK_SIZE
39762306a36Sopenharmony_ci
39862306a36Sopenharmony_ci	/* Get number of full WORDS */
39962306a36Sopenharmony_ci	andi	$at, BYTES, MASK_U32
40062306a36Sopenharmony_ci
40162306a36Sopenharmony_ci	/* Load upper half of jump table addr */
40262306a36Sopenharmony_ci	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_ci	/* Calculate lower half jump table offset */
40562306a36Sopenharmony_ci	ins	T0, $at, 1, 6
40662306a36Sopenharmony_ci
40762306a36Sopenharmony_ci	/* Add offset to STATE */
40862306a36Sopenharmony_ci	addu	T1, STATE, $at
40962306a36Sopenharmony_ci
41062306a36Sopenharmony_ci	/* Add lower half jump table addr */
41162306a36Sopenharmony_ci	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
41262306a36Sopenharmony_ci
41362306a36Sopenharmony_ci	/* Read value from STATE */
41462306a36Sopenharmony_ci	lw	SAVED_CA, 0(T1)
41562306a36Sopenharmony_ci
41662306a36Sopenharmony_ci	/* Store remaining bytecounter as negative value */
41762306a36Sopenharmony_ci	subu	BYTES, $at, BYTES
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci	jr	T0
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_ci	/* Jump table */
42262306a36Sopenharmony_ci	FOR_EACH_WORD(JMPTBL_UNALIGNED)
42362306a36Sopenharmony_ci.end chacha_crypt_arch
42462306a36Sopenharmony_ci.set at
42562306a36Sopenharmony_ci
42662306a36Sopenharmony_ci/* Input arguments
42762306a36Sopenharmony_ci * STATE	$a0
42862306a36Sopenharmony_ci * OUT		$a1
42962306a36Sopenharmony_ci * NROUND	$a2
43062306a36Sopenharmony_ci */
43162306a36Sopenharmony_ci
43262306a36Sopenharmony_ci#undef X12
43362306a36Sopenharmony_ci#undef X13
43462306a36Sopenharmony_ci#undef X14
43562306a36Sopenharmony_ci#undef X15
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ci#define X12	$a3
43862306a36Sopenharmony_ci#define X13	$at
43962306a36Sopenharmony_ci#define X14	$v0
44062306a36Sopenharmony_ci#define X15	STATE
44162306a36Sopenharmony_ci
44262306a36Sopenharmony_ci.set noat
44362306a36Sopenharmony_ci.globl	hchacha_block_arch
44462306a36Sopenharmony_ci.ent	hchacha_block_arch
44562306a36Sopenharmony_cihchacha_block_arch:
44662306a36Sopenharmony_ci	.frame	$sp, STACK_SIZE, $ra
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_ci	addiu	$sp, -STACK_SIZE
44962306a36Sopenharmony_ci
45062306a36Sopenharmony_ci	/* Save X11(s6) */
45162306a36Sopenharmony_ci	sw	X11, 0($sp)
45262306a36Sopenharmony_ci
45362306a36Sopenharmony_ci	lw	X0,  0(STATE)
45462306a36Sopenharmony_ci	lw	X1,  4(STATE)
45562306a36Sopenharmony_ci	lw	X2,  8(STATE)
45662306a36Sopenharmony_ci	lw	X3,  12(STATE)
45762306a36Sopenharmony_ci	lw	X4,  16(STATE)
45862306a36Sopenharmony_ci	lw	X5,  20(STATE)
45962306a36Sopenharmony_ci	lw	X6,  24(STATE)
46062306a36Sopenharmony_ci	lw	X7,  28(STATE)
46162306a36Sopenharmony_ci	lw	X8,  32(STATE)
46262306a36Sopenharmony_ci	lw	X9,  36(STATE)
46362306a36Sopenharmony_ci	lw	X10, 40(STATE)
46462306a36Sopenharmony_ci	lw	X11, 44(STATE)
46562306a36Sopenharmony_ci	lw	X12, 48(STATE)
46662306a36Sopenharmony_ci	lw	X13, 52(STATE)
46762306a36Sopenharmony_ci	lw	X14, 56(STATE)
46862306a36Sopenharmony_ci	lw	X15, 60(STATE)
46962306a36Sopenharmony_ci
47062306a36Sopenharmony_ci.Loop_hchacha_xor_rounds:
47162306a36Sopenharmony_ci	addiu	$a2, -2
47262306a36Sopenharmony_ci	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
47362306a36Sopenharmony_ci	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
47462306a36Sopenharmony_ci	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
47562306a36Sopenharmony_ci	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
47662306a36Sopenharmony_ci	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
47762306a36Sopenharmony_ci	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
47862306a36Sopenharmony_ci	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
47962306a36Sopenharmony_ci	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
48062306a36Sopenharmony_ci	bnez	$a2, .Loop_hchacha_xor_rounds
48162306a36Sopenharmony_ci
48262306a36Sopenharmony_ci	/* Restore used register */
48362306a36Sopenharmony_ci	lw	X11, 0($sp)
48462306a36Sopenharmony_ci
48562306a36Sopenharmony_ci	sw	X0,  0(OUT)
48662306a36Sopenharmony_ci	sw	X1,  4(OUT)
48762306a36Sopenharmony_ci	sw	X2,  8(OUT)
48862306a36Sopenharmony_ci	sw	X3,  12(OUT)
48962306a36Sopenharmony_ci	sw	X12, 16(OUT)
49062306a36Sopenharmony_ci	sw	X13, 20(OUT)
49162306a36Sopenharmony_ci	sw	X14, 24(OUT)
49262306a36Sopenharmony_ci	sw	X15, 28(OUT)
49362306a36Sopenharmony_ci
49462306a36Sopenharmony_ci	addiu	$sp, STACK_SIZE
49562306a36Sopenharmony_ci	jr	$ra
49662306a36Sopenharmony_ci.end hchacha_block_arch
49762306a36Sopenharmony_ci.set at
498