162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2018 Google, Inc.
462306a36Sopenharmony_ci */
562306a36Sopenharmony_ci
662306a36Sopenharmony_ci#include <linux/linkage.h>
762306a36Sopenharmony_ci#include <asm/assembler.h>
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci/*
1062306a36Sopenharmony_ci * Design notes:
1162306a36Sopenharmony_ci *
1262306a36Sopenharmony_ci * 16 registers would be needed to hold the state matrix, but only 14 are
1362306a36Sopenharmony_ci * available because 'sp' and 'pc' cannot be used.  So we spill the elements
1462306a36Sopenharmony_ci * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
1562306a36Sopenharmony_ci * 'ldrd' and one 'strd' instruction per round.
1662306a36Sopenharmony_ci *
1762306a36Sopenharmony_ci * All rotates are performed using the implicit rotate operand accepted by the
1862306a36Sopenharmony_ci * 'add' and 'eor' instructions.  This is faster than using explicit rotate
1962306a36Sopenharmony_ci * instructions.  To make this work, we allow the values in the second and last
2062306a36Sopenharmony_ci * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
2162306a36Sopenharmony_ci * wrong rotation amount.  The rotation amount is then fixed up just in time
2262306a36Sopenharmony_ci * when the values are used.  'brot' is the number of bits the values in row 'b'
2362306a36Sopenharmony_ci * need to be rotated right to arrive at the correct values, and 'drot'
2462306a36Sopenharmony_ci * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
2562306a36Sopenharmony_ci * that they end up as (25, 24) after every round.
2662306a36Sopenharmony_ci */
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci	// ChaCha state registers
2962306a36Sopenharmony_ci	X0	.req	r0
3062306a36Sopenharmony_ci	X1	.req	r1
3162306a36Sopenharmony_ci	X2	.req	r2
3262306a36Sopenharmony_ci	X3	.req	r3
3362306a36Sopenharmony_ci	X4	.req	r4
3462306a36Sopenharmony_ci	X5	.req	r5
3562306a36Sopenharmony_ci	X6	.req	r6
3662306a36Sopenharmony_ci	X7	.req	r7
3762306a36Sopenharmony_ci	X8_X10	.req	r8	// shared by x8 and x10
3862306a36Sopenharmony_ci	X9_X11	.req	r9	// shared by x9 and x11
3962306a36Sopenharmony_ci	X12	.req	r10
4062306a36Sopenharmony_ci	X13	.req	r11
4162306a36Sopenharmony_ci	X14	.req	r12
4262306a36Sopenharmony_ci	X15	.req	r14
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci.macro _le32_bswap_4x	a, b, c, d,  tmp
4562306a36Sopenharmony_ci#ifdef __ARMEB__
4662306a36Sopenharmony_ci	rev_l		\a,  \tmp
4762306a36Sopenharmony_ci	rev_l		\b,  \tmp
4862306a36Sopenharmony_ci	rev_l		\c,  \tmp
4962306a36Sopenharmony_ci	rev_l		\d,  \tmp
5062306a36Sopenharmony_ci#endif
5162306a36Sopenharmony_ci.endm
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci.macro __ldrd		a, b, src, offset
5462306a36Sopenharmony_ci#if __LINUX_ARM_ARCH__ >= 6
5562306a36Sopenharmony_ci	ldrd		\a, \b, [\src, #\offset]
5662306a36Sopenharmony_ci#else
5762306a36Sopenharmony_ci	ldr		\a, [\src, #\offset]
5862306a36Sopenharmony_ci	ldr		\b, [\src, #\offset + 4]
5962306a36Sopenharmony_ci#endif
6062306a36Sopenharmony_ci.endm
6162306a36Sopenharmony_ci
6262306a36Sopenharmony_ci.macro __strd		a, b, dst, offset
6362306a36Sopenharmony_ci#if __LINUX_ARM_ARCH__ >= 6
6462306a36Sopenharmony_ci	strd		\a, \b, [\dst, #\offset]
6562306a36Sopenharmony_ci#else
6662306a36Sopenharmony_ci	str		\a, [\dst, #\offset]
6762306a36Sopenharmony_ci	str		\b, [\dst, #\offset + 4]
6862306a36Sopenharmony_ci#endif
6962306a36Sopenharmony_ci.endm
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci.macro _halfround	a1, b1, c1, d1,  a2, b2, c2, d2
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci	// a += b; d ^= a; d = rol(d, 16);
7462306a36Sopenharmony_ci	add		\a1, \a1, \b1, ror #brot
7562306a36Sopenharmony_ci	add		\a2, \a2, \b2, ror #brot
7662306a36Sopenharmony_ci	eor		\d1, \a1, \d1, ror #drot
7762306a36Sopenharmony_ci	eor		\d2, \a2, \d2, ror #drot
7862306a36Sopenharmony_ci	// drot == 32 - 16 == 16
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_ci	// c += d; b ^= c; b = rol(b, 12);
8162306a36Sopenharmony_ci	add		\c1, \c1, \d1, ror #16
8262306a36Sopenharmony_ci	add		\c2, \c2, \d2, ror #16
8362306a36Sopenharmony_ci	eor		\b1, \c1, \b1, ror #brot
8462306a36Sopenharmony_ci	eor		\b2, \c2, \b2, ror #brot
8562306a36Sopenharmony_ci	// brot == 32 - 12 == 20
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci	// a += b; d ^= a; d = rol(d, 8);
8862306a36Sopenharmony_ci	add		\a1, \a1, \b1, ror #20
8962306a36Sopenharmony_ci	add		\a2, \a2, \b2, ror #20
9062306a36Sopenharmony_ci	eor		\d1, \a1, \d1, ror #16
9162306a36Sopenharmony_ci	eor		\d2, \a2, \d2, ror #16
9262306a36Sopenharmony_ci	// drot == 32 - 8 == 24
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_ci	// c += d; b ^= c; b = rol(b, 7);
9562306a36Sopenharmony_ci	add		\c1, \c1, \d1, ror #24
9662306a36Sopenharmony_ci	add		\c2, \c2, \d2, ror #24
9762306a36Sopenharmony_ci	eor		\b1, \c1, \b1, ror #20
9862306a36Sopenharmony_ci	eor		\b2, \c2, \b2, ror #20
9962306a36Sopenharmony_ci	// brot == 32 - 7 == 25
10062306a36Sopenharmony_ci.endm
10162306a36Sopenharmony_ci
10262306a36Sopenharmony_ci.macro _doubleround
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci	// column round
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
10762306a36Sopenharmony_ci	_halfround	X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci	// save (x8, x9); restore (x10, x11)
11062306a36Sopenharmony_ci	__strd		X8_X10, X9_X11, sp, 0
11162306a36Sopenharmony_ci	__ldrd		X8_X10, X9_X11, sp, 8
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_ci	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
11462306a36Sopenharmony_ci	_halfround	X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci	.set brot, 25
11762306a36Sopenharmony_ci	.set drot, 24
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci	// diagonal round
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_ci	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
12262306a36Sopenharmony_ci	_halfround	X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_ci	// save (x10, x11); restore (x8, x9)
12562306a36Sopenharmony_ci	__strd		X8_X10, X9_X11, sp, 8
12662306a36Sopenharmony_ci	__ldrd		X8_X10, X9_X11, sp, 0
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
12962306a36Sopenharmony_ci	_halfround	X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
13062306a36Sopenharmony_ci.endm
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci.macro _chacha_permute	nrounds
13362306a36Sopenharmony_ci	.set brot, 0
13462306a36Sopenharmony_ci	.set drot, 0
13562306a36Sopenharmony_ci	.rept \nrounds / 2
13662306a36Sopenharmony_ci	 _doubleround
13762306a36Sopenharmony_ci	.endr
13862306a36Sopenharmony_ci.endm
13962306a36Sopenharmony_ci
14062306a36Sopenharmony_ci.macro _chacha		nrounds
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci.Lnext_block\@:
14362306a36Sopenharmony_ci	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
14462306a36Sopenharmony_ci	// Registers contain x0-x9,x12-x15.
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ci	// Do the core ChaCha permutation to update x0-x15.
14762306a36Sopenharmony_ci	_chacha_permute	\nrounds
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci	add		sp, #8
15062306a36Sopenharmony_ci	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
15162306a36Sopenharmony_ci	// Registers contain x0-x9,x12-x15.
15262306a36Sopenharmony_ci	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_ci	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
15562306a36Sopenharmony_ci	push		{X8_X10, X9_X11, X12, X13, X14, X15}
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci	// Load (OUT, IN, LEN).
15862306a36Sopenharmony_ci	ldr		r14, [sp, #96]
15962306a36Sopenharmony_ci	ldr		r12, [sp, #100]
16062306a36Sopenharmony_ci	ldr		r11, [sp, #104]
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_ci	orr		r10, r14, r12
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci	// Use slow path if fewer than 64 bytes remain.
16562306a36Sopenharmony_ci	cmp		r11, #64
16662306a36Sopenharmony_ci	blt		.Lxor_slowpath\@
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ci	// Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
16962306a36Sopenharmony_ci	// ARMv6+, since ldmia and stmia (used below) still require alignment.
17062306a36Sopenharmony_ci	tst		r10, #3
17162306a36Sopenharmony_ci	bne		.Lxor_slowpath\@
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci	// Fast path: XOR 64 bytes of aligned data.
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_ci	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
17662306a36Sopenharmony_ci	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
17762306a36Sopenharmony_ci	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
17862306a36Sopenharmony_ci
17962306a36Sopenharmony_ci	// x0-x3
18062306a36Sopenharmony_ci	__ldrd		r8, r9, sp, 32
18162306a36Sopenharmony_ci	__ldrd		r10, r11, sp, 40
18262306a36Sopenharmony_ci	add		X0, X0, r8
18362306a36Sopenharmony_ci	add		X1, X1, r9
18462306a36Sopenharmony_ci	add		X2, X2, r10
18562306a36Sopenharmony_ci	add		X3, X3, r11
18662306a36Sopenharmony_ci	_le32_bswap_4x	X0, X1, X2, X3,  r8
18762306a36Sopenharmony_ci	ldmia		r12!, {r8-r11}
18862306a36Sopenharmony_ci	eor		X0, X0, r8
18962306a36Sopenharmony_ci	eor		X1, X1, r9
19062306a36Sopenharmony_ci	eor		X2, X2, r10
19162306a36Sopenharmony_ci	eor		X3, X3, r11
19262306a36Sopenharmony_ci	stmia		r14!, {X0-X3}
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ci	// x4-x7
19562306a36Sopenharmony_ci	__ldrd		r8, r9, sp, 48
19662306a36Sopenharmony_ci	__ldrd		r10, r11, sp, 56
19762306a36Sopenharmony_ci	add		X4, r8, X4, ror #brot
19862306a36Sopenharmony_ci	add		X5, r9, X5, ror #brot
19962306a36Sopenharmony_ci	ldmia		r12!, {X0-X3}
20062306a36Sopenharmony_ci	add		X6, r10, X6, ror #brot
20162306a36Sopenharmony_ci	add		X7, r11, X7, ror #brot
20262306a36Sopenharmony_ci	_le32_bswap_4x	X4, X5, X6, X7,  r8
20362306a36Sopenharmony_ci	eor		X4, X4, X0
20462306a36Sopenharmony_ci	eor		X5, X5, X1
20562306a36Sopenharmony_ci	eor		X6, X6, X2
20662306a36Sopenharmony_ci	eor		X7, X7, X3
20762306a36Sopenharmony_ci	stmia		r14!, {X4-X7}
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci	// x8-x15
21062306a36Sopenharmony_ci	pop		{r0-r7}			// (x8-x9,x12-x15,x10-x11)
21162306a36Sopenharmony_ci	__ldrd		r8, r9, sp, 32
21262306a36Sopenharmony_ci	__ldrd		r10, r11, sp, 40
21362306a36Sopenharmony_ci	add		r0, r0, r8		// x8
21462306a36Sopenharmony_ci	add		r1, r1, r9		// x9
21562306a36Sopenharmony_ci	add		r6, r6, r10		// x10
21662306a36Sopenharmony_ci	add		r7, r7, r11		// x11
21762306a36Sopenharmony_ci	_le32_bswap_4x	r0, r1, r6, r7,  r8
21862306a36Sopenharmony_ci	ldmia		r12!, {r8-r11}
21962306a36Sopenharmony_ci	eor		r0, r0, r8		// x8
22062306a36Sopenharmony_ci	eor		r1, r1, r9		// x9
22162306a36Sopenharmony_ci	eor		r6, r6, r10		// x10
22262306a36Sopenharmony_ci	eor		r7, r7, r11		// x11
22362306a36Sopenharmony_ci	stmia		r14!, {r0,r1,r6,r7}
22462306a36Sopenharmony_ci	ldmia		r12!, {r0,r1,r6,r7}
22562306a36Sopenharmony_ci	__ldrd		r8, r9, sp, 48
22662306a36Sopenharmony_ci	__ldrd		r10, r11, sp, 56
22762306a36Sopenharmony_ci	add		r2, r8, r2, ror #drot	// x12
22862306a36Sopenharmony_ci	add		r3, r9, r3, ror #drot	// x13
22962306a36Sopenharmony_ci	add		r4, r10, r4, ror #drot	// x14
23062306a36Sopenharmony_ci	add		r5, r11, r5, ror #drot	// x15
23162306a36Sopenharmony_ci	_le32_bswap_4x	r2, r3, r4, r5,  r9
23262306a36Sopenharmony_ci	  ldr		r9, [sp, #72]		// load LEN
23362306a36Sopenharmony_ci	eor		r2, r2, r0		// x12
23462306a36Sopenharmony_ci	eor		r3, r3, r1		// x13
23562306a36Sopenharmony_ci	eor		r4, r4, r6		// x14
23662306a36Sopenharmony_ci	eor		r5, r5, r7		// x15
23762306a36Sopenharmony_ci	  subs		r9, #64			// decrement and check LEN
23862306a36Sopenharmony_ci	stmia		r14!, {r2-r5}
23962306a36Sopenharmony_ci
24062306a36Sopenharmony_ci	beq		.Ldone\@
24162306a36Sopenharmony_ci
24262306a36Sopenharmony_ci.Lprepare_for_next_block\@:
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci	// Stack: x0-x15 OUT IN LEN
24562306a36Sopenharmony_ci
24662306a36Sopenharmony_ci	// Increment block counter (x12)
24762306a36Sopenharmony_ci	add		r8, #1
24862306a36Sopenharmony_ci
24962306a36Sopenharmony_ci	// Store updated (OUT, IN, LEN)
25062306a36Sopenharmony_ci	str		r14, [sp, #64]
25162306a36Sopenharmony_ci	str		r12, [sp, #68]
25262306a36Sopenharmony_ci	str		r9, [sp, #72]
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ci	  mov		r14, sp
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci	// Store updated block counter (x12)
25762306a36Sopenharmony_ci	str		r8, [sp, #48]
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	  sub		sp, #16
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_ci	// Reload state and do next block
26262306a36Sopenharmony_ci	ldmia		r14!, {r0-r11}		// load x0-x11
26362306a36Sopenharmony_ci	__strd		r10, r11, sp, 8		// store x10-x11 before state
26462306a36Sopenharmony_ci	ldmia		r14, {r10-r12,r14}	// load x12-x15
26562306a36Sopenharmony_ci	b		.Lnext_block\@
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci.Lxor_slowpath\@:
26862306a36Sopenharmony_ci	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
26962306a36Sopenharmony_ci	// We handle it by storing the 64 bytes of keystream to the stack, then
27062306a36Sopenharmony_ci	// XOR-ing the needed portion with the data.
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci	// Allocate keystream buffer
27362306a36Sopenharmony_ci	sub		sp, #64
27462306a36Sopenharmony_ci	mov		r14, sp
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
27762306a36Sopenharmony_ci	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
27862306a36Sopenharmony_ci	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	// Save keystream for x0-x3
28162306a36Sopenharmony_ci	__ldrd		r8, r9, sp, 96
28262306a36Sopenharmony_ci	__ldrd		r10, r11, sp, 104
28362306a36Sopenharmony_ci	add		X0, X0, r8
28462306a36Sopenharmony_ci	add		X1, X1, r9
28562306a36Sopenharmony_ci	add		X2, X2, r10
28662306a36Sopenharmony_ci	add		X3, X3, r11
28762306a36Sopenharmony_ci	_le32_bswap_4x	X0, X1, X2, X3,  r8
28862306a36Sopenharmony_ci	stmia		r14!, {X0-X3}
28962306a36Sopenharmony_ci
29062306a36Sopenharmony_ci	// Save keystream for x4-x7
29162306a36Sopenharmony_ci	__ldrd		r8, r9, sp, 112
29262306a36Sopenharmony_ci	__ldrd		r10, r11, sp, 120
29362306a36Sopenharmony_ci	add		X4, r8, X4, ror #brot
29462306a36Sopenharmony_ci	add		X5, r9, X5, ror #brot
29562306a36Sopenharmony_ci	add		X6, r10, X6, ror #brot
29662306a36Sopenharmony_ci	add		X7, r11, X7, ror #brot
29762306a36Sopenharmony_ci	_le32_bswap_4x	X4, X5, X6, X7,  r8
29862306a36Sopenharmony_ci	  add		r8, sp, #64
29962306a36Sopenharmony_ci	stmia		r14!, {X4-X7}
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ci	// Save keystream for x8-x15
30262306a36Sopenharmony_ci	ldm		r8, {r0-r7}		// (x8-x9,x12-x15,x10-x11)
30362306a36Sopenharmony_ci	__ldrd		r8, r9, sp, 128
30462306a36Sopenharmony_ci	__ldrd		r10, r11, sp, 136
30562306a36Sopenharmony_ci	add		r0, r0, r8		// x8
30662306a36Sopenharmony_ci	add		r1, r1, r9		// x9
30762306a36Sopenharmony_ci	add		r6, r6, r10		// x10
30862306a36Sopenharmony_ci	add		r7, r7, r11		// x11
30962306a36Sopenharmony_ci	_le32_bswap_4x	r0, r1, r6, r7,  r8
31062306a36Sopenharmony_ci	stmia		r14!, {r0,r1,r6,r7}
31162306a36Sopenharmony_ci	__ldrd		r8, r9, sp, 144
31262306a36Sopenharmony_ci	__ldrd		r10, r11, sp, 152
31362306a36Sopenharmony_ci	add		r2, r8, r2, ror #drot	// x12
31462306a36Sopenharmony_ci	add		r3, r9, r3, ror #drot	// x13
31562306a36Sopenharmony_ci	add		r4, r10, r4, ror #drot	// x14
31662306a36Sopenharmony_ci	add		r5, r11, r5, ror #drot	// x15
31762306a36Sopenharmony_ci	_le32_bswap_4x	r2, r3, r4, r5,  r9
31862306a36Sopenharmony_ci	stmia		r14, {r2-r5}
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_ci	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
32162306a36Sopenharmony_ci	// Registers: r8 is block counter, r12 is IN.
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci	ldr		r9, [sp, #168]		// LEN
32462306a36Sopenharmony_ci	ldr		r14, [sp, #160]		// OUT
32562306a36Sopenharmony_ci	cmp		r9, #64
32662306a36Sopenharmony_ci	  mov		r0, sp
32762306a36Sopenharmony_ci	movle		r1, r9
32862306a36Sopenharmony_ci	movgt		r1, #64
32962306a36Sopenharmony_ci	// r1 is number of bytes to XOR, in range [1, 64]
33062306a36Sopenharmony_ci
33162306a36Sopenharmony_ci.if __LINUX_ARM_ARCH__ < 6
33262306a36Sopenharmony_ci	orr		r2, r12, r14
33362306a36Sopenharmony_ci	tst		r2, #3			// IN or OUT misaligned?
33462306a36Sopenharmony_ci	bne		.Lxor_next_byte\@
33562306a36Sopenharmony_ci.endif
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci	// XOR a word at a time
33862306a36Sopenharmony_ci.rept 16
33962306a36Sopenharmony_ci	subs		r1, #4
34062306a36Sopenharmony_ci	blt		.Lxor_words_done\@
34162306a36Sopenharmony_ci	ldr		r2, [r12], #4
34262306a36Sopenharmony_ci	ldr		r3, [r0], #4
34362306a36Sopenharmony_ci	eor		r2, r2, r3
34462306a36Sopenharmony_ci	str		r2, [r14], #4
34562306a36Sopenharmony_ci.endr
34662306a36Sopenharmony_ci	b		.Lxor_slowpath_done\@
34762306a36Sopenharmony_ci.Lxor_words_done\@:
34862306a36Sopenharmony_ci	ands		r1, r1, #3
34962306a36Sopenharmony_ci	beq		.Lxor_slowpath_done\@
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci	// XOR a byte at a time
35262306a36Sopenharmony_ci.Lxor_next_byte\@:
35362306a36Sopenharmony_ci	ldrb		r2, [r12], #1
35462306a36Sopenharmony_ci	ldrb		r3, [r0], #1
35562306a36Sopenharmony_ci	eor		r2, r2, r3
35662306a36Sopenharmony_ci	strb		r2, [r14], #1
35762306a36Sopenharmony_ci	subs		r1, #1
35862306a36Sopenharmony_ci	bne		.Lxor_next_byte\@
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci.Lxor_slowpath_done\@:
36162306a36Sopenharmony_ci	subs		r9, #64
36262306a36Sopenharmony_ci	add		sp, #96
36362306a36Sopenharmony_ci	bgt		.Lprepare_for_next_block\@
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci.Ldone\@:
36662306a36Sopenharmony_ci.endm	// _chacha
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci/*
36962306a36Sopenharmony_ci * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
37062306a36Sopenharmony_ci *		     const u32 *state, int nrounds);
37162306a36Sopenharmony_ci */
37262306a36Sopenharmony_ciENTRY(chacha_doarm)
37362306a36Sopenharmony_ci	cmp		r2, #0			// len == 0?
37462306a36Sopenharmony_ci	reteq		lr
37562306a36Sopenharmony_ci
37662306a36Sopenharmony_ci	ldr		ip, [sp]
37762306a36Sopenharmony_ci	cmp		ip, #12
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci	push		{r0-r2,r4-r11,lr}
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_ci	// Push state x0-x15 onto stack.
38262306a36Sopenharmony_ci	// Also store an extra copy of x10-x11 just before the state.
38362306a36Sopenharmony_ci
38462306a36Sopenharmony_ci	add		X12, r3, #48
38562306a36Sopenharmony_ci	ldm		X12, {X12,X13,X14,X15}
38662306a36Sopenharmony_ci	push		{X12,X13,X14,X15}
38762306a36Sopenharmony_ci	sub		sp, sp, #64
38862306a36Sopenharmony_ci
38962306a36Sopenharmony_ci	__ldrd		X8_X10, X9_X11, r3, 40
39062306a36Sopenharmony_ci	__strd		X8_X10, X9_X11, sp, 8
39162306a36Sopenharmony_ci	__strd		X8_X10, X9_X11, sp, 56
39262306a36Sopenharmony_ci	ldm		r3, {X0-X9_X11}
39362306a36Sopenharmony_ci	__strd		X0, X1, sp, 16
39462306a36Sopenharmony_ci	__strd		X2, X3, sp, 24
39562306a36Sopenharmony_ci	__strd		X4, X5, sp, 32
39662306a36Sopenharmony_ci	__strd		X6, X7, sp, 40
39762306a36Sopenharmony_ci	__strd		X8_X10, X9_X11, sp, 48
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_ci	beq		1f
40062306a36Sopenharmony_ci	_chacha		20
40162306a36Sopenharmony_ci
40262306a36Sopenharmony_ci0:	add		sp, #76
40362306a36Sopenharmony_ci	pop		{r4-r11, pc}
40462306a36Sopenharmony_ci
40562306a36Sopenharmony_ci1:	_chacha		12
40662306a36Sopenharmony_ci	b		0b
40762306a36Sopenharmony_ciENDPROC(chacha_doarm)
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci/*
41062306a36Sopenharmony_ci * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
41162306a36Sopenharmony_ci */
41262306a36Sopenharmony_ciENTRY(hchacha_block_arm)
41362306a36Sopenharmony_ci	push		{r1,r4-r11,lr}
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci	cmp		r2, #12			// ChaCha12 ?
41662306a36Sopenharmony_ci
41762306a36Sopenharmony_ci	mov		r14, r0
41862306a36Sopenharmony_ci	ldmia		r14!, {r0-r11}		// load x0-x11
41962306a36Sopenharmony_ci	push		{r10-r11}		// store x10-x11 to stack
42062306a36Sopenharmony_ci	ldm		r14, {r10-r12,r14}	// load x12-x15
42162306a36Sopenharmony_ci	sub		sp, #8
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci	beq		1f
42462306a36Sopenharmony_ci	_chacha_permute	20
42562306a36Sopenharmony_ci
42662306a36Sopenharmony_ci	// Skip over (unused0-unused1, x10-x11)
42762306a36Sopenharmony_ci0:	add		sp, #16
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	// Fix up rotations of x12-x15
43062306a36Sopenharmony_ci	ror		X12, X12, #drot
43162306a36Sopenharmony_ci	ror		X13, X13, #drot
43262306a36Sopenharmony_ci	  pop		{r4}			// load 'out'
43362306a36Sopenharmony_ci	ror		X14, X14, #drot
43462306a36Sopenharmony_ci	ror		X15, X15, #drot
43562306a36Sopenharmony_ci
43662306a36Sopenharmony_ci	// Store (x0-x3,x12-x15) to 'out'
43762306a36Sopenharmony_ci	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci	pop		{r4-r11,pc}
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ci1:	_chacha_permute	12
44262306a36Sopenharmony_ci	b		0b
44362306a36Sopenharmony_ciENDPROC(hchacha_block_arm)
444