162306a36Sopenharmony_ci/*
262306a36Sopenharmony_ci * ChaCha/XChaCha NEON helper functions
362306a36Sopenharmony_ci *
462306a36Sopenharmony_ci * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * This program is free software; you can redistribute it and/or modify
762306a36Sopenharmony_ci * it under the terms of the GNU General Public License version 2 as
862306a36Sopenharmony_ci * published by the Free Software Foundation.
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci * Based on:
1162306a36Sopenharmony_ci * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
1262306a36Sopenharmony_ci *
1362306a36Sopenharmony_ci * Copyright (C) 2015 Martin Willi
1462306a36Sopenharmony_ci *
1562306a36Sopenharmony_ci * This program is free software; you can redistribute it and/or modify
1662306a36Sopenharmony_ci * it under the terms of the GNU General Public License as published by
1762306a36Sopenharmony_ci * the Free Software Foundation; either version 2 of the License, or
1862306a36Sopenharmony_ci * (at your option) any later version.
1962306a36Sopenharmony_ci */
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci /*
2262306a36Sopenharmony_ci  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
2362306a36Sopenharmony_ci  *
2462306a36Sopenharmony_ci  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
2562306a36Sopenharmony_ci  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
2662306a36Sopenharmony_ci  * (c)  vrev32.16			(16-bit rotations only)
2762306a36Sopenharmony_ci  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
2862306a36Sopenharmony_ci  *					 needs index vector)
2962306a36Sopenharmony_ci  *
3062306a36Sopenharmony_ci  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
3162306a36Sopenharmony_ci  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
3262306a36Sopenharmony_ci  * cycles of (b) on both Cortex-A7 and Cortex-A53.
3362306a36Sopenharmony_ci  *
3462306a36Sopenharmony_ci  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
3562306a36Sopenharmony_ci  * and doesn't need a temporary register.
3662306a36Sopenharmony_ci  *
3762306a36Sopenharmony_ci  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
3862306a36Sopenharmony_ci  * is twice as fast as (a), even when doing (a) on multiple registers
3962306a36Sopenharmony_ci  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
4062306a36Sopenharmony_ci  * parallelizes better when temporary registers are scarce.
4162306a36Sopenharmony_ci  *
4262306a36Sopenharmony_ci  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
4362306a36Sopenharmony_ci  * (a), so the need to load the rotation table actually makes the vtbl method
4462306a36Sopenharmony_ci  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
4562306a36Sopenharmony_ci  * seems to be a good compromise to get a more significant speed boost on some
4662306a36Sopenharmony_ci  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
4762306a36Sopenharmony_ci  */
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci#include <linux/linkage.h>
5062306a36Sopenharmony_ci#include <asm/cache.h>
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci	.text
5362306a36Sopenharmony_ci	.fpu		neon
5462306a36Sopenharmony_ci	.align		5
5562306a36Sopenharmony_ci
5662306a36Sopenharmony_ci/*
5762306a36Sopenharmony_ci * chacha_permute - permute one block
5862306a36Sopenharmony_ci *
5962306a36Sopenharmony_ci * Permute one 64-byte block where the state matrix is stored in the four NEON
6062306a36Sopenharmony_ci * registers q0-q3.  It performs matrix operations on four words in parallel,
6162306a36Sopenharmony_ci * but requires shuffling to rearrange the words after each round.
6262306a36Sopenharmony_ci *
6362306a36Sopenharmony_ci * The round count is given in r3.
6462306a36Sopenharmony_ci *
6562306a36Sopenharmony_ci * Clobbers: r3, ip, q4-q5
6662306a36Sopenharmony_ci */
6762306a36Sopenharmony_cichacha_permute:
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_ci	adr		ip, .Lrol8_table
7062306a36Sopenharmony_ci	vld1.8		{d10}, [ip, :64]
7162306a36Sopenharmony_ci
7262306a36Sopenharmony_ci.Ldoubleround:
7362306a36Sopenharmony_ci	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
7462306a36Sopenharmony_ci	vadd.i32	q0, q0, q1
7562306a36Sopenharmony_ci	veor		q3, q3, q0
7662306a36Sopenharmony_ci	vrev32.16	q3, q3
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_ci	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
7962306a36Sopenharmony_ci	vadd.i32	q2, q2, q3
8062306a36Sopenharmony_ci	veor		q4, q1, q2
8162306a36Sopenharmony_ci	vshl.u32	q1, q4, #12
8262306a36Sopenharmony_ci	vsri.u32	q1, q4, #20
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
8562306a36Sopenharmony_ci	vadd.i32	q0, q0, q1
8662306a36Sopenharmony_ci	veor		q3, q3, q0
8762306a36Sopenharmony_ci	vtbl.8		d6, {d6}, d10
8862306a36Sopenharmony_ci	vtbl.8		d7, {d7}, d10
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_ci	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
9162306a36Sopenharmony_ci	vadd.i32	q2, q2, q3
9262306a36Sopenharmony_ci	veor		q4, q1, q2
9362306a36Sopenharmony_ci	vshl.u32	q1, q4, #7
9462306a36Sopenharmony_ci	vsri.u32	q1, q4, #25
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
9762306a36Sopenharmony_ci	vext.8		q1, q1, q1, #4
9862306a36Sopenharmony_ci	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
9962306a36Sopenharmony_ci	vext.8		q2, q2, q2, #8
10062306a36Sopenharmony_ci	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
10162306a36Sopenharmony_ci	vext.8		q3, q3, q3, #12
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_ci	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
10462306a36Sopenharmony_ci	vadd.i32	q0, q0, q1
10562306a36Sopenharmony_ci	veor		q3, q3, q0
10662306a36Sopenharmony_ci	vrev32.16	q3, q3
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_ci	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
10962306a36Sopenharmony_ci	vadd.i32	q2, q2, q3
11062306a36Sopenharmony_ci	veor		q4, q1, q2
11162306a36Sopenharmony_ci	vshl.u32	q1, q4, #12
11262306a36Sopenharmony_ci	vsri.u32	q1, q4, #20
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
11562306a36Sopenharmony_ci	vadd.i32	q0, q0, q1
11662306a36Sopenharmony_ci	veor		q3, q3, q0
11762306a36Sopenharmony_ci	vtbl.8		d6, {d6}, d10
11862306a36Sopenharmony_ci	vtbl.8		d7, {d7}, d10
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_ci	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
12162306a36Sopenharmony_ci	vadd.i32	q2, q2, q3
12262306a36Sopenharmony_ci	veor		q4, q1, q2
12362306a36Sopenharmony_ci	vshl.u32	q1, q4, #7
12462306a36Sopenharmony_ci	vsri.u32	q1, q4, #25
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
12762306a36Sopenharmony_ci	vext.8		q1, q1, q1, #12
12862306a36Sopenharmony_ci	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
12962306a36Sopenharmony_ci	vext.8		q2, q2, q2, #8
13062306a36Sopenharmony_ci	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
13162306a36Sopenharmony_ci	vext.8		q3, q3, q3, #4
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci	subs		r3, r3, #2
13462306a36Sopenharmony_ci	bne		.Ldoubleround
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_ci	bx		lr
13762306a36Sopenharmony_ciENDPROC(chacha_permute)
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_ciENTRY(chacha_block_xor_neon)
14062306a36Sopenharmony_ci	// r0: Input state matrix, s
14162306a36Sopenharmony_ci	// r1: 1 data block output, o
14262306a36Sopenharmony_ci	// r2: 1 data block input, i
14362306a36Sopenharmony_ci	// r3: nrounds
14462306a36Sopenharmony_ci	push		{lr}
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ci	// x0..3 = s0..3
14762306a36Sopenharmony_ci	add		ip, r0, #0x20
14862306a36Sopenharmony_ci	vld1.32		{q0-q1}, [r0]
14962306a36Sopenharmony_ci	vld1.32		{q2-q3}, [ip]
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci	vmov		q8, q0
15262306a36Sopenharmony_ci	vmov		q9, q1
15362306a36Sopenharmony_ci	vmov		q10, q2
15462306a36Sopenharmony_ci	vmov		q11, q3
15562306a36Sopenharmony_ci
15662306a36Sopenharmony_ci	bl		chacha_permute
15762306a36Sopenharmony_ci
15862306a36Sopenharmony_ci	add		ip, r2, #0x20
15962306a36Sopenharmony_ci	vld1.8		{q4-q5}, [r2]
16062306a36Sopenharmony_ci	vld1.8		{q6-q7}, [ip]
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_ci	// o0 = i0 ^ (x0 + s0)
16362306a36Sopenharmony_ci	vadd.i32	q0, q0, q8
16462306a36Sopenharmony_ci	veor		q0, q0, q4
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_ci	// o1 = i1 ^ (x1 + s1)
16762306a36Sopenharmony_ci	vadd.i32	q1, q1, q9
16862306a36Sopenharmony_ci	veor		q1, q1, q5
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci	// o2 = i2 ^ (x2 + s2)
17162306a36Sopenharmony_ci	vadd.i32	q2, q2, q10
17262306a36Sopenharmony_ci	veor		q2, q2, q6
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_ci	// o3 = i3 ^ (x3 + s3)
17562306a36Sopenharmony_ci	vadd.i32	q3, q3, q11
17662306a36Sopenharmony_ci	veor		q3, q3, q7
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci	add		ip, r1, #0x20
17962306a36Sopenharmony_ci	vst1.8		{q0-q1}, [r1]
18062306a36Sopenharmony_ci	vst1.8		{q2-q3}, [ip]
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci	pop		{pc}
18362306a36Sopenharmony_ciENDPROC(chacha_block_xor_neon)
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_ciENTRY(hchacha_block_neon)
18662306a36Sopenharmony_ci	// r0: Input state matrix, s
18762306a36Sopenharmony_ci	// r1: output (8 32-bit words)
18862306a36Sopenharmony_ci	// r2: nrounds
18962306a36Sopenharmony_ci	push		{lr}
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci	vld1.32		{q0-q1}, [r0]!
19262306a36Sopenharmony_ci	vld1.32		{q2-q3}, [r0]
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ci	mov		r3, r2
19562306a36Sopenharmony_ci	bl		chacha_permute
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_ci	vst1.32		{q0}, [r1]!
19862306a36Sopenharmony_ci	vst1.32		{q3}, [r1]
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci	pop		{pc}
20162306a36Sopenharmony_ciENDPROC(hchacha_block_neon)
20262306a36Sopenharmony_ci
20362306a36Sopenharmony_ci	.align		4
20462306a36Sopenharmony_ci.Lctrinc:	.word	0, 1, 2, 3
20562306a36Sopenharmony_ci.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	.align		5
20862306a36Sopenharmony_ciENTRY(chacha_4block_xor_neon)
20962306a36Sopenharmony_ci	push		{r4, lr}
21062306a36Sopenharmony_ci	mov		r4, sp			// preserve the stack pointer
21162306a36Sopenharmony_ci	sub		ip, sp, #0x20		// allocate a 32 byte buffer
21262306a36Sopenharmony_ci	bic		ip, ip, #0x1f		// aligned to 32 bytes
21362306a36Sopenharmony_ci	mov		sp, ip
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ci	// r0: Input state matrix, s
21662306a36Sopenharmony_ci	// r1: 4 data blocks output, o
21762306a36Sopenharmony_ci	// r2: 4 data blocks input, i
21862306a36Sopenharmony_ci	// r3: nrounds
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_ci	//
22162306a36Sopenharmony_ci	// This function encrypts four consecutive ChaCha blocks by loading
22262306a36Sopenharmony_ci	// the state matrix in NEON registers four times. The algorithm performs
22362306a36Sopenharmony_ci	// each operation on the corresponding word of each state matrix, hence
22462306a36Sopenharmony_ci	// requires no word shuffling. The words are re-interleaved before the
22562306a36Sopenharmony_ci	// final addition of the original state and the XORing step.
22662306a36Sopenharmony_ci	//
22762306a36Sopenharmony_ci
22862306a36Sopenharmony_ci	// x0..15[0-3] = s0..15[0-3]
22962306a36Sopenharmony_ci	add		ip, r0, #0x20
23062306a36Sopenharmony_ci	vld1.32		{q0-q1}, [r0]
23162306a36Sopenharmony_ci	vld1.32		{q2-q3}, [ip]
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_ci	adr		lr, .Lctrinc
23462306a36Sopenharmony_ci	vdup.32		q15, d7[1]
23562306a36Sopenharmony_ci	vdup.32		q14, d7[0]
23662306a36Sopenharmony_ci	vld1.32		{q4}, [lr, :128]
23762306a36Sopenharmony_ci	vdup.32		q13, d6[1]
23862306a36Sopenharmony_ci	vdup.32		q12, d6[0]
23962306a36Sopenharmony_ci	vdup.32		q11, d5[1]
24062306a36Sopenharmony_ci	vdup.32		q10, d5[0]
24162306a36Sopenharmony_ci	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
24262306a36Sopenharmony_ci	vdup.32		q9, d4[1]
24362306a36Sopenharmony_ci	vdup.32		q8, d4[0]
24462306a36Sopenharmony_ci	vdup.32		q7, d3[1]
24562306a36Sopenharmony_ci	vdup.32		q6, d3[0]
24662306a36Sopenharmony_ci	vdup.32		q5, d2[1]
24762306a36Sopenharmony_ci	vdup.32		q4, d2[0]
24862306a36Sopenharmony_ci	vdup.32		q3, d1[1]
24962306a36Sopenharmony_ci	vdup.32		q2, d1[0]
25062306a36Sopenharmony_ci	vdup.32		q1, d0[1]
25162306a36Sopenharmony_ci	vdup.32		q0, d0[0]
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci	adr		ip, .Lrol8_table
25462306a36Sopenharmony_ci	b		1f
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci.Ldoubleround4:
25762306a36Sopenharmony_ci	vld1.32		{q8-q9}, [sp, :256]
25862306a36Sopenharmony_ci1:
25962306a36Sopenharmony_ci	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
26062306a36Sopenharmony_ci	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
26162306a36Sopenharmony_ci	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
26262306a36Sopenharmony_ci	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
26362306a36Sopenharmony_ci	vadd.i32	q0, q0, q4
26462306a36Sopenharmony_ci	vadd.i32	q1, q1, q5
26562306a36Sopenharmony_ci	vadd.i32	q2, q2, q6
26662306a36Sopenharmony_ci	vadd.i32	q3, q3, q7
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	veor		q12, q12, q0
26962306a36Sopenharmony_ci	veor		q13, q13, q1
27062306a36Sopenharmony_ci	veor		q14, q14, q2
27162306a36Sopenharmony_ci	veor		q15, q15, q3
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci	vrev32.16	q12, q12
27462306a36Sopenharmony_ci	vrev32.16	q13, q13
27562306a36Sopenharmony_ci	vrev32.16	q14, q14
27662306a36Sopenharmony_ci	vrev32.16	q15, q15
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
27962306a36Sopenharmony_ci	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
28062306a36Sopenharmony_ci	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
28162306a36Sopenharmony_ci	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
28262306a36Sopenharmony_ci	vadd.i32	q8, q8, q12
28362306a36Sopenharmony_ci	vadd.i32	q9, q9, q13
28462306a36Sopenharmony_ci	vadd.i32	q10, q10, q14
28562306a36Sopenharmony_ci	vadd.i32	q11, q11, q15
28662306a36Sopenharmony_ci
28762306a36Sopenharmony_ci	vst1.32		{q8-q9}, [sp, :256]
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci	veor		q8, q4, q8
29062306a36Sopenharmony_ci	veor		q9, q5, q9
29162306a36Sopenharmony_ci	vshl.u32	q4, q8, #12
29262306a36Sopenharmony_ci	vshl.u32	q5, q9, #12
29362306a36Sopenharmony_ci	vsri.u32	q4, q8, #20
29462306a36Sopenharmony_ci	vsri.u32	q5, q9, #20
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci	veor		q8, q6, q10
29762306a36Sopenharmony_ci	veor		q9, q7, q11
29862306a36Sopenharmony_ci	vshl.u32	q6, q8, #12
29962306a36Sopenharmony_ci	vshl.u32	q7, q9, #12
30062306a36Sopenharmony_ci	vsri.u32	q6, q8, #20
30162306a36Sopenharmony_ci	vsri.u32	q7, q9, #20
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
30462306a36Sopenharmony_ci	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
30562306a36Sopenharmony_ci	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
30662306a36Sopenharmony_ci	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
30762306a36Sopenharmony_ci	vld1.8		{d16}, [ip, :64]
30862306a36Sopenharmony_ci	vadd.i32	q0, q0, q4
30962306a36Sopenharmony_ci	vadd.i32	q1, q1, q5
31062306a36Sopenharmony_ci	vadd.i32	q2, q2, q6
31162306a36Sopenharmony_ci	vadd.i32	q3, q3, q7
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci	veor		q12, q12, q0
31462306a36Sopenharmony_ci	veor		q13, q13, q1
31562306a36Sopenharmony_ci	veor		q14, q14, q2
31662306a36Sopenharmony_ci	veor		q15, q15, q3
31762306a36Sopenharmony_ci
31862306a36Sopenharmony_ci	vtbl.8		d24, {d24}, d16
31962306a36Sopenharmony_ci	vtbl.8		d25, {d25}, d16
32062306a36Sopenharmony_ci	vtbl.8		d26, {d26}, d16
32162306a36Sopenharmony_ci	vtbl.8		d27, {d27}, d16
32262306a36Sopenharmony_ci	vtbl.8		d28, {d28}, d16
32362306a36Sopenharmony_ci	vtbl.8		d29, {d29}, d16
32462306a36Sopenharmony_ci	vtbl.8		d30, {d30}, d16
32562306a36Sopenharmony_ci	vtbl.8		d31, {d31}, d16
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci	vld1.32		{q8-q9}, [sp, :256]
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_ci	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
33062306a36Sopenharmony_ci	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
33162306a36Sopenharmony_ci	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
33262306a36Sopenharmony_ci	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
33362306a36Sopenharmony_ci	vadd.i32	q8, q8, q12
33462306a36Sopenharmony_ci	vadd.i32	q9, q9, q13
33562306a36Sopenharmony_ci	vadd.i32	q10, q10, q14
33662306a36Sopenharmony_ci	vadd.i32	q11, q11, q15
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ci	vst1.32		{q8-q9}, [sp, :256]
33962306a36Sopenharmony_ci
34062306a36Sopenharmony_ci	veor		q8, q4, q8
34162306a36Sopenharmony_ci	veor		q9, q5, q9
34262306a36Sopenharmony_ci	vshl.u32	q4, q8, #7
34362306a36Sopenharmony_ci	vshl.u32	q5, q9, #7
34462306a36Sopenharmony_ci	vsri.u32	q4, q8, #25
34562306a36Sopenharmony_ci	vsri.u32	q5, q9, #25
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_ci	veor		q8, q6, q10
34862306a36Sopenharmony_ci	veor		q9, q7, q11
34962306a36Sopenharmony_ci	vshl.u32	q6, q8, #7
35062306a36Sopenharmony_ci	vshl.u32	q7, q9, #7
35162306a36Sopenharmony_ci	vsri.u32	q6, q8, #25
35262306a36Sopenharmony_ci	vsri.u32	q7, q9, #25
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci	vld1.32		{q8-q9}, [sp, :256]
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_ci	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
35762306a36Sopenharmony_ci	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
35862306a36Sopenharmony_ci	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
35962306a36Sopenharmony_ci	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
36062306a36Sopenharmony_ci	vadd.i32	q0, q0, q5
36162306a36Sopenharmony_ci	vadd.i32	q1, q1, q6
36262306a36Sopenharmony_ci	vadd.i32	q2, q2, q7
36362306a36Sopenharmony_ci	vadd.i32	q3, q3, q4
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci	veor		q15, q15, q0
36662306a36Sopenharmony_ci	veor		q12, q12, q1
36762306a36Sopenharmony_ci	veor		q13, q13, q2
36862306a36Sopenharmony_ci	veor		q14, q14, q3
36962306a36Sopenharmony_ci
37062306a36Sopenharmony_ci	vrev32.16	q15, q15
37162306a36Sopenharmony_ci	vrev32.16	q12, q12
37262306a36Sopenharmony_ci	vrev32.16	q13, q13
37362306a36Sopenharmony_ci	vrev32.16	q14, q14
37462306a36Sopenharmony_ci
37562306a36Sopenharmony_ci	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
37662306a36Sopenharmony_ci	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
37762306a36Sopenharmony_ci	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
37862306a36Sopenharmony_ci	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
37962306a36Sopenharmony_ci	vadd.i32	q10, q10, q15
38062306a36Sopenharmony_ci	vadd.i32	q11, q11, q12
38162306a36Sopenharmony_ci	vadd.i32	q8, q8, q13
38262306a36Sopenharmony_ci	vadd.i32	q9, q9, q14
38362306a36Sopenharmony_ci
38462306a36Sopenharmony_ci	vst1.32		{q8-q9}, [sp, :256]
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci	veor		q8, q7, q8
38762306a36Sopenharmony_ci	veor		q9, q4, q9
38862306a36Sopenharmony_ci	vshl.u32	q7, q8, #12
38962306a36Sopenharmony_ci	vshl.u32	q4, q9, #12
39062306a36Sopenharmony_ci	vsri.u32	q7, q8, #20
39162306a36Sopenharmony_ci	vsri.u32	q4, q9, #20
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_ci	veor		q8, q5, q10
39462306a36Sopenharmony_ci	veor		q9, q6, q11
39562306a36Sopenharmony_ci	vshl.u32	q5, q8, #12
39662306a36Sopenharmony_ci	vshl.u32	q6, q9, #12
39762306a36Sopenharmony_ci	vsri.u32	q5, q8, #20
39862306a36Sopenharmony_ci	vsri.u32	q6, q9, #20
39962306a36Sopenharmony_ci
40062306a36Sopenharmony_ci	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
40162306a36Sopenharmony_ci	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
40262306a36Sopenharmony_ci	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
40362306a36Sopenharmony_ci	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
40462306a36Sopenharmony_ci	vld1.8		{d16}, [ip, :64]
40562306a36Sopenharmony_ci	vadd.i32	q0, q0, q5
40662306a36Sopenharmony_ci	vadd.i32	q1, q1, q6
40762306a36Sopenharmony_ci	vadd.i32	q2, q2, q7
40862306a36Sopenharmony_ci	vadd.i32	q3, q3, q4
40962306a36Sopenharmony_ci
41062306a36Sopenharmony_ci	veor		q15, q15, q0
41162306a36Sopenharmony_ci	veor		q12, q12, q1
41262306a36Sopenharmony_ci	veor		q13, q13, q2
41362306a36Sopenharmony_ci	veor		q14, q14, q3
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci	vtbl.8		d30, {d30}, d16
41662306a36Sopenharmony_ci	vtbl.8		d31, {d31}, d16
41762306a36Sopenharmony_ci	vtbl.8		d24, {d24}, d16
41862306a36Sopenharmony_ci	vtbl.8		d25, {d25}, d16
41962306a36Sopenharmony_ci	vtbl.8		d26, {d26}, d16
42062306a36Sopenharmony_ci	vtbl.8		d27, {d27}, d16
42162306a36Sopenharmony_ci	vtbl.8		d28, {d28}, d16
42262306a36Sopenharmony_ci	vtbl.8		d29, {d29}, d16
42362306a36Sopenharmony_ci
42462306a36Sopenharmony_ci	vld1.32		{q8-q9}, [sp, :256]
42562306a36Sopenharmony_ci
42662306a36Sopenharmony_ci	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
42762306a36Sopenharmony_ci	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
42862306a36Sopenharmony_ci	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
42962306a36Sopenharmony_ci	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
43062306a36Sopenharmony_ci	vadd.i32	q10, q10, q15
43162306a36Sopenharmony_ci	vadd.i32	q11, q11, q12
43262306a36Sopenharmony_ci	vadd.i32	q8, q8, q13
43362306a36Sopenharmony_ci	vadd.i32	q9, q9, q14
43462306a36Sopenharmony_ci
43562306a36Sopenharmony_ci	vst1.32		{q8-q9}, [sp, :256]
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ci	veor		q8, q7, q8
43862306a36Sopenharmony_ci	veor		q9, q4, q9
43962306a36Sopenharmony_ci	vshl.u32	q7, q8, #7
44062306a36Sopenharmony_ci	vshl.u32	q4, q9, #7
44162306a36Sopenharmony_ci	vsri.u32	q7, q8, #25
44262306a36Sopenharmony_ci	vsri.u32	q4, q9, #25
44362306a36Sopenharmony_ci
44462306a36Sopenharmony_ci	veor		q8, q5, q10
44562306a36Sopenharmony_ci	veor		q9, q6, q11
44662306a36Sopenharmony_ci	vshl.u32	q5, q8, #7
44762306a36Sopenharmony_ci	vshl.u32	q6, q9, #7
44862306a36Sopenharmony_ci	vsri.u32	q5, q8, #25
44962306a36Sopenharmony_ci	vsri.u32	q6, q9, #25
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci	subs		r3, r3, #2
45262306a36Sopenharmony_ci	bne		.Ldoubleround4
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
45562306a36Sopenharmony_ci	// x8..9[0-3] are on the stack.
45662306a36Sopenharmony_ci
45762306a36Sopenharmony_ci	// Re-interleave the words in the first two rows of each block (x0..7).
45862306a36Sopenharmony_ci	// Also add the counter values 0-3 to x12[0-3].
45962306a36Sopenharmony_ci	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
46062306a36Sopenharmony_ci	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
46162306a36Sopenharmony_ci	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
46262306a36Sopenharmony_ci	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
46362306a36Sopenharmony_ci	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
46462306a36Sopenharmony_ci	  vadd.u32	q12, q8			// x12 += counter values 0-3
46562306a36Sopenharmony_ci	vswp		d1, d4
46662306a36Sopenharmony_ci	vswp		d3, d6
46762306a36Sopenharmony_ci	  vld1.32	{q8-q9}, [r0]!		// load s0..7
46862306a36Sopenharmony_ci	vswp		d9, d12
46962306a36Sopenharmony_ci	vswp		d11, d14
47062306a36Sopenharmony_ci
47162306a36Sopenharmony_ci	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
47262306a36Sopenharmony_ci	// after XORing the first 32 bytes.
47362306a36Sopenharmony_ci	vswp		q1, q4
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_ci	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
47662306a36Sopenharmony_ci
47762306a36Sopenharmony_ci	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
47862306a36Sopenharmony_ci	vadd.u32	q0, q0, q8
47962306a36Sopenharmony_ci	vadd.u32	q2, q2, q8
48062306a36Sopenharmony_ci	vadd.u32	q4, q4, q8
48162306a36Sopenharmony_ci	vadd.u32	q3, q3, q8
48262306a36Sopenharmony_ci
48362306a36Sopenharmony_ci	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
48462306a36Sopenharmony_ci	vadd.u32	q1, q1, q9
48562306a36Sopenharmony_ci	vadd.u32	q6, q6, q9
48662306a36Sopenharmony_ci	vadd.u32	q5, q5, q9
48762306a36Sopenharmony_ci	vadd.u32	q7, q7, q9
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_ci	// XOR first 32 bytes using keystream from first two rows of first block
49062306a36Sopenharmony_ci	vld1.8		{q8-q9}, [r2]!
49162306a36Sopenharmony_ci	veor		q8, q8, q0
49262306a36Sopenharmony_ci	veor		q9, q9, q1
49362306a36Sopenharmony_ci	vst1.8		{q8-q9}, [r1]!
49462306a36Sopenharmony_ci
49562306a36Sopenharmony_ci	// Re-interleave the words in the last two rows of each block (x8..15).
49662306a36Sopenharmony_ci	vld1.32		{q8-q9}, [sp, :256]
49762306a36Sopenharmony_ci	  mov		sp, r4		// restore original stack pointer
49862306a36Sopenharmony_ci	  ldr		r4, [r4, #8]	// load number of bytes
49962306a36Sopenharmony_ci	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
50062306a36Sopenharmony_ci	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
50162306a36Sopenharmony_ci	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
50262306a36Sopenharmony_ci	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
50362306a36Sopenharmony_ci	  vld1.32	{q0-q1}, [r0]	// load s8..15
50462306a36Sopenharmony_ci	vswp		d25, d28
50562306a36Sopenharmony_ci	vswp		d27, d30
50662306a36Sopenharmony_ci	vswp		d17, d20
50762306a36Sopenharmony_ci	vswp		d19, d22
50862306a36Sopenharmony_ci
50962306a36Sopenharmony_ci	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
51062306a36Sopenharmony_ci
51162306a36Sopenharmony_ci	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
51262306a36Sopenharmony_ci	vadd.u32	q8,  q8,  q0
51362306a36Sopenharmony_ci	vadd.u32	q10, q10, q0
51462306a36Sopenharmony_ci	vadd.u32	q9,  q9,  q0
51562306a36Sopenharmony_ci	vadd.u32	q11, q11, q0
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
51862306a36Sopenharmony_ci	vadd.u32	q12, q12, q1
51962306a36Sopenharmony_ci	vadd.u32	q14, q14, q1
52062306a36Sopenharmony_ci	vadd.u32	q13, q13, q1
52162306a36Sopenharmony_ci	vadd.u32	q15, q15, q1
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_ci	// XOR the rest of the data with the keystream
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_ci	vld1.8		{q0-q1}, [r2]!
52662306a36Sopenharmony_ci	subs		r4, r4, #96
52762306a36Sopenharmony_ci	veor		q0, q0, q8
52862306a36Sopenharmony_ci	veor		q1, q1, q12
52962306a36Sopenharmony_ci	ble		.Lle96
53062306a36Sopenharmony_ci	vst1.8		{q0-q1}, [r1]!
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ci	vld1.8		{q0-q1}, [r2]!
53362306a36Sopenharmony_ci	subs		r4, r4, #32
53462306a36Sopenharmony_ci	veor		q0, q0, q2
53562306a36Sopenharmony_ci	veor		q1, q1, q6
53662306a36Sopenharmony_ci	ble		.Lle128
53762306a36Sopenharmony_ci	vst1.8		{q0-q1}, [r1]!
53862306a36Sopenharmony_ci
53962306a36Sopenharmony_ci	vld1.8		{q0-q1}, [r2]!
54062306a36Sopenharmony_ci	subs		r4, r4, #32
54162306a36Sopenharmony_ci	veor		q0, q0, q10
54262306a36Sopenharmony_ci	veor		q1, q1, q14
54362306a36Sopenharmony_ci	ble		.Lle160
54462306a36Sopenharmony_ci	vst1.8		{q0-q1}, [r1]!
54562306a36Sopenharmony_ci
54662306a36Sopenharmony_ci	vld1.8		{q0-q1}, [r2]!
54762306a36Sopenharmony_ci	subs		r4, r4, #32
54862306a36Sopenharmony_ci	veor		q0, q0, q4
54962306a36Sopenharmony_ci	veor		q1, q1, q5
55062306a36Sopenharmony_ci	ble		.Lle192
55162306a36Sopenharmony_ci	vst1.8		{q0-q1}, [r1]!
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci	vld1.8		{q0-q1}, [r2]!
55462306a36Sopenharmony_ci	subs		r4, r4, #32
55562306a36Sopenharmony_ci	veor		q0, q0, q9
55662306a36Sopenharmony_ci	veor		q1, q1, q13
55762306a36Sopenharmony_ci	ble		.Lle224
55862306a36Sopenharmony_ci	vst1.8		{q0-q1}, [r1]!
55962306a36Sopenharmony_ci
56062306a36Sopenharmony_ci	vld1.8		{q0-q1}, [r2]!
56162306a36Sopenharmony_ci	subs		r4, r4, #32
56262306a36Sopenharmony_ci	veor		q0, q0, q3
56362306a36Sopenharmony_ci	veor		q1, q1, q7
56462306a36Sopenharmony_ci	blt		.Llt256
56562306a36Sopenharmony_ci.Lout:
56662306a36Sopenharmony_ci	vst1.8		{q0-q1}, [r1]!
56762306a36Sopenharmony_ci
56862306a36Sopenharmony_ci	vld1.8		{q0-q1}, [r2]
56962306a36Sopenharmony_ci	veor		q0, q0, q11
57062306a36Sopenharmony_ci	veor		q1, q1, q15
57162306a36Sopenharmony_ci	vst1.8		{q0-q1}, [r1]
57262306a36Sopenharmony_ci
57362306a36Sopenharmony_ci	pop		{r4, pc}
57462306a36Sopenharmony_ci
57562306a36Sopenharmony_ci.Lle192:
57662306a36Sopenharmony_ci	vmov		q4, q9
57762306a36Sopenharmony_ci	vmov		q5, q13
57862306a36Sopenharmony_ci
57962306a36Sopenharmony_ci.Lle160:
58062306a36Sopenharmony_ci	// nothing to do
58162306a36Sopenharmony_ci
58262306a36Sopenharmony_ci.Lfinalblock:
58362306a36Sopenharmony_ci	// Process the final block if processing less than 4 full blocks.
58462306a36Sopenharmony_ci	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
58562306a36Sopenharmony_ci	// previous 32 byte output block that still needs to be written at
58662306a36Sopenharmony_ci	// [r1] in q0-q1.
58762306a36Sopenharmony_ci	beq		.Lfullblock
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_ci.Lpartialblock:
59062306a36Sopenharmony_ci	adr		lr, .Lpermute + 32
59162306a36Sopenharmony_ci	add		r2, r2, r4
59262306a36Sopenharmony_ci	add		lr, lr, r4
59362306a36Sopenharmony_ci	add		r4, r4, r1
59462306a36Sopenharmony_ci
59562306a36Sopenharmony_ci	vld1.8		{q2-q3}, [lr]
59662306a36Sopenharmony_ci	vld1.8		{q6-q7}, [r2]
59762306a36Sopenharmony_ci
59862306a36Sopenharmony_ci	add		r4, r4, #32
59962306a36Sopenharmony_ci
60062306a36Sopenharmony_ci	vtbl.8		d4, {q4-q5}, d4
60162306a36Sopenharmony_ci	vtbl.8		d5, {q4-q5}, d5
60262306a36Sopenharmony_ci	vtbl.8		d6, {q4-q5}, d6
60362306a36Sopenharmony_ci	vtbl.8		d7, {q4-q5}, d7
60462306a36Sopenharmony_ci
60562306a36Sopenharmony_ci	veor		q6, q6, q2
60662306a36Sopenharmony_ci	veor		q7, q7, q3
60762306a36Sopenharmony_ci
60862306a36Sopenharmony_ci	vst1.8		{q6-q7}, [r4]	// overlapping stores
60962306a36Sopenharmony_ci	vst1.8		{q0-q1}, [r1]
61062306a36Sopenharmony_ci	pop		{r4, pc}
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci.Lfullblock:
61362306a36Sopenharmony_ci	vmov		q11, q4
61462306a36Sopenharmony_ci	vmov		q15, q5
61562306a36Sopenharmony_ci	b		.Lout
61662306a36Sopenharmony_ci.Lle96:
61762306a36Sopenharmony_ci	vmov		q4, q2
61862306a36Sopenharmony_ci	vmov		q5, q6
61962306a36Sopenharmony_ci	b		.Lfinalblock
62062306a36Sopenharmony_ci.Lle128:
62162306a36Sopenharmony_ci	vmov		q4, q10
62262306a36Sopenharmony_ci	vmov		q5, q14
62362306a36Sopenharmony_ci	b		.Lfinalblock
62462306a36Sopenharmony_ci.Lle224:
62562306a36Sopenharmony_ci	vmov		q4, q3
62662306a36Sopenharmony_ci	vmov		q5, q7
62762306a36Sopenharmony_ci	b		.Lfinalblock
62862306a36Sopenharmony_ci.Llt256:
62962306a36Sopenharmony_ci	vmov		q4, q11
63062306a36Sopenharmony_ci	vmov		q5, q15
63162306a36Sopenharmony_ci	b		.Lpartialblock
63262306a36Sopenharmony_ciENDPROC(chacha_4block_xor_neon)
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ci	.align		L1_CACHE_SHIFT
63562306a36Sopenharmony_ci.Lpermute:
63662306a36Sopenharmony_ci	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
63762306a36Sopenharmony_ci	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
63862306a36Sopenharmony_ci	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
63962306a36Sopenharmony_ci	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
64062306a36Sopenharmony_ci	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
64162306a36Sopenharmony_ci	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
64262306a36Sopenharmony_ci	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
64362306a36Sopenharmony_ci	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
644