162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * BLAKE2b digest algorithm, NEON accelerated
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright 2020 Google LLC
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Author: Eric Biggers <ebiggers@google.com>
862306a36Sopenharmony_ci */
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#include <linux/linkage.h>
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci	.text
1362306a36Sopenharmony_ci	.fpu		neon
1462306a36Sopenharmony_ci
1562306a36Sopenharmony_ci	// The arguments to blake2b_compress_neon()
1662306a36Sopenharmony_ci	STATE		.req	r0
1762306a36Sopenharmony_ci	BLOCK		.req	r1
1862306a36Sopenharmony_ci	NBLOCKS		.req	r2
1962306a36Sopenharmony_ci	INC		.req	r3
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci	// Pointers to the rotation tables
2262306a36Sopenharmony_ci	ROR24_TABLE	.req	r4
2362306a36Sopenharmony_ci	ROR16_TABLE	.req	r5
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_ci	// The original stack pointer
2662306a36Sopenharmony_ci	ORIG_SP		.req	r6
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci	// NEON registers which contain the message words of the current block.
2962306a36Sopenharmony_ci	// M_0-M_3 are occasionally used for other purposes too.
3062306a36Sopenharmony_ci	M_0		.req	d16
3162306a36Sopenharmony_ci	M_1		.req	d17
3262306a36Sopenharmony_ci	M_2		.req	d18
3362306a36Sopenharmony_ci	M_3		.req	d19
3462306a36Sopenharmony_ci	M_4		.req	d20
3562306a36Sopenharmony_ci	M_5		.req	d21
3662306a36Sopenharmony_ci	M_6		.req	d22
3762306a36Sopenharmony_ci	M_7		.req	d23
3862306a36Sopenharmony_ci	M_8		.req	d24
3962306a36Sopenharmony_ci	M_9		.req	d25
4062306a36Sopenharmony_ci	M_10		.req	d26
4162306a36Sopenharmony_ci	M_11		.req	d27
4262306a36Sopenharmony_ci	M_12		.req	d28
4362306a36Sopenharmony_ci	M_13		.req	d29
4462306a36Sopenharmony_ci	M_14		.req	d30
4562306a36Sopenharmony_ci	M_15		.req	d31
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_ci	.align		4
4862306a36Sopenharmony_ci	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
4962306a36Sopenharmony_ci	// instruction.  This is the most efficient way to implement these
5062306a36Sopenharmony_ci	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
5162306a36Sopenharmony_ci	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
5262306a36Sopenharmony_ci.Lror24_table:
5362306a36Sopenharmony_ci	.byte		3, 4, 5, 6, 7, 0, 1, 2
5462306a36Sopenharmony_ci.Lror16_table:
5562306a36Sopenharmony_ci	.byte		2, 3, 4, 5, 6, 7, 0, 1
5662306a36Sopenharmony_ci	// The BLAKE2b initialization vector
5762306a36Sopenharmony_ci.Lblake2b_IV:
5862306a36Sopenharmony_ci	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
5962306a36Sopenharmony_ci	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
6062306a36Sopenharmony_ci	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
6162306a36Sopenharmony_ci	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
6462306a36Sopenharmony_ci// NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
6562306a36Sopenharmony_ci// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
6662306a36Sopenharmony_ci// (M_0-M_3), so that they can be reloaded if they are used as temporary
6762306a36Sopenharmony_ci// registers.  The macro arguments s0-s15 give the order in which the message
6862306a36Sopenharmony_ci// words are used in this round.  'final' is 1 if this is the final round.
6962306a36Sopenharmony_ci.macro	_blake2b_round	s0, s1, s2, s3, s4, s5, s6, s7, \
7062306a36Sopenharmony_ci			s8, s9, s10, s11, s12, s13, s14, s15, final=0
7162306a36Sopenharmony_ci
7262306a36Sopenharmony_ci	// Mix the columns:
7362306a36Sopenharmony_ci	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
7462306a36Sopenharmony_ci	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci	// a += b + m[blake2b_sigma[r][2*i + 0]];
7762306a36Sopenharmony_ci	vadd.u64	q0, q0, q2
7862306a36Sopenharmony_ci	vadd.u64	q1, q1, q3
7962306a36Sopenharmony_ci	vadd.u64	d0, d0, M_\s0
8062306a36Sopenharmony_ci	vadd.u64	d1, d1, M_\s2
8162306a36Sopenharmony_ci	vadd.u64	d2, d2, M_\s4
8262306a36Sopenharmony_ci	vadd.u64	d3, d3, M_\s6
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci	// d = ror64(d ^ a, 32);
8562306a36Sopenharmony_ci	veor		q6, q6, q0
8662306a36Sopenharmony_ci	veor		q7, q7, q1
8762306a36Sopenharmony_ci	vrev64.32	q6, q6
8862306a36Sopenharmony_ci	vrev64.32	q7, q7
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_ci	// c += d;
9162306a36Sopenharmony_ci	vadd.u64	q4, q4, q6
9262306a36Sopenharmony_ci	vadd.u64	q5, q5, q7
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_ci	// b = ror64(b ^ c, 24);
9562306a36Sopenharmony_ci	vld1.8		{M_0}, [ROR24_TABLE, :64]
9662306a36Sopenharmony_ci	veor		q2, q2, q4
9762306a36Sopenharmony_ci	veor		q3, q3, q5
9862306a36Sopenharmony_ci	vtbl.8		d4, {d4}, M_0
9962306a36Sopenharmony_ci	vtbl.8		d5, {d5}, M_0
10062306a36Sopenharmony_ci	vtbl.8		d6, {d6}, M_0
10162306a36Sopenharmony_ci	vtbl.8		d7, {d7}, M_0
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_ci	// a += b + m[blake2b_sigma[r][2*i + 1]];
10462306a36Sopenharmony_ci	//
10562306a36Sopenharmony_ci	// M_0 got clobbered above, so we have to reload it if any of the four
10662306a36Sopenharmony_ci	// message words this step needs happens to be M_0.  Otherwise we don't
10762306a36Sopenharmony_ci	// need to reload it here, as it will just get clobbered again below.
10862306a36Sopenharmony_ci.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
10962306a36Sopenharmony_ci	vld1.8		{M_0}, [sp, :64]
11062306a36Sopenharmony_ci.endif
11162306a36Sopenharmony_ci	vadd.u64	q0, q0, q2
11262306a36Sopenharmony_ci	vadd.u64	q1, q1, q3
11362306a36Sopenharmony_ci	vadd.u64	d0, d0, M_\s1
11462306a36Sopenharmony_ci	vadd.u64	d1, d1, M_\s3
11562306a36Sopenharmony_ci	vadd.u64	d2, d2, M_\s5
11662306a36Sopenharmony_ci	vadd.u64	d3, d3, M_\s7
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci	// d = ror64(d ^ a, 16);
11962306a36Sopenharmony_ci	vld1.8		{M_0}, [ROR16_TABLE, :64]
12062306a36Sopenharmony_ci	veor		q6, q6, q0
12162306a36Sopenharmony_ci	veor		q7, q7, q1
12262306a36Sopenharmony_ci	vtbl.8		d12, {d12}, M_0
12362306a36Sopenharmony_ci	vtbl.8		d13, {d13}, M_0
12462306a36Sopenharmony_ci	vtbl.8		d14, {d14}, M_0
12562306a36Sopenharmony_ci	vtbl.8		d15, {d15}, M_0
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci	// c += d;
12862306a36Sopenharmony_ci	vadd.u64	q4, q4, q6
12962306a36Sopenharmony_ci	vadd.u64	q5, q5, q7
13062306a36Sopenharmony_ci
13162306a36Sopenharmony_ci	// b = ror64(b ^ c, 63);
13262306a36Sopenharmony_ci	//
13362306a36Sopenharmony_ci	// This rotation amount isn't a multiple of 8, so it has to be
13462306a36Sopenharmony_ci	// implemented using a pair of shifts, which requires temporary
13562306a36Sopenharmony_ci	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
13662306a36Sopenharmony_ci	veor		q8, q2, q4
13762306a36Sopenharmony_ci	veor		q9, q3, q5
13862306a36Sopenharmony_ci	vshr.u64	q2, q8, #63
13962306a36Sopenharmony_ci	vshr.u64	q3, q9, #63
14062306a36Sopenharmony_ci	vsli.u64	q2, q8, #1
14162306a36Sopenharmony_ci	vsli.u64	q3, q9, #1
14262306a36Sopenharmony_ci	vld1.8		{q8-q9}, [sp, :256]
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci	// Mix the diagonals:
14562306a36Sopenharmony_ci	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
14662306a36Sopenharmony_ci	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
14762306a36Sopenharmony_ci	//
14862306a36Sopenharmony_ci	// There are two possible ways to do this: use 'vext' instructions to
14962306a36Sopenharmony_ci	// shift the rows of the matrix so that the diagonals become columns,
15062306a36Sopenharmony_ci	// and undo it afterwards; or just use 64-bit operations on 'd'
15162306a36Sopenharmony_ci	// registers instead of 128-bit operations on 'q' registers.  We use the
15262306a36Sopenharmony_ci	// latter approach, as it performs much better on Cortex-A7.
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_ci	// a += b + m[blake2b_sigma[r][2*i + 0]];
15562306a36Sopenharmony_ci	vadd.u64	d0, d0, d5
15662306a36Sopenharmony_ci	vadd.u64	d1, d1, d6
15762306a36Sopenharmony_ci	vadd.u64	d2, d2, d7
15862306a36Sopenharmony_ci	vadd.u64	d3, d3, d4
15962306a36Sopenharmony_ci	vadd.u64	d0, d0, M_\s8
16062306a36Sopenharmony_ci	vadd.u64	d1, d1, M_\s10
16162306a36Sopenharmony_ci	vadd.u64	d2, d2, M_\s12
16262306a36Sopenharmony_ci	vadd.u64	d3, d3, M_\s14
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci	// d = ror64(d ^ a, 32);
16562306a36Sopenharmony_ci	veor		d15, d15, d0
16662306a36Sopenharmony_ci	veor		d12, d12, d1
16762306a36Sopenharmony_ci	veor		d13, d13, d2
16862306a36Sopenharmony_ci	veor		d14, d14, d3
16962306a36Sopenharmony_ci	vrev64.32	d15, d15
17062306a36Sopenharmony_ci	vrev64.32	d12, d12
17162306a36Sopenharmony_ci	vrev64.32	d13, d13
17262306a36Sopenharmony_ci	vrev64.32	d14, d14
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_ci	// c += d;
17562306a36Sopenharmony_ci	vadd.u64	d10, d10, d15
17662306a36Sopenharmony_ci	vadd.u64	d11, d11, d12
17762306a36Sopenharmony_ci	vadd.u64	d8, d8, d13
17862306a36Sopenharmony_ci	vadd.u64	d9, d9, d14
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_ci	// b = ror64(b ^ c, 24);
18162306a36Sopenharmony_ci	vld1.8		{M_0}, [ROR24_TABLE, :64]
18262306a36Sopenharmony_ci	veor		d5, d5, d10
18362306a36Sopenharmony_ci	veor		d6, d6, d11
18462306a36Sopenharmony_ci	veor		d7, d7, d8
18562306a36Sopenharmony_ci	veor		d4, d4, d9
18662306a36Sopenharmony_ci	vtbl.8		d5, {d5}, M_0
18762306a36Sopenharmony_ci	vtbl.8		d6, {d6}, M_0
18862306a36Sopenharmony_ci	vtbl.8		d7, {d7}, M_0
18962306a36Sopenharmony_ci	vtbl.8		d4, {d4}, M_0
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci	// a += b + m[blake2b_sigma[r][2*i + 1]];
19262306a36Sopenharmony_ci.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
19362306a36Sopenharmony_ci	vld1.8		{M_0}, [sp, :64]
19462306a36Sopenharmony_ci.endif
19562306a36Sopenharmony_ci	vadd.u64	d0, d0, d5
19662306a36Sopenharmony_ci	vadd.u64	d1, d1, d6
19762306a36Sopenharmony_ci	vadd.u64	d2, d2, d7
19862306a36Sopenharmony_ci	vadd.u64	d3, d3, d4
19962306a36Sopenharmony_ci	vadd.u64	d0, d0, M_\s9
20062306a36Sopenharmony_ci	vadd.u64	d1, d1, M_\s11
20162306a36Sopenharmony_ci	vadd.u64	d2, d2, M_\s13
20262306a36Sopenharmony_ci	vadd.u64	d3, d3, M_\s15
20362306a36Sopenharmony_ci
20462306a36Sopenharmony_ci	// d = ror64(d ^ a, 16);
20562306a36Sopenharmony_ci	vld1.8		{M_0}, [ROR16_TABLE, :64]
20662306a36Sopenharmony_ci	veor		d15, d15, d0
20762306a36Sopenharmony_ci	veor		d12, d12, d1
20862306a36Sopenharmony_ci	veor		d13, d13, d2
20962306a36Sopenharmony_ci	veor		d14, d14, d3
21062306a36Sopenharmony_ci	vtbl.8		d12, {d12}, M_0
21162306a36Sopenharmony_ci	vtbl.8		d13, {d13}, M_0
21262306a36Sopenharmony_ci	vtbl.8		d14, {d14}, M_0
21362306a36Sopenharmony_ci	vtbl.8		d15, {d15}, M_0
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ci	// c += d;
21662306a36Sopenharmony_ci	vadd.u64	d10, d10, d15
21762306a36Sopenharmony_ci	vadd.u64	d11, d11, d12
21862306a36Sopenharmony_ci	vadd.u64	d8, d8, d13
21962306a36Sopenharmony_ci	vadd.u64	d9, d9, d14
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ci	// b = ror64(b ^ c, 63);
22262306a36Sopenharmony_ci	veor		d16, d4, d9
22362306a36Sopenharmony_ci	veor		d17, d5, d10
22462306a36Sopenharmony_ci	veor		d18, d6, d11
22562306a36Sopenharmony_ci	veor		d19, d7, d8
22662306a36Sopenharmony_ci	vshr.u64	q2, q8, #63
22762306a36Sopenharmony_ci	vshr.u64	q3, q9, #63
22862306a36Sopenharmony_ci	vsli.u64	q2, q8, #1
22962306a36Sopenharmony_ci	vsli.u64	q3, q9, #1
23062306a36Sopenharmony_ci	// Reloading q8-q9 can be skipped on the final round.
23162306a36Sopenharmony_ci.if ! \final
23262306a36Sopenharmony_ci	vld1.8		{q8-q9}, [sp, :256]
23362306a36Sopenharmony_ci.endif
23462306a36Sopenharmony_ci.endm
23562306a36Sopenharmony_ci
23662306a36Sopenharmony_ci//
23762306a36Sopenharmony_ci// void blake2b_compress_neon(struct blake2b_state *state,
23862306a36Sopenharmony_ci//			      const u8 *block, size_t nblocks, u32 inc);
23962306a36Sopenharmony_ci//
24062306a36Sopenharmony_ci// Only the first three fields of struct blake2b_state are used:
24162306a36Sopenharmony_ci//	u64 h[8];	(inout)
24262306a36Sopenharmony_ci//	u64 t[2];	(inout)
24362306a36Sopenharmony_ci//	u64 f[2];	(in)
24462306a36Sopenharmony_ci//
24562306a36Sopenharmony_ci	.align		5
24662306a36Sopenharmony_ciENTRY(blake2b_compress_neon)
24762306a36Sopenharmony_ci	push		{r4-r10}
24862306a36Sopenharmony_ci
24962306a36Sopenharmony_ci	// Allocate a 32-byte stack buffer that is 32-byte aligned.
25062306a36Sopenharmony_ci	mov		ORIG_SP, sp
25162306a36Sopenharmony_ci	sub		ip, sp, #32
25262306a36Sopenharmony_ci	bic		ip, ip, #31
25362306a36Sopenharmony_ci	mov		sp, ip
25462306a36Sopenharmony_ci
25562306a36Sopenharmony_ci	adr		ROR24_TABLE, .Lror24_table
25662306a36Sopenharmony_ci	adr		ROR16_TABLE, .Lror16_table
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_ci	mov		ip, STATE
25962306a36Sopenharmony_ci	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
26062306a36Sopenharmony_ci	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
26162306a36Sopenharmony_ci.Lnext_block:
26262306a36Sopenharmony_ci	  adr		r10, .Lblake2b_IV
26362306a36Sopenharmony_ci	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
26462306a36Sopenharmony_ci	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
26562306a36Sopenharmony_ci	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
26662306a36Sopenharmony_ci	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
26762306a36Sopenharmony_ci	  adds		r7, r7, INC		// Increment counter
26862306a36Sopenharmony_ci	bcs		.Lslow_inc_ctr
26962306a36Sopenharmony_ci	vmov.i32	d28[0], r7
27062306a36Sopenharmony_ci	vst1.64		{d28}, [ip]		// Update t[0]
27162306a36Sopenharmony_ci.Linc_ctr_done:
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci	// Load the next message block and finish initializing the state matrix
27462306a36Sopenharmony_ci	// 'v'.  Fortunately, there are exactly enough NEON registers to fit the
27562306a36Sopenharmony_ci	// entire state matrix in q0-q7 and the entire message block in q8-15.
27662306a36Sopenharmony_ci	//
27762306a36Sopenharmony_ci	// However, _blake2b_round also needs some extra registers for rotates,
27862306a36Sopenharmony_ci	// so we have to spill some registers.  It's better to spill the message
27962306a36Sopenharmony_ci	// registers than the state registers, as the message doesn't change.
28062306a36Sopenharmony_ci	// Therefore we store a copy of the first 32 bytes of the message block
28162306a36Sopenharmony_ci	// (q8-q9) in an aligned buffer on the stack so that they can be
28262306a36Sopenharmony_ci	// reloaded when needed.  (We could just reload directly from the
28362306a36Sopenharmony_ci	// message buffer, but it's faster to use aligned loads.)
28462306a36Sopenharmony_ci	vld1.8		{q8-q9}, [BLOCK]!
28562306a36Sopenharmony_ci	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
28662306a36Sopenharmony_ci	vld1.8		{q10-q11}, [BLOCK]!
28762306a36Sopenharmony_ci	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
28862306a36Sopenharmony_ci	vld1.8		{q12-q13}, [BLOCK]!
28962306a36Sopenharmony_ci	vst1.8		{q8-q9}, [sp, :256]
29062306a36Sopenharmony_ci	  mov		ip, STATE
29162306a36Sopenharmony_ci	vld1.8		{q14-q15}, [BLOCK]!
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	// Execute the rounds.  Each round is provided the order in which it
29462306a36Sopenharmony_ci	// needs to use the message words.
29562306a36Sopenharmony_ci	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
29662306a36Sopenharmony_ci	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
29762306a36Sopenharmony_ci	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
29862306a36Sopenharmony_ci	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
29962306a36Sopenharmony_ci	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
30062306a36Sopenharmony_ci	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
30162306a36Sopenharmony_ci	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
30262306a36Sopenharmony_ci	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
30362306a36Sopenharmony_ci	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
30462306a36Sopenharmony_ci	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
30562306a36Sopenharmony_ci	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
30662306a36Sopenharmony_ci	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
30762306a36Sopenharmony_ci			final=1
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_ci	// Fold the final state matrix into the hash chaining value:
31062306a36Sopenharmony_ci	//
31162306a36Sopenharmony_ci	//	for (i = 0; i < 8; i++)
31262306a36Sopenharmony_ci	//		h[i] ^= v[i] ^ v[i + 8];
31362306a36Sopenharmony_ci	//
31462306a36Sopenharmony_ci	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
31562306a36Sopenharmony_ci	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
31662306a36Sopenharmony_ci	veor		q1, q1, q5		// v[2..3] ^= v[10..11]
31762306a36Sopenharmony_ci	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
31862306a36Sopenharmony_ci	veor		q2, q2, q6		// v[4..5] ^= v[12..13]
31962306a36Sopenharmony_ci	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
32062306a36Sopenharmony_ci	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
32162306a36Sopenharmony_ci	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
32262306a36Sopenharmony_ci	  mov		ip, STATE
32362306a36Sopenharmony_ci	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
32462306a36Sopenharmony_ci	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
32562306a36Sopenharmony_ci	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
32662306a36Sopenharmony_ci	veor		q3, q3, q11		// v[6..7] ^= h[6..7]
32762306a36Sopenharmony_ci	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_ci	// Advance to the next block, if there is one.
33062306a36Sopenharmony_ci	bne		.Lnext_block		// nblocks != 0?
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci	mov		sp, ORIG_SP
33362306a36Sopenharmony_ci	pop		{r4-r10}
33462306a36Sopenharmony_ci	mov		pc, lr
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_ci.Lslow_inc_ctr:
33762306a36Sopenharmony_ci	// Handle the case where the counter overflowed its low 32 bits, by
33862306a36Sopenharmony_ci	// carrying the overflow bit into the full 128-bit counter.
33962306a36Sopenharmony_ci	vmov		r9, r10, d29
34062306a36Sopenharmony_ci	adcs		r8, r8, #0
34162306a36Sopenharmony_ci	adcs		r9, r9, #0
34262306a36Sopenharmony_ci	adc		r10, r10, #0
34362306a36Sopenharmony_ci	vmov		d28, r7, r8
34462306a36Sopenharmony_ci	vmov		d29, r9, r10
34562306a36Sopenharmony_ci	vst1.64		{q14}, [ip]		// Update t[0] and t[1]
34662306a36Sopenharmony_ci	b		.Linc_ctr_done
34762306a36Sopenharmony_ciENDPROC(blake2b_compress_neon)
348