162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
462306a36Sopenharmony_ci */
562306a36Sopenharmony_ci
662306a36Sopenharmony_ci#ifdef CONFIG_ARM64
762306a36Sopenharmony_ci#include <asm/neon-intrinsics.h>
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci#define AES_ROUND	"aese %0.16b, %1.16b \n\t aesmc %0.16b, %0.16b"
1062306a36Sopenharmony_ci#else
1162306a36Sopenharmony_ci#include <arm_neon.h>
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci#define AES_ROUND	"aese.8 %q0, %q1 \n\t aesmc.8 %q0, %q0"
1462306a36Sopenharmony_ci#endif
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci#define AEGIS_BLOCK_SIZE	16
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci#include <stddef.h>
1962306a36Sopenharmony_ci#include "aegis-neon.h"
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ciextern int aegis128_have_aes_insn;
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_civoid *memcpy(void *dest, const void *src, size_t n);
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_cistruct aegis128_state {
2662306a36Sopenharmony_ci	uint8x16_t v[5];
2762306a36Sopenharmony_ci};
2862306a36Sopenharmony_ci
2962306a36Sopenharmony_ciextern const uint8_t crypto_aes_sbox[];
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_cistatic struct aegis128_state aegis128_load_state_neon(const void *state)
3262306a36Sopenharmony_ci{
3362306a36Sopenharmony_ci	return (struct aegis128_state){ {
3462306a36Sopenharmony_ci		vld1q_u8(state),
3562306a36Sopenharmony_ci		vld1q_u8(state + 16),
3662306a36Sopenharmony_ci		vld1q_u8(state + 32),
3762306a36Sopenharmony_ci		vld1q_u8(state + 48),
3862306a36Sopenharmony_ci		vld1q_u8(state + 64)
3962306a36Sopenharmony_ci	} };
4062306a36Sopenharmony_ci}
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_cistatic void aegis128_save_state_neon(struct aegis128_state st, void *state)
4362306a36Sopenharmony_ci{
4462306a36Sopenharmony_ci	vst1q_u8(state, st.v[0]);
4562306a36Sopenharmony_ci	vst1q_u8(state + 16, st.v[1]);
4662306a36Sopenharmony_ci	vst1q_u8(state + 32, st.v[2]);
4762306a36Sopenharmony_ci	vst1q_u8(state + 48, st.v[3]);
4862306a36Sopenharmony_ci	vst1q_u8(state + 64, st.v[4]);
4962306a36Sopenharmony_ci}
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_cistatic inline __attribute__((always_inline))
5262306a36Sopenharmony_ciuint8x16_t aegis_aes_round(uint8x16_t w)
5362306a36Sopenharmony_ci{
5462306a36Sopenharmony_ci	uint8x16_t z = {};
5562306a36Sopenharmony_ci
5662306a36Sopenharmony_ci#ifdef CONFIG_ARM64
5762306a36Sopenharmony_ci	if (!__builtin_expect(aegis128_have_aes_insn, 1)) {
5862306a36Sopenharmony_ci		static const uint8_t shift_rows[] = {
5962306a36Sopenharmony_ci			0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
6062306a36Sopenharmony_ci			0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
6162306a36Sopenharmony_ci		};
6262306a36Sopenharmony_ci		static const uint8_t ror32by8[] = {
6362306a36Sopenharmony_ci			0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
6462306a36Sopenharmony_ci			0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
6562306a36Sopenharmony_ci		};
6662306a36Sopenharmony_ci		uint8x16_t v;
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_ci		// shift rows
6962306a36Sopenharmony_ci		w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci		// sub bytes
7262306a36Sopenharmony_ci#ifndef CONFIG_CC_IS_GCC
7362306a36Sopenharmony_ci		v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), w);
7462306a36Sopenharmony_ci		v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), w - 0x40);
7562306a36Sopenharmony_ci		v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), w - 0x80);
7662306a36Sopenharmony_ci		v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), w - 0xc0);
7762306a36Sopenharmony_ci#else
7862306a36Sopenharmony_ci		asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w));
7962306a36Sopenharmony_ci		w -= 0x40;
8062306a36Sopenharmony_ci		asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w));
8162306a36Sopenharmony_ci		w -= 0x40;
8262306a36Sopenharmony_ci		asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w));
8362306a36Sopenharmony_ci		w -= 0x40;
8462306a36Sopenharmony_ci		asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w));
8562306a36Sopenharmony_ci#endif
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci		// mix columns
8862306a36Sopenharmony_ci		w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
8962306a36Sopenharmony_ci		w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
9062306a36Sopenharmony_ci		w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
9162306a36Sopenharmony_ci
9262306a36Sopenharmony_ci		return w;
9362306a36Sopenharmony_ci	}
9462306a36Sopenharmony_ci#endif
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci	/*
9762306a36Sopenharmony_ci	 * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics
9862306a36Sopenharmony_ci	 * to force the compiler to issue the aese/aesmc instructions in pairs.
9962306a36Sopenharmony_ci	 * This is much faster on many cores, where the instruction pair can
10062306a36Sopenharmony_ci	 * execute in a single cycle.
10162306a36Sopenharmony_ci	 */
10262306a36Sopenharmony_ci	asm(AES_ROUND : "+w"(w) : "w"(z));
10362306a36Sopenharmony_ci	return w;
10462306a36Sopenharmony_ci}
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_cistatic inline __attribute__((always_inline))
10762306a36Sopenharmony_cistruct aegis128_state aegis128_update_neon(struct aegis128_state st,
10862306a36Sopenharmony_ci					   uint8x16_t m)
10962306a36Sopenharmony_ci{
11062306a36Sopenharmony_ci	m       ^= aegis_aes_round(st.v[4]);
11162306a36Sopenharmony_ci	st.v[4] ^= aegis_aes_round(st.v[3]);
11262306a36Sopenharmony_ci	st.v[3] ^= aegis_aes_round(st.v[2]);
11362306a36Sopenharmony_ci	st.v[2] ^= aegis_aes_round(st.v[1]);
11462306a36Sopenharmony_ci	st.v[1] ^= aegis_aes_round(st.v[0]);
11562306a36Sopenharmony_ci	st.v[0] ^= m;
11662306a36Sopenharmony_ci
11762306a36Sopenharmony_ci	return st;
11862306a36Sopenharmony_ci}
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_cistatic inline __attribute__((always_inline))
12162306a36Sopenharmony_civoid preload_sbox(void)
12262306a36Sopenharmony_ci{
12362306a36Sopenharmony_ci	if (!IS_ENABLED(CONFIG_ARM64) ||
12462306a36Sopenharmony_ci	    !IS_ENABLED(CONFIG_CC_IS_GCC) ||
12562306a36Sopenharmony_ci	    __builtin_expect(aegis128_have_aes_insn, 1))
12662306a36Sopenharmony_ci		return;
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci	asm("ld1	{v16.16b-v19.16b}, [%0], #64	\n\t"
12962306a36Sopenharmony_ci	    "ld1	{v20.16b-v23.16b}, [%0], #64	\n\t"
13062306a36Sopenharmony_ci	    "ld1	{v24.16b-v27.16b}, [%0], #64	\n\t"
13162306a36Sopenharmony_ci	    "ld1	{v28.16b-v31.16b}, [%0]		\n\t"
13262306a36Sopenharmony_ci	    :: "r"(crypto_aes_sbox));
13362306a36Sopenharmony_ci}
13462306a36Sopenharmony_ci
13562306a36Sopenharmony_civoid crypto_aegis128_init_neon(void *state, const void *key, const void *iv)
13662306a36Sopenharmony_ci{
13762306a36Sopenharmony_ci	static const uint8_t const0[] = {
13862306a36Sopenharmony_ci		0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d,
13962306a36Sopenharmony_ci		0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62,
14062306a36Sopenharmony_ci	};
14162306a36Sopenharmony_ci	static const uint8_t const1[] = {
14262306a36Sopenharmony_ci		0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1,
14362306a36Sopenharmony_ci		0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd,
14462306a36Sopenharmony_ci	};
14562306a36Sopenharmony_ci	uint8x16_t k = vld1q_u8(key);
14662306a36Sopenharmony_ci	uint8x16_t kiv = k ^ vld1q_u8(iv);
14762306a36Sopenharmony_ci	struct aegis128_state st = {{
14862306a36Sopenharmony_ci		kiv,
14962306a36Sopenharmony_ci		vld1q_u8(const1),
15062306a36Sopenharmony_ci		vld1q_u8(const0),
15162306a36Sopenharmony_ci		k ^ vld1q_u8(const0),
15262306a36Sopenharmony_ci		k ^ vld1q_u8(const1),
15362306a36Sopenharmony_ci	}};
15462306a36Sopenharmony_ci	int i;
15562306a36Sopenharmony_ci
15662306a36Sopenharmony_ci	preload_sbox();
15762306a36Sopenharmony_ci
15862306a36Sopenharmony_ci	for (i = 0; i < 5; i++) {
15962306a36Sopenharmony_ci		st = aegis128_update_neon(st, k);
16062306a36Sopenharmony_ci		st = aegis128_update_neon(st, kiv);
16162306a36Sopenharmony_ci	}
16262306a36Sopenharmony_ci	aegis128_save_state_neon(st, state);
16362306a36Sopenharmony_ci}
16462306a36Sopenharmony_ci
16562306a36Sopenharmony_civoid crypto_aegis128_update_neon(void *state, const void *msg)
16662306a36Sopenharmony_ci{
16762306a36Sopenharmony_ci	struct aegis128_state st = aegis128_load_state_neon(state);
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci	preload_sbox();
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci	st = aegis128_update_neon(st, vld1q_u8(msg));
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci	aegis128_save_state_neon(st, state);
17462306a36Sopenharmony_ci}
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci#ifdef CONFIG_ARM
17762306a36Sopenharmony_ci/*
17862306a36Sopenharmony_ci * AArch32 does not provide these intrinsics natively because it does not
17962306a36Sopenharmony_ci * implement the underlying instructions. AArch32 only provides 64-bit
18062306a36Sopenharmony_ci * wide vtbl.8/vtbx.8 instruction, so use those instead.
18162306a36Sopenharmony_ci */
18262306a36Sopenharmony_cistatic uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
18362306a36Sopenharmony_ci{
18462306a36Sopenharmony_ci	union {
18562306a36Sopenharmony_ci		uint8x16_t	val;
18662306a36Sopenharmony_ci		uint8x8x2_t	pair;
18762306a36Sopenharmony_ci	} __a = { a };
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
19062306a36Sopenharmony_ci			   vtbl2_u8(__a.pair, vget_high_u8(b)));
19162306a36Sopenharmony_ci}
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_cistatic uint8x16_t vqtbx1q_u8(uint8x16_t v, uint8x16_t a, uint8x16_t b)
19462306a36Sopenharmony_ci{
19562306a36Sopenharmony_ci	union {
19662306a36Sopenharmony_ci		uint8x16_t	val;
19762306a36Sopenharmony_ci		uint8x8x2_t	pair;
19862306a36Sopenharmony_ci	} __a = { a };
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci	return vcombine_u8(vtbx2_u8(vget_low_u8(v), __a.pair, vget_low_u8(b)),
20162306a36Sopenharmony_ci			   vtbx2_u8(vget_high_u8(v), __a.pair, vget_high_u8(b)));
20262306a36Sopenharmony_ci}
20362306a36Sopenharmony_ci
20462306a36Sopenharmony_cistatic int8_t vminvq_s8(int8x16_t v)
20562306a36Sopenharmony_ci{
20662306a36Sopenharmony_ci	int8x8_t s = vpmin_s8(vget_low_s8(v), vget_high_s8(v));
20762306a36Sopenharmony_ci
20862306a36Sopenharmony_ci	s = vpmin_s8(s, s);
20962306a36Sopenharmony_ci	s = vpmin_s8(s, s);
21062306a36Sopenharmony_ci	s = vpmin_s8(s, s);
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci	return vget_lane_s8(s, 0);
21362306a36Sopenharmony_ci}
21462306a36Sopenharmony_ci#endif
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_cistatic const uint8_t permute[] __aligned(64) = {
21762306a36Sopenharmony_ci	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
21862306a36Sopenharmony_ci	 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
21962306a36Sopenharmony_ci	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
22062306a36Sopenharmony_ci};
22162306a36Sopenharmony_ci
22262306a36Sopenharmony_civoid crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
22362306a36Sopenharmony_ci					unsigned int size)
22462306a36Sopenharmony_ci{
22562306a36Sopenharmony_ci	struct aegis128_state st = aegis128_load_state_neon(state);
22662306a36Sopenharmony_ci	const int short_input = size < AEGIS_BLOCK_SIZE;
22762306a36Sopenharmony_ci	uint8x16_t msg;
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ci	preload_sbox();
23062306a36Sopenharmony_ci
23162306a36Sopenharmony_ci	while (size >= AEGIS_BLOCK_SIZE) {
23262306a36Sopenharmony_ci		uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_ci		msg = vld1q_u8(src);
23562306a36Sopenharmony_ci		st = aegis128_update_neon(st, msg);
23662306a36Sopenharmony_ci		msg ^= s;
23762306a36Sopenharmony_ci		vst1q_u8(dst, msg);
23862306a36Sopenharmony_ci
23962306a36Sopenharmony_ci		size -= AEGIS_BLOCK_SIZE;
24062306a36Sopenharmony_ci		src += AEGIS_BLOCK_SIZE;
24162306a36Sopenharmony_ci		dst += AEGIS_BLOCK_SIZE;
24262306a36Sopenharmony_ci	}
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci	if (size > 0) {
24562306a36Sopenharmony_ci		uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
24662306a36Sopenharmony_ci		uint8_t buf[AEGIS_BLOCK_SIZE];
24762306a36Sopenharmony_ci		const void *in = src;
24862306a36Sopenharmony_ci		void *out = dst;
24962306a36Sopenharmony_ci		uint8x16_t m;
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_ci		if (__builtin_expect(short_input, 0))
25262306a36Sopenharmony_ci			in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ci		m = vqtbl1q_u8(vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
25562306a36Sopenharmony_ci			       vld1q_u8(permute + 32 - size));
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci		st = aegis128_update_neon(st, m);
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci		vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
26062306a36Sopenharmony_ci			 vqtbl1q_u8(m ^ s, vld1q_u8(permute + size)));
26162306a36Sopenharmony_ci
26262306a36Sopenharmony_ci		if (__builtin_expect(short_input, 0))
26362306a36Sopenharmony_ci			memcpy(dst, out, size);
26462306a36Sopenharmony_ci		else
26562306a36Sopenharmony_ci			vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
26662306a36Sopenharmony_ci	}
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	aegis128_save_state_neon(st, state);
26962306a36Sopenharmony_ci}
27062306a36Sopenharmony_ci
27162306a36Sopenharmony_civoid crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
27262306a36Sopenharmony_ci					unsigned int size)
27362306a36Sopenharmony_ci{
27462306a36Sopenharmony_ci	struct aegis128_state st = aegis128_load_state_neon(state);
27562306a36Sopenharmony_ci	const int short_input = size < AEGIS_BLOCK_SIZE;
27662306a36Sopenharmony_ci	uint8x16_t msg;
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci	preload_sbox();
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	while (size >= AEGIS_BLOCK_SIZE) {
28162306a36Sopenharmony_ci		msg = vld1q_u8(src) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
28262306a36Sopenharmony_ci		st = aegis128_update_neon(st, msg);
28362306a36Sopenharmony_ci		vst1q_u8(dst, msg);
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_ci		size -= AEGIS_BLOCK_SIZE;
28662306a36Sopenharmony_ci		src += AEGIS_BLOCK_SIZE;
28762306a36Sopenharmony_ci		dst += AEGIS_BLOCK_SIZE;
28862306a36Sopenharmony_ci	}
28962306a36Sopenharmony_ci
29062306a36Sopenharmony_ci	if (size > 0) {
29162306a36Sopenharmony_ci		uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
29262306a36Sopenharmony_ci		uint8_t buf[AEGIS_BLOCK_SIZE];
29362306a36Sopenharmony_ci		const void *in = src;
29462306a36Sopenharmony_ci		void *out = dst;
29562306a36Sopenharmony_ci		uint8x16_t m;
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_ci		if (__builtin_expect(short_input, 0))
29862306a36Sopenharmony_ci			in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
29962306a36Sopenharmony_ci
30062306a36Sopenharmony_ci		m = s ^ vqtbx1q_u8(s, vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
30162306a36Sopenharmony_ci				   vld1q_u8(permute + 32 - size));
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci		st = aegis128_update_neon(st, m);
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci		vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
30662306a36Sopenharmony_ci			 vqtbl1q_u8(m, vld1q_u8(permute + size)));
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci		if (__builtin_expect(short_input, 0))
30962306a36Sopenharmony_ci			memcpy(dst, out, size);
31062306a36Sopenharmony_ci		else
31162306a36Sopenharmony_ci			vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
31262306a36Sopenharmony_ci	}
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_ci	aegis128_save_state_neon(st, state);
31562306a36Sopenharmony_ci}
31662306a36Sopenharmony_ci
31762306a36Sopenharmony_ciint crypto_aegis128_final_neon(void *state, void *tag_xor,
31862306a36Sopenharmony_ci			       unsigned int assoclen,
31962306a36Sopenharmony_ci			       unsigned int cryptlen,
32062306a36Sopenharmony_ci			       unsigned int authsize)
32162306a36Sopenharmony_ci{
32262306a36Sopenharmony_ci	struct aegis128_state st = aegis128_load_state_neon(state);
32362306a36Sopenharmony_ci	uint8x16_t v;
32462306a36Sopenharmony_ci	int i;
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci	preload_sbox();
32762306a36Sopenharmony_ci
32862306a36Sopenharmony_ci	v = st.v[3] ^ (uint8x16_t)vcombine_u64(vmov_n_u64(8ULL * assoclen),
32962306a36Sopenharmony_ci					       vmov_n_u64(8ULL * cryptlen));
33062306a36Sopenharmony_ci
33162306a36Sopenharmony_ci	for (i = 0; i < 7; i++)
33262306a36Sopenharmony_ci		st = aegis128_update_neon(st, v);
33362306a36Sopenharmony_ci
33462306a36Sopenharmony_ci	v = st.v[0] ^ st.v[1] ^ st.v[2] ^ st.v[3] ^ st.v[4];
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_ci	if (authsize > 0) {
33762306a36Sopenharmony_ci		v = vqtbl1q_u8(~vceqq_u8(v, vld1q_u8(tag_xor)),
33862306a36Sopenharmony_ci			       vld1q_u8(permute + authsize));
33962306a36Sopenharmony_ci
34062306a36Sopenharmony_ci		return vminvq_s8((int8x16_t)v);
34162306a36Sopenharmony_ci	}
34262306a36Sopenharmony_ci
34362306a36Sopenharmony_ci	vst1q_u8(tag_xor, v);
34462306a36Sopenharmony_ci	return 0;
34562306a36Sopenharmony_ci}
346