18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
68c2ecf20Sopenharmony_ci */
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci#include <asm/hwcap.h>
98c2ecf20Sopenharmony_ci#include <asm/neon.h>
108c2ecf20Sopenharmony_ci#include <asm/simd.h>
118c2ecf20Sopenharmony_ci#include <asm/unaligned.h>
128c2ecf20Sopenharmony_ci#include <crypto/algapi.h>
138c2ecf20Sopenharmony_ci#include <crypto/internal/hash.h>
148c2ecf20Sopenharmony_ci#include <crypto/internal/poly1305.h>
158c2ecf20Sopenharmony_ci#include <crypto/internal/simd.h>
168c2ecf20Sopenharmony_ci#include <linux/cpufeature.h>
178c2ecf20Sopenharmony_ci#include <linux/crypto.h>
188c2ecf20Sopenharmony_ci#include <linux/jump_label.h>
198c2ecf20Sopenharmony_ci#include <linux/module.h>
208c2ecf20Sopenharmony_ci
218c2ecf20Sopenharmony_ciasmlinkage void poly1305_init_arm64(void *state, const u8 *key);
228c2ecf20Sopenharmony_ciasmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
238c2ecf20Sopenharmony_ciasmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
248c2ecf20Sopenharmony_ciasmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
258c2ecf20Sopenharmony_ci
268c2ecf20Sopenharmony_cistatic __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
278c2ecf20Sopenharmony_ci
288c2ecf20Sopenharmony_civoid poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
298c2ecf20Sopenharmony_ci{
308c2ecf20Sopenharmony_ci	poly1305_init_arm64(&dctx->h, key);
318c2ecf20Sopenharmony_ci	dctx->s[0] = get_unaligned_le32(key + 16);
328c2ecf20Sopenharmony_ci	dctx->s[1] = get_unaligned_le32(key + 20);
338c2ecf20Sopenharmony_ci	dctx->s[2] = get_unaligned_le32(key + 24);
348c2ecf20Sopenharmony_ci	dctx->s[3] = get_unaligned_le32(key + 28);
358c2ecf20Sopenharmony_ci	dctx->buflen = 0;
368c2ecf20Sopenharmony_ci}
378c2ecf20Sopenharmony_ciEXPORT_SYMBOL(poly1305_init_arch);
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_cistatic int neon_poly1305_init(struct shash_desc *desc)
408c2ecf20Sopenharmony_ci{
418c2ecf20Sopenharmony_ci	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
428c2ecf20Sopenharmony_ci
438c2ecf20Sopenharmony_ci	dctx->buflen = 0;
448c2ecf20Sopenharmony_ci	dctx->rset = 0;
458c2ecf20Sopenharmony_ci	dctx->sset = false;
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci	return 0;
488c2ecf20Sopenharmony_ci}
498c2ecf20Sopenharmony_ci
508c2ecf20Sopenharmony_cistatic void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
518c2ecf20Sopenharmony_ci				 u32 len, u32 hibit, bool do_neon)
528c2ecf20Sopenharmony_ci{
538c2ecf20Sopenharmony_ci	if (unlikely(!dctx->sset)) {
548c2ecf20Sopenharmony_ci		if (!dctx->rset) {
558c2ecf20Sopenharmony_ci			poly1305_init_arm64(&dctx->h, src);
568c2ecf20Sopenharmony_ci			src += POLY1305_BLOCK_SIZE;
578c2ecf20Sopenharmony_ci			len -= POLY1305_BLOCK_SIZE;
588c2ecf20Sopenharmony_ci			dctx->rset = 1;
598c2ecf20Sopenharmony_ci		}
608c2ecf20Sopenharmony_ci		if (len >= POLY1305_BLOCK_SIZE) {
618c2ecf20Sopenharmony_ci			dctx->s[0] = get_unaligned_le32(src +  0);
628c2ecf20Sopenharmony_ci			dctx->s[1] = get_unaligned_le32(src +  4);
638c2ecf20Sopenharmony_ci			dctx->s[2] = get_unaligned_le32(src +  8);
648c2ecf20Sopenharmony_ci			dctx->s[3] = get_unaligned_le32(src + 12);
658c2ecf20Sopenharmony_ci			src += POLY1305_BLOCK_SIZE;
668c2ecf20Sopenharmony_ci			len -= POLY1305_BLOCK_SIZE;
678c2ecf20Sopenharmony_ci			dctx->sset = true;
688c2ecf20Sopenharmony_ci		}
698c2ecf20Sopenharmony_ci		if (len < POLY1305_BLOCK_SIZE)
708c2ecf20Sopenharmony_ci			return;
718c2ecf20Sopenharmony_ci	}
728c2ecf20Sopenharmony_ci
738c2ecf20Sopenharmony_ci	len &= ~(POLY1305_BLOCK_SIZE - 1);
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_ci	if (static_branch_likely(&have_neon) && likely(do_neon))
768c2ecf20Sopenharmony_ci		poly1305_blocks_neon(&dctx->h, src, len, hibit);
778c2ecf20Sopenharmony_ci	else
788c2ecf20Sopenharmony_ci		poly1305_blocks(&dctx->h, src, len, hibit);
798c2ecf20Sopenharmony_ci}
808c2ecf20Sopenharmony_ci
818c2ecf20Sopenharmony_cistatic void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
828c2ecf20Sopenharmony_ci				    const u8 *src, u32 len, bool do_neon)
838c2ecf20Sopenharmony_ci{
848c2ecf20Sopenharmony_ci	if (unlikely(dctx->buflen)) {
858c2ecf20Sopenharmony_ci		u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
868c2ecf20Sopenharmony_ci
878c2ecf20Sopenharmony_ci		memcpy(dctx->buf + dctx->buflen, src, bytes);
888c2ecf20Sopenharmony_ci		src += bytes;
898c2ecf20Sopenharmony_ci		len -= bytes;
908c2ecf20Sopenharmony_ci		dctx->buflen += bytes;
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ci		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
938c2ecf20Sopenharmony_ci			neon_poly1305_blocks(dctx, dctx->buf,
948c2ecf20Sopenharmony_ci					     POLY1305_BLOCK_SIZE, 1, false);
958c2ecf20Sopenharmony_ci			dctx->buflen = 0;
968c2ecf20Sopenharmony_ci		}
978c2ecf20Sopenharmony_ci	}
988c2ecf20Sopenharmony_ci
998c2ecf20Sopenharmony_ci	if (likely(len >= POLY1305_BLOCK_SIZE)) {
1008c2ecf20Sopenharmony_ci		neon_poly1305_blocks(dctx, src, len, 1, do_neon);
1018c2ecf20Sopenharmony_ci		src += round_down(len, POLY1305_BLOCK_SIZE);
1028c2ecf20Sopenharmony_ci		len %= POLY1305_BLOCK_SIZE;
1038c2ecf20Sopenharmony_ci	}
1048c2ecf20Sopenharmony_ci
1058c2ecf20Sopenharmony_ci	if (unlikely(len)) {
1068c2ecf20Sopenharmony_ci		dctx->buflen = len;
1078c2ecf20Sopenharmony_ci		memcpy(dctx->buf, src, len);
1088c2ecf20Sopenharmony_ci	}
1098c2ecf20Sopenharmony_ci}
1108c2ecf20Sopenharmony_ci
1118c2ecf20Sopenharmony_cistatic int neon_poly1305_update(struct shash_desc *desc,
1128c2ecf20Sopenharmony_ci				const u8 *src, unsigned int srclen)
1138c2ecf20Sopenharmony_ci{
1148c2ecf20Sopenharmony_ci	bool do_neon = crypto_simd_usable() && srclen > 128;
1158c2ecf20Sopenharmony_ci	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
1168c2ecf20Sopenharmony_ci
1178c2ecf20Sopenharmony_ci	if (static_branch_likely(&have_neon) && do_neon)
1188c2ecf20Sopenharmony_ci		kernel_neon_begin();
1198c2ecf20Sopenharmony_ci	neon_poly1305_do_update(dctx, src, srclen, do_neon);
1208c2ecf20Sopenharmony_ci	if (static_branch_likely(&have_neon) && do_neon)
1218c2ecf20Sopenharmony_ci		kernel_neon_end();
1228c2ecf20Sopenharmony_ci	return 0;
1238c2ecf20Sopenharmony_ci}
1248c2ecf20Sopenharmony_ci
1258c2ecf20Sopenharmony_civoid poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
1268c2ecf20Sopenharmony_ci			  unsigned int nbytes)
1278c2ecf20Sopenharmony_ci{
1288c2ecf20Sopenharmony_ci	if (unlikely(dctx->buflen)) {
1298c2ecf20Sopenharmony_ci		u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
1308c2ecf20Sopenharmony_ci
1318c2ecf20Sopenharmony_ci		memcpy(dctx->buf + dctx->buflen, src, bytes);
1328c2ecf20Sopenharmony_ci		src += bytes;
1338c2ecf20Sopenharmony_ci		nbytes -= bytes;
1348c2ecf20Sopenharmony_ci		dctx->buflen += bytes;
1358c2ecf20Sopenharmony_ci
1368c2ecf20Sopenharmony_ci		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
1378c2ecf20Sopenharmony_ci			poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
1388c2ecf20Sopenharmony_ci			dctx->buflen = 0;
1398c2ecf20Sopenharmony_ci		}
1408c2ecf20Sopenharmony_ci	}
1418c2ecf20Sopenharmony_ci
1428c2ecf20Sopenharmony_ci	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
1438c2ecf20Sopenharmony_ci		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
1448c2ecf20Sopenharmony_ci
1458c2ecf20Sopenharmony_ci		if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
1468c2ecf20Sopenharmony_ci			do {
1478c2ecf20Sopenharmony_ci				unsigned int todo = min_t(unsigned int, len, SZ_4K);
1488c2ecf20Sopenharmony_ci
1498c2ecf20Sopenharmony_ci				kernel_neon_begin();
1508c2ecf20Sopenharmony_ci				poly1305_blocks_neon(&dctx->h, src, todo, 1);
1518c2ecf20Sopenharmony_ci				kernel_neon_end();
1528c2ecf20Sopenharmony_ci
1538c2ecf20Sopenharmony_ci				len -= todo;
1548c2ecf20Sopenharmony_ci				src += todo;
1558c2ecf20Sopenharmony_ci			} while (len);
1568c2ecf20Sopenharmony_ci		} else {
1578c2ecf20Sopenharmony_ci			poly1305_blocks(&dctx->h, src, len, 1);
1588c2ecf20Sopenharmony_ci			src += len;
1598c2ecf20Sopenharmony_ci		}
1608c2ecf20Sopenharmony_ci		nbytes %= POLY1305_BLOCK_SIZE;
1618c2ecf20Sopenharmony_ci	}
1628c2ecf20Sopenharmony_ci
1638c2ecf20Sopenharmony_ci	if (unlikely(nbytes)) {
1648c2ecf20Sopenharmony_ci		dctx->buflen = nbytes;
1658c2ecf20Sopenharmony_ci		memcpy(dctx->buf, src, nbytes);
1668c2ecf20Sopenharmony_ci	}
1678c2ecf20Sopenharmony_ci}
1688c2ecf20Sopenharmony_ciEXPORT_SYMBOL(poly1305_update_arch);
1698c2ecf20Sopenharmony_ci
1708c2ecf20Sopenharmony_civoid poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
1718c2ecf20Sopenharmony_ci{
1728c2ecf20Sopenharmony_ci	if (unlikely(dctx->buflen)) {
1738c2ecf20Sopenharmony_ci		dctx->buf[dctx->buflen++] = 1;
1748c2ecf20Sopenharmony_ci		memset(dctx->buf + dctx->buflen, 0,
1758c2ecf20Sopenharmony_ci		       POLY1305_BLOCK_SIZE - dctx->buflen);
1768c2ecf20Sopenharmony_ci		poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
1778c2ecf20Sopenharmony_ci	}
1788c2ecf20Sopenharmony_ci
1798c2ecf20Sopenharmony_ci	poly1305_emit(&dctx->h, dst, dctx->s);
1808c2ecf20Sopenharmony_ci	*dctx = (struct poly1305_desc_ctx){};
1818c2ecf20Sopenharmony_ci}
1828c2ecf20Sopenharmony_ciEXPORT_SYMBOL(poly1305_final_arch);
1838c2ecf20Sopenharmony_ci
1848c2ecf20Sopenharmony_cistatic int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
1858c2ecf20Sopenharmony_ci{
1868c2ecf20Sopenharmony_ci	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
1878c2ecf20Sopenharmony_ci
1888c2ecf20Sopenharmony_ci	if (unlikely(!dctx->sset))
1898c2ecf20Sopenharmony_ci		return -ENOKEY;
1908c2ecf20Sopenharmony_ci
1918c2ecf20Sopenharmony_ci	poly1305_final_arch(dctx, dst);
1928c2ecf20Sopenharmony_ci	return 0;
1938c2ecf20Sopenharmony_ci}
1948c2ecf20Sopenharmony_ci
1958c2ecf20Sopenharmony_cistatic struct shash_alg neon_poly1305_alg = {
1968c2ecf20Sopenharmony_ci	.init			= neon_poly1305_init,
1978c2ecf20Sopenharmony_ci	.update			= neon_poly1305_update,
1988c2ecf20Sopenharmony_ci	.final			= neon_poly1305_final,
1998c2ecf20Sopenharmony_ci	.digestsize		= POLY1305_DIGEST_SIZE,
2008c2ecf20Sopenharmony_ci	.descsize		= sizeof(struct poly1305_desc_ctx),
2018c2ecf20Sopenharmony_ci
2028c2ecf20Sopenharmony_ci	.base.cra_name		= "poly1305",
2038c2ecf20Sopenharmony_ci	.base.cra_driver_name	= "poly1305-neon",
2048c2ecf20Sopenharmony_ci	.base.cra_priority	= 200,
2058c2ecf20Sopenharmony_ci	.base.cra_blocksize	= POLY1305_BLOCK_SIZE,
2068c2ecf20Sopenharmony_ci	.base.cra_module	= THIS_MODULE,
2078c2ecf20Sopenharmony_ci};
2088c2ecf20Sopenharmony_ci
2098c2ecf20Sopenharmony_cistatic int __init neon_poly1305_mod_init(void)
2108c2ecf20Sopenharmony_ci{
2118c2ecf20Sopenharmony_ci	if (!cpu_have_named_feature(ASIMD))
2128c2ecf20Sopenharmony_ci		return 0;
2138c2ecf20Sopenharmony_ci
2148c2ecf20Sopenharmony_ci	static_branch_enable(&have_neon);
2158c2ecf20Sopenharmony_ci
2168c2ecf20Sopenharmony_ci	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
2178c2ecf20Sopenharmony_ci		crypto_register_shash(&neon_poly1305_alg) : 0;
2188c2ecf20Sopenharmony_ci}
2198c2ecf20Sopenharmony_ci
2208c2ecf20Sopenharmony_cistatic void __exit neon_poly1305_mod_exit(void)
2218c2ecf20Sopenharmony_ci{
2228c2ecf20Sopenharmony_ci	if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && cpu_have_named_feature(ASIMD))
2238c2ecf20Sopenharmony_ci		crypto_unregister_shash(&neon_poly1305_alg);
2248c2ecf20Sopenharmony_ci}
2258c2ecf20Sopenharmony_ci
2268c2ecf20Sopenharmony_cimodule_init(neon_poly1305_mod_init);
2278c2ecf20Sopenharmony_cimodule_exit(neon_poly1305_mod_exit);
2288c2ecf20Sopenharmony_ci
2298c2ecf20Sopenharmony_ciMODULE_LICENSE("GPL v2");
2308c2ecf20Sopenharmony_ciMODULE_ALIAS_CRYPTO("poly1305");
2318c2ecf20Sopenharmony_ciMODULE_ALIAS_CRYPTO("poly1305-neon");
232