18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 OR MIT 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 48c2ecf20Sopenharmony_ci */ 58c2ecf20Sopenharmony_ci 68c2ecf20Sopenharmony_ci#include <crypto/algapi.h> 78c2ecf20Sopenharmony_ci#include <crypto/internal/hash.h> 88c2ecf20Sopenharmony_ci#include <crypto/internal/poly1305.h> 98c2ecf20Sopenharmony_ci#include <crypto/internal/simd.h> 108c2ecf20Sopenharmony_ci#include <linux/crypto.h> 118c2ecf20Sopenharmony_ci#include <linux/jump_label.h> 128c2ecf20Sopenharmony_ci#include <linux/kernel.h> 138c2ecf20Sopenharmony_ci#include <linux/module.h> 148c2ecf20Sopenharmony_ci#include <linux/sizes.h> 158c2ecf20Sopenharmony_ci#include <asm/intel-family.h> 168c2ecf20Sopenharmony_ci#include <asm/simd.h> 178c2ecf20Sopenharmony_ci 188c2ecf20Sopenharmony_ciasmlinkage void poly1305_init_x86_64(void *ctx, 198c2ecf20Sopenharmony_ci const u8 key[POLY1305_BLOCK_SIZE]); 208c2ecf20Sopenharmony_ciasmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, 218c2ecf20Sopenharmony_ci const size_t len, const u32 padbit); 228c2ecf20Sopenharmony_ciasmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], 238c2ecf20Sopenharmony_ci const u32 nonce[4]); 248c2ecf20Sopenharmony_ciasmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], 258c2ecf20Sopenharmony_ci const u32 nonce[4]); 268c2ecf20Sopenharmony_ciasmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, 278c2ecf20Sopenharmony_ci const u32 padbit); 288c2ecf20Sopenharmony_ciasmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, 298c2ecf20Sopenharmony_ci const u32 padbit); 308c2ecf20Sopenharmony_ciasmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, 318c2ecf20Sopenharmony_ci const size_t len, const u32 padbit); 328c2ecf20Sopenharmony_ci 338c2ecf20Sopenharmony_cistatic __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); 348c2ecf20Sopenharmony_cistatic __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); 358c2ecf20Sopenharmony_cistatic __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_cistruct poly1305_arch_internal { 388c2ecf20Sopenharmony_ci union { 398c2ecf20Sopenharmony_ci struct { 408c2ecf20Sopenharmony_ci u32 h[5]; 418c2ecf20Sopenharmony_ci u32 is_base2_26; 428c2ecf20Sopenharmony_ci }; 438c2ecf20Sopenharmony_ci u64 hs[3]; 448c2ecf20Sopenharmony_ci }; 458c2ecf20Sopenharmony_ci u64 r[2]; 468c2ecf20Sopenharmony_ci u64 pad; 478c2ecf20Sopenharmony_ci struct { u32 r2, r1, r4, r3; } rn[9]; 488c2ecf20Sopenharmony_ci}; 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit 518c2ecf20Sopenharmony_ci * the unfortunate situation of using AVX and then having to go back to scalar 528c2ecf20Sopenharmony_ci * -- because the user is silly and has called the update function from two 538c2ecf20Sopenharmony_ci * separate contexts -- then we need to convert back to the original base before 548c2ecf20Sopenharmony_ci * proceeding. It is possible to reason that the initial reduction below is 558c2ecf20Sopenharmony_ci * sufficient given the implementation invariants. However, for an avoidance of 568c2ecf20Sopenharmony_ci * doubt and because this is not performance critical, we do the full reduction 578c2ecf20Sopenharmony_ci * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py 588c2ecf20Sopenharmony_ci */ 598c2ecf20Sopenharmony_cistatic void convert_to_base2_64(void *ctx) 608c2ecf20Sopenharmony_ci{ 618c2ecf20Sopenharmony_ci struct poly1305_arch_internal *state = ctx; 628c2ecf20Sopenharmony_ci u32 cy; 638c2ecf20Sopenharmony_ci 648c2ecf20Sopenharmony_ci if (!state->is_base2_26) 658c2ecf20Sopenharmony_ci return; 668c2ecf20Sopenharmony_ci 678c2ecf20Sopenharmony_ci cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; 688c2ecf20Sopenharmony_ci cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; 698c2ecf20Sopenharmony_ci cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; 708c2ecf20Sopenharmony_ci cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; 718c2ecf20Sopenharmony_ci state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; 728c2ecf20Sopenharmony_ci state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); 738c2ecf20Sopenharmony_ci state->hs[2] = state->h[4] >> 24; 748c2ecf20Sopenharmony_ci#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) 758c2ecf20Sopenharmony_ci cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); 768c2ecf20Sopenharmony_ci state->hs[2] &= 3; 778c2ecf20Sopenharmony_ci state->hs[0] += cy; 788c2ecf20Sopenharmony_ci state->hs[1] += (cy = ULT(state->hs[0], cy)); 798c2ecf20Sopenharmony_ci state->hs[2] += ULT(state->hs[1], cy); 808c2ecf20Sopenharmony_ci#undef ULT 818c2ecf20Sopenharmony_ci state->is_base2_26 = 0; 828c2ecf20Sopenharmony_ci} 838c2ecf20Sopenharmony_ci 848c2ecf20Sopenharmony_cistatic void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]) 858c2ecf20Sopenharmony_ci{ 868c2ecf20Sopenharmony_ci poly1305_init_x86_64(ctx, key); 878c2ecf20Sopenharmony_ci} 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_cistatic void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, 908c2ecf20Sopenharmony_ci const u32 padbit) 918c2ecf20Sopenharmony_ci{ 928c2ecf20Sopenharmony_ci struct poly1305_arch_internal *state = ctx; 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci /* SIMD disables preemption, so relax after processing each page. */ 958c2ecf20Sopenharmony_ci BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || 968c2ecf20Sopenharmony_ci SZ_4K % POLY1305_BLOCK_SIZE); 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci if (!static_branch_likely(&poly1305_use_avx) || 998c2ecf20Sopenharmony_ci (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || 1008c2ecf20Sopenharmony_ci !crypto_simd_usable()) { 1018c2ecf20Sopenharmony_ci convert_to_base2_64(ctx); 1028c2ecf20Sopenharmony_ci poly1305_blocks_x86_64(ctx, inp, len, padbit); 1038c2ecf20Sopenharmony_ci return; 1048c2ecf20Sopenharmony_ci } 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci do { 1078c2ecf20Sopenharmony_ci const size_t bytes = min_t(size_t, len, SZ_4K); 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ci kernel_fpu_begin(); 1108c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) 1118c2ecf20Sopenharmony_ci poly1305_blocks_avx512(ctx, inp, bytes, padbit); 1128c2ecf20Sopenharmony_ci else if (static_branch_likely(&poly1305_use_avx2)) 1138c2ecf20Sopenharmony_ci poly1305_blocks_avx2(ctx, inp, bytes, padbit); 1148c2ecf20Sopenharmony_ci else 1158c2ecf20Sopenharmony_ci poly1305_blocks_avx(ctx, inp, bytes, padbit); 1168c2ecf20Sopenharmony_ci kernel_fpu_end(); 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_ci len -= bytes; 1198c2ecf20Sopenharmony_ci inp += bytes; 1208c2ecf20Sopenharmony_ci } while (len); 1218c2ecf20Sopenharmony_ci} 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_cistatic void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], 1248c2ecf20Sopenharmony_ci const u32 nonce[4]) 1258c2ecf20Sopenharmony_ci{ 1268c2ecf20Sopenharmony_ci if (!static_branch_likely(&poly1305_use_avx)) 1278c2ecf20Sopenharmony_ci poly1305_emit_x86_64(ctx, mac, nonce); 1288c2ecf20Sopenharmony_ci else 1298c2ecf20Sopenharmony_ci poly1305_emit_avx(ctx, mac, nonce); 1308c2ecf20Sopenharmony_ci} 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_civoid poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) 1338c2ecf20Sopenharmony_ci{ 1348c2ecf20Sopenharmony_ci poly1305_simd_init(&dctx->h, key); 1358c2ecf20Sopenharmony_ci dctx->s[0] = get_unaligned_le32(&key[16]); 1368c2ecf20Sopenharmony_ci dctx->s[1] = get_unaligned_le32(&key[20]); 1378c2ecf20Sopenharmony_ci dctx->s[2] = get_unaligned_le32(&key[24]); 1388c2ecf20Sopenharmony_ci dctx->s[3] = get_unaligned_le32(&key[28]); 1398c2ecf20Sopenharmony_ci dctx->buflen = 0; 1408c2ecf20Sopenharmony_ci dctx->sset = true; 1418c2ecf20Sopenharmony_ci} 1428c2ecf20Sopenharmony_ciEXPORT_SYMBOL(poly1305_init_arch); 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_cistatic unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, 1458c2ecf20Sopenharmony_ci const u8 *inp, unsigned int len) 1468c2ecf20Sopenharmony_ci{ 1478c2ecf20Sopenharmony_ci unsigned int acc = 0; 1488c2ecf20Sopenharmony_ci if (unlikely(!dctx->sset)) { 1498c2ecf20Sopenharmony_ci if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { 1508c2ecf20Sopenharmony_ci poly1305_simd_init(&dctx->h, inp); 1518c2ecf20Sopenharmony_ci inp += POLY1305_BLOCK_SIZE; 1528c2ecf20Sopenharmony_ci len -= POLY1305_BLOCK_SIZE; 1538c2ecf20Sopenharmony_ci acc += POLY1305_BLOCK_SIZE; 1548c2ecf20Sopenharmony_ci dctx->rset = 1; 1558c2ecf20Sopenharmony_ci } 1568c2ecf20Sopenharmony_ci if (len >= POLY1305_BLOCK_SIZE) { 1578c2ecf20Sopenharmony_ci dctx->s[0] = get_unaligned_le32(&inp[0]); 1588c2ecf20Sopenharmony_ci dctx->s[1] = get_unaligned_le32(&inp[4]); 1598c2ecf20Sopenharmony_ci dctx->s[2] = get_unaligned_le32(&inp[8]); 1608c2ecf20Sopenharmony_ci dctx->s[3] = get_unaligned_le32(&inp[12]); 1618c2ecf20Sopenharmony_ci acc += POLY1305_BLOCK_SIZE; 1628c2ecf20Sopenharmony_ci dctx->sset = true; 1638c2ecf20Sopenharmony_ci } 1648c2ecf20Sopenharmony_ci } 1658c2ecf20Sopenharmony_ci return acc; 1668c2ecf20Sopenharmony_ci} 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_civoid poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, 1698c2ecf20Sopenharmony_ci unsigned int srclen) 1708c2ecf20Sopenharmony_ci{ 1718c2ecf20Sopenharmony_ci unsigned int bytes, used; 1728c2ecf20Sopenharmony_ci 1738c2ecf20Sopenharmony_ci if (unlikely(dctx->buflen)) { 1748c2ecf20Sopenharmony_ci bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); 1758c2ecf20Sopenharmony_ci memcpy(dctx->buf + dctx->buflen, src, bytes); 1768c2ecf20Sopenharmony_ci src += bytes; 1778c2ecf20Sopenharmony_ci srclen -= bytes; 1788c2ecf20Sopenharmony_ci dctx->buflen += bytes; 1798c2ecf20Sopenharmony_ci 1808c2ecf20Sopenharmony_ci if (dctx->buflen == POLY1305_BLOCK_SIZE) { 1818c2ecf20Sopenharmony_ci if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE))) 1828c2ecf20Sopenharmony_ci poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); 1838c2ecf20Sopenharmony_ci dctx->buflen = 0; 1848c2ecf20Sopenharmony_ci } 1858c2ecf20Sopenharmony_ci } 1868c2ecf20Sopenharmony_ci 1878c2ecf20Sopenharmony_ci if (likely(srclen >= POLY1305_BLOCK_SIZE)) { 1888c2ecf20Sopenharmony_ci bytes = round_down(srclen, POLY1305_BLOCK_SIZE); 1898c2ecf20Sopenharmony_ci srclen -= bytes; 1908c2ecf20Sopenharmony_ci used = crypto_poly1305_setdctxkey(dctx, src, bytes); 1918c2ecf20Sopenharmony_ci if (likely(bytes - used)) 1928c2ecf20Sopenharmony_ci poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1); 1938c2ecf20Sopenharmony_ci src += bytes; 1948c2ecf20Sopenharmony_ci } 1958c2ecf20Sopenharmony_ci 1968c2ecf20Sopenharmony_ci if (unlikely(srclen)) { 1978c2ecf20Sopenharmony_ci dctx->buflen = srclen; 1988c2ecf20Sopenharmony_ci memcpy(dctx->buf, src, srclen); 1998c2ecf20Sopenharmony_ci } 2008c2ecf20Sopenharmony_ci} 2018c2ecf20Sopenharmony_ciEXPORT_SYMBOL(poly1305_update_arch); 2028c2ecf20Sopenharmony_ci 2038c2ecf20Sopenharmony_civoid poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) 2048c2ecf20Sopenharmony_ci{ 2058c2ecf20Sopenharmony_ci if (unlikely(dctx->buflen)) { 2068c2ecf20Sopenharmony_ci dctx->buf[dctx->buflen++] = 1; 2078c2ecf20Sopenharmony_ci memset(dctx->buf + dctx->buflen, 0, 2088c2ecf20Sopenharmony_ci POLY1305_BLOCK_SIZE - dctx->buflen); 2098c2ecf20Sopenharmony_ci poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); 2108c2ecf20Sopenharmony_ci } 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci poly1305_simd_emit(&dctx->h, dst, dctx->s); 2138c2ecf20Sopenharmony_ci *dctx = (struct poly1305_desc_ctx){}; 2148c2ecf20Sopenharmony_ci} 2158c2ecf20Sopenharmony_ciEXPORT_SYMBOL(poly1305_final_arch); 2168c2ecf20Sopenharmony_ci 2178c2ecf20Sopenharmony_cistatic int crypto_poly1305_init(struct shash_desc *desc) 2188c2ecf20Sopenharmony_ci{ 2198c2ecf20Sopenharmony_ci struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); 2208c2ecf20Sopenharmony_ci 2218c2ecf20Sopenharmony_ci *dctx = (struct poly1305_desc_ctx){}; 2228c2ecf20Sopenharmony_ci return 0; 2238c2ecf20Sopenharmony_ci} 2248c2ecf20Sopenharmony_ci 2258c2ecf20Sopenharmony_cistatic int crypto_poly1305_update(struct shash_desc *desc, 2268c2ecf20Sopenharmony_ci const u8 *src, unsigned int srclen) 2278c2ecf20Sopenharmony_ci{ 2288c2ecf20Sopenharmony_ci struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); 2298c2ecf20Sopenharmony_ci 2308c2ecf20Sopenharmony_ci poly1305_update_arch(dctx, src, srclen); 2318c2ecf20Sopenharmony_ci return 0; 2328c2ecf20Sopenharmony_ci} 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_cistatic int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) 2358c2ecf20Sopenharmony_ci{ 2368c2ecf20Sopenharmony_ci struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci if (unlikely(!dctx->sset)) 2398c2ecf20Sopenharmony_ci return -ENOKEY; 2408c2ecf20Sopenharmony_ci 2418c2ecf20Sopenharmony_ci poly1305_final_arch(dctx, dst); 2428c2ecf20Sopenharmony_ci return 0; 2438c2ecf20Sopenharmony_ci} 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_cistatic struct shash_alg alg = { 2468c2ecf20Sopenharmony_ci .digestsize = POLY1305_DIGEST_SIZE, 2478c2ecf20Sopenharmony_ci .init = crypto_poly1305_init, 2488c2ecf20Sopenharmony_ci .update = crypto_poly1305_update, 2498c2ecf20Sopenharmony_ci .final = crypto_poly1305_final, 2508c2ecf20Sopenharmony_ci .descsize = sizeof(struct poly1305_desc_ctx), 2518c2ecf20Sopenharmony_ci .base = { 2528c2ecf20Sopenharmony_ci .cra_name = "poly1305", 2538c2ecf20Sopenharmony_ci .cra_driver_name = "poly1305-simd", 2548c2ecf20Sopenharmony_ci .cra_priority = 300, 2558c2ecf20Sopenharmony_ci .cra_blocksize = POLY1305_BLOCK_SIZE, 2568c2ecf20Sopenharmony_ci .cra_module = THIS_MODULE, 2578c2ecf20Sopenharmony_ci }, 2588c2ecf20Sopenharmony_ci}; 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_cistatic int __init poly1305_simd_mod_init(void) 2618c2ecf20Sopenharmony_ci{ 2628c2ecf20Sopenharmony_ci if (boot_cpu_has(X86_FEATURE_AVX) && 2638c2ecf20Sopenharmony_ci cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) 2648c2ecf20Sopenharmony_ci static_branch_enable(&poly1305_use_avx); 2658c2ecf20Sopenharmony_ci if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && 2668c2ecf20Sopenharmony_ci cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) 2678c2ecf20Sopenharmony_ci static_branch_enable(&poly1305_use_avx2); 2688c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && 2698c2ecf20Sopenharmony_ci boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && 2708c2ecf20Sopenharmony_ci cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && 2718c2ecf20Sopenharmony_ci /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ 2728c2ecf20Sopenharmony_ci boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X) 2738c2ecf20Sopenharmony_ci static_branch_enable(&poly1305_use_avx512); 2748c2ecf20Sopenharmony_ci return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; 2758c2ecf20Sopenharmony_ci} 2768c2ecf20Sopenharmony_ci 2778c2ecf20Sopenharmony_cistatic void __exit poly1305_simd_mod_exit(void) 2788c2ecf20Sopenharmony_ci{ 2798c2ecf20Sopenharmony_ci if (IS_REACHABLE(CONFIG_CRYPTO_HASH)) 2808c2ecf20Sopenharmony_ci crypto_unregister_shash(&alg); 2818c2ecf20Sopenharmony_ci} 2828c2ecf20Sopenharmony_ci 2838c2ecf20Sopenharmony_cimodule_init(poly1305_simd_mod_init); 2848c2ecf20Sopenharmony_cimodule_exit(poly1305_simd_mod_exit); 2858c2ecf20Sopenharmony_ci 2868c2ecf20Sopenharmony_ciMODULE_LICENSE("GPL"); 2878c2ecf20Sopenharmony_ciMODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); 2888c2ecf20Sopenharmony_ciMODULE_DESCRIPTION("Poly1305 authenticator"); 2898c2ecf20Sopenharmony_ciMODULE_ALIAS_CRYPTO("poly1305"); 2908c2ecf20Sopenharmony_ciMODULE_ALIAS_CRYPTO("poly1305-simd"); 291