162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 OR MIT 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci 662306a36Sopenharmony_ci#include <crypto/algapi.h> 762306a36Sopenharmony_ci#include <crypto/internal/hash.h> 862306a36Sopenharmony_ci#include <crypto/internal/poly1305.h> 962306a36Sopenharmony_ci#include <crypto/internal/simd.h> 1062306a36Sopenharmony_ci#include <linux/crypto.h> 1162306a36Sopenharmony_ci#include <linux/jump_label.h> 1262306a36Sopenharmony_ci#include <linux/kernel.h> 1362306a36Sopenharmony_ci#include <linux/module.h> 1462306a36Sopenharmony_ci#include <linux/sizes.h> 1562306a36Sopenharmony_ci#include <asm/intel-family.h> 1662306a36Sopenharmony_ci#include <asm/simd.h> 1762306a36Sopenharmony_ci 1862306a36Sopenharmony_ciasmlinkage void poly1305_init_x86_64(void *ctx, 1962306a36Sopenharmony_ci const u8 key[POLY1305_BLOCK_SIZE]); 2062306a36Sopenharmony_ciasmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, 2162306a36Sopenharmony_ci const size_t len, const u32 padbit); 2262306a36Sopenharmony_ciasmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], 2362306a36Sopenharmony_ci const u32 nonce[4]); 2462306a36Sopenharmony_ciasmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], 2562306a36Sopenharmony_ci const u32 nonce[4]); 2662306a36Sopenharmony_ciasmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, 2762306a36Sopenharmony_ci const u32 padbit); 2862306a36Sopenharmony_ciasmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, 2962306a36Sopenharmony_ci const u32 padbit); 3062306a36Sopenharmony_ciasmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, 3162306a36Sopenharmony_ci const size_t len, const u32 padbit); 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_cistatic __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); 3462306a36Sopenharmony_cistatic __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); 3562306a36Sopenharmony_cistatic __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_cistruct poly1305_arch_internal { 3862306a36Sopenharmony_ci union { 3962306a36Sopenharmony_ci struct { 4062306a36Sopenharmony_ci u32 h[5]; 4162306a36Sopenharmony_ci u32 is_base2_26; 4262306a36Sopenharmony_ci }; 4362306a36Sopenharmony_ci u64 hs[3]; 4462306a36Sopenharmony_ci }; 4562306a36Sopenharmony_ci u64 r[2]; 4662306a36Sopenharmony_ci u64 pad; 4762306a36Sopenharmony_ci struct { u32 r2, r1, r4, r3; } rn[9]; 4862306a36Sopenharmony_ci}; 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit 5162306a36Sopenharmony_ci * the unfortunate situation of using AVX and then having to go back to scalar 5262306a36Sopenharmony_ci * -- because the user is silly and has called the update function from two 5362306a36Sopenharmony_ci * separate contexts -- then we need to convert back to the original base before 5462306a36Sopenharmony_ci * proceeding. It is possible to reason that the initial reduction below is 5562306a36Sopenharmony_ci * sufficient given the implementation invariants. However, for an avoidance of 5662306a36Sopenharmony_ci * doubt and because this is not performance critical, we do the full reduction 5762306a36Sopenharmony_ci * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py 5862306a36Sopenharmony_ci */ 5962306a36Sopenharmony_cistatic void convert_to_base2_64(void *ctx) 6062306a36Sopenharmony_ci{ 6162306a36Sopenharmony_ci struct poly1305_arch_internal *state = ctx; 6262306a36Sopenharmony_ci u32 cy; 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_ci if (!state->is_base2_26) 6562306a36Sopenharmony_ci return; 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_ci cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; 6862306a36Sopenharmony_ci cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; 6962306a36Sopenharmony_ci cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; 7062306a36Sopenharmony_ci cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; 7162306a36Sopenharmony_ci state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; 7262306a36Sopenharmony_ci state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); 7362306a36Sopenharmony_ci state->hs[2] = state->h[4] >> 24; 7462306a36Sopenharmony_ci#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) 7562306a36Sopenharmony_ci cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); 7662306a36Sopenharmony_ci state->hs[2] &= 3; 7762306a36Sopenharmony_ci state->hs[0] += cy; 7862306a36Sopenharmony_ci state->hs[1] += (cy = ULT(state->hs[0], cy)); 7962306a36Sopenharmony_ci state->hs[2] += ULT(state->hs[1], cy); 8062306a36Sopenharmony_ci#undef ULT 8162306a36Sopenharmony_ci state->is_base2_26 = 0; 8262306a36Sopenharmony_ci} 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_cistatic void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]) 8562306a36Sopenharmony_ci{ 8662306a36Sopenharmony_ci poly1305_init_x86_64(ctx, key); 8762306a36Sopenharmony_ci} 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_cistatic void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, 9062306a36Sopenharmony_ci const u32 padbit) 9162306a36Sopenharmony_ci{ 9262306a36Sopenharmony_ci struct poly1305_arch_internal *state = ctx; 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci /* SIMD disables preemption, so relax after processing each page. */ 9562306a36Sopenharmony_ci BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || 9662306a36Sopenharmony_ci SZ_4K % POLY1305_BLOCK_SIZE); 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci if (!static_branch_likely(&poly1305_use_avx) || 9962306a36Sopenharmony_ci (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || 10062306a36Sopenharmony_ci !crypto_simd_usable()) { 10162306a36Sopenharmony_ci convert_to_base2_64(ctx); 10262306a36Sopenharmony_ci poly1305_blocks_x86_64(ctx, inp, len, padbit); 10362306a36Sopenharmony_ci return; 10462306a36Sopenharmony_ci } 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci do { 10762306a36Sopenharmony_ci const size_t bytes = min_t(size_t, len, SZ_4K); 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci kernel_fpu_begin(); 11062306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) 11162306a36Sopenharmony_ci poly1305_blocks_avx512(ctx, inp, bytes, padbit); 11262306a36Sopenharmony_ci else if (static_branch_likely(&poly1305_use_avx2)) 11362306a36Sopenharmony_ci poly1305_blocks_avx2(ctx, inp, bytes, padbit); 11462306a36Sopenharmony_ci else 11562306a36Sopenharmony_ci poly1305_blocks_avx(ctx, inp, bytes, padbit); 11662306a36Sopenharmony_ci kernel_fpu_end(); 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci len -= bytes; 11962306a36Sopenharmony_ci inp += bytes; 12062306a36Sopenharmony_ci } while (len); 12162306a36Sopenharmony_ci} 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_cistatic void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], 12462306a36Sopenharmony_ci const u32 nonce[4]) 12562306a36Sopenharmony_ci{ 12662306a36Sopenharmony_ci if (!static_branch_likely(&poly1305_use_avx)) 12762306a36Sopenharmony_ci poly1305_emit_x86_64(ctx, mac, nonce); 12862306a36Sopenharmony_ci else 12962306a36Sopenharmony_ci poly1305_emit_avx(ctx, mac, nonce); 13062306a36Sopenharmony_ci} 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_civoid poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) 13362306a36Sopenharmony_ci{ 13462306a36Sopenharmony_ci poly1305_simd_init(&dctx->h, key); 13562306a36Sopenharmony_ci dctx->s[0] = get_unaligned_le32(&key[16]); 13662306a36Sopenharmony_ci dctx->s[1] = get_unaligned_le32(&key[20]); 13762306a36Sopenharmony_ci dctx->s[2] = get_unaligned_le32(&key[24]); 13862306a36Sopenharmony_ci dctx->s[3] = get_unaligned_le32(&key[28]); 13962306a36Sopenharmony_ci dctx->buflen = 0; 14062306a36Sopenharmony_ci dctx->sset = true; 14162306a36Sopenharmony_ci} 14262306a36Sopenharmony_ciEXPORT_SYMBOL(poly1305_init_arch); 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_cistatic unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, 14562306a36Sopenharmony_ci const u8 *inp, unsigned int len) 14662306a36Sopenharmony_ci{ 14762306a36Sopenharmony_ci unsigned int acc = 0; 14862306a36Sopenharmony_ci if (unlikely(!dctx->sset)) { 14962306a36Sopenharmony_ci if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { 15062306a36Sopenharmony_ci poly1305_simd_init(&dctx->h, inp); 15162306a36Sopenharmony_ci inp += POLY1305_BLOCK_SIZE; 15262306a36Sopenharmony_ci len -= POLY1305_BLOCK_SIZE; 15362306a36Sopenharmony_ci acc += POLY1305_BLOCK_SIZE; 15462306a36Sopenharmony_ci dctx->rset = 1; 15562306a36Sopenharmony_ci } 15662306a36Sopenharmony_ci if (len >= POLY1305_BLOCK_SIZE) { 15762306a36Sopenharmony_ci dctx->s[0] = get_unaligned_le32(&inp[0]); 15862306a36Sopenharmony_ci dctx->s[1] = get_unaligned_le32(&inp[4]); 15962306a36Sopenharmony_ci dctx->s[2] = get_unaligned_le32(&inp[8]); 16062306a36Sopenharmony_ci dctx->s[3] = get_unaligned_le32(&inp[12]); 16162306a36Sopenharmony_ci acc += POLY1305_BLOCK_SIZE; 16262306a36Sopenharmony_ci dctx->sset = true; 16362306a36Sopenharmony_ci } 16462306a36Sopenharmony_ci } 16562306a36Sopenharmony_ci return acc; 16662306a36Sopenharmony_ci} 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_civoid poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, 16962306a36Sopenharmony_ci unsigned int srclen) 17062306a36Sopenharmony_ci{ 17162306a36Sopenharmony_ci unsigned int bytes, used; 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci if (unlikely(dctx->buflen)) { 17462306a36Sopenharmony_ci bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); 17562306a36Sopenharmony_ci memcpy(dctx->buf + dctx->buflen, src, bytes); 17662306a36Sopenharmony_ci src += bytes; 17762306a36Sopenharmony_ci srclen -= bytes; 17862306a36Sopenharmony_ci dctx->buflen += bytes; 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci if (dctx->buflen == POLY1305_BLOCK_SIZE) { 18162306a36Sopenharmony_ci if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE))) 18262306a36Sopenharmony_ci poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); 18362306a36Sopenharmony_ci dctx->buflen = 0; 18462306a36Sopenharmony_ci } 18562306a36Sopenharmony_ci } 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci if (likely(srclen >= POLY1305_BLOCK_SIZE)) { 18862306a36Sopenharmony_ci bytes = round_down(srclen, POLY1305_BLOCK_SIZE); 18962306a36Sopenharmony_ci srclen -= bytes; 19062306a36Sopenharmony_ci used = crypto_poly1305_setdctxkey(dctx, src, bytes); 19162306a36Sopenharmony_ci if (likely(bytes - used)) 19262306a36Sopenharmony_ci poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1); 19362306a36Sopenharmony_ci src += bytes; 19462306a36Sopenharmony_ci } 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci if (unlikely(srclen)) { 19762306a36Sopenharmony_ci dctx->buflen = srclen; 19862306a36Sopenharmony_ci memcpy(dctx->buf, src, srclen); 19962306a36Sopenharmony_ci } 20062306a36Sopenharmony_ci} 20162306a36Sopenharmony_ciEXPORT_SYMBOL(poly1305_update_arch); 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_civoid poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) 20462306a36Sopenharmony_ci{ 20562306a36Sopenharmony_ci if (unlikely(dctx->buflen)) { 20662306a36Sopenharmony_ci dctx->buf[dctx->buflen++] = 1; 20762306a36Sopenharmony_ci memset(dctx->buf + dctx->buflen, 0, 20862306a36Sopenharmony_ci POLY1305_BLOCK_SIZE - dctx->buflen); 20962306a36Sopenharmony_ci poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); 21062306a36Sopenharmony_ci } 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci poly1305_simd_emit(&dctx->h, dst, dctx->s); 21362306a36Sopenharmony_ci memzero_explicit(dctx, sizeof(*dctx)); 21462306a36Sopenharmony_ci} 21562306a36Sopenharmony_ciEXPORT_SYMBOL(poly1305_final_arch); 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_cistatic int crypto_poly1305_init(struct shash_desc *desc) 21862306a36Sopenharmony_ci{ 21962306a36Sopenharmony_ci struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci *dctx = (struct poly1305_desc_ctx){}; 22262306a36Sopenharmony_ci return 0; 22362306a36Sopenharmony_ci} 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_cistatic int crypto_poly1305_update(struct shash_desc *desc, 22662306a36Sopenharmony_ci const u8 *src, unsigned int srclen) 22762306a36Sopenharmony_ci{ 22862306a36Sopenharmony_ci struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci poly1305_update_arch(dctx, src, srclen); 23162306a36Sopenharmony_ci return 0; 23262306a36Sopenharmony_ci} 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_cistatic int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) 23562306a36Sopenharmony_ci{ 23662306a36Sopenharmony_ci struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci if (unlikely(!dctx->sset)) 23962306a36Sopenharmony_ci return -ENOKEY; 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci poly1305_final_arch(dctx, dst); 24262306a36Sopenharmony_ci return 0; 24362306a36Sopenharmony_ci} 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_cistatic struct shash_alg alg = { 24662306a36Sopenharmony_ci .digestsize = POLY1305_DIGEST_SIZE, 24762306a36Sopenharmony_ci .init = crypto_poly1305_init, 24862306a36Sopenharmony_ci .update = crypto_poly1305_update, 24962306a36Sopenharmony_ci .final = crypto_poly1305_final, 25062306a36Sopenharmony_ci .descsize = sizeof(struct poly1305_desc_ctx), 25162306a36Sopenharmony_ci .base = { 25262306a36Sopenharmony_ci .cra_name = "poly1305", 25362306a36Sopenharmony_ci .cra_driver_name = "poly1305-simd", 25462306a36Sopenharmony_ci .cra_priority = 300, 25562306a36Sopenharmony_ci .cra_blocksize = POLY1305_BLOCK_SIZE, 25662306a36Sopenharmony_ci .cra_module = THIS_MODULE, 25762306a36Sopenharmony_ci }, 25862306a36Sopenharmony_ci}; 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_cistatic int __init poly1305_simd_mod_init(void) 26162306a36Sopenharmony_ci{ 26262306a36Sopenharmony_ci if (boot_cpu_has(X86_FEATURE_AVX) && 26362306a36Sopenharmony_ci cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) 26462306a36Sopenharmony_ci static_branch_enable(&poly1305_use_avx); 26562306a36Sopenharmony_ci if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && 26662306a36Sopenharmony_ci cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) 26762306a36Sopenharmony_ci static_branch_enable(&poly1305_use_avx2); 26862306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && 26962306a36Sopenharmony_ci boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && 27062306a36Sopenharmony_ci cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && 27162306a36Sopenharmony_ci /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ 27262306a36Sopenharmony_ci boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X) 27362306a36Sopenharmony_ci static_branch_enable(&poly1305_use_avx512); 27462306a36Sopenharmony_ci return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; 27562306a36Sopenharmony_ci} 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_cistatic void __exit poly1305_simd_mod_exit(void) 27862306a36Sopenharmony_ci{ 27962306a36Sopenharmony_ci if (IS_REACHABLE(CONFIG_CRYPTO_HASH)) 28062306a36Sopenharmony_ci crypto_unregister_shash(&alg); 28162306a36Sopenharmony_ci} 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_cimodule_init(poly1305_simd_mod_init); 28462306a36Sopenharmony_cimodule_exit(poly1305_simd_mod_exit); 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ciMODULE_LICENSE("GPL"); 28762306a36Sopenharmony_ciMODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); 28862306a36Sopenharmony_ciMODULE_DESCRIPTION("Poly1305 authenticator"); 28962306a36Sopenharmony_ciMODULE_ALIAS_CRYPTO("poly1305"); 29062306a36Sopenharmony_ciMODULE_ALIAS_CRYPTO("poly1305-simd"); 291