1e1051a39Sopenharmony_ci/* 2e1051a39Sopenharmony_ci * Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci * 4e1051a39Sopenharmony_ci * Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci * this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci * in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci * https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci */ 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci#include <stdlib.h> 11e1051a39Sopenharmony_ci#include <string.h> 12e1051a39Sopenharmony_ci#include <openssl/crypto.h> 13e1051a39Sopenharmony_ci 14e1051a39Sopenharmony_ci#include "crypto/poly1305.h" 15e1051a39Sopenharmony_ci 16e1051a39Sopenharmony_cisize_t Poly1305_ctx_size(void) 17e1051a39Sopenharmony_ci{ 18e1051a39Sopenharmony_ci return sizeof(struct poly1305_context); 19e1051a39Sopenharmony_ci} 20e1051a39Sopenharmony_ci 21e1051a39Sopenharmony_ci/* pick 32-bit unsigned integer in little endian order */ 22e1051a39Sopenharmony_cistatic unsigned int U8TOU32(const unsigned char *p) 23e1051a39Sopenharmony_ci{ 24e1051a39Sopenharmony_ci return (((unsigned int)(p[0] & 0xff)) | 25e1051a39Sopenharmony_ci ((unsigned int)(p[1] & 0xff) << 8) | 26e1051a39Sopenharmony_ci ((unsigned int)(p[2] & 0xff) << 16) | 27e1051a39Sopenharmony_ci ((unsigned int)(p[3] & 0xff) << 24)); 28e1051a39Sopenharmony_ci} 29e1051a39Sopenharmony_ci 30e1051a39Sopenharmony_ci/* 31e1051a39Sopenharmony_ci * Implementations can be classified by amount of significant bits in 32e1051a39Sopenharmony_ci * words making up the multi-precision value, or in other words radix 33e1051a39Sopenharmony_ci * or base of numerical representation, e.g. base 2^64, base 2^32, 34e1051a39Sopenharmony_ci * base 2^26. Complementary characteristic is how wide is the result of 35e1051a39Sopenharmony_ci * multiplication of pair of digits, e.g. it would take 128 bits to 36e1051a39Sopenharmony_ci * accommodate multiplication result in base 2^64 case. These are used 37e1051a39Sopenharmony_ci * interchangeably. To describe implementation that is. But interface 38e1051a39Sopenharmony_ci * is designed to isolate this so that low-level primitives implemented 39e1051a39Sopenharmony_ci * in assembly can be self-contained/self-coherent. 40e1051a39Sopenharmony_ci */ 41e1051a39Sopenharmony_ci#ifndef POLY1305_ASM 42e1051a39Sopenharmony_ci/* 43e1051a39Sopenharmony_ci * Even though there is __int128 reference implementation targeting 44e1051a39Sopenharmony_ci * 64-bit platforms provided below, it's not obvious that it's optimal 45e1051a39Sopenharmony_ci * choice for every one of them. Depending on instruction set overall 46e1051a39Sopenharmony_ci * amount of instructions can be comparable to one in __int64 47e1051a39Sopenharmony_ci * implementation. Amount of multiplication instructions would be lower, 48e1051a39Sopenharmony_ci * but not necessarily overall. And in out-of-order execution context, 49e1051a39Sopenharmony_ci * it is the latter that can be crucial... 50e1051a39Sopenharmony_ci * 51e1051a39Sopenharmony_ci * On related note. Poly1305 author, D. J. Bernstein, discusses and 52e1051a39Sopenharmony_ci * provides floating-point implementations of the algorithm in question. 53e1051a39Sopenharmony_ci * It made a lot of sense by the time of introduction, because most 54e1051a39Sopenharmony_ci * then-modern processors didn't have pipelined integer multiplier. 55e1051a39Sopenharmony_ci * [Not to mention that some had non-constant timing for integer 56e1051a39Sopenharmony_ci * multiplications.] Floating-point instructions on the other hand could 57e1051a39Sopenharmony_ci * be issued every cycle, which allowed to achieve better performance. 58e1051a39Sopenharmony_ci * Nowadays, with SIMD and/or out-or-order execution, shared or 59e1051a39Sopenharmony_ci * even emulated FPU, it's more complicated, and floating-point 60e1051a39Sopenharmony_ci * implementation is not necessarily optimal choice in every situation, 61e1051a39Sopenharmony_ci * rather contrary... 62e1051a39Sopenharmony_ci * 63e1051a39Sopenharmony_ci * <appro@openssl.org> 64e1051a39Sopenharmony_ci */ 65e1051a39Sopenharmony_ci 66e1051a39Sopenharmony_citypedef unsigned int u32; 67e1051a39Sopenharmony_ci 68e1051a39Sopenharmony_ci/* 69e1051a39Sopenharmony_ci * poly1305_blocks processes a multiple of POLY1305_BLOCK_SIZE blocks 70e1051a39Sopenharmony_ci * of |inp| no longer than |len|. Behaviour for |len| not divisible by 71e1051a39Sopenharmony_ci * block size is unspecified in general case, even though in reference 72e1051a39Sopenharmony_ci * implementation the trailing chunk is simply ignored. Per algorithm 73e1051a39Sopenharmony_ci * specification, every input block, complete or last partial, is to be 74e1051a39Sopenharmony_ci * padded with a bit past most significant byte. The latter kind is then 75e1051a39Sopenharmony_ci * padded with zeros till block size. This last partial block padding 76e1051a39Sopenharmony_ci * is caller(*)'s responsibility, and because of this the last partial 77e1051a39Sopenharmony_ci * block is always processed with separate call with |len| set to 78e1051a39Sopenharmony_ci * POLY1305_BLOCK_SIZE and |padbit| to 0. In all other cases |padbit| 79e1051a39Sopenharmony_ci * should be set to 1 to perform implicit padding with 128th bit. 80e1051a39Sopenharmony_ci * poly1305_blocks does not actually check for this constraint though, 81e1051a39Sopenharmony_ci * it's caller(*)'s responsibility to comply. 82e1051a39Sopenharmony_ci * 83e1051a39Sopenharmony_ci * (*) In the context "caller" is not application code, but higher 84e1051a39Sopenharmony_ci * level Poly1305_* from this very module, so that quirks are 85e1051a39Sopenharmony_ci * handled locally. 86e1051a39Sopenharmony_ci */ 87e1051a39Sopenharmony_cistatic void 88e1051a39Sopenharmony_cipoly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit); 89e1051a39Sopenharmony_ci 90e1051a39Sopenharmony_ci/* 91e1051a39Sopenharmony_ci * Type-agnostic "rip-off" from constant_time.h 92e1051a39Sopenharmony_ci */ 93e1051a39Sopenharmony_ci# define CONSTANT_TIME_CARRY(a,b) ( \ 94e1051a39Sopenharmony_ci (a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1) \ 95e1051a39Sopenharmony_ci ) 96e1051a39Sopenharmony_ci 97e1051a39Sopenharmony_ci# if defined(INT64_MAX) && defined(INT128_MAX) 98e1051a39Sopenharmony_ci 99e1051a39Sopenharmony_citypedef unsigned long u64; 100e1051a39Sopenharmony_citypedef uint128_t u128; 101e1051a39Sopenharmony_ci 102e1051a39Sopenharmony_citypedef struct { 103e1051a39Sopenharmony_ci u64 h[3]; 104e1051a39Sopenharmony_ci u64 r[2]; 105e1051a39Sopenharmony_ci} poly1305_internal; 106e1051a39Sopenharmony_ci 107e1051a39Sopenharmony_ci/* pick 32-bit unsigned integer in little endian order */ 108e1051a39Sopenharmony_cistatic u64 U8TOU64(const unsigned char *p) 109e1051a39Sopenharmony_ci{ 110e1051a39Sopenharmony_ci return (((u64)(p[0] & 0xff)) | 111e1051a39Sopenharmony_ci ((u64)(p[1] & 0xff) << 8) | 112e1051a39Sopenharmony_ci ((u64)(p[2] & 0xff) << 16) | 113e1051a39Sopenharmony_ci ((u64)(p[3] & 0xff) << 24) | 114e1051a39Sopenharmony_ci ((u64)(p[4] & 0xff) << 32) | 115e1051a39Sopenharmony_ci ((u64)(p[5] & 0xff) << 40) | 116e1051a39Sopenharmony_ci ((u64)(p[6] & 0xff) << 48) | 117e1051a39Sopenharmony_ci ((u64)(p[7] & 0xff) << 56)); 118e1051a39Sopenharmony_ci} 119e1051a39Sopenharmony_ci 120e1051a39Sopenharmony_ci/* store a 32-bit unsigned integer in little endian */ 121e1051a39Sopenharmony_cistatic void U64TO8(unsigned char *p, u64 v) 122e1051a39Sopenharmony_ci{ 123e1051a39Sopenharmony_ci p[0] = (unsigned char)((v) & 0xff); 124e1051a39Sopenharmony_ci p[1] = (unsigned char)((v >> 8) & 0xff); 125e1051a39Sopenharmony_ci p[2] = (unsigned char)((v >> 16) & 0xff); 126e1051a39Sopenharmony_ci p[3] = (unsigned char)((v >> 24) & 0xff); 127e1051a39Sopenharmony_ci p[4] = (unsigned char)((v >> 32) & 0xff); 128e1051a39Sopenharmony_ci p[5] = (unsigned char)((v >> 40) & 0xff); 129e1051a39Sopenharmony_ci p[6] = (unsigned char)((v >> 48) & 0xff); 130e1051a39Sopenharmony_ci p[7] = (unsigned char)((v >> 56) & 0xff); 131e1051a39Sopenharmony_ci} 132e1051a39Sopenharmony_ci 133e1051a39Sopenharmony_cistatic void poly1305_init(void *ctx, const unsigned char key[16]) 134e1051a39Sopenharmony_ci{ 135e1051a39Sopenharmony_ci poly1305_internal *st = (poly1305_internal *) ctx; 136e1051a39Sopenharmony_ci 137e1051a39Sopenharmony_ci /* h = 0 */ 138e1051a39Sopenharmony_ci st->h[0] = 0; 139e1051a39Sopenharmony_ci st->h[1] = 0; 140e1051a39Sopenharmony_ci st->h[2] = 0; 141e1051a39Sopenharmony_ci 142e1051a39Sopenharmony_ci /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 143e1051a39Sopenharmony_ci st->r[0] = U8TOU64(&key[0]) & 0x0ffffffc0fffffff; 144e1051a39Sopenharmony_ci st->r[1] = U8TOU64(&key[8]) & 0x0ffffffc0ffffffc; 145e1051a39Sopenharmony_ci} 146e1051a39Sopenharmony_ci 147e1051a39Sopenharmony_cistatic void 148e1051a39Sopenharmony_cipoly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit) 149e1051a39Sopenharmony_ci{ 150e1051a39Sopenharmony_ci poly1305_internal *st = (poly1305_internal *)ctx; 151e1051a39Sopenharmony_ci u64 r0, r1; 152e1051a39Sopenharmony_ci u64 s1; 153e1051a39Sopenharmony_ci u64 h0, h1, h2, c; 154e1051a39Sopenharmony_ci u128 d0, d1; 155e1051a39Sopenharmony_ci 156e1051a39Sopenharmony_ci r0 = st->r[0]; 157e1051a39Sopenharmony_ci r1 = st->r[1]; 158e1051a39Sopenharmony_ci 159e1051a39Sopenharmony_ci s1 = r1 + (r1 >> 2); 160e1051a39Sopenharmony_ci 161e1051a39Sopenharmony_ci h0 = st->h[0]; 162e1051a39Sopenharmony_ci h1 = st->h[1]; 163e1051a39Sopenharmony_ci h2 = st->h[2]; 164e1051a39Sopenharmony_ci 165e1051a39Sopenharmony_ci while (len >= POLY1305_BLOCK_SIZE) { 166e1051a39Sopenharmony_ci /* h += m[i] */ 167e1051a39Sopenharmony_ci h0 = (u64)(d0 = (u128)h0 + U8TOU64(inp + 0)); 168e1051a39Sopenharmony_ci h1 = (u64)(d1 = (u128)h1 + (d0 >> 64) + U8TOU64(inp + 8)); 169e1051a39Sopenharmony_ci /* 170e1051a39Sopenharmony_ci * padbit can be zero only when original len was 171e1051a39Sopenharmony_ci * POLY1306_BLOCK_SIZE, but we don't check 172e1051a39Sopenharmony_ci */ 173e1051a39Sopenharmony_ci h2 += (u64)(d1 >> 64) + padbit; 174e1051a39Sopenharmony_ci 175e1051a39Sopenharmony_ci /* h *= r "%" p, where "%" stands for "partial remainder" */ 176e1051a39Sopenharmony_ci d0 = ((u128)h0 * r0) + 177e1051a39Sopenharmony_ci ((u128)h1 * s1); 178e1051a39Sopenharmony_ci d1 = ((u128)h0 * r1) + 179e1051a39Sopenharmony_ci ((u128)h1 * r0) + 180e1051a39Sopenharmony_ci (h2 * s1); 181e1051a39Sopenharmony_ci h2 = (h2 * r0); 182e1051a39Sopenharmony_ci 183e1051a39Sopenharmony_ci /* last reduction step: */ 184e1051a39Sopenharmony_ci /* a) h2:h0 = h2<<128 + d1<<64 + d0 */ 185e1051a39Sopenharmony_ci h0 = (u64)d0; 186e1051a39Sopenharmony_ci h1 = (u64)(d1 += d0 >> 64); 187e1051a39Sopenharmony_ci h2 += (u64)(d1 >> 64); 188e1051a39Sopenharmony_ci /* b) (h2:h0 += (h2:h0>>130) * 5) %= 2^130 */ 189e1051a39Sopenharmony_ci c = (h2 >> 2) + (h2 & ~3UL); 190e1051a39Sopenharmony_ci h2 &= 3; 191e1051a39Sopenharmony_ci h0 += c; 192e1051a39Sopenharmony_ci h1 += (c = CONSTANT_TIME_CARRY(h0,c)); 193e1051a39Sopenharmony_ci h2 += CONSTANT_TIME_CARRY(h1,c); 194e1051a39Sopenharmony_ci /* 195e1051a39Sopenharmony_ci * Occasional overflows to 3rd bit of h2 are taken care of 196e1051a39Sopenharmony_ci * "naturally". If after this point we end up at the top of 197e1051a39Sopenharmony_ci * this loop, then the overflow bit will be accounted for 198e1051a39Sopenharmony_ci * in next iteration. If we end up in poly1305_emit, then 199e1051a39Sopenharmony_ci * comparison to modulus below will still count as "carry 200e1051a39Sopenharmony_ci * into 131st bit", so that properly reduced value will be 201e1051a39Sopenharmony_ci * picked in conditional move. 202e1051a39Sopenharmony_ci */ 203e1051a39Sopenharmony_ci 204e1051a39Sopenharmony_ci inp += POLY1305_BLOCK_SIZE; 205e1051a39Sopenharmony_ci len -= POLY1305_BLOCK_SIZE; 206e1051a39Sopenharmony_ci } 207e1051a39Sopenharmony_ci 208e1051a39Sopenharmony_ci st->h[0] = h0; 209e1051a39Sopenharmony_ci st->h[1] = h1; 210e1051a39Sopenharmony_ci st->h[2] = h2; 211e1051a39Sopenharmony_ci} 212e1051a39Sopenharmony_ci 213e1051a39Sopenharmony_cistatic void poly1305_emit(void *ctx, unsigned char mac[16], 214e1051a39Sopenharmony_ci const u32 nonce[4]) 215e1051a39Sopenharmony_ci{ 216e1051a39Sopenharmony_ci poly1305_internal *st = (poly1305_internal *) ctx; 217e1051a39Sopenharmony_ci u64 h0, h1, h2; 218e1051a39Sopenharmony_ci u64 g0, g1, g2; 219e1051a39Sopenharmony_ci u128 t; 220e1051a39Sopenharmony_ci u64 mask; 221e1051a39Sopenharmony_ci 222e1051a39Sopenharmony_ci h0 = st->h[0]; 223e1051a39Sopenharmony_ci h1 = st->h[1]; 224e1051a39Sopenharmony_ci h2 = st->h[2]; 225e1051a39Sopenharmony_ci 226e1051a39Sopenharmony_ci /* compare to modulus by computing h + -p */ 227e1051a39Sopenharmony_ci g0 = (u64)(t = (u128)h0 + 5); 228e1051a39Sopenharmony_ci g1 = (u64)(t = (u128)h1 + (t >> 64)); 229e1051a39Sopenharmony_ci g2 = h2 + (u64)(t >> 64); 230e1051a39Sopenharmony_ci 231e1051a39Sopenharmony_ci /* if there was carry into 131st bit, h1:h0 = g1:g0 */ 232e1051a39Sopenharmony_ci mask = 0 - (g2 >> 2); 233e1051a39Sopenharmony_ci g0 &= mask; 234e1051a39Sopenharmony_ci g1 &= mask; 235e1051a39Sopenharmony_ci mask = ~mask; 236e1051a39Sopenharmony_ci h0 = (h0 & mask) | g0; 237e1051a39Sopenharmony_ci h1 = (h1 & mask) | g1; 238e1051a39Sopenharmony_ci 239e1051a39Sopenharmony_ci /* mac = (h + nonce) % (2^128) */ 240e1051a39Sopenharmony_ci h0 = (u64)(t = (u128)h0 + nonce[0] + ((u64)nonce[1]<<32)); 241e1051a39Sopenharmony_ci h1 = (u64)(t = (u128)h1 + nonce[2] + ((u64)nonce[3]<<32) + (t >> 64)); 242e1051a39Sopenharmony_ci 243e1051a39Sopenharmony_ci U64TO8(mac + 0, h0); 244e1051a39Sopenharmony_ci U64TO8(mac + 8, h1); 245e1051a39Sopenharmony_ci} 246e1051a39Sopenharmony_ci 247e1051a39Sopenharmony_ci# else 248e1051a39Sopenharmony_ci 249e1051a39Sopenharmony_ci# if defined(_WIN32) && !defined(__MINGW32__) 250e1051a39Sopenharmony_citypedef unsigned __int64 u64; 251e1051a39Sopenharmony_ci# elif defined(__arch64__) 252e1051a39Sopenharmony_citypedef unsigned long u64; 253e1051a39Sopenharmony_ci# else 254e1051a39Sopenharmony_citypedef unsigned long long u64; 255e1051a39Sopenharmony_ci# endif 256e1051a39Sopenharmony_ci 257e1051a39Sopenharmony_citypedef struct { 258e1051a39Sopenharmony_ci u32 h[5]; 259e1051a39Sopenharmony_ci u32 r[4]; 260e1051a39Sopenharmony_ci} poly1305_internal; 261e1051a39Sopenharmony_ci 262e1051a39Sopenharmony_ci/* store a 32-bit unsigned integer in little endian */ 263e1051a39Sopenharmony_cistatic void U32TO8(unsigned char *p, unsigned int v) 264e1051a39Sopenharmony_ci{ 265e1051a39Sopenharmony_ci p[0] = (unsigned char)((v) & 0xff); 266e1051a39Sopenharmony_ci p[1] = (unsigned char)((v >> 8) & 0xff); 267e1051a39Sopenharmony_ci p[2] = (unsigned char)((v >> 16) & 0xff); 268e1051a39Sopenharmony_ci p[3] = (unsigned char)((v >> 24) & 0xff); 269e1051a39Sopenharmony_ci} 270e1051a39Sopenharmony_ci 271e1051a39Sopenharmony_cistatic void poly1305_init(void *ctx, const unsigned char key[16]) 272e1051a39Sopenharmony_ci{ 273e1051a39Sopenharmony_ci poly1305_internal *st = (poly1305_internal *) ctx; 274e1051a39Sopenharmony_ci 275e1051a39Sopenharmony_ci /* h = 0 */ 276e1051a39Sopenharmony_ci st->h[0] = 0; 277e1051a39Sopenharmony_ci st->h[1] = 0; 278e1051a39Sopenharmony_ci st->h[2] = 0; 279e1051a39Sopenharmony_ci st->h[3] = 0; 280e1051a39Sopenharmony_ci st->h[4] = 0; 281e1051a39Sopenharmony_ci 282e1051a39Sopenharmony_ci /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 283e1051a39Sopenharmony_ci st->r[0] = U8TOU32(&key[0]) & 0x0fffffff; 284e1051a39Sopenharmony_ci st->r[1] = U8TOU32(&key[4]) & 0x0ffffffc; 285e1051a39Sopenharmony_ci st->r[2] = U8TOU32(&key[8]) & 0x0ffffffc; 286e1051a39Sopenharmony_ci st->r[3] = U8TOU32(&key[12]) & 0x0ffffffc; 287e1051a39Sopenharmony_ci} 288e1051a39Sopenharmony_ci 289e1051a39Sopenharmony_cistatic void 290e1051a39Sopenharmony_cipoly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit) 291e1051a39Sopenharmony_ci{ 292e1051a39Sopenharmony_ci poly1305_internal *st = (poly1305_internal *)ctx; 293e1051a39Sopenharmony_ci u32 r0, r1, r2, r3; 294e1051a39Sopenharmony_ci u32 s1, s2, s3; 295e1051a39Sopenharmony_ci u32 h0, h1, h2, h3, h4, c; 296e1051a39Sopenharmony_ci u64 d0, d1, d2, d3; 297e1051a39Sopenharmony_ci 298e1051a39Sopenharmony_ci r0 = st->r[0]; 299e1051a39Sopenharmony_ci r1 = st->r[1]; 300e1051a39Sopenharmony_ci r2 = st->r[2]; 301e1051a39Sopenharmony_ci r3 = st->r[3]; 302e1051a39Sopenharmony_ci 303e1051a39Sopenharmony_ci s1 = r1 + (r1 >> 2); 304e1051a39Sopenharmony_ci s2 = r2 + (r2 >> 2); 305e1051a39Sopenharmony_ci s3 = r3 + (r3 >> 2); 306e1051a39Sopenharmony_ci 307e1051a39Sopenharmony_ci h0 = st->h[0]; 308e1051a39Sopenharmony_ci h1 = st->h[1]; 309e1051a39Sopenharmony_ci h2 = st->h[2]; 310e1051a39Sopenharmony_ci h3 = st->h[3]; 311e1051a39Sopenharmony_ci h4 = st->h[4]; 312e1051a39Sopenharmony_ci 313e1051a39Sopenharmony_ci while (len >= POLY1305_BLOCK_SIZE) { 314e1051a39Sopenharmony_ci /* h += m[i] */ 315e1051a39Sopenharmony_ci h0 = (u32)(d0 = (u64)h0 + U8TOU32(inp + 0)); 316e1051a39Sopenharmony_ci h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + U8TOU32(inp + 4)); 317e1051a39Sopenharmony_ci h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + U8TOU32(inp + 8)); 318e1051a39Sopenharmony_ci h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + U8TOU32(inp + 12)); 319e1051a39Sopenharmony_ci h4 += (u32)(d3 >> 32) + padbit; 320e1051a39Sopenharmony_ci 321e1051a39Sopenharmony_ci /* h *= r "%" p, where "%" stands for "partial remainder" */ 322e1051a39Sopenharmony_ci d0 = ((u64)h0 * r0) + 323e1051a39Sopenharmony_ci ((u64)h1 * s3) + 324e1051a39Sopenharmony_ci ((u64)h2 * s2) + 325e1051a39Sopenharmony_ci ((u64)h3 * s1); 326e1051a39Sopenharmony_ci d1 = ((u64)h0 * r1) + 327e1051a39Sopenharmony_ci ((u64)h1 * r0) + 328e1051a39Sopenharmony_ci ((u64)h2 * s3) + 329e1051a39Sopenharmony_ci ((u64)h3 * s2) + 330e1051a39Sopenharmony_ci (h4 * s1); 331e1051a39Sopenharmony_ci d2 = ((u64)h0 * r2) + 332e1051a39Sopenharmony_ci ((u64)h1 * r1) + 333e1051a39Sopenharmony_ci ((u64)h2 * r0) + 334e1051a39Sopenharmony_ci ((u64)h3 * s3) + 335e1051a39Sopenharmony_ci (h4 * s2); 336e1051a39Sopenharmony_ci d3 = ((u64)h0 * r3) + 337e1051a39Sopenharmony_ci ((u64)h1 * r2) + 338e1051a39Sopenharmony_ci ((u64)h2 * r1) + 339e1051a39Sopenharmony_ci ((u64)h3 * r0) + 340e1051a39Sopenharmony_ci (h4 * s3); 341e1051a39Sopenharmony_ci h4 = (h4 * r0); 342e1051a39Sopenharmony_ci 343e1051a39Sopenharmony_ci /* last reduction step: */ 344e1051a39Sopenharmony_ci /* a) h4:h0 = h4<<128 + d3<<96 + d2<<64 + d1<<32 + d0 */ 345e1051a39Sopenharmony_ci h0 = (u32)d0; 346e1051a39Sopenharmony_ci h1 = (u32)(d1 += d0 >> 32); 347e1051a39Sopenharmony_ci h2 = (u32)(d2 += d1 >> 32); 348e1051a39Sopenharmony_ci h3 = (u32)(d3 += d2 >> 32); 349e1051a39Sopenharmony_ci h4 += (u32)(d3 >> 32); 350e1051a39Sopenharmony_ci /* b) (h4:h0 += (h4:h0>>130) * 5) %= 2^130 */ 351e1051a39Sopenharmony_ci c = (h4 >> 2) + (h4 & ~3U); 352e1051a39Sopenharmony_ci h4 &= 3; 353e1051a39Sopenharmony_ci h0 += c; 354e1051a39Sopenharmony_ci h1 += (c = CONSTANT_TIME_CARRY(h0,c)); 355e1051a39Sopenharmony_ci h2 += (c = CONSTANT_TIME_CARRY(h1,c)); 356e1051a39Sopenharmony_ci h3 += (c = CONSTANT_TIME_CARRY(h2,c)); 357e1051a39Sopenharmony_ci h4 += CONSTANT_TIME_CARRY(h3,c); 358e1051a39Sopenharmony_ci /* 359e1051a39Sopenharmony_ci * Occasional overflows to 3rd bit of h4 are taken care of 360e1051a39Sopenharmony_ci * "naturally". If after this point we end up at the top of 361e1051a39Sopenharmony_ci * this loop, then the overflow bit will be accounted for 362e1051a39Sopenharmony_ci * in next iteration. If we end up in poly1305_emit, then 363e1051a39Sopenharmony_ci * comparison to modulus below will still count as "carry 364e1051a39Sopenharmony_ci * into 131st bit", so that properly reduced value will be 365e1051a39Sopenharmony_ci * picked in conditional move. 366e1051a39Sopenharmony_ci */ 367e1051a39Sopenharmony_ci 368e1051a39Sopenharmony_ci inp += POLY1305_BLOCK_SIZE; 369e1051a39Sopenharmony_ci len -= POLY1305_BLOCK_SIZE; 370e1051a39Sopenharmony_ci } 371e1051a39Sopenharmony_ci 372e1051a39Sopenharmony_ci st->h[0] = h0; 373e1051a39Sopenharmony_ci st->h[1] = h1; 374e1051a39Sopenharmony_ci st->h[2] = h2; 375e1051a39Sopenharmony_ci st->h[3] = h3; 376e1051a39Sopenharmony_ci st->h[4] = h4; 377e1051a39Sopenharmony_ci} 378e1051a39Sopenharmony_ci 379e1051a39Sopenharmony_cistatic void poly1305_emit(void *ctx, unsigned char mac[16], 380e1051a39Sopenharmony_ci const u32 nonce[4]) 381e1051a39Sopenharmony_ci{ 382e1051a39Sopenharmony_ci poly1305_internal *st = (poly1305_internal *) ctx; 383e1051a39Sopenharmony_ci u32 h0, h1, h2, h3, h4; 384e1051a39Sopenharmony_ci u32 g0, g1, g2, g3, g4; 385e1051a39Sopenharmony_ci u64 t; 386e1051a39Sopenharmony_ci u32 mask; 387e1051a39Sopenharmony_ci 388e1051a39Sopenharmony_ci h0 = st->h[0]; 389e1051a39Sopenharmony_ci h1 = st->h[1]; 390e1051a39Sopenharmony_ci h2 = st->h[2]; 391e1051a39Sopenharmony_ci h3 = st->h[3]; 392e1051a39Sopenharmony_ci h4 = st->h[4]; 393e1051a39Sopenharmony_ci 394e1051a39Sopenharmony_ci /* compare to modulus by computing h + -p */ 395e1051a39Sopenharmony_ci g0 = (u32)(t = (u64)h0 + 5); 396e1051a39Sopenharmony_ci g1 = (u32)(t = (u64)h1 + (t >> 32)); 397e1051a39Sopenharmony_ci g2 = (u32)(t = (u64)h2 + (t >> 32)); 398e1051a39Sopenharmony_ci g3 = (u32)(t = (u64)h3 + (t >> 32)); 399e1051a39Sopenharmony_ci g4 = h4 + (u32)(t >> 32); 400e1051a39Sopenharmony_ci 401e1051a39Sopenharmony_ci /* if there was carry into 131st bit, h3:h0 = g3:g0 */ 402e1051a39Sopenharmony_ci mask = 0 - (g4 >> 2); 403e1051a39Sopenharmony_ci g0 &= mask; 404e1051a39Sopenharmony_ci g1 &= mask; 405e1051a39Sopenharmony_ci g2 &= mask; 406e1051a39Sopenharmony_ci g3 &= mask; 407e1051a39Sopenharmony_ci mask = ~mask; 408e1051a39Sopenharmony_ci h0 = (h0 & mask) | g0; 409e1051a39Sopenharmony_ci h1 = (h1 & mask) | g1; 410e1051a39Sopenharmony_ci h2 = (h2 & mask) | g2; 411e1051a39Sopenharmony_ci h3 = (h3 & mask) | g3; 412e1051a39Sopenharmony_ci 413e1051a39Sopenharmony_ci /* mac = (h + nonce) % (2^128) */ 414e1051a39Sopenharmony_ci h0 = (u32)(t = (u64)h0 + nonce[0]); 415e1051a39Sopenharmony_ci h1 = (u32)(t = (u64)h1 + (t >> 32) + nonce[1]); 416e1051a39Sopenharmony_ci h2 = (u32)(t = (u64)h2 + (t >> 32) + nonce[2]); 417e1051a39Sopenharmony_ci h3 = (u32)(t = (u64)h3 + (t >> 32) + nonce[3]); 418e1051a39Sopenharmony_ci 419e1051a39Sopenharmony_ci U32TO8(mac + 0, h0); 420e1051a39Sopenharmony_ci U32TO8(mac + 4, h1); 421e1051a39Sopenharmony_ci U32TO8(mac + 8, h2); 422e1051a39Sopenharmony_ci U32TO8(mac + 12, h3); 423e1051a39Sopenharmony_ci} 424e1051a39Sopenharmony_ci# endif 425e1051a39Sopenharmony_ci#else 426e1051a39Sopenharmony_ciint poly1305_init(void *ctx, const unsigned char key[16], void *func); 427e1051a39Sopenharmony_civoid poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, 428e1051a39Sopenharmony_ci unsigned int padbit); 429e1051a39Sopenharmony_civoid poly1305_emit(void *ctx, unsigned char mac[16], 430e1051a39Sopenharmony_ci const unsigned int nonce[4]); 431e1051a39Sopenharmony_ci#endif 432e1051a39Sopenharmony_ci 433e1051a39Sopenharmony_civoid Poly1305_Init(POLY1305 *ctx, const unsigned char key[32]) 434e1051a39Sopenharmony_ci{ 435e1051a39Sopenharmony_ci ctx->nonce[0] = U8TOU32(&key[16]); 436e1051a39Sopenharmony_ci ctx->nonce[1] = U8TOU32(&key[20]); 437e1051a39Sopenharmony_ci ctx->nonce[2] = U8TOU32(&key[24]); 438e1051a39Sopenharmony_ci ctx->nonce[3] = U8TOU32(&key[28]); 439e1051a39Sopenharmony_ci 440e1051a39Sopenharmony_ci#ifndef POLY1305_ASM 441e1051a39Sopenharmony_ci poly1305_init(ctx->opaque, key); 442e1051a39Sopenharmony_ci#else 443e1051a39Sopenharmony_ci /* 444e1051a39Sopenharmony_ci * Unlike reference poly1305_init assembly counterpart is expected 445e1051a39Sopenharmony_ci * to return a value: non-zero if it initializes ctx->func, and zero 446e1051a39Sopenharmony_ci * otherwise. Latter is to simplify assembly in cases when there no 447e1051a39Sopenharmony_ci * multiple code paths to switch between. 448e1051a39Sopenharmony_ci */ 449e1051a39Sopenharmony_ci if (!poly1305_init(ctx->opaque, key, &ctx->func)) { 450e1051a39Sopenharmony_ci ctx->func.blocks = poly1305_blocks; 451e1051a39Sopenharmony_ci ctx->func.emit = poly1305_emit; 452e1051a39Sopenharmony_ci } 453e1051a39Sopenharmony_ci#endif 454e1051a39Sopenharmony_ci 455e1051a39Sopenharmony_ci ctx->num = 0; 456e1051a39Sopenharmony_ci 457e1051a39Sopenharmony_ci} 458e1051a39Sopenharmony_ci 459e1051a39Sopenharmony_ci#ifdef POLY1305_ASM 460e1051a39Sopenharmony_ci/* 461e1051a39Sopenharmony_ci * This "eclipses" poly1305_blocks and poly1305_emit, but it's 462e1051a39Sopenharmony_ci * conscious choice imposed by -Wshadow compiler warnings. 463e1051a39Sopenharmony_ci */ 464e1051a39Sopenharmony_ci# define poly1305_blocks (*poly1305_blocks_p) 465e1051a39Sopenharmony_ci# define poly1305_emit (*poly1305_emit_p) 466e1051a39Sopenharmony_ci#endif 467e1051a39Sopenharmony_ci 468e1051a39Sopenharmony_civoid Poly1305_Update(POLY1305 *ctx, const unsigned char *inp, size_t len) 469e1051a39Sopenharmony_ci{ 470e1051a39Sopenharmony_ci#ifdef POLY1305_ASM 471e1051a39Sopenharmony_ci /* 472e1051a39Sopenharmony_ci * As documented, poly1305_blocks is never called with input 473e1051a39Sopenharmony_ci * longer than single block and padbit argument set to 0. This 474e1051a39Sopenharmony_ci * property is fluently used in assembly modules to optimize 475e1051a39Sopenharmony_ci * padbit handling on loop boundary. 476e1051a39Sopenharmony_ci */ 477e1051a39Sopenharmony_ci poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks; 478e1051a39Sopenharmony_ci#endif 479e1051a39Sopenharmony_ci size_t rem, num; 480e1051a39Sopenharmony_ci 481e1051a39Sopenharmony_ci if ((num = ctx->num)) { 482e1051a39Sopenharmony_ci rem = POLY1305_BLOCK_SIZE - num; 483e1051a39Sopenharmony_ci if (len >= rem) { 484e1051a39Sopenharmony_ci memcpy(ctx->data + num, inp, rem); 485e1051a39Sopenharmony_ci poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 1); 486e1051a39Sopenharmony_ci inp += rem; 487e1051a39Sopenharmony_ci len -= rem; 488e1051a39Sopenharmony_ci } else { 489e1051a39Sopenharmony_ci /* Still not enough data to process a block. */ 490e1051a39Sopenharmony_ci memcpy(ctx->data + num, inp, len); 491e1051a39Sopenharmony_ci ctx->num = num + len; 492e1051a39Sopenharmony_ci return; 493e1051a39Sopenharmony_ci } 494e1051a39Sopenharmony_ci } 495e1051a39Sopenharmony_ci 496e1051a39Sopenharmony_ci rem = len % POLY1305_BLOCK_SIZE; 497e1051a39Sopenharmony_ci len -= rem; 498e1051a39Sopenharmony_ci 499e1051a39Sopenharmony_ci if (len >= POLY1305_BLOCK_SIZE) { 500e1051a39Sopenharmony_ci poly1305_blocks(ctx->opaque, inp, len, 1); 501e1051a39Sopenharmony_ci inp += len; 502e1051a39Sopenharmony_ci } 503e1051a39Sopenharmony_ci 504e1051a39Sopenharmony_ci if (rem) 505e1051a39Sopenharmony_ci memcpy(ctx->data, inp, rem); 506e1051a39Sopenharmony_ci 507e1051a39Sopenharmony_ci ctx->num = rem; 508e1051a39Sopenharmony_ci} 509e1051a39Sopenharmony_ci 510e1051a39Sopenharmony_civoid Poly1305_Final(POLY1305 *ctx, unsigned char mac[16]) 511e1051a39Sopenharmony_ci{ 512e1051a39Sopenharmony_ci#ifdef POLY1305_ASM 513e1051a39Sopenharmony_ci poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks; 514e1051a39Sopenharmony_ci poly1305_emit_f poly1305_emit_p = ctx->func.emit; 515e1051a39Sopenharmony_ci#endif 516e1051a39Sopenharmony_ci size_t num; 517e1051a39Sopenharmony_ci 518e1051a39Sopenharmony_ci if ((num = ctx->num)) { 519e1051a39Sopenharmony_ci ctx->data[num++] = 1; /* pad bit */ 520e1051a39Sopenharmony_ci while (num < POLY1305_BLOCK_SIZE) 521e1051a39Sopenharmony_ci ctx->data[num++] = 0; 522e1051a39Sopenharmony_ci poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0); 523e1051a39Sopenharmony_ci } 524e1051a39Sopenharmony_ci 525e1051a39Sopenharmony_ci poly1305_emit(ctx->opaque, mac, ctx->nonce); 526e1051a39Sopenharmony_ci 527e1051a39Sopenharmony_ci /* zero out the state */ 528e1051a39Sopenharmony_ci OPENSSL_cleanse(ctx, sizeof(*ctx)); 529e1051a39Sopenharmony_ci} 530