1e1051a39Sopenharmony_ci/* 2e1051a39Sopenharmony_ci * Copyright 2017-2022 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci * Copyright 2014 Cryptography Research, Inc. 4e1051a39Sopenharmony_ci * 5e1051a39Sopenharmony_ci * Licensed under the Apache License 2.0 (the "License"). You may not use 6e1051a39Sopenharmony_ci * this file except in compliance with the License. You can obtain a copy 7e1051a39Sopenharmony_ci * in the file LICENSE in the source distribution or at 8e1051a39Sopenharmony_ci * https://www.openssl.org/source/license.html 9e1051a39Sopenharmony_ci * 10e1051a39Sopenharmony_ci * Originally written by Mike Hamburg 11e1051a39Sopenharmony_ci */ 12e1051a39Sopenharmony_ci 13e1051a39Sopenharmony_ci#include "e_os.h" 14e1051a39Sopenharmony_ci#include <openssl/macros.h> 15e1051a39Sopenharmony_ci#include "internal/numbers.h" 16e1051a39Sopenharmony_ci 17e1051a39Sopenharmony_ci#ifndef UINT128_MAX 18e1051a39Sopenharmony_ci/* No support for 128 bit ints, so do nothing here */ 19e1051a39Sopenharmony_ciNON_EMPTY_TRANSLATION_UNIT 20e1051a39Sopenharmony_ci#else 21e1051a39Sopenharmony_ci 22e1051a39Sopenharmony_ci# include "../field.h" 23e1051a39Sopenharmony_ci 24e1051a39Sopenharmony_civoid gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs) 25e1051a39Sopenharmony_ci{ 26e1051a39Sopenharmony_ci const uint64_t *a = as->limb, *b = bs->limb; 27e1051a39Sopenharmony_ci uint64_t *c = cs->limb; 28e1051a39Sopenharmony_ci uint128_t accum0 = 0, accum1 = 0, accum2; 29e1051a39Sopenharmony_ci uint64_t mask = (1ULL << 56) - 1; 30e1051a39Sopenharmony_ci uint64_t aa[4], bb[4], bbb[4]; 31e1051a39Sopenharmony_ci unsigned int i, j; 32e1051a39Sopenharmony_ci 33e1051a39Sopenharmony_ci for (i = 0; i < 4; i++) { 34e1051a39Sopenharmony_ci aa[i] = a[i] + a[i + 4]; 35e1051a39Sopenharmony_ci bb[i] = b[i] + b[i + 4]; 36e1051a39Sopenharmony_ci bbb[i] = bb[i] + b[i + 4]; 37e1051a39Sopenharmony_ci } 38e1051a39Sopenharmony_ci 39e1051a39Sopenharmony_ci for (i = 0; i < 4; i++) { 40e1051a39Sopenharmony_ci accum2 = 0; 41e1051a39Sopenharmony_ci 42e1051a39Sopenharmony_ci for (j = 0; j <= i; j++) { 43e1051a39Sopenharmony_ci accum2 += widemul(a[j], b[i - j]); 44e1051a39Sopenharmony_ci accum1 += widemul(aa[j], bb[i - j]); 45e1051a39Sopenharmony_ci accum0 += widemul(a[j + 4], b[i - j + 4]); 46e1051a39Sopenharmony_ci } 47e1051a39Sopenharmony_ci for (; j < 4; j++) { 48e1051a39Sopenharmony_ci accum2 += widemul(a[j], b[i - j + 8]); 49e1051a39Sopenharmony_ci accum1 += widemul(aa[j], bbb[i - j + 4]); 50e1051a39Sopenharmony_ci accum0 += widemul(a[j + 4], bb[i - j + 4]); 51e1051a39Sopenharmony_ci } 52e1051a39Sopenharmony_ci 53e1051a39Sopenharmony_ci accum1 -= accum2; 54e1051a39Sopenharmony_ci accum0 += accum2; 55e1051a39Sopenharmony_ci 56e1051a39Sopenharmony_ci c[i] = ((uint64_t)(accum0)) & mask; 57e1051a39Sopenharmony_ci c[i + 4] = ((uint64_t)(accum1)) & mask; 58e1051a39Sopenharmony_ci 59e1051a39Sopenharmony_ci accum0 >>= 56; 60e1051a39Sopenharmony_ci accum1 >>= 56; 61e1051a39Sopenharmony_ci } 62e1051a39Sopenharmony_ci 63e1051a39Sopenharmony_ci accum0 += accum1; 64e1051a39Sopenharmony_ci accum0 += c[4]; 65e1051a39Sopenharmony_ci accum1 += c[0]; 66e1051a39Sopenharmony_ci c[4] = ((uint64_t)(accum0)) & mask; 67e1051a39Sopenharmony_ci c[0] = ((uint64_t)(accum1)) & mask; 68e1051a39Sopenharmony_ci 69e1051a39Sopenharmony_ci accum0 >>= 56; 70e1051a39Sopenharmony_ci accum1 >>= 56; 71e1051a39Sopenharmony_ci 72e1051a39Sopenharmony_ci c[5] += ((uint64_t)(accum0)); 73e1051a39Sopenharmony_ci c[1] += ((uint64_t)(accum1)); 74e1051a39Sopenharmony_ci} 75e1051a39Sopenharmony_ci 76e1051a39Sopenharmony_civoid gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b) 77e1051a39Sopenharmony_ci{ 78e1051a39Sopenharmony_ci const uint64_t *a = as->limb; 79e1051a39Sopenharmony_ci uint64_t *c = cs->limb; 80e1051a39Sopenharmony_ci uint128_t accum0 = 0, accum4 = 0; 81e1051a39Sopenharmony_ci uint64_t mask = (1ULL << 56) - 1; 82e1051a39Sopenharmony_ci int i; 83e1051a39Sopenharmony_ci 84e1051a39Sopenharmony_ci for (i = 0; i < 4; i++) { 85e1051a39Sopenharmony_ci accum0 += widemul(b, a[i]); 86e1051a39Sopenharmony_ci accum4 += widemul(b, a[i + 4]); 87e1051a39Sopenharmony_ci c[i] = accum0 & mask; 88e1051a39Sopenharmony_ci accum0 >>= 56; 89e1051a39Sopenharmony_ci c[i + 4] = accum4 & mask; 90e1051a39Sopenharmony_ci accum4 >>= 56; 91e1051a39Sopenharmony_ci } 92e1051a39Sopenharmony_ci 93e1051a39Sopenharmony_ci accum0 += accum4 + c[4]; 94e1051a39Sopenharmony_ci c[4] = accum0 & mask; 95e1051a39Sopenharmony_ci c[5] += accum0 >> 56; 96e1051a39Sopenharmony_ci 97e1051a39Sopenharmony_ci accum4 += c[0]; 98e1051a39Sopenharmony_ci c[0] = accum4 & mask; 99e1051a39Sopenharmony_ci c[1] += accum4 >> 56; 100e1051a39Sopenharmony_ci} 101e1051a39Sopenharmony_ci 102e1051a39Sopenharmony_civoid gf_sqr(gf_s * RESTRICT cs, const gf as) 103e1051a39Sopenharmony_ci{ 104e1051a39Sopenharmony_ci const uint64_t *a = as->limb; 105e1051a39Sopenharmony_ci uint64_t *c = cs->limb; 106e1051a39Sopenharmony_ci uint128_t accum0 = 0, accum1 = 0, accum2; 107e1051a39Sopenharmony_ci uint64_t mask = (1ULL << 56) - 1; 108e1051a39Sopenharmony_ci uint64_t aa[4]; 109e1051a39Sopenharmony_ci unsigned int i; 110e1051a39Sopenharmony_ci 111e1051a39Sopenharmony_ci /* For some reason clang doesn't vectorize this without prompting? */ 112e1051a39Sopenharmony_ci for (i = 0; i < 4; i++) 113e1051a39Sopenharmony_ci aa[i] = a[i] + a[i + 4]; 114e1051a39Sopenharmony_ci 115e1051a39Sopenharmony_ci accum2 = widemul(a[0], a[3]); 116e1051a39Sopenharmony_ci accum0 = widemul(aa[0], aa[3]); 117e1051a39Sopenharmony_ci accum1 = widemul(a[4], a[7]); 118e1051a39Sopenharmony_ci 119e1051a39Sopenharmony_ci accum2 += widemul(a[1], a[2]); 120e1051a39Sopenharmony_ci accum0 += widemul(aa[1], aa[2]); 121e1051a39Sopenharmony_ci accum1 += widemul(a[5], a[6]); 122e1051a39Sopenharmony_ci 123e1051a39Sopenharmony_ci accum0 -= accum2; 124e1051a39Sopenharmony_ci accum1 += accum2; 125e1051a39Sopenharmony_ci 126e1051a39Sopenharmony_ci c[3] = ((uint64_t)(accum1)) << 1 & mask; 127e1051a39Sopenharmony_ci c[7] = ((uint64_t)(accum0)) << 1 & mask; 128e1051a39Sopenharmony_ci 129e1051a39Sopenharmony_ci accum0 >>= 55; 130e1051a39Sopenharmony_ci accum1 >>= 55; 131e1051a39Sopenharmony_ci 132e1051a39Sopenharmony_ci accum0 += widemul(2 * aa[1], aa[3]); 133e1051a39Sopenharmony_ci accum1 += widemul(2 * a[5], a[7]); 134e1051a39Sopenharmony_ci accum0 += widemul(aa[2], aa[2]); 135e1051a39Sopenharmony_ci accum1 += accum0; 136e1051a39Sopenharmony_ci 137e1051a39Sopenharmony_ci accum0 -= widemul(2 * a[1], a[3]); 138e1051a39Sopenharmony_ci accum1 += widemul(a[6], a[6]); 139e1051a39Sopenharmony_ci 140e1051a39Sopenharmony_ci accum2 = widemul(a[0], a[0]); 141e1051a39Sopenharmony_ci accum1 -= accum2; 142e1051a39Sopenharmony_ci accum0 += accum2; 143e1051a39Sopenharmony_ci 144e1051a39Sopenharmony_ci accum0 -= widemul(a[2], a[2]); 145e1051a39Sopenharmony_ci accum1 += widemul(aa[0], aa[0]); 146e1051a39Sopenharmony_ci accum0 += widemul(a[4], a[4]); 147e1051a39Sopenharmony_ci 148e1051a39Sopenharmony_ci c[0] = ((uint64_t)(accum0)) & mask; 149e1051a39Sopenharmony_ci c[4] = ((uint64_t)(accum1)) & mask; 150e1051a39Sopenharmony_ci 151e1051a39Sopenharmony_ci accum0 >>= 56; 152e1051a39Sopenharmony_ci accum1 >>= 56; 153e1051a39Sopenharmony_ci 154e1051a39Sopenharmony_ci accum2 = widemul(2 * aa[2], aa[3]); 155e1051a39Sopenharmony_ci accum0 -= widemul(2 * a[2], a[3]); 156e1051a39Sopenharmony_ci accum1 += widemul(2 * a[6], a[7]); 157e1051a39Sopenharmony_ci 158e1051a39Sopenharmony_ci accum1 += accum2; 159e1051a39Sopenharmony_ci accum0 += accum2; 160e1051a39Sopenharmony_ci 161e1051a39Sopenharmony_ci accum2 = widemul(2 * a[0], a[1]); 162e1051a39Sopenharmony_ci accum1 += widemul(2 * aa[0], aa[1]); 163e1051a39Sopenharmony_ci accum0 += widemul(2 * a[4], a[5]); 164e1051a39Sopenharmony_ci 165e1051a39Sopenharmony_ci accum1 -= accum2; 166e1051a39Sopenharmony_ci accum0 += accum2; 167e1051a39Sopenharmony_ci 168e1051a39Sopenharmony_ci c[1] = ((uint64_t)(accum0)) & mask; 169e1051a39Sopenharmony_ci c[5] = ((uint64_t)(accum1)) & mask; 170e1051a39Sopenharmony_ci 171e1051a39Sopenharmony_ci accum0 >>= 56; 172e1051a39Sopenharmony_ci accum1 >>= 56; 173e1051a39Sopenharmony_ci 174e1051a39Sopenharmony_ci accum2 = widemul(aa[3], aa[3]); 175e1051a39Sopenharmony_ci accum0 -= widemul(a[3], a[3]); 176e1051a39Sopenharmony_ci accum1 += widemul(a[7], a[7]); 177e1051a39Sopenharmony_ci 178e1051a39Sopenharmony_ci accum1 += accum2; 179e1051a39Sopenharmony_ci accum0 += accum2; 180e1051a39Sopenharmony_ci 181e1051a39Sopenharmony_ci accum2 = widemul(2 * a[0], a[2]); 182e1051a39Sopenharmony_ci accum1 += widemul(2 * aa[0], aa[2]); 183e1051a39Sopenharmony_ci accum0 += widemul(2 * a[4], a[6]); 184e1051a39Sopenharmony_ci 185e1051a39Sopenharmony_ci accum2 += widemul(a[1], a[1]); 186e1051a39Sopenharmony_ci accum1 += widemul(aa[1], aa[1]); 187e1051a39Sopenharmony_ci accum0 += widemul(a[5], a[5]); 188e1051a39Sopenharmony_ci 189e1051a39Sopenharmony_ci accum1 -= accum2; 190e1051a39Sopenharmony_ci accum0 += accum2; 191e1051a39Sopenharmony_ci 192e1051a39Sopenharmony_ci c[2] = ((uint64_t)(accum0)) & mask; 193e1051a39Sopenharmony_ci c[6] = ((uint64_t)(accum1)) & mask; 194e1051a39Sopenharmony_ci 195e1051a39Sopenharmony_ci accum0 >>= 56; 196e1051a39Sopenharmony_ci accum1 >>= 56; 197e1051a39Sopenharmony_ci 198e1051a39Sopenharmony_ci accum0 += c[3]; 199e1051a39Sopenharmony_ci accum1 += c[7]; 200e1051a39Sopenharmony_ci c[3] = ((uint64_t)(accum0)) & mask; 201e1051a39Sopenharmony_ci c[7] = ((uint64_t)(accum1)) & mask; 202e1051a39Sopenharmony_ci 203e1051a39Sopenharmony_ci /* we could almost stop here, but it wouldn't be stable, so... */ 204e1051a39Sopenharmony_ci 205e1051a39Sopenharmony_ci accum0 >>= 56; 206e1051a39Sopenharmony_ci accum1 >>= 56; 207e1051a39Sopenharmony_ci c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); 208e1051a39Sopenharmony_ci c[0] += ((uint64_t)(accum1)); 209e1051a39Sopenharmony_ci} 210e1051a39Sopenharmony_ci#endif 211