1e1051a39Sopenharmony_ci/*
2e1051a39Sopenharmony_ci * Copyright 2017-2022 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci * Copyright 2014 Cryptography Research, Inc.
4e1051a39Sopenharmony_ci *
5e1051a39Sopenharmony_ci * Licensed under the Apache License 2.0 (the "License").  You may not use
6e1051a39Sopenharmony_ci * this file except in compliance with the License.  You can obtain a copy
7e1051a39Sopenharmony_ci * in the file LICENSE in the source distribution or at
8e1051a39Sopenharmony_ci * https://www.openssl.org/source/license.html
9e1051a39Sopenharmony_ci *
10e1051a39Sopenharmony_ci * Originally written by Mike Hamburg
11e1051a39Sopenharmony_ci */
12e1051a39Sopenharmony_ci
13e1051a39Sopenharmony_ci#include "e_os.h"
14e1051a39Sopenharmony_ci#include <openssl/macros.h>
15e1051a39Sopenharmony_ci#include "internal/numbers.h"
16e1051a39Sopenharmony_ci
17e1051a39Sopenharmony_ci#ifndef UINT128_MAX
18e1051a39Sopenharmony_ci/* No support for 128 bit ints, so do nothing here */
19e1051a39Sopenharmony_ciNON_EMPTY_TRANSLATION_UNIT
20e1051a39Sopenharmony_ci#else
21e1051a39Sopenharmony_ci
22e1051a39Sopenharmony_ci# include "../field.h"
23e1051a39Sopenharmony_ci
24e1051a39Sopenharmony_civoid gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
25e1051a39Sopenharmony_ci{
26e1051a39Sopenharmony_ci    const uint64_t *a = as->limb, *b = bs->limb;
27e1051a39Sopenharmony_ci    uint64_t *c = cs->limb;
28e1051a39Sopenharmony_ci    uint128_t accum0 = 0, accum1 = 0, accum2;
29e1051a39Sopenharmony_ci    uint64_t mask = (1ULL << 56) - 1;
30e1051a39Sopenharmony_ci    uint64_t aa[4], bb[4], bbb[4];
31e1051a39Sopenharmony_ci    unsigned int i, j;
32e1051a39Sopenharmony_ci
33e1051a39Sopenharmony_ci    for (i = 0; i < 4; i++) {
34e1051a39Sopenharmony_ci        aa[i] = a[i] + a[i + 4];
35e1051a39Sopenharmony_ci        bb[i] = b[i] + b[i + 4];
36e1051a39Sopenharmony_ci        bbb[i] = bb[i] + b[i + 4];
37e1051a39Sopenharmony_ci    }
38e1051a39Sopenharmony_ci
39e1051a39Sopenharmony_ci    for (i = 0; i < 4; i++) {
40e1051a39Sopenharmony_ci        accum2 = 0;
41e1051a39Sopenharmony_ci
42e1051a39Sopenharmony_ci        for (j = 0; j <= i; j++) {
43e1051a39Sopenharmony_ci            accum2 += widemul(a[j], b[i - j]);
44e1051a39Sopenharmony_ci            accum1 += widemul(aa[j], bb[i - j]);
45e1051a39Sopenharmony_ci            accum0 += widemul(a[j + 4], b[i - j + 4]);
46e1051a39Sopenharmony_ci        }
47e1051a39Sopenharmony_ci        for (; j < 4; j++) {
48e1051a39Sopenharmony_ci            accum2 += widemul(a[j], b[i - j + 8]);
49e1051a39Sopenharmony_ci            accum1 += widemul(aa[j], bbb[i - j + 4]);
50e1051a39Sopenharmony_ci            accum0 += widemul(a[j + 4], bb[i - j + 4]);
51e1051a39Sopenharmony_ci        }
52e1051a39Sopenharmony_ci
53e1051a39Sopenharmony_ci        accum1 -= accum2;
54e1051a39Sopenharmony_ci        accum0 += accum2;
55e1051a39Sopenharmony_ci
56e1051a39Sopenharmony_ci        c[i] = ((uint64_t)(accum0)) & mask;
57e1051a39Sopenharmony_ci        c[i + 4] = ((uint64_t)(accum1)) & mask;
58e1051a39Sopenharmony_ci
59e1051a39Sopenharmony_ci        accum0 >>= 56;
60e1051a39Sopenharmony_ci        accum1 >>= 56;
61e1051a39Sopenharmony_ci    }
62e1051a39Sopenharmony_ci
63e1051a39Sopenharmony_ci    accum0 += accum1;
64e1051a39Sopenharmony_ci    accum0 += c[4];
65e1051a39Sopenharmony_ci    accum1 += c[0];
66e1051a39Sopenharmony_ci    c[4] = ((uint64_t)(accum0)) & mask;
67e1051a39Sopenharmony_ci    c[0] = ((uint64_t)(accum1)) & mask;
68e1051a39Sopenharmony_ci
69e1051a39Sopenharmony_ci    accum0 >>= 56;
70e1051a39Sopenharmony_ci    accum1 >>= 56;
71e1051a39Sopenharmony_ci
72e1051a39Sopenharmony_ci    c[5] += ((uint64_t)(accum0));
73e1051a39Sopenharmony_ci    c[1] += ((uint64_t)(accum1));
74e1051a39Sopenharmony_ci}
75e1051a39Sopenharmony_ci
76e1051a39Sopenharmony_civoid gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b)
77e1051a39Sopenharmony_ci{
78e1051a39Sopenharmony_ci    const uint64_t *a = as->limb;
79e1051a39Sopenharmony_ci    uint64_t *c = cs->limb;
80e1051a39Sopenharmony_ci    uint128_t accum0 = 0, accum4 = 0;
81e1051a39Sopenharmony_ci    uint64_t mask = (1ULL << 56) - 1;
82e1051a39Sopenharmony_ci    int i;
83e1051a39Sopenharmony_ci
84e1051a39Sopenharmony_ci    for (i = 0; i < 4; i++) {
85e1051a39Sopenharmony_ci        accum0 += widemul(b, a[i]);
86e1051a39Sopenharmony_ci        accum4 += widemul(b, a[i + 4]);
87e1051a39Sopenharmony_ci        c[i] = accum0 & mask;
88e1051a39Sopenharmony_ci        accum0 >>= 56;
89e1051a39Sopenharmony_ci        c[i + 4] = accum4 & mask;
90e1051a39Sopenharmony_ci        accum4 >>= 56;
91e1051a39Sopenharmony_ci    }
92e1051a39Sopenharmony_ci
93e1051a39Sopenharmony_ci    accum0 += accum4 + c[4];
94e1051a39Sopenharmony_ci    c[4] = accum0 & mask;
95e1051a39Sopenharmony_ci    c[5] += accum0 >> 56;
96e1051a39Sopenharmony_ci
97e1051a39Sopenharmony_ci    accum4 += c[0];
98e1051a39Sopenharmony_ci    c[0] = accum4 & mask;
99e1051a39Sopenharmony_ci    c[1] += accum4 >> 56;
100e1051a39Sopenharmony_ci}
101e1051a39Sopenharmony_ci
102e1051a39Sopenharmony_civoid gf_sqr(gf_s * RESTRICT cs, const gf as)
103e1051a39Sopenharmony_ci{
104e1051a39Sopenharmony_ci    const uint64_t *a = as->limb;
105e1051a39Sopenharmony_ci    uint64_t *c = cs->limb;
106e1051a39Sopenharmony_ci    uint128_t accum0 = 0, accum1 = 0, accum2;
107e1051a39Sopenharmony_ci    uint64_t mask = (1ULL << 56) - 1;
108e1051a39Sopenharmony_ci    uint64_t aa[4];
109e1051a39Sopenharmony_ci    unsigned int i;
110e1051a39Sopenharmony_ci
111e1051a39Sopenharmony_ci    /* For some reason clang doesn't vectorize this without prompting? */
112e1051a39Sopenharmony_ci    for (i = 0; i < 4; i++)
113e1051a39Sopenharmony_ci        aa[i] = a[i] + a[i + 4];
114e1051a39Sopenharmony_ci
115e1051a39Sopenharmony_ci    accum2 = widemul(a[0], a[3]);
116e1051a39Sopenharmony_ci    accum0 = widemul(aa[0], aa[3]);
117e1051a39Sopenharmony_ci    accum1 = widemul(a[4], a[7]);
118e1051a39Sopenharmony_ci
119e1051a39Sopenharmony_ci    accum2 += widemul(a[1], a[2]);
120e1051a39Sopenharmony_ci    accum0 += widemul(aa[1], aa[2]);
121e1051a39Sopenharmony_ci    accum1 += widemul(a[5], a[6]);
122e1051a39Sopenharmony_ci
123e1051a39Sopenharmony_ci    accum0 -= accum2;
124e1051a39Sopenharmony_ci    accum1 += accum2;
125e1051a39Sopenharmony_ci
126e1051a39Sopenharmony_ci    c[3] = ((uint64_t)(accum1)) << 1 & mask;
127e1051a39Sopenharmony_ci    c[7] = ((uint64_t)(accum0)) << 1 & mask;
128e1051a39Sopenharmony_ci
129e1051a39Sopenharmony_ci    accum0 >>= 55;
130e1051a39Sopenharmony_ci    accum1 >>= 55;
131e1051a39Sopenharmony_ci
132e1051a39Sopenharmony_ci    accum0 += widemul(2 * aa[1], aa[3]);
133e1051a39Sopenharmony_ci    accum1 += widemul(2 * a[5], a[7]);
134e1051a39Sopenharmony_ci    accum0 += widemul(aa[2], aa[2]);
135e1051a39Sopenharmony_ci    accum1 += accum0;
136e1051a39Sopenharmony_ci
137e1051a39Sopenharmony_ci    accum0 -= widemul(2 * a[1], a[3]);
138e1051a39Sopenharmony_ci    accum1 += widemul(a[6], a[6]);
139e1051a39Sopenharmony_ci
140e1051a39Sopenharmony_ci    accum2 = widemul(a[0], a[0]);
141e1051a39Sopenharmony_ci    accum1 -= accum2;
142e1051a39Sopenharmony_ci    accum0 += accum2;
143e1051a39Sopenharmony_ci
144e1051a39Sopenharmony_ci    accum0 -= widemul(a[2], a[2]);
145e1051a39Sopenharmony_ci    accum1 += widemul(aa[0], aa[0]);
146e1051a39Sopenharmony_ci    accum0 += widemul(a[4], a[4]);
147e1051a39Sopenharmony_ci
148e1051a39Sopenharmony_ci    c[0] = ((uint64_t)(accum0)) & mask;
149e1051a39Sopenharmony_ci    c[4] = ((uint64_t)(accum1)) & mask;
150e1051a39Sopenharmony_ci
151e1051a39Sopenharmony_ci    accum0 >>= 56;
152e1051a39Sopenharmony_ci    accum1 >>= 56;
153e1051a39Sopenharmony_ci
154e1051a39Sopenharmony_ci    accum2 = widemul(2 * aa[2], aa[3]);
155e1051a39Sopenharmony_ci    accum0 -= widemul(2 * a[2], a[3]);
156e1051a39Sopenharmony_ci    accum1 += widemul(2 * a[6], a[7]);
157e1051a39Sopenharmony_ci
158e1051a39Sopenharmony_ci    accum1 += accum2;
159e1051a39Sopenharmony_ci    accum0 += accum2;
160e1051a39Sopenharmony_ci
161e1051a39Sopenharmony_ci    accum2 = widemul(2 * a[0], a[1]);
162e1051a39Sopenharmony_ci    accum1 += widemul(2 * aa[0], aa[1]);
163e1051a39Sopenharmony_ci    accum0 += widemul(2 * a[4], a[5]);
164e1051a39Sopenharmony_ci
165e1051a39Sopenharmony_ci    accum1 -= accum2;
166e1051a39Sopenharmony_ci    accum0 += accum2;
167e1051a39Sopenharmony_ci
168e1051a39Sopenharmony_ci    c[1] = ((uint64_t)(accum0)) & mask;
169e1051a39Sopenharmony_ci    c[5] = ((uint64_t)(accum1)) & mask;
170e1051a39Sopenharmony_ci
171e1051a39Sopenharmony_ci    accum0 >>= 56;
172e1051a39Sopenharmony_ci    accum1 >>= 56;
173e1051a39Sopenharmony_ci
174e1051a39Sopenharmony_ci    accum2 = widemul(aa[3], aa[3]);
175e1051a39Sopenharmony_ci    accum0 -= widemul(a[3], a[3]);
176e1051a39Sopenharmony_ci    accum1 += widemul(a[7], a[7]);
177e1051a39Sopenharmony_ci
178e1051a39Sopenharmony_ci    accum1 += accum2;
179e1051a39Sopenharmony_ci    accum0 += accum2;
180e1051a39Sopenharmony_ci
181e1051a39Sopenharmony_ci    accum2 = widemul(2 * a[0], a[2]);
182e1051a39Sopenharmony_ci    accum1 += widemul(2 * aa[0], aa[2]);
183e1051a39Sopenharmony_ci    accum0 += widemul(2 * a[4], a[6]);
184e1051a39Sopenharmony_ci
185e1051a39Sopenharmony_ci    accum2 += widemul(a[1], a[1]);
186e1051a39Sopenharmony_ci    accum1 += widemul(aa[1], aa[1]);
187e1051a39Sopenharmony_ci    accum0 += widemul(a[5], a[5]);
188e1051a39Sopenharmony_ci
189e1051a39Sopenharmony_ci    accum1 -= accum2;
190e1051a39Sopenharmony_ci    accum0 += accum2;
191e1051a39Sopenharmony_ci
192e1051a39Sopenharmony_ci    c[2] = ((uint64_t)(accum0)) & mask;
193e1051a39Sopenharmony_ci    c[6] = ((uint64_t)(accum1)) & mask;
194e1051a39Sopenharmony_ci
195e1051a39Sopenharmony_ci    accum0 >>= 56;
196e1051a39Sopenharmony_ci    accum1 >>= 56;
197e1051a39Sopenharmony_ci
198e1051a39Sopenharmony_ci    accum0 += c[3];
199e1051a39Sopenharmony_ci    accum1 += c[7];
200e1051a39Sopenharmony_ci    c[3] = ((uint64_t)(accum0)) & mask;
201e1051a39Sopenharmony_ci    c[7] = ((uint64_t)(accum1)) & mask;
202e1051a39Sopenharmony_ci
203e1051a39Sopenharmony_ci    /* we could almost stop here, but it wouldn't be stable, so... */
204e1051a39Sopenharmony_ci
205e1051a39Sopenharmony_ci    accum0 >>= 56;
206e1051a39Sopenharmony_ci    accum1 >>= 56;
207e1051a39Sopenharmony_ci    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
208e1051a39Sopenharmony_ci    c[0] += ((uint64_t)(accum1));
209e1051a39Sopenharmony_ci}
210e1051a39Sopenharmony_ci#endif
211