1e1051a39Sopenharmony_ci/*
2e1051a39Sopenharmony_ci * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci *
4e1051a39Sopenharmony_ci * Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci * this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci * in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci * https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci */
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci/*
11e1051a39Sopenharmony_ci * This module is meant to be used as template for non-x87 floating-
12e1051a39Sopenharmony_ci * point assembly modules. The template itself is x86_64-specific
13e1051a39Sopenharmony_ci * though, as it was debugged on x86_64. So that implementor would
14e1051a39Sopenharmony_ci * have to recognize platform-specific parts, UxTOy and inline asm,
15e1051a39Sopenharmony_ci * and act accordingly.
16e1051a39Sopenharmony_ci *
17e1051a39Sopenharmony_ci * Huh? x86_64-specific code as template for non-x87? Note seven, which
18e1051a39Sopenharmony_ci * is not a typo, but reference to 80-bit precision. This module on the
19e1051a39Sopenharmony_ci * other hand relies on 64-bit precision operations, which are default
20e1051a39Sopenharmony_ci * for x86_64 code. And since we are at it, just for sense of it,
21e1051a39Sopenharmony_ci * large-block performance in cycles per processed byte for *this* code
22e1051a39Sopenharmony_ci * is:
23e1051a39Sopenharmony_ci *                      gcc-4.8         icc-15.0        clang-3.4(*)
24e1051a39Sopenharmony_ci *
25e1051a39Sopenharmony_ci * Westmere             4.96            5.09            4.37
26e1051a39Sopenharmony_ci * Sandy Bridge         4.95            4.90            4.17
27e1051a39Sopenharmony_ci * Haswell              4.92            4.87            3.78
28e1051a39Sopenharmony_ci * Bulldozer            4.67            4.49            4.68
29e1051a39Sopenharmony_ci * VIA Nano             7.07            7.05            5.98
30e1051a39Sopenharmony_ci * Silvermont           10.6            9.61            12.6
31e1051a39Sopenharmony_ci *
32e1051a39Sopenharmony_ci * (*)  clang managed to discover parallelism and deployed SIMD;
33e1051a39Sopenharmony_ci *
34e1051a39Sopenharmony_ci * And for range of other platforms with unspecified gcc versions:
35e1051a39Sopenharmony_ci *
36e1051a39Sopenharmony_ci * Freescale e300       12.5
37e1051a39Sopenharmony_ci * PPC74x0              10.8
38e1051a39Sopenharmony_ci * POWER6               4.92
39e1051a39Sopenharmony_ci * POWER7               4.50
40e1051a39Sopenharmony_ci * POWER8               4.10
41e1051a39Sopenharmony_ci *
42e1051a39Sopenharmony_ci * z10                  11.2
43e1051a39Sopenharmony_ci * z196+                7.30
44e1051a39Sopenharmony_ci *
45e1051a39Sopenharmony_ci * UltraSPARC III       16.0
46e1051a39Sopenharmony_ci * SPARC T4             16.1
47e1051a39Sopenharmony_ci */
48e1051a39Sopenharmony_ci
49e1051a39Sopenharmony_ci#if !(defined(__GNUC__) && __GNUC__>=2)
50e1051a39Sopenharmony_ci# error "this is gcc-specific template"
51e1051a39Sopenharmony_ci#endif
52e1051a39Sopenharmony_ci
53e1051a39Sopenharmony_ci#include <stdlib.h>
54e1051a39Sopenharmony_ci
55e1051a39Sopenharmony_citypedef unsigned char u8;
56e1051a39Sopenharmony_citypedef unsigned int u32;
57e1051a39Sopenharmony_citypedef unsigned long long u64;
58e1051a39Sopenharmony_citypedef union { double d; u64 u; } elem64;
59e1051a39Sopenharmony_ci
60e1051a39Sopenharmony_ci#define TWO(p)          ((double)(1ULL<<(p)))
61e1051a39Sopenharmony_ci#define TWO0            TWO(0)
62e1051a39Sopenharmony_ci#define TWO32           TWO(32)
63e1051a39Sopenharmony_ci#define TWO64           (TWO32*TWO(32))
64e1051a39Sopenharmony_ci#define TWO96           (TWO64*TWO(32))
65e1051a39Sopenharmony_ci#define TWO130          (TWO96*TWO(34))
66e1051a39Sopenharmony_ci
67e1051a39Sopenharmony_ci#define EXP(p)          ((1023ULL+(p))<<52)
68e1051a39Sopenharmony_ci
69e1051a39Sopenharmony_ci#if defined(__x86_64__) || (defined(__PPC__) && defined(__LITTLE_ENDIAN__))
70e1051a39Sopenharmony_ci# define U8TOU32(p)     (*(const u32 *)(p))
71e1051a39Sopenharmony_ci# define U32TO8(p,v)    (*(u32 *)(p) = (v))
72e1051a39Sopenharmony_ci#elif defined(__PPC__)
73e1051a39Sopenharmony_ci# define U8TOU32(p)     ({u32 ret; asm ("lwbrx	%0,0,%1":"=r"(ret):"b"(p)); ret; })
74e1051a39Sopenharmony_ci# define U32TO8(p,v)    asm ("stwbrx %0,0,%1"::"r"(v),"b"(p):"memory")
75e1051a39Sopenharmony_ci#elif defined(__s390x__)
76e1051a39Sopenharmony_ci# define U8TOU32(p)     ({u32 ret; asm ("lrv	%0,%1":"=d"(ret):"m"(*(u32 *)(p))); ret; })
77e1051a39Sopenharmony_ci# define U32TO8(p,v)    asm ("strv	%1,%0":"=m"(*(u32 *)(p)):"d"(v))
78e1051a39Sopenharmony_ci#endif
79e1051a39Sopenharmony_ci
80e1051a39Sopenharmony_ci#ifndef U8TOU32
81e1051a39Sopenharmony_ci# define U8TOU32(p)     ((u32)(p)[0]     | (u32)(p)[1]<<8 |     \
82e1051a39Sopenharmony_ci                         (u32)(p)[2]<<16 | (u32)(p)[3]<<24  )
83e1051a39Sopenharmony_ci#endif
84e1051a39Sopenharmony_ci#ifndef U32TO8
85e1051a39Sopenharmony_ci# define U32TO8(p,v)    ((p)[0] = (u8)(v),       (p)[1] = (u8)((v)>>8), \
86e1051a39Sopenharmony_ci                         (p)[2] = (u8)((v)>>16), (p)[3] = (u8)((v)>>24) )
87e1051a39Sopenharmony_ci#endif
88e1051a39Sopenharmony_ci
89e1051a39Sopenharmony_citypedef struct {
90e1051a39Sopenharmony_ci    elem64 h[4];
91e1051a39Sopenharmony_ci    double r[8];
92e1051a39Sopenharmony_ci    double s[6];
93e1051a39Sopenharmony_ci} poly1305_internal;
94e1051a39Sopenharmony_ci
95e1051a39Sopenharmony_ci/* "round toward zero (truncate), mask all exceptions" */
96e1051a39Sopenharmony_ci#if defined(__x86_64__)
97e1051a39Sopenharmony_cistatic const u32 mxcsr = 0x7f80;
98e1051a39Sopenharmony_ci#elif defined(__PPC__)
99e1051a39Sopenharmony_cistatic const u64 one = 1;
100e1051a39Sopenharmony_ci#elif defined(__s390x__)
101e1051a39Sopenharmony_cistatic const u32 fpc = 1;
102e1051a39Sopenharmony_ci#elif defined(__sparc__)
103e1051a39Sopenharmony_cistatic const u64 fsr = 1ULL<<30;
104e1051a39Sopenharmony_ci#elif defined(__mips__)
105e1051a39Sopenharmony_cistatic const u32 fcsr = 1;
106e1051a39Sopenharmony_ci#else
107e1051a39Sopenharmony_ci#error "unrecognized platform"
108e1051a39Sopenharmony_ci#endif
109e1051a39Sopenharmony_ci
110e1051a39Sopenharmony_ciint poly1305_init(void *ctx, const unsigned char key[16])
111e1051a39Sopenharmony_ci{
112e1051a39Sopenharmony_ci    poly1305_internal *st = (poly1305_internal *) ctx;
113e1051a39Sopenharmony_ci    elem64 r0, r1, r2, r3;
114e1051a39Sopenharmony_ci
115e1051a39Sopenharmony_ci    /* h = 0, biased */
116e1051a39Sopenharmony_ci#if 0
117e1051a39Sopenharmony_ci    st->h[0].d = TWO(52)*TWO0;
118e1051a39Sopenharmony_ci    st->h[1].d = TWO(52)*TWO32;
119e1051a39Sopenharmony_ci    st->h[2].d = TWO(52)*TWO64;
120e1051a39Sopenharmony_ci    st->h[3].d = TWO(52)*TWO96;
121e1051a39Sopenharmony_ci#else
122e1051a39Sopenharmony_ci    st->h[0].u = EXP(52+0);
123e1051a39Sopenharmony_ci    st->h[1].u = EXP(52+32);
124e1051a39Sopenharmony_ci    st->h[2].u = EXP(52+64);
125e1051a39Sopenharmony_ci    st->h[3].u = EXP(52+96);
126e1051a39Sopenharmony_ci#endif
127e1051a39Sopenharmony_ci
128e1051a39Sopenharmony_ci    if (key) {
129e1051a39Sopenharmony_ci        /*
130e1051a39Sopenharmony_ci         * set "truncate" rounding mode
131e1051a39Sopenharmony_ci         */
132e1051a39Sopenharmony_ci#if defined(__x86_64__)
133e1051a39Sopenharmony_ci        u32 mxcsr_orig;
134e1051a39Sopenharmony_ci
135e1051a39Sopenharmony_ci        asm volatile ("stmxcsr	%0":"=m"(mxcsr_orig));
136e1051a39Sopenharmony_ci        asm volatile ("ldmxcsr	%0"::"m"(mxcsr));
137e1051a39Sopenharmony_ci#elif defined(__PPC__)
138e1051a39Sopenharmony_ci        double fpscr_orig, fpscr = *(double *)&one;
139e1051a39Sopenharmony_ci
140e1051a39Sopenharmony_ci        asm volatile ("mffs	%0":"=f"(fpscr_orig));
141e1051a39Sopenharmony_ci        asm volatile ("mtfsf	255,%0"::"f"(fpscr));
142e1051a39Sopenharmony_ci#elif defined(__s390x__)
143e1051a39Sopenharmony_ci        u32 fpc_orig;
144e1051a39Sopenharmony_ci
145e1051a39Sopenharmony_ci        asm volatile ("stfpc	%0":"=m"(fpc_orig));
146e1051a39Sopenharmony_ci        asm volatile ("lfpc	%0"::"m"(fpc));
147e1051a39Sopenharmony_ci#elif defined(__sparc__)
148e1051a39Sopenharmony_ci        u64 fsr_orig;
149e1051a39Sopenharmony_ci
150e1051a39Sopenharmony_ci        asm volatile ("stx	%%fsr,%0":"=m"(fsr_orig));
151e1051a39Sopenharmony_ci        asm volatile ("ldx	%0,%%fsr"::"m"(fsr));
152e1051a39Sopenharmony_ci#elif defined(__mips__)
153e1051a39Sopenharmony_ci        u32 fcsr_orig;
154e1051a39Sopenharmony_ci
155e1051a39Sopenharmony_ci        asm volatile ("cfc1	%0,$31":"=r"(fcsr_orig));
156e1051a39Sopenharmony_ci        asm volatile ("ctc1	%0,$31"::"r"(fcsr));
157e1051a39Sopenharmony_ci#endif
158e1051a39Sopenharmony_ci
159e1051a39Sopenharmony_ci        /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
160e1051a39Sopenharmony_ci        r0.u = EXP(52+0)  | (U8TOU32(&key[0])  & 0x0fffffff);
161e1051a39Sopenharmony_ci        r1.u = EXP(52+32) | (U8TOU32(&key[4])  & 0x0ffffffc);
162e1051a39Sopenharmony_ci        r2.u = EXP(52+64) | (U8TOU32(&key[8])  & 0x0ffffffc);
163e1051a39Sopenharmony_ci        r3.u = EXP(52+96) | (U8TOU32(&key[12]) & 0x0ffffffc);
164e1051a39Sopenharmony_ci
165e1051a39Sopenharmony_ci        st->r[0] = r0.d - TWO(52)*TWO0;
166e1051a39Sopenharmony_ci        st->r[2] = r1.d - TWO(52)*TWO32;
167e1051a39Sopenharmony_ci        st->r[4] = r2.d - TWO(52)*TWO64;
168e1051a39Sopenharmony_ci        st->r[6] = r3.d - TWO(52)*TWO96;
169e1051a39Sopenharmony_ci
170e1051a39Sopenharmony_ci        st->s[0] = st->r[2] * (5.0/TWO130);
171e1051a39Sopenharmony_ci        st->s[2] = st->r[4] * (5.0/TWO130);
172e1051a39Sopenharmony_ci        st->s[4] = st->r[6] * (5.0/TWO130);
173e1051a39Sopenharmony_ci
174e1051a39Sopenharmony_ci        /*
175e1051a39Sopenharmony_ci         * base 2^32 -> base 2^16
176e1051a39Sopenharmony_ci         */
177e1051a39Sopenharmony_ci        st->r[1] = (st->r[0] + TWO(52)*TWO(16)*TWO0) -
178e1051a39Sopenharmony_ci                               TWO(52)*TWO(16)*TWO0;
179e1051a39Sopenharmony_ci        st->r[0] -= st->r[1];
180e1051a39Sopenharmony_ci
181e1051a39Sopenharmony_ci        st->r[3] = (st->r[2] + TWO(52)*TWO(16)*TWO32) -
182e1051a39Sopenharmony_ci                               TWO(52)*TWO(16)*TWO32;
183e1051a39Sopenharmony_ci        st->r[2] -= st->r[3];
184e1051a39Sopenharmony_ci
185e1051a39Sopenharmony_ci        st->r[5] = (st->r[4] + TWO(52)*TWO(16)*TWO64) -
186e1051a39Sopenharmony_ci                               TWO(52)*TWO(16)*TWO64;
187e1051a39Sopenharmony_ci        st->r[4] -= st->r[5];
188e1051a39Sopenharmony_ci
189e1051a39Sopenharmony_ci        st->r[7] = (st->r[6] + TWO(52)*TWO(16)*TWO96) -
190e1051a39Sopenharmony_ci                               TWO(52)*TWO(16)*TWO96;
191e1051a39Sopenharmony_ci        st->r[6] -= st->r[7];
192e1051a39Sopenharmony_ci
193e1051a39Sopenharmony_ci        st->s[1] = (st->s[0] + TWO(52)*TWO(16)*TWO0/TWO96) -
194e1051a39Sopenharmony_ci                               TWO(52)*TWO(16)*TWO0/TWO96;
195e1051a39Sopenharmony_ci        st->s[0] -= st->s[1];
196e1051a39Sopenharmony_ci
197e1051a39Sopenharmony_ci        st->s[3] = (st->s[2] + TWO(52)*TWO(16)*TWO32/TWO96) -
198e1051a39Sopenharmony_ci                               TWO(52)*TWO(16)*TWO32/TWO96;
199e1051a39Sopenharmony_ci        st->s[2] -= st->s[3];
200e1051a39Sopenharmony_ci
201e1051a39Sopenharmony_ci        st->s[5] = (st->s[4] + TWO(52)*TWO(16)*TWO64/TWO96) -
202e1051a39Sopenharmony_ci                               TWO(52)*TWO(16)*TWO64/TWO96;
203e1051a39Sopenharmony_ci        st->s[4] -= st->s[5];
204e1051a39Sopenharmony_ci
205e1051a39Sopenharmony_ci        /*
206e1051a39Sopenharmony_ci         * restore original FPU control register
207e1051a39Sopenharmony_ci         */
208e1051a39Sopenharmony_ci#if defined(__x86_64__)
209e1051a39Sopenharmony_ci        asm volatile ("ldmxcsr	%0"::"m"(mxcsr_orig));
210e1051a39Sopenharmony_ci#elif defined(__PPC__)
211e1051a39Sopenharmony_ci        asm volatile ("mtfsf	255,%0"::"f"(fpscr_orig));
212e1051a39Sopenharmony_ci#elif defined(__s390x__)
213e1051a39Sopenharmony_ci        asm volatile ("lfpc	%0"::"m"(fpc_orig));
214e1051a39Sopenharmony_ci#elif defined(__sparc__)
215e1051a39Sopenharmony_ci        asm volatile ("ldx	%0,%%fsr"::"m"(fsr_orig));
216e1051a39Sopenharmony_ci#elif defined(__mips__)
217e1051a39Sopenharmony_ci        asm volatile ("ctc1	%0,$31"::"r"(fcsr_orig));
218e1051a39Sopenharmony_ci#endif
219e1051a39Sopenharmony_ci    }
220e1051a39Sopenharmony_ci
221e1051a39Sopenharmony_ci    return 0;
222e1051a39Sopenharmony_ci}
223e1051a39Sopenharmony_ci
224e1051a39Sopenharmony_civoid poly1305_blocks(void *ctx, const unsigned char *inp, size_t len,
225e1051a39Sopenharmony_ci                     int padbit)
226e1051a39Sopenharmony_ci{
227e1051a39Sopenharmony_ci    poly1305_internal *st = (poly1305_internal *)ctx;
228e1051a39Sopenharmony_ci    elem64 in0, in1, in2, in3;
229e1051a39Sopenharmony_ci    u64 pad = (u64)padbit<<32;
230e1051a39Sopenharmony_ci
231e1051a39Sopenharmony_ci    double x0, x1, x2, x3;
232e1051a39Sopenharmony_ci    double h0lo, h0hi, h1lo, h1hi, h2lo, h2hi, h3lo, h3hi;
233e1051a39Sopenharmony_ci    double c0lo, c0hi, c1lo, c1hi, c2lo, c2hi, c3lo, c3hi;
234e1051a39Sopenharmony_ci
235e1051a39Sopenharmony_ci    const double r0lo = st->r[0];
236e1051a39Sopenharmony_ci    const double r0hi = st->r[1];
237e1051a39Sopenharmony_ci    const double r1lo = st->r[2];
238e1051a39Sopenharmony_ci    const double r1hi = st->r[3];
239e1051a39Sopenharmony_ci    const double r2lo = st->r[4];
240e1051a39Sopenharmony_ci    const double r2hi = st->r[5];
241e1051a39Sopenharmony_ci    const double r3lo = st->r[6];
242e1051a39Sopenharmony_ci    const double r3hi = st->r[7];
243e1051a39Sopenharmony_ci
244e1051a39Sopenharmony_ci    const double s1lo = st->s[0];
245e1051a39Sopenharmony_ci    const double s1hi = st->s[1];
246e1051a39Sopenharmony_ci    const double s2lo = st->s[2];
247e1051a39Sopenharmony_ci    const double s2hi = st->s[3];
248e1051a39Sopenharmony_ci    const double s3lo = st->s[4];
249e1051a39Sopenharmony_ci    const double s3hi = st->s[5];
250e1051a39Sopenharmony_ci
251e1051a39Sopenharmony_ci    /*
252e1051a39Sopenharmony_ci     * set "truncate" rounding mode
253e1051a39Sopenharmony_ci     */
254e1051a39Sopenharmony_ci#if defined(__x86_64__)
255e1051a39Sopenharmony_ci    u32 mxcsr_orig;
256e1051a39Sopenharmony_ci
257e1051a39Sopenharmony_ci    asm volatile ("stmxcsr	%0":"=m"(mxcsr_orig));
258e1051a39Sopenharmony_ci    asm volatile ("ldmxcsr	%0"::"m"(mxcsr));
259e1051a39Sopenharmony_ci#elif defined(__PPC__)
260e1051a39Sopenharmony_ci    double fpscr_orig, fpscr = *(double *)&one;
261e1051a39Sopenharmony_ci
262e1051a39Sopenharmony_ci    asm volatile ("mffs		%0":"=f"(fpscr_orig));
263e1051a39Sopenharmony_ci    asm volatile ("mtfsf	255,%0"::"f"(fpscr));
264e1051a39Sopenharmony_ci#elif defined(__s390x__)
265e1051a39Sopenharmony_ci    u32 fpc_orig;
266e1051a39Sopenharmony_ci
267e1051a39Sopenharmony_ci    asm volatile ("stfpc	%0":"=m"(fpc_orig));
268e1051a39Sopenharmony_ci    asm volatile ("lfpc		%0"::"m"(fpc));
269e1051a39Sopenharmony_ci#elif defined(__sparc__)
270e1051a39Sopenharmony_ci    u64 fsr_orig;
271e1051a39Sopenharmony_ci
272e1051a39Sopenharmony_ci    asm volatile ("stx		%%fsr,%0":"=m"(fsr_orig));
273e1051a39Sopenharmony_ci    asm volatile ("ldx		%0,%%fsr"::"m"(fsr));
274e1051a39Sopenharmony_ci#elif defined(__mips__)
275e1051a39Sopenharmony_ci    u32 fcsr_orig;
276e1051a39Sopenharmony_ci
277e1051a39Sopenharmony_ci    asm volatile ("cfc1		%0,$31":"=r"(fcsr_orig));
278e1051a39Sopenharmony_ci    asm volatile ("ctc1		%0,$31"::"r"(fcsr));
279e1051a39Sopenharmony_ci#endif
280e1051a39Sopenharmony_ci
281e1051a39Sopenharmony_ci    /*
282e1051a39Sopenharmony_ci     * load base 2^32 and de-bias
283e1051a39Sopenharmony_ci     */
284e1051a39Sopenharmony_ci    h0lo = st->h[0].d - TWO(52)*TWO0;
285e1051a39Sopenharmony_ci    h1lo = st->h[1].d - TWO(52)*TWO32;
286e1051a39Sopenharmony_ci    h2lo = st->h[2].d - TWO(52)*TWO64;
287e1051a39Sopenharmony_ci    h3lo = st->h[3].d - TWO(52)*TWO96;
288e1051a39Sopenharmony_ci
289e1051a39Sopenharmony_ci#ifdef __clang__
290e1051a39Sopenharmony_ci    h0hi = 0;
291e1051a39Sopenharmony_ci    h1hi = 0;
292e1051a39Sopenharmony_ci    h2hi = 0;
293e1051a39Sopenharmony_ci    h3hi = 0;
294e1051a39Sopenharmony_ci#else
295e1051a39Sopenharmony_ci    in0.u = EXP(52+0)  | U8TOU32(&inp[0]);
296e1051a39Sopenharmony_ci    in1.u = EXP(52+32) | U8TOU32(&inp[4]);
297e1051a39Sopenharmony_ci    in2.u = EXP(52+64) | U8TOU32(&inp[8]);
298e1051a39Sopenharmony_ci    in3.u = EXP(52+96) | U8TOU32(&inp[12]) | pad;
299e1051a39Sopenharmony_ci
300e1051a39Sopenharmony_ci    x0 = in0.d - TWO(52)*TWO0;
301e1051a39Sopenharmony_ci    x1 = in1.d - TWO(52)*TWO32;
302e1051a39Sopenharmony_ci    x2 = in2.d - TWO(52)*TWO64;
303e1051a39Sopenharmony_ci    x3 = in3.d - TWO(52)*TWO96;
304e1051a39Sopenharmony_ci
305e1051a39Sopenharmony_ci    x0 += h0lo;
306e1051a39Sopenharmony_ci    x1 += h1lo;
307e1051a39Sopenharmony_ci    x2 += h2lo;
308e1051a39Sopenharmony_ci    x3 += h3lo;
309e1051a39Sopenharmony_ci
310e1051a39Sopenharmony_ci    goto fast_entry;
311e1051a39Sopenharmony_ci#endif
312e1051a39Sopenharmony_ci
313e1051a39Sopenharmony_ci    do {
314e1051a39Sopenharmony_ci        in0.u = EXP(52+0)  | U8TOU32(&inp[0]);
315e1051a39Sopenharmony_ci        in1.u = EXP(52+32) | U8TOU32(&inp[4]);
316e1051a39Sopenharmony_ci        in2.u = EXP(52+64) | U8TOU32(&inp[8]);
317e1051a39Sopenharmony_ci        in3.u = EXP(52+96) | U8TOU32(&inp[12]) | pad;
318e1051a39Sopenharmony_ci
319e1051a39Sopenharmony_ci        x0 = in0.d - TWO(52)*TWO0;
320e1051a39Sopenharmony_ci        x1 = in1.d - TWO(52)*TWO32;
321e1051a39Sopenharmony_ci        x2 = in2.d - TWO(52)*TWO64;
322e1051a39Sopenharmony_ci        x3 = in3.d - TWO(52)*TWO96;
323e1051a39Sopenharmony_ci
324e1051a39Sopenharmony_ci        /*
325e1051a39Sopenharmony_ci         * note that there are multiple ways to accumulate input, e.g.
326e1051a39Sopenharmony_ci         * one can as well accumulate to h0lo-h1lo-h1hi-h2hi...
327e1051a39Sopenharmony_ci         */
328e1051a39Sopenharmony_ci        h0lo += x0;
329e1051a39Sopenharmony_ci        h0hi += x1;
330e1051a39Sopenharmony_ci        h2lo += x2;
331e1051a39Sopenharmony_ci        h2hi += x3;
332e1051a39Sopenharmony_ci
333e1051a39Sopenharmony_ci        /*
334e1051a39Sopenharmony_ci         * carries that cross 32n-bit (and 130-bit) boundaries
335e1051a39Sopenharmony_ci         */
336e1051a39Sopenharmony_ci        c0lo = (h0lo + TWO(52)*TWO32)  - TWO(52)*TWO32;
337e1051a39Sopenharmony_ci        c1lo = (h1lo + TWO(52)*TWO64)  - TWO(52)*TWO64;
338e1051a39Sopenharmony_ci        c2lo = (h2lo + TWO(52)*TWO96)  - TWO(52)*TWO96;
339e1051a39Sopenharmony_ci        c3lo = (h3lo + TWO(52)*TWO130) - TWO(52)*TWO130;
340e1051a39Sopenharmony_ci
341e1051a39Sopenharmony_ci        c0hi = (h0hi + TWO(52)*TWO32)  - TWO(52)*TWO32;
342e1051a39Sopenharmony_ci        c1hi = (h1hi + TWO(52)*TWO64)  - TWO(52)*TWO64;
343e1051a39Sopenharmony_ci        c2hi = (h2hi + TWO(52)*TWO96)  - TWO(52)*TWO96;
344e1051a39Sopenharmony_ci        c3hi = (h3hi + TWO(52)*TWO130) - TWO(52)*TWO130;
345e1051a39Sopenharmony_ci
346e1051a39Sopenharmony_ci        /*
347e1051a39Sopenharmony_ci         * base 2^48 -> base 2^32 with last reduction step
348e1051a39Sopenharmony_ci         */
349e1051a39Sopenharmony_ci        x1 =  (h1lo - c1lo) + c0lo;
350e1051a39Sopenharmony_ci        x2 =  (h2lo - c2lo) + c1lo;
351e1051a39Sopenharmony_ci        x3 =  (h3lo - c3lo) + c2lo;
352e1051a39Sopenharmony_ci        x0 =  (h0lo - c0lo) + c3lo * (5.0/TWO130);
353e1051a39Sopenharmony_ci
354e1051a39Sopenharmony_ci        x1 += (h1hi - c1hi) + c0hi;
355e1051a39Sopenharmony_ci        x2 += (h2hi - c2hi) + c1hi;
356e1051a39Sopenharmony_ci        x3 += (h3hi - c3hi) + c2hi;
357e1051a39Sopenharmony_ci        x0 += (h0hi - c0hi) + c3hi * (5.0/TWO130);
358e1051a39Sopenharmony_ci
359e1051a39Sopenharmony_ci#ifndef __clang__
360e1051a39Sopenharmony_ci    fast_entry:
361e1051a39Sopenharmony_ci#endif
362e1051a39Sopenharmony_ci        /*
363e1051a39Sopenharmony_ci         * base 2^32 * base 2^16 = base 2^48
364e1051a39Sopenharmony_ci         */
365e1051a39Sopenharmony_ci        h0lo = s3lo * x1 + s2lo * x2 + s1lo * x3 + r0lo * x0;
366e1051a39Sopenharmony_ci        h1lo = r0lo * x1 + s3lo * x2 + s2lo * x3 + r1lo * x0;
367e1051a39Sopenharmony_ci        h2lo = r1lo * x1 + r0lo * x2 + s3lo * x3 + r2lo * x0;
368e1051a39Sopenharmony_ci        h3lo = r2lo * x1 + r1lo * x2 + r0lo * x3 + r3lo * x0;
369e1051a39Sopenharmony_ci
370e1051a39Sopenharmony_ci        h0hi = s3hi * x1 + s2hi * x2 + s1hi * x3 + r0hi * x0;
371e1051a39Sopenharmony_ci        h1hi = r0hi * x1 + s3hi * x2 + s2hi * x3 + r1hi * x0;
372e1051a39Sopenharmony_ci        h2hi = r1hi * x1 + r0hi * x2 + s3hi * x3 + r2hi * x0;
373e1051a39Sopenharmony_ci        h3hi = r2hi * x1 + r1hi * x2 + r0hi * x3 + r3hi * x0;
374e1051a39Sopenharmony_ci
375e1051a39Sopenharmony_ci        inp += 16;
376e1051a39Sopenharmony_ci        len -= 16;
377e1051a39Sopenharmony_ci
378e1051a39Sopenharmony_ci    } while (len >= 16);
379e1051a39Sopenharmony_ci
380e1051a39Sopenharmony_ci    /*
381e1051a39Sopenharmony_ci     * carries that cross 32n-bit (and 130-bit) boundaries
382e1051a39Sopenharmony_ci     */
383e1051a39Sopenharmony_ci    c0lo = (h0lo + TWO(52)*TWO32)  - TWO(52)*TWO32;
384e1051a39Sopenharmony_ci    c1lo = (h1lo + TWO(52)*TWO64)  - TWO(52)*TWO64;
385e1051a39Sopenharmony_ci    c2lo = (h2lo + TWO(52)*TWO96)  - TWO(52)*TWO96;
386e1051a39Sopenharmony_ci    c3lo = (h3lo + TWO(52)*TWO130) - TWO(52)*TWO130;
387e1051a39Sopenharmony_ci
388e1051a39Sopenharmony_ci    c0hi = (h0hi + TWO(52)*TWO32)  - TWO(52)*TWO32;
389e1051a39Sopenharmony_ci    c1hi = (h1hi + TWO(52)*TWO64)  - TWO(52)*TWO64;
390e1051a39Sopenharmony_ci    c2hi = (h2hi + TWO(52)*TWO96)  - TWO(52)*TWO96;
391e1051a39Sopenharmony_ci    c3hi = (h3hi + TWO(52)*TWO130) - TWO(52)*TWO130;
392e1051a39Sopenharmony_ci
393e1051a39Sopenharmony_ci    /*
394e1051a39Sopenharmony_ci     * base 2^48 -> base 2^32 with last reduction step
395e1051a39Sopenharmony_ci     */
396e1051a39Sopenharmony_ci    x1 =  (h1lo - c1lo) + c0lo;
397e1051a39Sopenharmony_ci    x2 =  (h2lo - c2lo) + c1lo;
398e1051a39Sopenharmony_ci    x3 =  (h3lo - c3lo) + c2lo;
399e1051a39Sopenharmony_ci    x0 =  (h0lo - c0lo) + c3lo * (5.0/TWO130);
400e1051a39Sopenharmony_ci
401e1051a39Sopenharmony_ci    x1 += (h1hi - c1hi) + c0hi;
402e1051a39Sopenharmony_ci    x2 += (h2hi - c2hi) + c1hi;
403e1051a39Sopenharmony_ci    x3 += (h3hi - c3hi) + c2hi;
404e1051a39Sopenharmony_ci    x0 += (h0hi - c0hi) + c3hi * (5.0/TWO130);
405e1051a39Sopenharmony_ci
406e1051a39Sopenharmony_ci    /*
407e1051a39Sopenharmony_ci     * store base 2^32, with bias
408e1051a39Sopenharmony_ci     */
409e1051a39Sopenharmony_ci    st->h[1].d = x1 + TWO(52)*TWO32;
410e1051a39Sopenharmony_ci    st->h[2].d = x2 + TWO(52)*TWO64;
411e1051a39Sopenharmony_ci    st->h[3].d = x3 + TWO(52)*TWO96;
412e1051a39Sopenharmony_ci    st->h[0].d = x0 + TWO(52)*TWO0;
413e1051a39Sopenharmony_ci
414e1051a39Sopenharmony_ci    /*
415e1051a39Sopenharmony_ci     * restore original FPU control register
416e1051a39Sopenharmony_ci     */
417e1051a39Sopenharmony_ci#if defined(__x86_64__)
418e1051a39Sopenharmony_ci    asm volatile ("ldmxcsr	%0"::"m"(mxcsr_orig));
419e1051a39Sopenharmony_ci#elif defined(__PPC__)
420e1051a39Sopenharmony_ci    asm volatile ("mtfsf	255,%0"::"f"(fpscr_orig));
421e1051a39Sopenharmony_ci#elif defined(__s390x__)
422e1051a39Sopenharmony_ci    asm volatile ("lfpc		%0"::"m"(fpc_orig));
423e1051a39Sopenharmony_ci#elif defined(__sparc__)
424e1051a39Sopenharmony_ci    asm volatile ("ldx		%0,%%fsr"::"m"(fsr_orig));
425e1051a39Sopenharmony_ci#elif defined(__mips__)
426e1051a39Sopenharmony_ci    asm volatile ("ctc1		%0,$31"::"r"(fcsr_orig));
427e1051a39Sopenharmony_ci#endif
428e1051a39Sopenharmony_ci}
429e1051a39Sopenharmony_ci
430e1051a39Sopenharmony_civoid poly1305_emit(void *ctx, unsigned char mac[16], const u32 nonce[4])
431e1051a39Sopenharmony_ci{
432e1051a39Sopenharmony_ci    poly1305_internal *st = (poly1305_internal *) ctx;
433e1051a39Sopenharmony_ci    u64 h0, h1, h2, h3, h4;
434e1051a39Sopenharmony_ci    u32 g0, g1, g2, g3, g4;
435e1051a39Sopenharmony_ci    u64 t;
436e1051a39Sopenharmony_ci    u32 mask;
437e1051a39Sopenharmony_ci
438e1051a39Sopenharmony_ci    /*
439e1051a39Sopenharmony_ci     * thanks to bias masking exponent gives integer result
440e1051a39Sopenharmony_ci     */
441e1051a39Sopenharmony_ci    h0 = st->h[0].u & 0x000fffffffffffffULL;
442e1051a39Sopenharmony_ci    h1 = st->h[1].u & 0x000fffffffffffffULL;
443e1051a39Sopenharmony_ci    h2 = st->h[2].u & 0x000fffffffffffffULL;
444e1051a39Sopenharmony_ci    h3 = st->h[3].u & 0x000fffffffffffffULL;
445e1051a39Sopenharmony_ci
446e1051a39Sopenharmony_ci    /*
447e1051a39Sopenharmony_ci     * can be partially reduced, so reduce...
448e1051a39Sopenharmony_ci     */
449e1051a39Sopenharmony_ci    h4 = h3>>32; h3 &= 0xffffffffU;
450e1051a39Sopenharmony_ci    g4 = h4&-4;
451e1051a39Sopenharmony_ci    h4 &= 3;
452e1051a39Sopenharmony_ci    g4 += g4>>2;
453e1051a39Sopenharmony_ci
454e1051a39Sopenharmony_ci    h0 += g4;
455e1051a39Sopenharmony_ci    h1 += h0>>32; h0 &= 0xffffffffU;
456e1051a39Sopenharmony_ci    h2 += h1>>32; h1 &= 0xffffffffU;
457e1051a39Sopenharmony_ci    h3 += h2>>32; h2 &= 0xffffffffU;
458e1051a39Sopenharmony_ci
459e1051a39Sopenharmony_ci    /* compute h + -p */
460e1051a39Sopenharmony_ci    g0 = (u32)(t = h0 + 5);
461e1051a39Sopenharmony_ci    g1 = (u32)(t = h1 + (t >> 32));
462e1051a39Sopenharmony_ci    g2 = (u32)(t = h2 + (t >> 32));
463e1051a39Sopenharmony_ci    g3 = (u32)(t = h3 + (t >> 32));
464e1051a39Sopenharmony_ci    g4 = h4 + (u32)(t >> 32);
465e1051a39Sopenharmony_ci
466e1051a39Sopenharmony_ci    /* if there was carry, select g0-g3 */
467e1051a39Sopenharmony_ci    mask = 0 - (g4 >> 2);
468e1051a39Sopenharmony_ci    g0 &= mask;
469e1051a39Sopenharmony_ci    g1 &= mask;
470e1051a39Sopenharmony_ci    g2 &= mask;
471e1051a39Sopenharmony_ci    g3 &= mask;
472e1051a39Sopenharmony_ci    mask = ~mask;
473e1051a39Sopenharmony_ci    g0 |= (h0 & mask);
474e1051a39Sopenharmony_ci    g1 |= (h1 & mask);
475e1051a39Sopenharmony_ci    g2 |= (h2 & mask);
476e1051a39Sopenharmony_ci    g3 |= (h3 & mask);
477e1051a39Sopenharmony_ci
478e1051a39Sopenharmony_ci    /* mac = (h + nonce) % (2^128) */
479e1051a39Sopenharmony_ci    g0 = (u32)(t = (u64)g0 + nonce[0]);
480e1051a39Sopenharmony_ci    g1 = (u32)(t = (u64)g1 + (t >> 32) + nonce[1]);
481e1051a39Sopenharmony_ci    g2 = (u32)(t = (u64)g2 + (t >> 32) + nonce[2]);
482e1051a39Sopenharmony_ci    g3 = (u32)(t = (u64)g3 + (t >> 32) + nonce[3]);
483e1051a39Sopenharmony_ci
484e1051a39Sopenharmony_ci    U32TO8(mac + 0, g0);
485e1051a39Sopenharmony_ci    U32TO8(mac + 4, g1);
486e1051a39Sopenharmony_ci    U32TO8(mac + 8, g2);
487e1051a39Sopenharmony_ci    U32TO8(mac + 12, g3);
488e1051a39Sopenharmony_ci}
489