11cb0ef41Sopenharmony_ci/* 21cb0ef41Sopenharmony_ci * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. 31cb0ef41Sopenharmony_ci * 41cb0ef41Sopenharmony_ci * Licensed under the Apache License 2.0 (the "License"). You may not use 51cb0ef41Sopenharmony_ci * this file except in compliance with the License. You can obtain a copy 61cb0ef41Sopenharmony_ci * in the file LICENSE in the source distribution or at 71cb0ef41Sopenharmony_ci * https://www.openssl.org/source/license.html 81cb0ef41Sopenharmony_ci */ 91cb0ef41Sopenharmony_ci 101cb0ef41Sopenharmony_ci/* 111cb0ef41Sopenharmony_ci * This module is meant to be used as template for non-x87 floating- 121cb0ef41Sopenharmony_ci * point assembly modules. The template itself is x86_64-specific 131cb0ef41Sopenharmony_ci * though, as it was debugged on x86_64. So that implementor would 141cb0ef41Sopenharmony_ci * have to recognize platform-specific parts, UxTOy and inline asm, 151cb0ef41Sopenharmony_ci * and act accordingly. 161cb0ef41Sopenharmony_ci * 171cb0ef41Sopenharmony_ci * Huh? x86_64-specific code as template for non-x87? Note seven, which 181cb0ef41Sopenharmony_ci * is not a typo, but reference to 80-bit precision. This module on the 191cb0ef41Sopenharmony_ci * other hand relies on 64-bit precision operations, which are default 201cb0ef41Sopenharmony_ci * for x86_64 code. And since we are at it, just for sense of it, 211cb0ef41Sopenharmony_ci * large-block performance in cycles per processed byte for *this* code 221cb0ef41Sopenharmony_ci * is: 231cb0ef41Sopenharmony_ci * gcc-4.8 icc-15.0 clang-3.4(*) 241cb0ef41Sopenharmony_ci * 251cb0ef41Sopenharmony_ci * Westmere 4.96 5.09 4.37 261cb0ef41Sopenharmony_ci * Sandy Bridge 4.95 4.90 4.17 271cb0ef41Sopenharmony_ci * Haswell 4.92 4.87 3.78 281cb0ef41Sopenharmony_ci * Bulldozer 4.67 4.49 4.68 291cb0ef41Sopenharmony_ci * VIA Nano 7.07 7.05 5.98 301cb0ef41Sopenharmony_ci * Silvermont 10.6 9.61 12.6 311cb0ef41Sopenharmony_ci * 321cb0ef41Sopenharmony_ci * (*) clang managed to discover parallelism and deployed SIMD; 331cb0ef41Sopenharmony_ci * 341cb0ef41Sopenharmony_ci * And for range of other platforms with unspecified gcc versions: 351cb0ef41Sopenharmony_ci * 361cb0ef41Sopenharmony_ci * Freescale e300 12.5 371cb0ef41Sopenharmony_ci * PPC74x0 10.8 381cb0ef41Sopenharmony_ci * POWER6 4.92 391cb0ef41Sopenharmony_ci * POWER7 4.50 401cb0ef41Sopenharmony_ci * POWER8 4.10 411cb0ef41Sopenharmony_ci * 421cb0ef41Sopenharmony_ci * z10 11.2 431cb0ef41Sopenharmony_ci * z196+ 7.30 441cb0ef41Sopenharmony_ci * 451cb0ef41Sopenharmony_ci * UltraSPARC III 16.0 461cb0ef41Sopenharmony_ci * SPARC T4 16.1 471cb0ef41Sopenharmony_ci */ 481cb0ef41Sopenharmony_ci 491cb0ef41Sopenharmony_ci#if !(defined(__GNUC__) && __GNUC__>=2) 501cb0ef41Sopenharmony_ci# error "this is gcc-specific template" 511cb0ef41Sopenharmony_ci#endif 521cb0ef41Sopenharmony_ci 531cb0ef41Sopenharmony_ci#include <stdlib.h> 541cb0ef41Sopenharmony_ci 551cb0ef41Sopenharmony_citypedef unsigned char u8; 561cb0ef41Sopenharmony_citypedef unsigned int u32; 571cb0ef41Sopenharmony_citypedef unsigned long long u64; 581cb0ef41Sopenharmony_citypedef union { double d; u64 u; } elem64; 591cb0ef41Sopenharmony_ci 601cb0ef41Sopenharmony_ci#define TWO(p) ((double)(1ULL<<(p))) 611cb0ef41Sopenharmony_ci#define TWO0 TWO(0) 621cb0ef41Sopenharmony_ci#define TWO32 TWO(32) 631cb0ef41Sopenharmony_ci#define TWO64 (TWO32*TWO(32)) 641cb0ef41Sopenharmony_ci#define TWO96 (TWO64*TWO(32)) 651cb0ef41Sopenharmony_ci#define TWO130 (TWO96*TWO(34)) 661cb0ef41Sopenharmony_ci 671cb0ef41Sopenharmony_ci#define EXP(p) ((1023ULL+(p))<<52) 681cb0ef41Sopenharmony_ci 691cb0ef41Sopenharmony_ci#if defined(__x86_64__) || (defined(__PPC__) && defined(__LITTLE_ENDIAN__)) 701cb0ef41Sopenharmony_ci# define U8TOU32(p) (*(const u32 *)(p)) 711cb0ef41Sopenharmony_ci# define U32TO8(p,v) (*(u32 *)(p) = (v)) 721cb0ef41Sopenharmony_ci#elif defined(__PPC__) 731cb0ef41Sopenharmony_ci# define U8TOU32(p) ({u32 ret; asm ("lwbrx %0,0,%1":"=r"(ret):"b"(p)); ret; }) 741cb0ef41Sopenharmony_ci# define U32TO8(p,v) asm ("stwbrx %0,0,%1"::"r"(v),"b"(p):"memory") 751cb0ef41Sopenharmony_ci#elif defined(__s390x__) 761cb0ef41Sopenharmony_ci# define U8TOU32(p) ({u32 ret; asm ("lrv %0,%1":"=d"(ret):"m"(*(u32 *)(p))); ret; }) 771cb0ef41Sopenharmony_ci# define U32TO8(p,v) asm ("strv %1,%0":"=m"(*(u32 *)(p)):"d"(v)) 781cb0ef41Sopenharmony_ci#endif 791cb0ef41Sopenharmony_ci 801cb0ef41Sopenharmony_ci#ifndef U8TOU32 811cb0ef41Sopenharmony_ci# define U8TOU32(p) ((u32)(p)[0] | (u32)(p)[1]<<8 | \ 821cb0ef41Sopenharmony_ci (u32)(p)[2]<<16 | (u32)(p)[3]<<24 ) 831cb0ef41Sopenharmony_ci#endif 841cb0ef41Sopenharmony_ci#ifndef U32TO8 851cb0ef41Sopenharmony_ci# define U32TO8(p,v) ((p)[0] = (u8)(v), (p)[1] = (u8)((v)>>8), \ 861cb0ef41Sopenharmony_ci (p)[2] = (u8)((v)>>16), (p)[3] = (u8)((v)>>24) ) 871cb0ef41Sopenharmony_ci#endif 881cb0ef41Sopenharmony_ci 891cb0ef41Sopenharmony_citypedef struct { 901cb0ef41Sopenharmony_ci elem64 h[4]; 911cb0ef41Sopenharmony_ci double r[8]; 921cb0ef41Sopenharmony_ci double s[6]; 931cb0ef41Sopenharmony_ci} poly1305_internal; 941cb0ef41Sopenharmony_ci 951cb0ef41Sopenharmony_ci/* "round toward zero (truncate), mask all exceptions" */ 961cb0ef41Sopenharmony_ci#if defined(__x86_64__) 971cb0ef41Sopenharmony_cistatic const u32 mxcsr = 0x7f80; 981cb0ef41Sopenharmony_ci#elif defined(__PPC__) 991cb0ef41Sopenharmony_cistatic const u64 one = 1; 1001cb0ef41Sopenharmony_ci#elif defined(__s390x__) 1011cb0ef41Sopenharmony_cistatic const u32 fpc = 1; 1021cb0ef41Sopenharmony_ci#elif defined(__sparc__) 1031cb0ef41Sopenharmony_cistatic const u64 fsr = 1ULL<<30; 1041cb0ef41Sopenharmony_ci#elif defined(__mips__) 1051cb0ef41Sopenharmony_cistatic const u32 fcsr = 1; 1061cb0ef41Sopenharmony_ci#else 1071cb0ef41Sopenharmony_ci#error "unrecognized platform" 1081cb0ef41Sopenharmony_ci#endif 1091cb0ef41Sopenharmony_ci 1101cb0ef41Sopenharmony_ciint poly1305_init(void *ctx, const unsigned char key[16]) 1111cb0ef41Sopenharmony_ci{ 1121cb0ef41Sopenharmony_ci poly1305_internal *st = (poly1305_internal *) ctx; 1131cb0ef41Sopenharmony_ci elem64 r0, r1, r2, r3; 1141cb0ef41Sopenharmony_ci 1151cb0ef41Sopenharmony_ci /* h = 0, biased */ 1161cb0ef41Sopenharmony_ci#if 0 1171cb0ef41Sopenharmony_ci st->h[0].d = TWO(52)*TWO0; 1181cb0ef41Sopenharmony_ci st->h[1].d = TWO(52)*TWO32; 1191cb0ef41Sopenharmony_ci st->h[2].d = TWO(52)*TWO64; 1201cb0ef41Sopenharmony_ci st->h[3].d = TWO(52)*TWO96; 1211cb0ef41Sopenharmony_ci#else 1221cb0ef41Sopenharmony_ci st->h[0].u = EXP(52+0); 1231cb0ef41Sopenharmony_ci st->h[1].u = EXP(52+32); 1241cb0ef41Sopenharmony_ci st->h[2].u = EXP(52+64); 1251cb0ef41Sopenharmony_ci st->h[3].u = EXP(52+96); 1261cb0ef41Sopenharmony_ci#endif 1271cb0ef41Sopenharmony_ci 1281cb0ef41Sopenharmony_ci if (key) { 1291cb0ef41Sopenharmony_ci /* 1301cb0ef41Sopenharmony_ci * set "truncate" rounding mode 1311cb0ef41Sopenharmony_ci */ 1321cb0ef41Sopenharmony_ci#if defined(__x86_64__) 1331cb0ef41Sopenharmony_ci u32 mxcsr_orig; 1341cb0ef41Sopenharmony_ci 1351cb0ef41Sopenharmony_ci asm volatile ("stmxcsr %0":"=m"(mxcsr_orig)); 1361cb0ef41Sopenharmony_ci asm volatile ("ldmxcsr %0"::"m"(mxcsr)); 1371cb0ef41Sopenharmony_ci#elif defined(__PPC__) 1381cb0ef41Sopenharmony_ci double fpscr_orig, fpscr = *(double *)&one; 1391cb0ef41Sopenharmony_ci 1401cb0ef41Sopenharmony_ci asm volatile ("mffs %0":"=f"(fpscr_orig)); 1411cb0ef41Sopenharmony_ci asm volatile ("mtfsf 255,%0"::"f"(fpscr)); 1421cb0ef41Sopenharmony_ci#elif defined(__s390x__) 1431cb0ef41Sopenharmony_ci u32 fpc_orig; 1441cb0ef41Sopenharmony_ci 1451cb0ef41Sopenharmony_ci asm volatile ("stfpc %0":"=m"(fpc_orig)); 1461cb0ef41Sopenharmony_ci asm volatile ("lfpc %0"::"m"(fpc)); 1471cb0ef41Sopenharmony_ci#elif defined(__sparc__) 1481cb0ef41Sopenharmony_ci u64 fsr_orig; 1491cb0ef41Sopenharmony_ci 1501cb0ef41Sopenharmony_ci asm volatile ("stx %%fsr,%0":"=m"(fsr_orig)); 1511cb0ef41Sopenharmony_ci asm volatile ("ldx %0,%%fsr"::"m"(fsr)); 1521cb0ef41Sopenharmony_ci#elif defined(__mips__) 1531cb0ef41Sopenharmony_ci u32 fcsr_orig; 1541cb0ef41Sopenharmony_ci 1551cb0ef41Sopenharmony_ci asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig)); 1561cb0ef41Sopenharmony_ci asm volatile ("ctc1 %0,$31"::"r"(fcsr)); 1571cb0ef41Sopenharmony_ci#endif 1581cb0ef41Sopenharmony_ci 1591cb0ef41Sopenharmony_ci /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 1601cb0ef41Sopenharmony_ci r0.u = EXP(52+0) | (U8TOU32(&key[0]) & 0x0fffffff); 1611cb0ef41Sopenharmony_ci r1.u = EXP(52+32) | (U8TOU32(&key[4]) & 0x0ffffffc); 1621cb0ef41Sopenharmony_ci r2.u = EXP(52+64) | (U8TOU32(&key[8]) & 0x0ffffffc); 1631cb0ef41Sopenharmony_ci r3.u = EXP(52+96) | (U8TOU32(&key[12]) & 0x0ffffffc); 1641cb0ef41Sopenharmony_ci 1651cb0ef41Sopenharmony_ci st->r[0] = r0.d - TWO(52)*TWO0; 1661cb0ef41Sopenharmony_ci st->r[2] = r1.d - TWO(52)*TWO32; 1671cb0ef41Sopenharmony_ci st->r[4] = r2.d - TWO(52)*TWO64; 1681cb0ef41Sopenharmony_ci st->r[6] = r3.d - TWO(52)*TWO96; 1691cb0ef41Sopenharmony_ci 1701cb0ef41Sopenharmony_ci st->s[0] = st->r[2] * (5.0/TWO130); 1711cb0ef41Sopenharmony_ci st->s[2] = st->r[4] * (5.0/TWO130); 1721cb0ef41Sopenharmony_ci st->s[4] = st->r[6] * (5.0/TWO130); 1731cb0ef41Sopenharmony_ci 1741cb0ef41Sopenharmony_ci /* 1751cb0ef41Sopenharmony_ci * base 2^32 -> base 2^16 1761cb0ef41Sopenharmony_ci */ 1771cb0ef41Sopenharmony_ci st->r[1] = (st->r[0] + TWO(52)*TWO(16)*TWO0) - 1781cb0ef41Sopenharmony_ci TWO(52)*TWO(16)*TWO0; 1791cb0ef41Sopenharmony_ci st->r[0] -= st->r[1]; 1801cb0ef41Sopenharmony_ci 1811cb0ef41Sopenharmony_ci st->r[3] = (st->r[2] + TWO(52)*TWO(16)*TWO32) - 1821cb0ef41Sopenharmony_ci TWO(52)*TWO(16)*TWO32; 1831cb0ef41Sopenharmony_ci st->r[2] -= st->r[3]; 1841cb0ef41Sopenharmony_ci 1851cb0ef41Sopenharmony_ci st->r[5] = (st->r[4] + TWO(52)*TWO(16)*TWO64) - 1861cb0ef41Sopenharmony_ci TWO(52)*TWO(16)*TWO64; 1871cb0ef41Sopenharmony_ci st->r[4] -= st->r[5]; 1881cb0ef41Sopenharmony_ci 1891cb0ef41Sopenharmony_ci st->r[7] = (st->r[6] + TWO(52)*TWO(16)*TWO96) - 1901cb0ef41Sopenharmony_ci TWO(52)*TWO(16)*TWO96; 1911cb0ef41Sopenharmony_ci st->r[6] -= st->r[7]; 1921cb0ef41Sopenharmony_ci 1931cb0ef41Sopenharmony_ci st->s[1] = (st->s[0] + TWO(52)*TWO(16)*TWO0/TWO96) - 1941cb0ef41Sopenharmony_ci TWO(52)*TWO(16)*TWO0/TWO96; 1951cb0ef41Sopenharmony_ci st->s[0] -= st->s[1]; 1961cb0ef41Sopenharmony_ci 1971cb0ef41Sopenharmony_ci st->s[3] = (st->s[2] + TWO(52)*TWO(16)*TWO32/TWO96) - 1981cb0ef41Sopenharmony_ci TWO(52)*TWO(16)*TWO32/TWO96; 1991cb0ef41Sopenharmony_ci st->s[2] -= st->s[3]; 2001cb0ef41Sopenharmony_ci 2011cb0ef41Sopenharmony_ci st->s[5] = (st->s[4] + TWO(52)*TWO(16)*TWO64/TWO96) - 2021cb0ef41Sopenharmony_ci TWO(52)*TWO(16)*TWO64/TWO96; 2031cb0ef41Sopenharmony_ci st->s[4] -= st->s[5]; 2041cb0ef41Sopenharmony_ci 2051cb0ef41Sopenharmony_ci /* 2061cb0ef41Sopenharmony_ci * restore original FPU control register 2071cb0ef41Sopenharmony_ci */ 2081cb0ef41Sopenharmony_ci#if defined(__x86_64__) 2091cb0ef41Sopenharmony_ci asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig)); 2101cb0ef41Sopenharmony_ci#elif defined(__PPC__) 2111cb0ef41Sopenharmony_ci asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig)); 2121cb0ef41Sopenharmony_ci#elif defined(__s390x__) 2131cb0ef41Sopenharmony_ci asm volatile ("lfpc %0"::"m"(fpc_orig)); 2141cb0ef41Sopenharmony_ci#elif defined(__sparc__) 2151cb0ef41Sopenharmony_ci asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig)); 2161cb0ef41Sopenharmony_ci#elif defined(__mips__) 2171cb0ef41Sopenharmony_ci asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig)); 2181cb0ef41Sopenharmony_ci#endif 2191cb0ef41Sopenharmony_ci } 2201cb0ef41Sopenharmony_ci 2211cb0ef41Sopenharmony_ci return 0; 2221cb0ef41Sopenharmony_ci} 2231cb0ef41Sopenharmony_ci 2241cb0ef41Sopenharmony_civoid poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, 2251cb0ef41Sopenharmony_ci int padbit) 2261cb0ef41Sopenharmony_ci{ 2271cb0ef41Sopenharmony_ci poly1305_internal *st = (poly1305_internal *)ctx; 2281cb0ef41Sopenharmony_ci elem64 in0, in1, in2, in3; 2291cb0ef41Sopenharmony_ci u64 pad = (u64)padbit<<32; 2301cb0ef41Sopenharmony_ci 2311cb0ef41Sopenharmony_ci double x0, x1, x2, x3; 2321cb0ef41Sopenharmony_ci double h0lo, h0hi, h1lo, h1hi, h2lo, h2hi, h3lo, h3hi; 2331cb0ef41Sopenharmony_ci double c0lo, c0hi, c1lo, c1hi, c2lo, c2hi, c3lo, c3hi; 2341cb0ef41Sopenharmony_ci 2351cb0ef41Sopenharmony_ci const double r0lo = st->r[0]; 2361cb0ef41Sopenharmony_ci const double r0hi = st->r[1]; 2371cb0ef41Sopenharmony_ci const double r1lo = st->r[2]; 2381cb0ef41Sopenharmony_ci const double r1hi = st->r[3]; 2391cb0ef41Sopenharmony_ci const double r2lo = st->r[4]; 2401cb0ef41Sopenharmony_ci const double r2hi = st->r[5]; 2411cb0ef41Sopenharmony_ci const double r3lo = st->r[6]; 2421cb0ef41Sopenharmony_ci const double r3hi = st->r[7]; 2431cb0ef41Sopenharmony_ci 2441cb0ef41Sopenharmony_ci const double s1lo = st->s[0]; 2451cb0ef41Sopenharmony_ci const double s1hi = st->s[1]; 2461cb0ef41Sopenharmony_ci const double s2lo = st->s[2]; 2471cb0ef41Sopenharmony_ci const double s2hi = st->s[3]; 2481cb0ef41Sopenharmony_ci const double s3lo = st->s[4]; 2491cb0ef41Sopenharmony_ci const double s3hi = st->s[5]; 2501cb0ef41Sopenharmony_ci 2511cb0ef41Sopenharmony_ci /* 2521cb0ef41Sopenharmony_ci * set "truncate" rounding mode 2531cb0ef41Sopenharmony_ci */ 2541cb0ef41Sopenharmony_ci#if defined(__x86_64__) 2551cb0ef41Sopenharmony_ci u32 mxcsr_orig; 2561cb0ef41Sopenharmony_ci 2571cb0ef41Sopenharmony_ci asm volatile ("stmxcsr %0":"=m"(mxcsr_orig)); 2581cb0ef41Sopenharmony_ci asm volatile ("ldmxcsr %0"::"m"(mxcsr)); 2591cb0ef41Sopenharmony_ci#elif defined(__PPC__) 2601cb0ef41Sopenharmony_ci double fpscr_orig, fpscr = *(double *)&one; 2611cb0ef41Sopenharmony_ci 2621cb0ef41Sopenharmony_ci asm volatile ("mffs %0":"=f"(fpscr_orig)); 2631cb0ef41Sopenharmony_ci asm volatile ("mtfsf 255,%0"::"f"(fpscr)); 2641cb0ef41Sopenharmony_ci#elif defined(__s390x__) 2651cb0ef41Sopenharmony_ci u32 fpc_orig; 2661cb0ef41Sopenharmony_ci 2671cb0ef41Sopenharmony_ci asm volatile ("stfpc %0":"=m"(fpc_orig)); 2681cb0ef41Sopenharmony_ci asm volatile ("lfpc %0"::"m"(fpc)); 2691cb0ef41Sopenharmony_ci#elif defined(__sparc__) 2701cb0ef41Sopenharmony_ci u64 fsr_orig; 2711cb0ef41Sopenharmony_ci 2721cb0ef41Sopenharmony_ci asm volatile ("stx %%fsr,%0":"=m"(fsr_orig)); 2731cb0ef41Sopenharmony_ci asm volatile ("ldx %0,%%fsr"::"m"(fsr)); 2741cb0ef41Sopenharmony_ci#elif defined(__mips__) 2751cb0ef41Sopenharmony_ci u32 fcsr_orig; 2761cb0ef41Sopenharmony_ci 2771cb0ef41Sopenharmony_ci asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig)); 2781cb0ef41Sopenharmony_ci asm volatile ("ctc1 %0,$31"::"r"(fcsr)); 2791cb0ef41Sopenharmony_ci#endif 2801cb0ef41Sopenharmony_ci 2811cb0ef41Sopenharmony_ci /* 2821cb0ef41Sopenharmony_ci * load base 2^32 and de-bias 2831cb0ef41Sopenharmony_ci */ 2841cb0ef41Sopenharmony_ci h0lo = st->h[0].d - TWO(52)*TWO0; 2851cb0ef41Sopenharmony_ci h1lo = st->h[1].d - TWO(52)*TWO32; 2861cb0ef41Sopenharmony_ci h2lo = st->h[2].d - TWO(52)*TWO64; 2871cb0ef41Sopenharmony_ci h3lo = st->h[3].d - TWO(52)*TWO96; 2881cb0ef41Sopenharmony_ci 2891cb0ef41Sopenharmony_ci#ifdef __clang__ 2901cb0ef41Sopenharmony_ci h0hi = 0; 2911cb0ef41Sopenharmony_ci h1hi = 0; 2921cb0ef41Sopenharmony_ci h2hi = 0; 2931cb0ef41Sopenharmony_ci h3hi = 0; 2941cb0ef41Sopenharmony_ci#else 2951cb0ef41Sopenharmony_ci in0.u = EXP(52+0) | U8TOU32(&inp[0]); 2961cb0ef41Sopenharmony_ci in1.u = EXP(52+32) | U8TOU32(&inp[4]); 2971cb0ef41Sopenharmony_ci in2.u = EXP(52+64) | U8TOU32(&inp[8]); 2981cb0ef41Sopenharmony_ci in3.u = EXP(52+96) | U8TOU32(&inp[12]) | pad; 2991cb0ef41Sopenharmony_ci 3001cb0ef41Sopenharmony_ci x0 = in0.d - TWO(52)*TWO0; 3011cb0ef41Sopenharmony_ci x1 = in1.d - TWO(52)*TWO32; 3021cb0ef41Sopenharmony_ci x2 = in2.d - TWO(52)*TWO64; 3031cb0ef41Sopenharmony_ci x3 = in3.d - TWO(52)*TWO96; 3041cb0ef41Sopenharmony_ci 3051cb0ef41Sopenharmony_ci x0 += h0lo; 3061cb0ef41Sopenharmony_ci x1 += h1lo; 3071cb0ef41Sopenharmony_ci x2 += h2lo; 3081cb0ef41Sopenharmony_ci x3 += h3lo; 3091cb0ef41Sopenharmony_ci 3101cb0ef41Sopenharmony_ci goto fast_entry; 3111cb0ef41Sopenharmony_ci#endif 3121cb0ef41Sopenharmony_ci 3131cb0ef41Sopenharmony_ci do { 3141cb0ef41Sopenharmony_ci in0.u = EXP(52+0) | U8TOU32(&inp[0]); 3151cb0ef41Sopenharmony_ci in1.u = EXP(52+32) | U8TOU32(&inp[4]); 3161cb0ef41Sopenharmony_ci in2.u = EXP(52+64) | U8TOU32(&inp[8]); 3171cb0ef41Sopenharmony_ci in3.u = EXP(52+96) | U8TOU32(&inp[12]) | pad; 3181cb0ef41Sopenharmony_ci 3191cb0ef41Sopenharmony_ci x0 = in0.d - TWO(52)*TWO0; 3201cb0ef41Sopenharmony_ci x1 = in1.d - TWO(52)*TWO32; 3211cb0ef41Sopenharmony_ci x2 = in2.d - TWO(52)*TWO64; 3221cb0ef41Sopenharmony_ci x3 = in3.d - TWO(52)*TWO96; 3231cb0ef41Sopenharmony_ci 3241cb0ef41Sopenharmony_ci /* 3251cb0ef41Sopenharmony_ci * note that there are multiple ways to accumulate input, e.g. 3261cb0ef41Sopenharmony_ci * one can as well accumulate to h0lo-h1lo-h1hi-h2hi... 3271cb0ef41Sopenharmony_ci */ 3281cb0ef41Sopenharmony_ci h0lo += x0; 3291cb0ef41Sopenharmony_ci h0hi += x1; 3301cb0ef41Sopenharmony_ci h2lo += x2; 3311cb0ef41Sopenharmony_ci h2hi += x3; 3321cb0ef41Sopenharmony_ci 3331cb0ef41Sopenharmony_ci /* 3341cb0ef41Sopenharmony_ci * carries that cross 32n-bit (and 130-bit) boundaries 3351cb0ef41Sopenharmony_ci */ 3361cb0ef41Sopenharmony_ci c0lo = (h0lo + TWO(52)*TWO32) - TWO(52)*TWO32; 3371cb0ef41Sopenharmony_ci c1lo = (h1lo + TWO(52)*TWO64) - TWO(52)*TWO64; 3381cb0ef41Sopenharmony_ci c2lo = (h2lo + TWO(52)*TWO96) - TWO(52)*TWO96; 3391cb0ef41Sopenharmony_ci c3lo = (h3lo + TWO(52)*TWO130) - TWO(52)*TWO130; 3401cb0ef41Sopenharmony_ci 3411cb0ef41Sopenharmony_ci c0hi = (h0hi + TWO(52)*TWO32) - TWO(52)*TWO32; 3421cb0ef41Sopenharmony_ci c1hi = (h1hi + TWO(52)*TWO64) - TWO(52)*TWO64; 3431cb0ef41Sopenharmony_ci c2hi = (h2hi + TWO(52)*TWO96) - TWO(52)*TWO96; 3441cb0ef41Sopenharmony_ci c3hi = (h3hi + TWO(52)*TWO130) - TWO(52)*TWO130; 3451cb0ef41Sopenharmony_ci 3461cb0ef41Sopenharmony_ci /* 3471cb0ef41Sopenharmony_ci * base 2^48 -> base 2^32 with last reduction step 3481cb0ef41Sopenharmony_ci */ 3491cb0ef41Sopenharmony_ci x1 = (h1lo - c1lo) + c0lo; 3501cb0ef41Sopenharmony_ci x2 = (h2lo - c2lo) + c1lo; 3511cb0ef41Sopenharmony_ci x3 = (h3lo - c3lo) + c2lo; 3521cb0ef41Sopenharmony_ci x0 = (h0lo - c0lo) + c3lo * (5.0/TWO130); 3531cb0ef41Sopenharmony_ci 3541cb0ef41Sopenharmony_ci x1 += (h1hi - c1hi) + c0hi; 3551cb0ef41Sopenharmony_ci x2 += (h2hi - c2hi) + c1hi; 3561cb0ef41Sopenharmony_ci x3 += (h3hi - c3hi) + c2hi; 3571cb0ef41Sopenharmony_ci x0 += (h0hi - c0hi) + c3hi * (5.0/TWO130); 3581cb0ef41Sopenharmony_ci 3591cb0ef41Sopenharmony_ci#ifndef __clang__ 3601cb0ef41Sopenharmony_ci fast_entry: 3611cb0ef41Sopenharmony_ci#endif 3621cb0ef41Sopenharmony_ci /* 3631cb0ef41Sopenharmony_ci * base 2^32 * base 2^16 = base 2^48 3641cb0ef41Sopenharmony_ci */ 3651cb0ef41Sopenharmony_ci h0lo = s3lo * x1 + s2lo * x2 + s1lo * x3 + r0lo * x0; 3661cb0ef41Sopenharmony_ci h1lo = r0lo * x1 + s3lo * x2 + s2lo * x3 + r1lo * x0; 3671cb0ef41Sopenharmony_ci h2lo = r1lo * x1 + r0lo * x2 + s3lo * x3 + r2lo * x0; 3681cb0ef41Sopenharmony_ci h3lo = r2lo * x1 + r1lo * x2 + r0lo * x3 + r3lo * x0; 3691cb0ef41Sopenharmony_ci 3701cb0ef41Sopenharmony_ci h0hi = s3hi * x1 + s2hi * x2 + s1hi * x3 + r0hi * x0; 3711cb0ef41Sopenharmony_ci h1hi = r0hi * x1 + s3hi * x2 + s2hi * x3 + r1hi * x0; 3721cb0ef41Sopenharmony_ci h2hi = r1hi * x1 + r0hi * x2 + s3hi * x3 + r2hi * x0; 3731cb0ef41Sopenharmony_ci h3hi = r2hi * x1 + r1hi * x2 + r0hi * x3 + r3hi * x0; 3741cb0ef41Sopenharmony_ci 3751cb0ef41Sopenharmony_ci inp += 16; 3761cb0ef41Sopenharmony_ci len -= 16; 3771cb0ef41Sopenharmony_ci 3781cb0ef41Sopenharmony_ci } while (len >= 16); 3791cb0ef41Sopenharmony_ci 3801cb0ef41Sopenharmony_ci /* 3811cb0ef41Sopenharmony_ci * carries that cross 32n-bit (and 130-bit) boundaries 3821cb0ef41Sopenharmony_ci */ 3831cb0ef41Sopenharmony_ci c0lo = (h0lo + TWO(52)*TWO32) - TWO(52)*TWO32; 3841cb0ef41Sopenharmony_ci c1lo = (h1lo + TWO(52)*TWO64) - TWO(52)*TWO64; 3851cb0ef41Sopenharmony_ci c2lo = (h2lo + TWO(52)*TWO96) - TWO(52)*TWO96; 3861cb0ef41Sopenharmony_ci c3lo = (h3lo + TWO(52)*TWO130) - TWO(52)*TWO130; 3871cb0ef41Sopenharmony_ci 3881cb0ef41Sopenharmony_ci c0hi = (h0hi + TWO(52)*TWO32) - TWO(52)*TWO32; 3891cb0ef41Sopenharmony_ci c1hi = (h1hi + TWO(52)*TWO64) - TWO(52)*TWO64; 3901cb0ef41Sopenharmony_ci c2hi = (h2hi + TWO(52)*TWO96) - TWO(52)*TWO96; 3911cb0ef41Sopenharmony_ci c3hi = (h3hi + TWO(52)*TWO130) - TWO(52)*TWO130; 3921cb0ef41Sopenharmony_ci 3931cb0ef41Sopenharmony_ci /* 3941cb0ef41Sopenharmony_ci * base 2^48 -> base 2^32 with last reduction step 3951cb0ef41Sopenharmony_ci */ 3961cb0ef41Sopenharmony_ci x1 = (h1lo - c1lo) + c0lo; 3971cb0ef41Sopenharmony_ci x2 = (h2lo - c2lo) + c1lo; 3981cb0ef41Sopenharmony_ci x3 = (h3lo - c3lo) + c2lo; 3991cb0ef41Sopenharmony_ci x0 = (h0lo - c0lo) + c3lo * (5.0/TWO130); 4001cb0ef41Sopenharmony_ci 4011cb0ef41Sopenharmony_ci x1 += (h1hi - c1hi) + c0hi; 4021cb0ef41Sopenharmony_ci x2 += (h2hi - c2hi) + c1hi; 4031cb0ef41Sopenharmony_ci x3 += (h3hi - c3hi) + c2hi; 4041cb0ef41Sopenharmony_ci x0 += (h0hi - c0hi) + c3hi * (5.0/TWO130); 4051cb0ef41Sopenharmony_ci 4061cb0ef41Sopenharmony_ci /* 4071cb0ef41Sopenharmony_ci * store base 2^32, with bias 4081cb0ef41Sopenharmony_ci */ 4091cb0ef41Sopenharmony_ci st->h[1].d = x1 + TWO(52)*TWO32; 4101cb0ef41Sopenharmony_ci st->h[2].d = x2 + TWO(52)*TWO64; 4111cb0ef41Sopenharmony_ci st->h[3].d = x3 + TWO(52)*TWO96; 4121cb0ef41Sopenharmony_ci st->h[0].d = x0 + TWO(52)*TWO0; 4131cb0ef41Sopenharmony_ci 4141cb0ef41Sopenharmony_ci /* 4151cb0ef41Sopenharmony_ci * restore original FPU control register 4161cb0ef41Sopenharmony_ci */ 4171cb0ef41Sopenharmony_ci#if defined(__x86_64__) 4181cb0ef41Sopenharmony_ci asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig)); 4191cb0ef41Sopenharmony_ci#elif defined(__PPC__) 4201cb0ef41Sopenharmony_ci asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig)); 4211cb0ef41Sopenharmony_ci#elif defined(__s390x__) 4221cb0ef41Sopenharmony_ci asm volatile ("lfpc %0"::"m"(fpc_orig)); 4231cb0ef41Sopenharmony_ci#elif defined(__sparc__) 4241cb0ef41Sopenharmony_ci asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig)); 4251cb0ef41Sopenharmony_ci#elif defined(__mips__) 4261cb0ef41Sopenharmony_ci asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig)); 4271cb0ef41Sopenharmony_ci#endif 4281cb0ef41Sopenharmony_ci} 4291cb0ef41Sopenharmony_ci 4301cb0ef41Sopenharmony_civoid poly1305_emit(void *ctx, unsigned char mac[16], const u32 nonce[4]) 4311cb0ef41Sopenharmony_ci{ 4321cb0ef41Sopenharmony_ci poly1305_internal *st = (poly1305_internal *) ctx; 4331cb0ef41Sopenharmony_ci u64 h0, h1, h2, h3, h4; 4341cb0ef41Sopenharmony_ci u32 g0, g1, g2, g3, g4; 4351cb0ef41Sopenharmony_ci u64 t; 4361cb0ef41Sopenharmony_ci u32 mask; 4371cb0ef41Sopenharmony_ci 4381cb0ef41Sopenharmony_ci /* 4391cb0ef41Sopenharmony_ci * thanks to bias masking exponent gives integer result 4401cb0ef41Sopenharmony_ci */ 4411cb0ef41Sopenharmony_ci h0 = st->h[0].u & 0x000fffffffffffffULL; 4421cb0ef41Sopenharmony_ci h1 = st->h[1].u & 0x000fffffffffffffULL; 4431cb0ef41Sopenharmony_ci h2 = st->h[2].u & 0x000fffffffffffffULL; 4441cb0ef41Sopenharmony_ci h3 = st->h[3].u & 0x000fffffffffffffULL; 4451cb0ef41Sopenharmony_ci 4461cb0ef41Sopenharmony_ci /* 4471cb0ef41Sopenharmony_ci * can be partially reduced, so reduce... 4481cb0ef41Sopenharmony_ci */ 4491cb0ef41Sopenharmony_ci h4 = h3>>32; h3 &= 0xffffffffU; 4501cb0ef41Sopenharmony_ci g4 = h4&-4; 4511cb0ef41Sopenharmony_ci h4 &= 3; 4521cb0ef41Sopenharmony_ci g4 += g4>>2; 4531cb0ef41Sopenharmony_ci 4541cb0ef41Sopenharmony_ci h0 += g4; 4551cb0ef41Sopenharmony_ci h1 += h0>>32; h0 &= 0xffffffffU; 4561cb0ef41Sopenharmony_ci h2 += h1>>32; h1 &= 0xffffffffU; 4571cb0ef41Sopenharmony_ci h3 += h2>>32; h2 &= 0xffffffffU; 4581cb0ef41Sopenharmony_ci 4591cb0ef41Sopenharmony_ci /* compute h + -p */ 4601cb0ef41Sopenharmony_ci g0 = (u32)(t = h0 + 5); 4611cb0ef41Sopenharmony_ci g1 = (u32)(t = h1 + (t >> 32)); 4621cb0ef41Sopenharmony_ci g2 = (u32)(t = h2 + (t >> 32)); 4631cb0ef41Sopenharmony_ci g3 = (u32)(t = h3 + (t >> 32)); 4641cb0ef41Sopenharmony_ci g4 = h4 + (u32)(t >> 32); 4651cb0ef41Sopenharmony_ci 4661cb0ef41Sopenharmony_ci /* if there was carry, select g0-g3 */ 4671cb0ef41Sopenharmony_ci mask = 0 - (g4 >> 2); 4681cb0ef41Sopenharmony_ci g0 &= mask; 4691cb0ef41Sopenharmony_ci g1 &= mask; 4701cb0ef41Sopenharmony_ci g2 &= mask; 4711cb0ef41Sopenharmony_ci g3 &= mask; 4721cb0ef41Sopenharmony_ci mask = ~mask; 4731cb0ef41Sopenharmony_ci g0 |= (h0 & mask); 4741cb0ef41Sopenharmony_ci g1 |= (h1 & mask); 4751cb0ef41Sopenharmony_ci g2 |= (h2 & mask); 4761cb0ef41Sopenharmony_ci g3 |= (h3 & mask); 4771cb0ef41Sopenharmony_ci 4781cb0ef41Sopenharmony_ci /* mac = (h + nonce) % (2^128) */ 4791cb0ef41Sopenharmony_ci g0 = (u32)(t = (u64)g0 + nonce[0]); 4801cb0ef41Sopenharmony_ci g1 = (u32)(t = (u64)g1 + (t >> 32) + nonce[1]); 4811cb0ef41Sopenharmony_ci g2 = (u32)(t = (u64)g2 + (t >> 32) + nonce[2]); 4821cb0ef41Sopenharmony_ci g3 = (u32)(t = (u64)g3 + (t >> 32) + nonce[3]); 4831cb0ef41Sopenharmony_ci 4841cb0ef41Sopenharmony_ci U32TO8(mac + 0, g0); 4851cb0ef41Sopenharmony_ci U32TO8(mac + 4, g1); 4861cb0ef41Sopenharmony_ci U32TO8(mac + 8, g2); 4871cb0ef41Sopenharmony_ci U32TO8(mac + 12, g3); 4881cb0ef41Sopenharmony_ci} 489