1e1051a39Sopenharmony_ci/* 2e1051a39Sopenharmony_ci * Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci * 4e1051a39Sopenharmony_ci * Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci * this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci * in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci * https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci */ 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci#include "../bn_local.h" 11e1051a39Sopenharmony_ci#if !(defined(__GNUC__) && __GNUC__>=2) 12e1051a39Sopenharmony_ci# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ 13e1051a39Sopenharmony_ci#else 14e1051a39Sopenharmony_ci/*- 15e1051a39Sopenharmony_ci * x86_64 BIGNUM accelerator version 0.1, December 2002. 16e1051a39Sopenharmony_ci * 17e1051a39Sopenharmony_ci * Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL 18e1051a39Sopenharmony_ci * project. 19e1051a39Sopenharmony_ci * 20e1051a39Sopenharmony_ci * Rights for redistribution and usage in source and binary forms are 21e1051a39Sopenharmony_ci * granted according to the License. Warranty of any kind is disclaimed. 22e1051a39Sopenharmony_ci * 23e1051a39Sopenharmony_ci * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 24e1051a39Sopenharmony_ci * versions, like 1.0... 25e1051a39Sopenharmony_ci * A. Well, that's because this code is basically a quick-n-dirty 26e1051a39Sopenharmony_ci * proof-of-concept hack. As you can see it's implemented with 27e1051a39Sopenharmony_ci * inline assembler, which means that you're bound to GCC and that 28e1051a39Sopenharmony_ci * there might be enough room for further improvement. 29e1051a39Sopenharmony_ci * 30e1051a39Sopenharmony_ci * Q. Why inline assembler? 31e1051a39Sopenharmony_ci * A. x86_64 features own ABI which I'm not familiar with. This is 32e1051a39Sopenharmony_ci * why I decided to let the compiler take care of subroutine 33e1051a39Sopenharmony_ci * prologue/epilogue as well as register allocation. For reference. 34e1051a39Sopenharmony_ci * Win64 implements different ABI for AMD64, different from Linux. 35e1051a39Sopenharmony_ci * 36e1051a39Sopenharmony_ci * Q. How much faster does it get? 37e1051a39Sopenharmony_ci * A. 'apps/openssl speed rsa dsa' output with no-asm: 38e1051a39Sopenharmony_ci * 39e1051a39Sopenharmony_ci * sign verify sign/s verify/s 40e1051a39Sopenharmony_ci * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 41e1051a39Sopenharmony_ci * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 42e1051a39Sopenharmony_ci * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 43e1051a39Sopenharmony_ci * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 44e1051a39Sopenharmony_ci * sign verify sign/s verify/s 45e1051a39Sopenharmony_ci * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 46e1051a39Sopenharmony_ci * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 47e1051a39Sopenharmony_ci * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 48e1051a39Sopenharmony_ci * 49e1051a39Sopenharmony_ci * 'apps/openssl speed rsa dsa' output with this module: 50e1051a39Sopenharmony_ci * 51e1051a39Sopenharmony_ci * sign verify sign/s verify/s 52e1051a39Sopenharmony_ci * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 53e1051a39Sopenharmony_ci * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 54e1051a39Sopenharmony_ci * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 55e1051a39Sopenharmony_ci * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 56e1051a39Sopenharmony_ci * sign verify sign/s verify/s 57e1051a39Sopenharmony_ci * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 58e1051a39Sopenharmony_ci * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 59e1051a39Sopenharmony_ci * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 60e1051a39Sopenharmony_ci * 61e1051a39Sopenharmony_ci * For the reference. IA-32 assembler implementation performs 62e1051a39Sopenharmony_ci * very much like 64-bit code compiled with no-asm on the same 63e1051a39Sopenharmony_ci * machine. 64e1051a39Sopenharmony_ci */ 65e1051a39Sopenharmony_ci 66e1051a39Sopenharmony_ci# undef mul 67e1051a39Sopenharmony_ci# undef mul_add 68e1051a39Sopenharmony_ci 69e1051a39Sopenharmony_ci/*- 70e1051a39Sopenharmony_ci * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; 71e1051a39Sopenharmony_ci * "g"(0) let the compiler to decide where does it 72e1051a39Sopenharmony_ci * want to keep the value of zero; 73e1051a39Sopenharmony_ci */ 74e1051a39Sopenharmony_ci# define mul_add(r,a,word,carry) do { \ 75e1051a39Sopenharmony_ci register BN_ULONG high,low; \ 76e1051a39Sopenharmony_ci asm ("mulq %3" \ 77e1051a39Sopenharmony_ci : "=a"(low),"=d"(high) \ 78e1051a39Sopenharmony_ci : "a"(word),"m"(a) \ 79e1051a39Sopenharmony_ci : "cc"); \ 80e1051a39Sopenharmony_ci asm ("addq %2,%0; adcq %3,%1" \ 81e1051a39Sopenharmony_ci : "+r"(carry),"+d"(high)\ 82e1051a39Sopenharmony_ci : "a"(low),"g"(0) \ 83e1051a39Sopenharmony_ci : "cc"); \ 84e1051a39Sopenharmony_ci asm ("addq %2,%0; adcq %3,%1" \ 85e1051a39Sopenharmony_ci : "+m"(r),"+d"(high) \ 86e1051a39Sopenharmony_ci : "r"(carry),"g"(0) \ 87e1051a39Sopenharmony_ci : "cc"); \ 88e1051a39Sopenharmony_ci carry=high; \ 89e1051a39Sopenharmony_ci } while (0) 90e1051a39Sopenharmony_ci 91e1051a39Sopenharmony_ci# define mul(r,a,word,carry) do { \ 92e1051a39Sopenharmony_ci register BN_ULONG high,low; \ 93e1051a39Sopenharmony_ci asm ("mulq %3" \ 94e1051a39Sopenharmony_ci : "=a"(low),"=d"(high) \ 95e1051a39Sopenharmony_ci : "a"(word),"g"(a) \ 96e1051a39Sopenharmony_ci : "cc"); \ 97e1051a39Sopenharmony_ci asm ("addq %2,%0; adcq %3,%1" \ 98e1051a39Sopenharmony_ci : "+r"(carry),"+d"(high)\ 99e1051a39Sopenharmony_ci : "a"(low),"g"(0) \ 100e1051a39Sopenharmony_ci : "cc"); \ 101e1051a39Sopenharmony_ci (r)=carry, carry=high; \ 102e1051a39Sopenharmony_ci } while (0) 103e1051a39Sopenharmony_ci# undef sqr 104e1051a39Sopenharmony_ci# define sqr(r0,r1,a) \ 105e1051a39Sopenharmony_ci asm ("mulq %2" \ 106e1051a39Sopenharmony_ci : "=a"(r0),"=d"(r1) \ 107e1051a39Sopenharmony_ci : "a"(a) \ 108e1051a39Sopenharmony_ci : "cc"); 109e1051a39Sopenharmony_ci 110e1051a39Sopenharmony_ciBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 111e1051a39Sopenharmony_ci BN_ULONG w) 112e1051a39Sopenharmony_ci{ 113e1051a39Sopenharmony_ci BN_ULONG c1 = 0; 114e1051a39Sopenharmony_ci 115e1051a39Sopenharmony_ci if (num <= 0) 116e1051a39Sopenharmony_ci return c1; 117e1051a39Sopenharmony_ci 118e1051a39Sopenharmony_ci while (num & ~3) { 119e1051a39Sopenharmony_ci mul_add(rp[0], ap[0], w, c1); 120e1051a39Sopenharmony_ci mul_add(rp[1], ap[1], w, c1); 121e1051a39Sopenharmony_ci mul_add(rp[2], ap[2], w, c1); 122e1051a39Sopenharmony_ci mul_add(rp[3], ap[3], w, c1); 123e1051a39Sopenharmony_ci ap += 4; 124e1051a39Sopenharmony_ci rp += 4; 125e1051a39Sopenharmony_ci num -= 4; 126e1051a39Sopenharmony_ci } 127e1051a39Sopenharmony_ci if (num) { 128e1051a39Sopenharmony_ci mul_add(rp[0], ap[0], w, c1); 129e1051a39Sopenharmony_ci if (--num == 0) 130e1051a39Sopenharmony_ci return c1; 131e1051a39Sopenharmony_ci mul_add(rp[1], ap[1], w, c1); 132e1051a39Sopenharmony_ci if (--num == 0) 133e1051a39Sopenharmony_ci return c1; 134e1051a39Sopenharmony_ci mul_add(rp[2], ap[2], w, c1); 135e1051a39Sopenharmony_ci return c1; 136e1051a39Sopenharmony_ci } 137e1051a39Sopenharmony_ci 138e1051a39Sopenharmony_ci return c1; 139e1051a39Sopenharmony_ci} 140e1051a39Sopenharmony_ci 141e1051a39Sopenharmony_ciBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 142e1051a39Sopenharmony_ci{ 143e1051a39Sopenharmony_ci BN_ULONG c1 = 0; 144e1051a39Sopenharmony_ci 145e1051a39Sopenharmony_ci if (num <= 0) 146e1051a39Sopenharmony_ci return c1; 147e1051a39Sopenharmony_ci 148e1051a39Sopenharmony_ci while (num & ~3) { 149e1051a39Sopenharmony_ci mul(rp[0], ap[0], w, c1); 150e1051a39Sopenharmony_ci mul(rp[1], ap[1], w, c1); 151e1051a39Sopenharmony_ci mul(rp[2], ap[2], w, c1); 152e1051a39Sopenharmony_ci mul(rp[3], ap[3], w, c1); 153e1051a39Sopenharmony_ci ap += 4; 154e1051a39Sopenharmony_ci rp += 4; 155e1051a39Sopenharmony_ci num -= 4; 156e1051a39Sopenharmony_ci } 157e1051a39Sopenharmony_ci if (num) { 158e1051a39Sopenharmony_ci mul(rp[0], ap[0], w, c1); 159e1051a39Sopenharmony_ci if (--num == 0) 160e1051a39Sopenharmony_ci return c1; 161e1051a39Sopenharmony_ci mul(rp[1], ap[1], w, c1); 162e1051a39Sopenharmony_ci if (--num == 0) 163e1051a39Sopenharmony_ci return c1; 164e1051a39Sopenharmony_ci mul(rp[2], ap[2], w, c1); 165e1051a39Sopenharmony_ci } 166e1051a39Sopenharmony_ci return c1; 167e1051a39Sopenharmony_ci} 168e1051a39Sopenharmony_ci 169e1051a39Sopenharmony_civoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 170e1051a39Sopenharmony_ci{ 171e1051a39Sopenharmony_ci if (n <= 0) 172e1051a39Sopenharmony_ci return; 173e1051a39Sopenharmony_ci 174e1051a39Sopenharmony_ci while (n & ~3) { 175e1051a39Sopenharmony_ci sqr(r[0], r[1], a[0]); 176e1051a39Sopenharmony_ci sqr(r[2], r[3], a[1]); 177e1051a39Sopenharmony_ci sqr(r[4], r[5], a[2]); 178e1051a39Sopenharmony_ci sqr(r[6], r[7], a[3]); 179e1051a39Sopenharmony_ci a += 4; 180e1051a39Sopenharmony_ci r += 8; 181e1051a39Sopenharmony_ci n -= 4; 182e1051a39Sopenharmony_ci } 183e1051a39Sopenharmony_ci if (n) { 184e1051a39Sopenharmony_ci sqr(r[0], r[1], a[0]); 185e1051a39Sopenharmony_ci if (--n == 0) 186e1051a39Sopenharmony_ci return; 187e1051a39Sopenharmony_ci sqr(r[2], r[3], a[1]); 188e1051a39Sopenharmony_ci if (--n == 0) 189e1051a39Sopenharmony_ci return; 190e1051a39Sopenharmony_ci sqr(r[4], r[5], a[2]); 191e1051a39Sopenharmony_ci } 192e1051a39Sopenharmony_ci} 193e1051a39Sopenharmony_ci 194e1051a39Sopenharmony_ciBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 195e1051a39Sopenharmony_ci{ 196e1051a39Sopenharmony_ci BN_ULONG ret, waste; 197e1051a39Sopenharmony_ci 198e1051a39Sopenharmony_ci asm("divq %4":"=a"(ret), "=d"(waste) 199e1051a39Sopenharmony_ci : "a"(l), "d"(h), "r"(d) 200e1051a39Sopenharmony_ci : "cc"); 201e1051a39Sopenharmony_ci 202e1051a39Sopenharmony_ci return ret; 203e1051a39Sopenharmony_ci} 204e1051a39Sopenharmony_ci 205e1051a39Sopenharmony_ciBN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 206e1051a39Sopenharmony_ci int n) 207e1051a39Sopenharmony_ci{ 208e1051a39Sopenharmony_ci BN_ULONG ret; 209e1051a39Sopenharmony_ci size_t i = 0; 210e1051a39Sopenharmony_ci 211e1051a39Sopenharmony_ci if (n <= 0) 212e1051a39Sopenharmony_ci return 0; 213e1051a39Sopenharmony_ci 214e1051a39Sopenharmony_ci asm volatile (" subq %0,%0 \n" /* clear carry */ 215e1051a39Sopenharmony_ci " jmp 1f \n" 216e1051a39Sopenharmony_ci ".p2align 4 \n" 217e1051a39Sopenharmony_ci "1: movq (%4,%2,8),%0 \n" 218e1051a39Sopenharmony_ci " adcq (%5,%2,8),%0 \n" 219e1051a39Sopenharmony_ci " movq %0,(%3,%2,8) \n" 220e1051a39Sopenharmony_ci " lea 1(%2),%2 \n" 221e1051a39Sopenharmony_ci " dec %1 \n" 222e1051a39Sopenharmony_ci " jnz 1b \n" 223e1051a39Sopenharmony_ci " sbbq %0,%0 \n" 224e1051a39Sopenharmony_ci :"=&r" (ret), "+c"(n), "+r"(i) 225e1051a39Sopenharmony_ci :"r"(rp), "r"(ap), "r"(bp) 226e1051a39Sopenharmony_ci :"cc", "memory"); 227e1051a39Sopenharmony_ci 228e1051a39Sopenharmony_ci return ret & 1; 229e1051a39Sopenharmony_ci} 230e1051a39Sopenharmony_ci 231e1051a39Sopenharmony_ci# ifndef SIMICS 232e1051a39Sopenharmony_ciBN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 233e1051a39Sopenharmony_ci int n) 234e1051a39Sopenharmony_ci{ 235e1051a39Sopenharmony_ci BN_ULONG ret; 236e1051a39Sopenharmony_ci size_t i = 0; 237e1051a39Sopenharmony_ci 238e1051a39Sopenharmony_ci if (n <= 0) 239e1051a39Sopenharmony_ci return 0; 240e1051a39Sopenharmony_ci 241e1051a39Sopenharmony_ci asm volatile (" subq %0,%0 \n" /* clear borrow */ 242e1051a39Sopenharmony_ci " jmp 1f \n" 243e1051a39Sopenharmony_ci ".p2align 4 \n" 244e1051a39Sopenharmony_ci "1: movq (%4,%2,8),%0 \n" 245e1051a39Sopenharmony_ci " sbbq (%5,%2,8),%0 \n" 246e1051a39Sopenharmony_ci " movq %0,(%3,%2,8) \n" 247e1051a39Sopenharmony_ci " lea 1(%2),%2 \n" 248e1051a39Sopenharmony_ci " dec %1 \n" 249e1051a39Sopenharmony_ci " jnz 1b \n" 250e1051a39Sopenharmony_ci " sbbq %0,%0 \n" 251e1051a39Sopenharmony_ci :"=&r" (ret), "+c"(n), "+r"(i) 252e1051a39Sopenharmony_ci :"r"(rp), "r"(ap), "r"(bp) 253e1051a39Sopenharmony_ci :"cc", "memory"); 254e1051a39Sopenharmony_ci 255e1051a39Sopenharmony_ci return ret & 1; 256e1051a39Sopenharmony_ci} 257e1051a39Sopenharmony_ci# else 258e1051a39Sopenharmony_ci/* Simics 1.4<7 has buggy sbbq:-( */ 259e1051a39Sopenharmony_ci# define BN_MASK2 0xffffffffffffffffL 260e1051a39Sopenharmony_ciBN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 261e1051a39Sopenharmony_ci{ 262e1051a39Sopenharmony_ci BN_ULONG t1, t2; 263e1051a39Sopenharmony_ci int c = 0; 264e1051a39Sopenharmony_ci 265e1051a39Sopenharmony_ci if (n <= 0) 266e1051a39Sopenharmony_ci return (BN_ULONG)0; 267e1051a39Sopenharmony_ci 268e1051a39Sopenharmony_ci for (;;) { 269e1051a39Sopenharmony_ci t1 = a[0]; 270e1051a39Sopenharmony_ci t2 = b[0]; 271e1051a39Sopenharmony_ci r[0] = (t1 - t2 - c) & BN_MASK2; 272e1051a39Sopenharmony_ci if (t1 != t2) 273e1051a39Sopenharmony_ci c = (t1 < t2); 274e1051a39Sopenharmony_ci if (--n <= 0) 275e1051a39Sopenharmony_ci break; 276e1051a39Sopenharmony_ci 277e1051a39Sopenharmony_ci t1 = a[1]; 278e1051a39Sopenharmony_ci t2 = b[1]; 279e1051a39Sopenharmony_ci r[1] = (t1 - t2 - c) & BN_MASK2; 280e1051a39Sopenharmony_ci if (t1 != t2) 281e1051a39Sopenharmony_ci c = (t1 < t2); 282e1051a39Sopenharmony_ci if (--n <= 0) 283e1051a39Sopenharmony_ci break; 284e1051a39Sopenharmony_ci 285e1051a39Sopenharmony_ci t1 = a[2]; 286e1051a39Sopenharmony_ci t2 = b[2]; 287e1051a39Sopenharmony_ci r[2] = (t1 - t2 - c) & BN_MASK2; 288e1051a39Sopenharmony_ci if (t1 != t2) 289e1051a39Sopenharmony_ci c = (t1 < t2); 290e1051a39Sopenharmony_ci if (--n <= 0) 291e1051a39Sopenharmony_ci break; 292e1051a39Sopenharmony_ci 293e1051a39Sopenharmony_ci t1 = a[3]; 294e1051a39Sopenharmony_ci t2 = b[3]; 295e1051a39Sopenharmony_ci r[3] = (t1 - t2 - c) & BN_MASK2; 296e1051a39Sopenharmony_ci if (t1 != t2) 297e1051a39Sopenharmony_ci c = (t1 < t2); 298e1051a39Sopenharmony_ci if (--n <= 0) 299e1051a39Sopenharmony_ci break; 300e1051a39Sopenharmony_ci 301e1051a39Sopenharmony_ci a += 4; 302e1051a39Sopenharmony_ci b += 4; 303e1051a39Sopenharmony_ci r += 4; 304e1051a39Sopenharmony_ci } 305e1051a39Sopenharmony_ci return c; 306e1051a39Sopenharmony_ci} 307e1051a39Sopenharmony_ci# endif 308e1051a39Sopenharmony_ci 309e1051a39Sopenharmony_ci/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 310e1051a39Sopenharmony_ci/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 311e1051a39Sopenharmony_ci/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 312e1051a39Sopenharmony_ci/* 313e1051a39Sopenharmony_ci * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 314e1051a39Sopenharmony_ci * c=(c2,c1,c0) 315e1051a39Sopenharmony_ci */ 316e1051a39Sopenharmony_ci 317e1051a39Sopenharmony_ci/* 318e1051a39Sopenharmony_ci * Keep in mind that carrying into high part of multiplication result 319e1051a39Sopenharmony_ci * can not overflow, because it cannot be all-ones. 320e1051a39Sopenharmony_ci */ 321e1051a39Sopenharmony_ci# if 0 322e1051a39Sopenharmony_ci/* original macros are kept for reference purposes */ 323e1051a39Sopenharmony_ci# define mul_add_c(a,b,c0,c1,c2) do { \ 324e1051a39Sopenharmony_ci BN_ULONG ta = (a), tb = (b); \ 325e1051a39Sopenharmony_ci BN_ULONG lo, hi; \ 326e1051a39Sopenharmony_ci BN_UMULT_LOHI(lo,hi,ta,tb); \ 327e1051a39Sopenharmony_ci c0 += lo; hi += (c0<lo)?1:0; \ 328e1051a39Sopenharmony_ci c1 += hi; c2 += (c1<hi)?1:0; \ 329e1051a39Sopenharmony_ci } while(0) 330e1051a39Sopenharmony_ci 331e1051a39Sopenharmony_ci# define mul_add_c2(a,b,c0,c1,c2) do { \ 332e1051a39Sopenharmony_ci BN_ULONG ta = (a), tb = (b); \ 333e1051a39Sopenharmony_ci BN_ULONG lo, hi, tt; \ 334e1051a39Sopenharmony_ci BN_UMULT_LOHI(lo,hi,ta,tb); \ 335e1051a39Sopenharmony_ci c0 += lo; tt = hi+((c0<lo)?1:0); \ 336e1051a39Sopenharmony_ci c1 += tt; c2 += (c1<tt)?1:0; \ 337e1051a39Sopenharmony_ci c0 += lo; hi += (c0<lo)?1:0; \ 338e1051a39Sopenharmony_ci c1 += hi; c2 += (c1<hi)?1:0; \ 339e1051a39Sopenharmony_ci } while(0) 340e1051a39Sopenharmony_ci 341e1051a39Sopenharmony_ci# define sqr_add_c(a,i,c0,c1,c2) do { \ 342e1051a39Sopenharmony_ci BN_ULONG ta = (a)[i]; \ 343e1051a39Sopenharmony_ci BN_ULONG lo, hi; \ 344e1051a39Sopenharmony_ci BN_UMULT_LOHI(lo,hi,ta,ta); \ 345e1051a39Sopenharmony_ci c0 += lo; hi += (c0<lo)?1:0; \ 346e1051a39Sopenharmony_ci c1 += hi; c2 += (c1<hi)?1:0; \ 347e1051a39Sopenharmony_ci } while(0) 348e1051a39Sopenharmony_ci# else 349e1051a39Sopenharmony_ci# define mul_add_c(a,b,c0,c1,c2) do { \ 350e1051a39Sopenharmony_ci BN_ULONG t1,t2; \ 351e1051a39Sopenharmony_ci asm ("mulq %3" \ 352e1051a39Sopenharmony_ci : "=a"(t1),"=d"(t2) \ 353e1051a39Sopenharmony_ci : "a"(a),"m"(b) \ 354e1051a39Sopenharmony_ci : "cc"); \ 355e1051a39Sopenharmony_ci asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 356e1051a39Sopenharmony_ci : "+r"(c0),"+r"(c1),"+r"(c2) \ 357e1051a39Sopenharmony_ci : "r"(t1),"r"(t2),"g"(0) \ 358e1051a39Sopenharmony_ci : "cc"); \ 359e1051a39Sopenharmony_ci } while (0) 360e1051a39Sopenharmony_ci 361e1051a39Sopenharmony_ci# define sqr_add_c(a,i,c0,c1,c2) do { \ 362e1051a39Sopenharmony_ci BN_ULONG t1,t2; \ 363e1051a39Sopenharmony_ci asm ("mulq %2" \ 364e1051a39Sopenharmony_ci : "=a"(t1),"=d"(t2) \ 365e1051a39Sopenharmony_ci : "a"(a[i]) \ 366e1051a39Sopenharmony_ci : "cc"); \ 367e1051a39Sopenharmony_ci asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 368e1051a39Sopenharmony_ci : "+r"(c0),"+r"(c1),"+r"(c2) \ 369e1051a39Sopenharmony_ci : "r"(t1),"r"(t2),"g"(0) \ 370e1051a39Sopenharmony_ci : "cc"); \ 371e1051a39Sopenharmony_ci } while (0) 372e1051a39Sopenharmony_ci 373e1051a39Sopenharmony_ci# define mul_add_c2(a,b,c0,c1,c2) do { \ 374e1051a39Sopenharmony_ci BN_ULONG t1,t2; \ 375e1051a39Sopenharmony_ci asm ("mulq %3" \ 376e1051a39Sopenharmony_ci : "=a"(t1),"=d"(t2) \ 377e1051a39Sopenharmony_ci : "a"(a),"m"(b) \ 378e1051a39Sopenharmony_ci : "cc"); \ 379e1051a39Sopenharmony_ci asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 380e1051a39Sopenharmony_ci : "+r"(c0),"+r"(c1),"+r"(c2) \ 381e1051a39Sopenharmony_ci : "r"(t1),"r"(t2),"g"(0) \ 382e1051a39Sopenharmony_ci : "cc"); \ 383e1051a39Sopenharmony_ci asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 384e1051a39Sopenharmony_ci : "+r"(c0),"+r"(c1),"+r"(c2) \ 385e1051a39Sopenharmony_ci : "r"(t1),"r"(t2),"g"(0) \ 386e1051a39Sopenharmony_ci : "cc"); \ 387e1051a39Sopenharmony_ci } while (0) 388e1051a39Sopenharmony_ci# endif 389e1051a39Sopenharmony_ci 390e1051a39Sopenharmony_ci# define sqr_add_c2(a,i,j,c0,c1,c2) \ 391e1051a39Sopenharmony_ci mul_add_c2((a)[i],(a)[j],c0,c1,c2) 392e1051a39Sopenharmony_ci 393e1051a39Sopenharmony_civoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 394e1051a39Sopenharmony_ci{ 395e1051a39Sopenharmony_ci BN_ULONG c1, c2, c3; 396e1051a39Sopenharmony_ci 397e1051a39Sopenharmony_ci c1 = 0; 398e1051a39Sopenharmony_ci c2 = 0; 399e1051a39Sopenharmony_ci c3 = 0; 400e1051a39Sopenharmony_ci mul_add_c(a[0], b[0], c1, c2, c3); 401e1051a39Sopenharmony_ci r[0] = c1; 402e1051a39Sopenharmony_ci c1 = 0; 403e1051a39Sopenharmony_ci mul_add_c(a[0], b[1], c2, c3, c1); 404e1051a39Sopenharmony_ci mul_add_c(a[1], b[0], c2, c3, c1); 405e1051a39Sopenharmony_ci r[1] = c2; 406e1051a39Sopenharmony_ci c2 = 0; 407e1051a39Sopenharmony_ci mul_add_c(a[2], b[0], c3, c1, c2); 408e1051a39Sopenharmony_ci mul_add_c(a[1], b[1], c3, c1, c2); 409e1051a39Sopenharmony_ci mul_add_c(a[0], b[2], c3, c1, c2); 410e1051a39Sopenharmony_ci r[2] = c3; 411e1051a39Sopenharmony_ci c3 = 0; 412e1051a39Sopenharmony_ci mul_add_c(a[0], b[3], c1, c2, c3); 413e1051a39Sopenharmony_ci mul_add_c(a[1], b[2], c1, c2, c3); 414e1051a39Sopenharmony_ci mul_add_c(a[2], b[1], c1, c2, c3); 415e1051a39Sopenharmony_ci mul_add_c(a[3], b[0], c1, c2, c3); 416e1051a39Sopenharmony_ci r[3] = c1; 417e1051a39Sopenharmony_ci c1 = 0; 418e1051a39Sopenharmony_ci mul_add_c(a[4], b[0], c2, c3, c1); 419e1051a39Sopenharmony_ci mul_add_c(a[3], b[1], c2, c3, c1); 420e1051a39Sopenharmony_ci mul_add_c(a[2], b[2], c2, c3, c1); 421e1051a39Sopenharmony_ci mul_add_c(a[1], b[3], c2, c3, c1); 422e1051a39Sopenharmony_ci mul_add_c(a[0], b[4], c2, c3, c1); 423e1051a39Sopenharmony_ci r[4] = c2; 424e1051a39Sopenharmony_ci c2 = 0; 425e1051a39Sopenharmony_ci mul_add_c(a[0], b[5], c3, c1, c2); 426e1051a39Sopenharmony_ci mul_add_c(a[1], b[4], c3, c1, c2); 427e1051a39Sopenharmony_ci mul_add_c(a[2], b[3], c3, c1, c2); 428e1051a39Sopenharmony_ci mul_add_c(a[3], b[2], c3, c1, c2); 429e1051a39Sopenharmony_ci mul_add_c(a[4], b[1], c3, c1, c2); 430e1051a39Sopenharmony_ci mul_add_c(a[5], b[0], c3, c1, c2); 431e1051a39Sopenharmony_ci r[5] = c3; 432e1051a39Sopenharmony_ci c3 = 0; 433e1051a39Sopenharmony_ci mul_add_c(a[6], b[0], c1, c2, c3); 434e1051a39Sopenharmony_ci mul_add_c(a[5], b[1], c1, c2, c3); 435e1051a39Sopenharmony_ci mul_add_c(a[4], b[2], c1, c2, c3); 436e1051a39Sopenharmony_ci mul_add_c(a[3], b[3], c1, c2, c3); 437e1051a39Sopenharmony_ci mul_add_c(a[2], b[4], c1, c2, c3); 438e1051a39Sopenharmony_ci mul_add_c(a[1], b[5], c1, c2, c3); 439e1051a39Sopenharmony_ci mul_add_c(a[0], b[6], c1, c2, c3); 440e1051a39Sopenharmony_ci r[6] = c1; 441e1051a39Sopenharmony_ci c1 = 0; 442e1051a39Sopenharmony_ci mul_add_c(a[0], b[7], c2, c3, c1); 443e1051a39Sopenharmony_ci mul_add_c(a[1], b[6], c2, c3, c1); 444e1051a39Sopenharmony_ci mul_add_c(a[2], b[5], c2, c3, c1); 445e1051a39Sopenharmony_ci mul_add_c(a[3], b[4], c2, c3, c1); 446e1051a39Sopenharmony_ci mul_add_c(a[4], b[3], c2, c3, c1); 447e1051a39Sopenharmony_ci mul_add_c(a[5], b[2], c2, c3, c1); 448e1051a39Sopenharmony_ci mul_add_c(a[6], b[1], c2, c3, c1); 449e1051a39Sopenharmony_ci mul_add_c(a[7], b[0], c2, c3, c1); 450e1051a39Sopenharmony_ci r[7] = c2; 451e1051a39Sopenharmony_ci c2 = 0; 452e1051a39Sopenharmony_ci mul_add_c(a[7], b[1], c3, c1, c2); 453e1051a39Sopenharmony_ci mul_add_c(a[6], b[2], c3, c1, c2); 454e1051a39Sopenharmony_ci mul_add_c(a[5], b[3], c3, c1, c2); 455e1051a39Sopenharmony_ci mul_add_c(a[4], b[4], c3, c1, c2); 456e1051a39Sopenharmony_ci mul_add_c(a[3], b[5], c3, c1, c2); 457e1051a39Sopenharmony_ci mul_add_c(a[2], b[6], c3, c1, c2); 458e1051a39Sopenharmony_ci mul_add_c(a[1], b[7], c3, c1, c2); 459e1051a39Sopenharmony_ci r[8] = c3; 460e1051a39Sopenharmony_ci c3 = 0; 461e1051a39Sopenharmony_ci mul_add_c(a[2], b[7], c1, c2, c3); 462e1051a39Sopenharmony_ci mul_add_c(a[3], b[6], c1, c2, c3); 463e1051a39Sopenharmony_ci mul_add_c(a[4], b[5], c1, c2, c3); 464e1051a39Sopenharmony_ci mul_add_c(a[5], b[4], c1, c2, c3); 465e1051a39Sopenharmony_ci mul_add_c(a[6], b[3], c1, c2, c3); 466e1051a39Sopenharmony_ci mul_add_c(a[7], b[2], c1, c2, c3); 467e1051a39Sopenharmony_ci r[9] = c1; 468e1051a39Sopenharmony_ci c1 = 0; 469e1051a39Sopenharmony_ci mul_add_c(a[7], b[3], c2, c3, c1); 470e1051a39Sopenharmony_ci mul_add_c(a[6], b[4], c2, c3, c1); 471e1051a39Sopenharmony_ci mul_add_c(a[5], b[5], c2, c3, c1); 472e1051a39Sopenharmony_ci mul_add_c(a[4], b[6], c2, c3, c1); 473e1051a39Sopenharmony_ci mul_add_c(a[3], b[7], c2, c3, c1); 474e1051a39Sopenharmony_ci r[10] = c2; 475e1051a39Sopenharmony_ci c2 = 0; 476e1051a39Sopenharmony_ci mul_add_c(a[4], b[7], c3, c1, c2); 477e1051a39Sopenharmony_ci mul_add_c(a[5], b[6], c3, c1, c2); 478e1051a39Sopenharmony_ci mul_add_c(a[6], b[5], c3, c1, c2); 479e1051a39Sopenharmony_ci mul_add_c(a[7], b[4], c3, c1, c2); 480e1051a39Sopenharmony_ci r[11] = c3; 481e1051a39Sopenharmony_ci c3 = 0; 482e1051a39Sopenharmony_ci mul_add_c(a[7], b[5], c1, c2, c3); 483e1051a39Sopenharmony_ci mul_add_c(a[6], b[6], c1, c2, c3); 484e1051a39Sopenharmony_ci mul_add_c(a[5], b[7], c1, c2, c3); 485e1051a39Sopenharmony_ci r[12] = c1; 486e1051a39Sopenharmony_ci c1 = 0; 487e1051a39Sopenharmony_ci mul_add_c(a[6], b[7], c2, c3, c1); 488e1051a39Sopenharmony_ci mul_add_c(a[7], b[6], c2, c3, c1); 489e1051a39Sopenharmony_ci r[13] = c2; 490e1051a39Sopenharmony_ci c2 = 0; 491e1051a39Sopenharmony_ci mul_add_c(a[7], b[7], c3, c1, c2); 492e1051a39Sopenharmony_ci r[14] = c3; 493e1051a39Sopenharmony_ci r[15] = c1; 494e1051a39Sopenharmony_ci} 495e1051a39Sopenharmony_ci 496e1051a39Sopenharmony_civoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 497e1051a39Sopenharmony_ci{ 498e1051a39Sopenharmony_ci BN_ULONG c1, c2, c3; 499e1051a39Sopenharmony_ci 500e1051a39Sopenharmony_ci c1 = 0; 501e1051a39Sopenharmony_ci c2 = 0; 502e1051a39Sopenharmony_ci c3 = 0; 503e1051a39Sopenharmony_ci mul_add_c(a[0], b[0], c1, c2, c3); 504e1051a39Sopenharmony_ci r[0] = c1; 505e1051a39Sopenharmony_ci c1 = 0; 506e1051a39Sopenharmony_ci mul_add_c(a[0], b[1], c2, c3, c1); 507e1051a39Sopenharmony_ci mul_add_c(a[1], b[0], c2, c3, c1); 508e1051a39Sopenharmony_ci r[1] = c2; 509e1051a39Sopenharmony_ci c2 = 0; 510e1051a39Sopenharmony_ci mul_add_c(a[2], b[0], c3, c1, c2); 511e1051a39Sopenharmony_ci mul_add_c(a[1], b[1], c3, c1, c2); 512e1051a39Sopenharmony_ci mul_add_c(a[0], b[2], c3, c1, c2); 513e1051a39Sopenharmony_ci r[2] = c3; 514e1051a39Sopenharmony_ci c3 = 0; 515e1051a39Sopenharmony_ci mul_add_c(a[0], b[3], c1, c2, c3); 516e1051a39Sopenharmony_ci mul_add_c(a[1], b[2], c1, c2, c3); 517e1051a39Sopenharmony_ci mul_add_c(a[2], b[1], c1, c2, c3); 518e1051a39Sopenharmony_ci mul_add_c(a[3], b[0], c1, c2, c3); 519e1051a39Sopenharmony_ci r[3] = c1; 520e1051a39Sopenharmony_ci c1 = 0; 521e1051a39Sopenharmony_ci mul_add_c(a[3], b[1], c2, c3, c1); 522e1051a39Sopenharmony_ci mul_add_c(a[2], b[2], c2, c3, c1); 523e1051a39Sopenharmony_ci mul_add_c(a[1], b[3], c2, c3, c1); 524e1051a39Sopenharmony_ci r[4] = c2; 525e1051a39Sopenharmony_ci c2 = 0; 526e1051a39Sopenharmony_ci mul_add_c(a[2], b[3], c3, c1, c2); 527e1051a39Sopenharmony_ci mul_add_c(a[3], b[2], c3, c1, c2); 528e1051a39Sopenharmony_ci r[5] = c3; 529e1051a39Sopenharmony_ci c3 = 0; 530e1051a39Sopenharmony_ci mul_add_c(a[3], b[3], c1, c2, c3); 531e1051a39Sopenharmony_ci r[6] = c1; 532e1051a39Sopenharmony_ci r[7] = c2; 533e1051a39Sopenharmony_ci} 534e1051a39Sopenharmony_ci 535e1051a39Sopenharmony_civoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 536e1051a39Sopenharmony_ci{ 537e1051a39Sopenharmony_ci BN_ULONG c1, c2, c3; 538e1051a39Sopenharmony_ci 539e1051a39Sopenharmony_ci c1 = 0; 540e1051a39Sopenharmony_ci c2 = 0; 541e1051a39Sopenharmony_ci c3 = 0; 542e1051a39Sopenharmony_ci sqr_add_c(a, 0, c1, c2, c3); 543e1051a39Sopenharmony_ci r[0] = c1; 544e1051a39Sopenharmony_ci c1 = 0; 545e1051a39Sopenharmony_ci sqr_add_c2(a, 1, 0, c2, c3, c1); 546e1051a39Sopenharmony_ci r[1] = c2; 547e1051a39Sopenharmony_ci c2 = 0; 548e1051a39Sopenharmony_ci sqr_add_c(a, 1, c3, c1, c2); 549e1051a39Sopenharmony_ci sqr_add_c2(a, 2, 0, c3, c1, c2); 550e1051a39Sopenharmony_ci r[2] = c3; 551e1051a39Sopenharmony_ci c3 = 0; 552e1051a39Sopenharmony_ci sqr_add_c2(a, 3, 0, c1, c2, c3); 553e1051a39Sopenharmony_ci sqr_add_c2(a, 2, 1, c1, c2, c3); 554e1051a39Sopenharmony_ci r[3] = c1; 555e1051a39Sopenharmony_ci c1 = 0; 556e1051a39Sopenharmony_ci sqr_add_c(a, 2, c2, c3, c1); 557e1051a39Sopenharmony_ci sqr_add_c2(a, 3, 1, c2, c3, c1); 558e1051a39Sopenharmony_ci sqr_add_c2(a, 4, 0, c2, c3, c1); 559e1051a39Sopenharmony_ci r[4] = c2; 560e1051a39Sopenharmony_ci c2 = 0; 561e1051a39Sopenharmony_ci sqr_add_c2(a, 5, 0, c3, c1, c2); 562e1051a39Sopenharmony_ci sqr_add_c2(a, 4, 1, c3, c1, c2); 563e1051a39Sopenharmony_ci sqr_add_c2(a, 3, 2, c3, c1, c2); 564e1051a39Sopenharmony_ci r[5] = c3; 565e1051a39Sopenharmony_ci c3 = 0; 566e1051a39Sopenharmony_ci sqr_add_c(a, 3, c1, c2, c3); 567e1051a39Sopenharmony_ci sqr_add_c2(a, 4, 2, c1, c2, c3); 568e1051a39Sopenharmony_ci sqr_add_c2(a, 5, 1, c1, c2, c3); 569e1051a39Sopenharmony_ci sqr_add_c2(a, 6, 0, c1, c2, c3); 570e1051a39Sopenharmony_ci r[6] = c1; 571e1051a39Sopenharmony_ci c1 = 0; 572e1051a39Sopenharmony_ci sqr_add_c2(a, 7, 0, c2, c3, c1); 573e1051a39Sopenharmony_ci sqr_add_c2(a, 6, 1, c2, c3, c1); 574e1051a39Sopenharmony_ci sqr_add_c2(a, 5, 2, c2, c3, c1); 575e1051a39Sopenharmony_ci sqr_add_c2(a, 4, 3, c2, c3, c1); 576e1051a39Sopenharmony_ci r[7] = c2; 577e1051a39Sopenharmony_ci c2 = 0; 578e1051a39Sopenharmony_ci sqr_add_c(a, 4, c3, c1, c2); 579e1051a39Sopenharmony_ci sqr_add_c2(a, 5, 3, c3, c1, c2); 580e1051a39Sopenharmony_ci sqr_add_c2(a, 6, 2, c3, c1, c2); 581e1051a39Sopenharmony_ci sqr_add_c2(a, 7, 1, c3, c1, c2); 582e1051a39Sopenharmony_ci r[8] = c3; 583e1051a39Sopenharmony_ci c3 = 0; 584e1051a39Sopenharmony_ci sqr_add_c2(a, 7, 2, c1, c2, c3); 585e1051a39Sopenharmony_ci sqr_add_c2(a, 6, 3, c1, c2, c3); 586e1051a39Sopenharmony_ci sqr_add_c2(a, 5, 4, c1, c2, c3); 587e1051a39Sopenharmony_ci r[9] = c1; 588e1051a39Sopenharmony_ci c1 = 0; 589e1051a39Sopenharmony_ci sqr_add_c(a, 5, c2, c3, c1); 590e1051a39Sopenharmony_ci sqr_add_c2(a, 6, 4, c2, c3, c1); 591e1051a39Sopenharmony_ci sqr_add_c2(a, 7, 3, c2, c3, c1); 592e1051a39Sopenharmony_ci r[10] = c2; 593e1051a39Sopenharmony_ci c2 = 0; 594e1051a39Sopenharmony_ci sqr_add_c2(a, 7, 4, c3, c1, c2); 595e1051a39Sopenharmony_ci sqr_add_c2(a, 6, 5, c3, c1, c2); 596e1051a39Sopenharmony_ci r[11] = c3; 597e1051a39Sopenharmony_ci c3 = 0; 598e1051a39Sopenharmony_ci sqr_add_c(a, 6, c1, c2, c3); 599e1051a39Sopenharmony_ci sqr_add_c2(a, 7, 5, c1, c2, c3); 600e1051a39Sopenharmony_ci r[12] = c1; 601e1051a39Sopenharmony_ci c1 = 0; 602e1051a39Sopenharmony_ci sqr_add_c2(a, 7, 6, c2, c3, c1); 603e1051a39Sopenharmony_ci r[13] = c2; 604e1051a39Sopenharmony_ci c2 = 0; 605e1051a39Sopenharmony_ci sqr_add_c(a, 7, c3, c1, c2); 606e1051a39Sopenharmony_ci r[14] = c3; 607e1051a39Sopenharmony_ci r[15] = c1; 608e1051a39Sopenharmony_ci} 609e1051a39Sopenharmony_ci 610e1051a39Sopenharmony_civoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 611e1051a39Sopenharmony_ci{ 612e1051a39Sopenharmony_ci BN_ULONG c1, c2, c3; 613e1051a39Sopenharmony_ci 614e1051a39Sopenharmony_ci c1 = 0; 615e1051a39Sopenharmony_ci c2 = 0; 616e1051a39Sopenharmony_ci c3 = 0; 617e1051a39Sopenharmony_ci sqr_add_c(a, 0, c1, c2, c3); 618e1051a39Sopenharmony_ci r[0] = c1; 619e1051a39Sopenharmony_ci c1 = 0; 620e1051a39Sopenharmony_ci sqr_add_c2(a, 1, 0, c2, c3, c1); 621e1051a39Sopenharmony_ci r[1] = c2; 622e1051a39Sopenharmony_ci c2 = 0; 623e1051a39Sopenharmony_ci sqr_add_c(a, 1, c3, c1, c2); 624e1051a39Sopenharmony_ci sqr_add_c2(a, 2, 0, c3, c1, c2); 625e1051a39Sopenharmony_ci r[2] = c3; 626e1051a39Sopenharmony_ci c3 = 0; 627e1051a39Sopenharmony_ci sqr_add_c2(a, 3, 0, c1, c2, c3); 628e1051a39Sopenharmony_ci sqr_add_c2(a, 2, 1, c1, c2, c3); 629e1051a39Sopenharmony_ci r[3] = c1; 630e1051a39Sopenharmony_ci c1 = 0; 631e1051a39Sopenharmony_ci sqr_add_c(a, 2, c2, c3, c1); 632e1051a39Sopenharmony_ci sqr_add_c2(a, 3, 1, c2, c3, c1); 633e1051a39Sopenharmony_ci r[4] = c2; 634e1051a39Sopenharmony_ci c2 = 0; 635e1051a39Sopenharmony_ci sqr_add_c2(a, 3, 2, c3, c1, c2); 636e1051a39Sopenharmony_ci r[5] = c3; 637e1051a39Sopenharmony_ci c3 = 0; 638e1051a39Sopenharmony_ci sqr_add_c(a, 3, c1, c2, c3); 639e1051a39Sopenharmony_ci r[6] = c1; 640e1051a39Sopenharmony_ci r[7] = c2; 641e1051a39Sopenharmony_ci} 642e1051a39Sopenharmony_ci#endif 643