1570af302Sopenharmony_ci#include <stdint.h> 2570af302Sopenharmony_ci#include <math.h> 3570af302Sopenharmony_ci#include <float.h> 4570af302Sopenharmony_ci#include "libm.h" 5570af302Sopenharmony_ci 6570af302Sopenharmony_ci#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 7570af302Sopenharmony_cilong double sqrtl(long double x) 8570af302Sopenharmony_ci{ 9570af302Sopenharmony_ci return sqrt(x); 10570af302Sopenharmony_ci} 11570af302Sopenharmony_ci#elif (LDBL_MANT_DIG == 113 || LDBL_MANT_DIG == 64) && LDBL_MAX_EXP == 16384 12570af302Sopenharmony_ci#include "sqrt_data.h" 13570af302Sopenharmony_ci 14570af302Sopenharmony_ci#define FENV_SUPPORT 1 15570af302Sopenharmony_ci 16570af302Sopenharmony_citypedef struct { 17570af302Sopenharmony_ci uint64_t hi; 18570af302Sopenharmony_ci uint64_t lo; 19570af302Sopenharmony_ci} u128; 20570af302Sopenharmony_ci 21570af302Sopenharmony_ci/* top: 16 bit sign+exponent, x: significand. */ 22570af302Sopenharmony_cistatic inline long double mkldbl(uint64_t top, u128 x) 23570af302Sopenharmony_ci{ 24570af302Sopenharmony_ci union ldshape u; 25570af302Sopenharmony_ci#if LDBL_MANT_DIG == 113 26570af302Sopenharmony_ci u.i2.hi = x.hi; 27570af302Sopenharmony_ci u.i2.lo = x.lo; 28570af302Sopenharmony_ci u.i2.hi &= 0x0000ffffffffffff; 29570af302Sopenharmony_ci u.i2.hi |= top << 48; 30570af302Sopenharmony_ci#elif LDBL_MANT_DIG == 64 31570af302Sopenharmony_ci u.i.se = top; 32570af302Sopenharmony_ci u.i.m = x.lo; 33570af302Sopenharmony_ci /* force the top bit on non-zero (and non-subnormal) results. */ 34570af302Sopenharmony_ci if (top & 0x7fff) 35570af302Sopenharmony_ci u.i.m |= 0x8000000000000000; 36570af302Sopenharmony_ci#endif 37570af302Sopenharmony_ci return u.f; 38570af302Sopenharmony_ci} 39570af302Sopenharmony_ci 40570af302Sopenharmony_ci/* return: top 16 bit is sign+exp and following bits are the significand. */ 41570af302Sopenharmony_cistatic inline u128 asu128(long double x) 42570af302Sopenharmony_ci{ 43570af302Sopenharmony_ci union ldshape u = {.f=x}; 44570af302Sopenharmony_ci u128 r; 45570af302Sopenharmony_ci#if LDBL_MANT_DIG == 113 46570af302Sopenharmony_ci r.hi = u.i2.hi; 47570af302Sopenharmony_ci r.lo = u.i2.lo; 48570af302Sopenharmony_ci#elif LDBL_MANT_DIG == 64 49570af302Sopenharmony_ci r.lo = u.i.m<<49; 50570af302Sopenharmony_ci /* ignore the top bit: pseudo numbers are not handled. */ 51570af302Sopenharmony_ci r.hi = u.i.m>>15; 52570af302Sopenharmony_ci r.hi &= 0x0000ffffffffffff; 53570af302Sopenharmony_ci r.hi |= (uint64_t)u.i.se << 48; 54570af302Sopenharmony_ci#endif 55570af302Sopenharmony_ci return r; 56570af302Sopenharmony_ci} 57570af302Sopenharmony_ci 58570af302Sopenharmony_ci/* returns a*b*2^-32 - e, with error 0 <= e < 1. */ 59570af302Sopenharmony_cistatic inline uint32_t mul32(uint32_t a, uint32_t b) 60570af302Sopenharmony_ci{ 61570af302Sopenharmony_ci return (uint64_t)a*b >> 32; 62570af302Sopenharmony_ci} 63570af302Sopenharmony_ci 64570af302Sopenharmony_ci/* returns a*b*2^-64 - e, with error 0 <= e < 3. */ 65570af302Sopenharmony_cistatic inline uint64_t mul64(uint64_t a, uint64_t b) 66570af302Sopenharmony_ci{ 67570af302Sopenharmony_ci uint64_t ahi = a>>32; 68570af302Sopenharmony_ci uint64_t alo = a&0xffffffff; 69570af302Sopenharmony_ci uint64_t bhi = b>>32; 70570af302Sopenharmony_ci uint64_t blo = b&0xffffffff; 71570af302Sopenharmony_ci return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32); 72570af302Sopenharmony_ci} 73570af302Sopenharmony_ci 74570af302Sopenharmony_cistatic inline u128 add64(u128 a, uint64_t b) 75570af302Sopenharmony_ci{ 76570af302Sopenharmony_ci u128 r; 77570af302Sopenharmony_ci r.lo = a.lo + b; 78570af302Sopenharmony_ci r.hi = a.hi; 79570af302Sopenharmony_ci if (r.lo < a.lo) 80570af302Sopenharmony_ci r.hi++; 81570af302Sopenharmony_ci return r; 82570af302Sopenharmony_ci} 83570af302Sopenharmony_ci 84570af302Sopenharmony_cistatic inline u128 add128(u128 a, u128 b) 85570af302Sopenharmony_ci{ 86570af302Sopenharmony_ci u128 r; 87570af302Sopenharmony_ci r.lo = a.lo + b.lo; 88570af302Sopenharmony_ci r.hi = a.hi + b.hi; 89570af302Sopenharmony_ci if (r.lo < a.lo) 90570af302Sopenharmony_ci r.hi++; 91570af302Sopenharmony_ci return r; 92570af302Sopenharmony_ci} 93570af302Sopenharmony_ci 94570af302Sopenharmony_cistatic inline u128 sub64(u128 a, uint64_t b) 95570af302Sopenharmony_ci{ 96570af302Sopenharmony_ci u128 r; 97570af302Sopenharmony_ci r.lo = a.lo - b; 98570af302Sopenharmony_ci r.hi = a.hi; 99570af302Sopenharmony_ci if (a.lo < b) 100570af302Sopenharmony_ci r.hi--; 101570af302Sopenharmony_ci return r; 102570af302Sopenharmony_ci} 103570af302Sopenharmony_ci 104570af302Sopenharmony_cistatic inline u128 sub128(u128 a, u128 b) 105570af302Sopenharmony_ci{ 106570af302Sopenharmony_ci u128 r; 107570af302Sopenharmony_ci r.lo = a.lo - b.lo; 108570af302Sopenharmony_ci r.hi = a.hi - b.hi; 109570af302Sopenharmony_ci if (a.lo < b.lo) 110570af302Sopenharmony_ci r.hi--; 111570af302Sopenharmony_ci return r; 112570af302Sopenharmony_ci} 113570af302Sopenharmony_ci 114570af302Sopenharmony_ci/* a<<n, 0 <= n <= 127 */ 115570af302Sopenharmony_cistatic inline u128 lsh(u128 a, int n) 116570af302Sopenharmony_ci{ 117570af302Sopenharmony_ci if (n == 0) 118570af302Sopenharmony_ci return a; 119570af302Sopenharmony_ci if (n >= 64) { 120570af302Sopenharmony_ci a.hi = a.lo<<(n-64); 121570af302Sopenharmony_ci a.lo = 0; 122570af302Sopenharmony_ci } else { 123570af302Sopenharmony_ci a.hi = (a.hi<<n) | (a.lo>>(64-n)); 124570af302Sopenharmony_ci a.lo = a.lo<<n; 125570af302Sopenharmony_ci } 126570af302Sopenharmony_ci return a; 127570af302Sopenharmony_ci} 128570af302Sopenharmony_ci 129570af302Sopenharmony_ci/* a>>n, 0 <= n <= 127 */ 130570af302Sopenharmony_cistatic inline u128 rsh(u128 a, int n) 131570af302Sopenharmony_ci{ 132570af302Sopenharmony_ci if (n == 0) 133570af302Sopenharmony_ci return a; 134570af302Sopenharmony_ci if (n >= 64) { 135570af302Sopenharmony_ci a.lo = a.hi>>(n-64); 136570af302Sopenharmony_ci a.hi = 0; 137570af302Sopenharmony_ci } else { 138570af302Sopenharmony_ci a.lo = (a.lo>>n) | (a.hi<<(64-n)); 139570af302Sopenharmony_ci a.hi = a.hi>>n; 140570af302Sopenharmony_ci } 141570af302Sopenharmony_ci return a; 142570af302Sopenharmony_ci} 143570af302Sopenharmony_ci 144570af302Sopenharmony_ci/* returns a*b exactly. */ 145570af302Sopenharmony_cistatic inline u128 mul64_128(uint64_t a, uint64_t b) 146570af302Sopenharmony_ci{ 147570af302Sopenharmony_ci u128 r; 148570af302Sopenharmony_ci uint64_t ahi = a>>32; 149570af302Sopenharmony_ci uint64_t alo = a&0xffffffff; 150570af302Sopenharmony_ci uint64_t bhi = b>>32; 151570af302Sopenharmony_ci uint64_t blo = b&0xffffffff; 152570af302Sopenharmony_ci uint64_t lo1 = ((ahi*blo)&0xffffffff) + ((alo*bhi)&0xffffffff) + (alo*blo>>32); 153570af302Sopenharmony_ci uint64_t lo2 = (alo*blo)&0xffffffff; 154570af302Sopenharmony_ci r.hi = ahi*bhi + (ahi*blo>>32) + (alo*bhi>>32) + (lo1>>32); 155570af302Sopenharmony_ci r.lo = (lo1<<32) + lo2; 156570af302Sopenharmony_ci return r; 157570af302Sopenharmony_ci} 158570af302Sopenharmony_ci 159570af302Sopenharmony_ci/* returns a*b*2^-128 - e, with error 0 <= e < 7. */ 160570af302Sopenharmony_cistatic inline u128 mul128(u128 a, u128 b) 161570af302Sopenharmony_ci{ 162570af302Sopenharmony_ci u128 hi = mul64_128(a.hi, b.hi); 163570af302Sopenharmony_ci uint64_t m1 = mul64(a.hi, b.lo); 164570af302Sopenharmony_ci uint64_t m2 = mul64(a.lo, b.hi); 165570af302Sopenharmony_ci return add64(add64(hi, m1), m2); 166570af302Sopenharmony_ci} 167570af302Sopenharmony_ci 168570af302Sopenharmony_ci/* returns a*b % 2^128. */ 169570af302Sopenharmony_cistatic inline u128 mul128_tail(u128 a, u128 b) 170570af302Sopenharmony_ci{ 171570af302Sopenharmony_ci u128 lo = mul64_128(a.lo, b.lo); 172570af302Sopenharmony_ci lo.hi += a.hi*b.lo + a.lo*b.hi; 173570af302Sopenharmony_ci return lo; 174570af302Sopenharmony_ci} 175570af302Sopenharmony_ci 176570af302Sopenharmony_ci 177570af302Sopenharmony_ci/* see sqrt.c for detailed comments. */ 178570af302Sopenharmony_ci 179570af302Sopenharmony_cilong double sqrtl(long double x) 180570af302Sopenharmony_ci{ 181570af302Sopenharmony_ci u128 ix, ml; 182570af302Sopenharmony_ci uint64_t top; 183570af302Sopenharmony_ci 184570af302Sopenharmony_ci ix = asu128(x); 185570af302Sopenharmony_ci top = ix.hi >> 48; 186570af302Sopenharmony_ci if (predict_false(top - 0x0001 >= 0x7fff - 0x0001)) { 187570af302Sopenharmony_ci /* x < 0x1p-16382 or inf or nan. */ 188570af302Sopenharmony_ci if (2*ix.hi == 0 && ix.lo == 0) 189570af302Sopenharmony_ci return x; 190570af302Sopenharmony_ci if (ix.hi == 0x7fff000000000000 && ix.lo == 0) 191570af302Sopenharmony_ci return x; 192570af302Sopenharmony_ci if (top >= 0x7fff) 193570af302Sopenharmony_ci return __math_invalidl(x); 194570af302Sopenharmony_ci /* x is subnormal, normalize it. */ 195570af302Sopenharmony_ci ix = asu128(x * 0x1p112); 196570af302Sopenharmony_ci top = ix.hi >> 48; 197570af302Sopenharmony_ci top -= 112; 198570af302Sopenharmony_ci } 199570af302Sopenharmony_ci 200570af302Sopenharmony_ci /* x = 4^e m; with int e and m in [1, 4) */ 201570af302Sopenharmony_ci int even = top & 1; 202570af302Sopenharmony_ci ml = lsh(ix, 15); 203570af302Sopenharmony_ci ml.hi |= 0x8000000000000000; 204570af302Sopenharmony_ci if (even) ml = rsh(ml, 1); 205570af302Sopenharmony_ci top = (top + 0x3fff) >> 1; 206570af302Sopenharmony_ci 207570af302Sopenharmony_ci /* r ~ 1/sqrt(m) */ 208570af302Sopenharmony_ci const uint64_t three = 0xc0000000; 209570af302Sopenharmony_ci uint64_t r, s, d, u, i; 210570af302Sopenharmony_ci i = (ix.hi >> 42) % 128; 211570af302Sopenharmony_ci r = (uint32_t)__rsqrt_tab[i] << 16; 212570af302Sopenharmony_ci /* |r sqrt(m) - 1| < 0x1p-8 */ 213570af302Sopenharmony_ci s = mul32(ml.hi>>32, r); 214570af302Sopenharmony_ci d = mul32(s, r); 215570af302Sopenharmony_ci u = three - d; 216570af302Sopenharmony_ci r = mul32(u, r) << 1; 217570af302Sopenharmony_ci /* |r sqrt(m) - 1| < 0x1.7bp-16, switch to 64bit */ 218570af302Sopenharmony_ci r = r<<32; 219570af302Sopenharmony_ci s = mul64(ml.hi, r); 220570af302Sopenharmony_ci d = mul64(s, r); 221570af302Sopenharmony_ci u = (three<<32) - d; 222570af302Sopenharmony_ci r = mul64(u, r) << 1; 223570af302Sopenharmony_ci /* |r sqrt(m) - 1| < 0x1.a5p-31 */ 224570af302Sopenharmony_ci s = mul64(u, s) << 1; 225570af302Sopenharmony_ci d = mul64(s, r); 226570af302Sopenharmony_ci u = (three<<32) - d; 227570af302Sopenharmony_ci r = mul64(u, r) << 1; 228570af302Sopenharmony_ci /* |r sqrt(m) - 1| < 0x1.c001p-59, switch to 128bit */ 229570af302Sopenharmony_ci 230570af302Sopenharmony_ci const u128 threel = {.hi=three<<32, .lo=0}; 231570af302Sopenharmony_ci u128 rl, sl, dl, ul; 232570af302Sopenharmony_ci rl.hi = r; 233570af302Sopenharmony_ci rl.lo = 0; 234570af302Sopenharmony_ci sl = mul128(ml, rl); 235570af302Sopenharmony_ci dl = mul128(sl, rl); 236570af302Sopenharmony_ci ul = sub128(threel, dl); 237570af302Sopenharmony_ci sl = mul128(ul, sl); /* repr: 3.125 */ 238570af302Sopenharmony_ci /* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */ 239570af302Sopenharmony_ci sl = rsh(sub64(sl, 4), 125-(LDBL_MANT_DIG-1)); 240570af302Sopenharmony_ci /* s < sqrt(m) < s + 1 ULP + tiny */ 241570af302Sopenharmony_ci 242570af302Sopenharmony_ci long double y; 243570af302Sopenharmony_ci u128 d2, d1, d0; 244570af302Sopenharmony_ci d0 = sub128(lsh(ml, 2*(LDBL_MANT_DIG-1)-126), mul128_tail(sl,sl)); 245570af302Sopenharmony_ci d1 = sub128(sl, d0); 246570af302Sopenharmony_ci d2 = add128(add64(sl, 1), d1); 247570af302Sopenharmony_ci sl = add64(sl, d1.hi >> 63); 248570af302Sopenharmony_ci y = mkldbl(top, sl); 249570af302Sopenharmony_ci if (FENV_SUPPORT) { 250570af302Sopenharmony_ci /* handle rounding modes and inexact exception. */ 251570af302Sopenharmony_ci top = predict_false((d2.hi|d2.lo)==0) ? 0 : 1; 252570af302Sopenharmony_ci top |= ((d1.hi^d2.hi)&0x8000000000000000) >> 48; 253570af302Sopenharmony_ci y += mkldbl(top, (u128){0}); 254570af302Sopenharmony_ci } 255570af302Sopenharmony_ci return y; 256570af302Sopenharmony_ci} 257570af302Sopenharmony_ci#else 258570af302Sopenharmony_ci#error unsupported long double format 259570af302Sopenharmony_ci#endif 260