1570af302Sopenharmony_ci#include <stdint.h> 2570af302Sopenharmony_ci#include <math.h> 3570af302Sopenharmony_ci#include "libm.h" 4570af302Sopenharmony_ci#include "sqrt_data.h" 5570af302Sopenharmony_ci 6570af302Sopenharmony_ci#define FENV_SUPPORT 1 7570af302Sopenharmony_ci 8570af302Sopenharmony_ci/* returns a*b*2^-32 - e, with error 0 <= e < 1. */ 9570af302Sopenharmony_cistatic inline uint32_t mul32(uint32_t a, uint32_t b) 10570af302Sopenharmony_ci{ 11570af302Sopenharmony_ci return (uint64_t)a*b >> 32; 12570af302Sopenharmony_ci} 13570af302Sopenharmony_ci 14570af302Sopenharmony_ci/* returns a*b*2^-64 - e, with error 0 <= e < 3. */ 15570af302Sopenharmony_cistatic inline uint64_t mul64(uint64_t a, uint64_t b) 16570af302Sopenharmony_ci{ 17570af302Sopenharmony_ci uint64_t ahi = a>>32; 18570af302Sopenharmony_ci uint64_t alo = a&0xffffffff; 19570af302Sopenharmony_ci uint64_t bhi = b>>32; 20570af302Sopenharmony_ci uint64_t blo = b&0xffffffff; 21570af302Sopenharmony_ci return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32); 22570af302Sopenharmony_ci} 23570af302Sopenharmony_ci 24570af302Sopenharmony_cidouble sqrt(double x) 25570af302Sopenharmony_ci{ 26570af302Sopenharmony_ci uint64_t ix, top, m; 27570af302Sopenharmony_ci 28570af302Sopenharmony_ci /* special case handling. */ 29570af302Sopenharmony_ci ix = asuint64(x); 30570af302Sopenharmony_ci top = ix >> 52; 31570af302Sopenharmony_ci if (predict_false(top - 0x001 >= 0x7ff - 0x001)) { 32570af302Sopenharmony_ci /* x < 0x1p-1022 or inf or nan. */ 33570af302Sopenharmony_ci if (ix * 2 == 0) 34570af302Sopenharmony_ci return x; 35570af302Sopenharmony_ci if (ix == 0x7ff0000000000000) 36570af302Sopenharmony_ci return x; 37570af302Sopenharmony_ci if (ix > 0x7ff0000000000000) 38570af302Sopenharmony_ci return __math_invalid(x); 39570af302Sopenharmony_ci /* x is subnormal, normalize it. */ 40570af302Sopenharmony_ci ix = asuint64(x * 0x1p52); 41570af302Sopenharmony_ci top = ix >> 52; 42570af302Sopenharmony_ci top -= 52; 43570af302Sopenharmony_ci } 44570af302Sopenharmony_ci 45570af302Sopenharmony_ci /* argument reduction: 46570af302Sopenharmony_ci x = 4^e m; with integer e, and m in [1, 4) 47570af302Sopenharmony_ci m: fixed point representation [2.62] 48570af302Sopenharmony_ci 2^e is the exponent part of the result. */ 49570af302Sopenharmony_ci int even = top & 1; 50570af302Sopenharmony_ci m = (ix << 11) | 0x8000000000000000; 51570af302Sopenharmony_ci if (even) m >>= 1; 52570af302Sopenharmony_ci top = (top + 0x3ff) >> 1; 53570af302Sopenharmony_ci 54570af302Sopenharmony_ci /* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4) 55570af302Sopenharmony_ci 56570af302Sopenharmony_ci initial estimate: 57570af302Sopenharmony_ci 7bit table lookup (1bit exponent and 6bit significand). 58570af302Sopenharmony_ci 59570af302Sopenharmony_ci iterative approximation: 60570af302Sopenharmony_ci using 2 goldschmidt iterations with 32bit int arithmetics 61570af302Sopenharmony_ci and a final iteration with 64bit int arithmetics. 62570af302Sopenharmony_ci 63570af302Sopenharmony_ci details: 64570af302Sopenharmony_ci 65570af302Sopenharmony_ci the relative error (e = r0 sqrt(m)-1) of a linear estimate 66570af302Sopenharmony_ci (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best, 67570af302Sopenharmony_ci a table lookup is faster and needs one less iteration 68570af302Sopenharmony_ci 6 bit lookup table (128b) gives |e| < 0x1.f9p-8 69570af302Sopenharmony_ci 7 bit lookup table (256b) gives |e| < 0x1.fdp-9 70570af302Sopenharmony_ci for single and double prec 6bit is enough but for quad 71570af302Sopenharmony_ci prec 7bit is needed (or modified iterations). to avoid 72570af302Sopenharmony_ci one more iteration >=13bit table would be needed (16k). 73570af302Sopenharmony_ci 74570af302Sopenharmony_ci a newton-raphson iteration for r is 75570af302Sopenharmony_ci w = r*r 76570af302Sopenharmony_ci u = 3 - m*w 77570af302Sopenharmony_ci r = r*u/2 78570af302Sopenharmony_ci can use a goldschmidt iteration for s at the end or 79570af302Sopenharmony_ci s = m*r 80570af302Sopenharmony_ci 81570af302Sopenharmony_ci first goldschmidt iteration is 82570af302Sopenharmony_ci s = m*r 83570af302Sopenharmony_ci u = 3 - s*r 84570af302Sopenharmony_ci r = r*u/2 85570af302Sopenharmony_ci s = s*u/2 86570af302Sopenharmony_ci next goldschmidt iteration is 87570af302Sopenharmony_ci u = 3 - s*r 88570af302Sopenharmony_ci r = r*u/2 89570af302Sopenharmony_ci s = s*u/2 90570af302Sopenharmony_ci and at the end r is not computed only s. 91570af302Sopenharmony_ci 92570af302Sopenharmony_ci they use the same amount of operations and converge at the 93570af302Sopenharmony_ci same quadratic rate, i.e. if 94570af302Sopenharmony_ci r1 sqrt(m) - 1 = e, then 95570af302Sopenharmony_ci r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3 96570af302Sopenharmony_ci the advantage of goldschmidt is that the mul for s and r 97570af302Sopenharmony_ci are independent (computed in parallel), however it is not 98570af302Sopenharmony_ci "self synchronizing": it only uses the input m in the 99570af302Sopenharmony_ci first iteration so rounding errors accumulate. at the end 100570af302Sopenharmony_ci or when switching to larger precision arithmetics rounding 101570af302Sopenharmony_ci errors dominate so the first iteration should be used. 102570af302Sopenharmony_ci 103570af302Sopenharmony_ci the fixed point representations are 104570af302Sopenharmony_ci m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30 105570af302Sopenharmony_ci and after switching to 64 bit 106570af302Sopenharmony_ci m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62 */ 107570af302Sopenharmony_ci 108570af302Sopenharmony_ci static const uint64_t three = 0xc0000000; 109570af302Sopenharmony_ci uint64_t r, s, d, u, i; 110570af302Sopenharmony_ci 111570af302Sopenharmony_ci i = (ix >> 46) % 128; 112570af302Sopenharmony_ci r = (uint32_t)__rsqrt_tab[i] << 16; 113570af302Sopenharmony_ci /* |r sqrt(m) - 1| < 0x1.fdp-9 */ 114570af302Sopenharmony_ci s = mul32(m>>32, r); 115570af302Sopenharmony_ci /* |s/sqrt(m) - 1| < 0x1.fdp-9 */ 116570af302Sopenharmony_ci d = mul32(s, r); 117570af302Sopenharmony_ci u = three - d; 118570af302Sopenharmony_ci r = mul32(r, u) << 1; 119570af302Sopenharmony_ci /* |r sqrt(m) - 1| < 0x1.7bp-16 */ 120570af302Sopenharmony_ci s = mul32(s, u) << 1; 121570af302Sopenharmony_ci /* |s/sqrt(m) - 1| < 0x1.7bp-16 */ 122570af302Sopenharmony_ci d = mul32(s, r); 123570af302Sopenharmony_ci u = three - d; 124570af302Sopenharmony_ci r = mul32(r, u) << 1; 125570af302Sopenharmony_ci /* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */ 126570af302Sopenharmony_ci r = r << 32; 127570af302Sopenharmony_ci s = mul64(m, r); 128570af302Sopenharmony_ci d = mul64(s, r); 129570af302Sopenharmony_ci u = (three<<32) - d; 130570af302Sopenharmony_ci s = mul64(s, u); /* repr: 3.61 */ 131570af302Sopenharmony_ci /* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */ 132570af302Sopenharmony_ci s = (s - 2) >> 9; /* repr: 12.52 */ 133570af302Sopenharmony_ci /* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */ 134570af302Sopenharmony_ci 135570af302Sopenharmony_ci /* s < sqrt(m) < s + 0x1.09p-52, 136570af302Sopenharmony_ci compute nearest rounded result: 137570af302Sopenharmony_ci the nearest result to 52 bits is either s or s+0x1p-52, 138570af302Sopenharmony_ci we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m. */ 139570af302Sopenharmony_ci uint64_t d0, d1, d2; 140570af302Sopenharmony_ci double y, t; 141570af302Sopenharmony_ci d0 = (m << 42) - s*s; 142570af302Sopenharmony_ci d1 = s - d0; 143570af302Sopenharmony_ci d2 = d1 + s + 1; 144570af302Sopenharmony_ci s += d1 >> 63; 145570af302Sopenharmony_ci s &= 0x000fffffffffffff; 146570af302Sopenharmony_ci s |= top << 52; 147570af302Sopenharmony_ci y = asdouble(s); 148570af302Sopenharmony_ci if (FENV_SUPPORT) { 149570af302Sopenharmony_ci /* handle rounding modes and inexact exception: 150570af302Sopenharmony_ci only (s+1)^2 == 2^42 m case is exact otherwise 151570af302Sopenharmony_ci add a tiny value to cause the fenv effects. */ 152570af302Sopenharmony_ci uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000; 153570af302Sopenharmony_ci tiny |= (d1^d2) & 0x8000000000000000; 154570af302Sopenharmony_ci t = asdouble(tiny); 155570af302Sopenharmony_ci y = eval_as_double(y + t); 156570af302Sopenharmony_ci } 157570af302Sopenharmony_ci return y; 158570af302Sopenharmony_ci} 159