1570af302Sopenharmony_ci#include <stdint.h>
2570af302Sopenharmony_ci#include <math.h>
3570af302Sopenharmony_ci#include <float.h>
4570af302Sopenharmony_ci#include "libm.h"
5570af302Sopenharmony_ci
6570af302Sopenharmony_ci#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
7570af302Sopenharmony_cilong double sqrtl(long double x)
8570af302Sopenharmony_ci{
9570af302Sopenharmony_ci	return sqrt(x);
10570af302Sopenharmony_ci}
11570af302Sopenharmony_ci#elif (LDBL_MANT_DIG == 113 || LDBL_MANT_DIG == 64) && LDBL_MAX_EXP == 16384
12570af302Sopenharmony_ci#include "sqrt_data.h"
13570af302Sopenharmony_ci
14570af302Sopenharmony_ci#define FENV_SUPPORT 1
15570af302Sopenharmony_ci
16570af302Sopenharmony_citypedef struct {
17570af302Sopenharmony_ci	uint64_t hi;
18570af302Sopenharmony_ci	uint64_t lo;
19570af302Sopenharmony_ci} u128;
20570af302Sopenharmony_ci
21570af302Sopenharmony_ci/* top: 16 bit sign+exponent, x: significand.  */
22570af302Sopenharmony_cistatic inline long double mkldbl(uint64_t top, u128 x)
23570af302Sopenharmony_ci{
24570af302Sopenharmony_ci	union ldshape u;
25570af302Sopenharmony_ci#if LDBL_MANT_DIG == 113
26570af302Sopenharmony_ci	u.i2.hi = x.hi;
27570af302Sopenharmony_ci	u.i2.lo = x.lo;
28570af302Sopenharmony_ci	u.i2.hi &= 0x0000ffffffffffff;
29570af302Sopenharmony_ci	u.i2.hi |= top << 48;
30570af302Sopenharmony_ci#elif LDBL_MANT_DIG == 64
31570af302Sopenharmony_ci	u.i.se = top;
32570af302Sopenharmony_ci	u.i.m = x.lo;
33570af302Sopenharmony_ci	/* force the top bit on non-zero (and non-subnormal) results.  */
34570af302Sopenharmony_ci	if (top & 0x7fff)
35570af302Sopenharmony_ci		u.i.m |= 0x8000000000000000;
36570af302Sopenharmony_ci#endif
37570af302Sopenharmony_ci	return u.f;
38570af302Sopenharmony_ci}
39570af302Sopenharmony_ci
40570af302Sopenharmony_ci/* return: top 16 bit is sign+exp and following bits are the significand.  */
41570af302Sopenharmony_cistatic inline u128 asu128(long double x)
42570af302Sopenharmony_ci{
43570af302Sopenharmony_ci	union ldshape u = {.f=x};
44570af302Sopenharmony_ci	u128 r;
45570af302Sopenharmony_ci#if LDBL_MANT_DIG == 113
46570af302Sopenharmony_ci	r.hi = u.i2.hi;
47570af302Sopenharmony_ci	r.lo = u.i2.lo;
48570af302Sopenharmony_ci#elif LDBL_MANT_DIG == 64
49570af302Sopenharmony_ci	r.lo = u.i.m<<49;
50570af302Sopenharmony_ci	/* ignore the top bit: pseudo numbers are not handled. */
51570af302Sopenharmony_ci	r.hi = u.i.m>>15;
52570af302Sopenharmony_ci	r.hi &= 0x0000ffffffffffff;
53570af302Sopenharmony_ci	r.hi |= (uint64_t)u.i.se << 48;
54570af302Sopenharmony_ci#endif
55570af302Sopenharmony_ci	return r;
56570af302Sopenharmony_ci}
57570af302Sopenharmony_ci
58570af302Sopenharmony_ci/* returns a*b*2^-32 - e, with error 0 <= e < 1.  */
59570af302Sopenharmony_cistatic inline uint32_t mul32(uint32_t a, uint32_t b)
60570af302Sopenharmony_ci{
61570af302Sopenharmony_ci	return (uint64_t)a*b >> 32;
62570af302Sopenharmony_ci}
63570af302Sopenharmony_ci
64570af302Sopenharmony_ci/* returns a*b*2^-64 - e, with error 0 <= e < 3.  */
65570af302Sopenharmony_cistatic inline uint64_t mul64(uint64_t a, uint64_t b)
66570af302Sopenharmony_ci{
67570af302Sopenharmony_ci	uint64_t ahi = a>>32;
68570af302Sopenharmony_ci	uint64_t alo = a&0xffffffff;
69570af302Sopenharmony_ci	uint64_t bhi = b>>32;
70570af302Sopenharmony_ci	uint64_t blo = b&0xffffffff;
71570af302Sopenharmony_ci	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
72570af302Sopenharmony_ci}
73570af302Sopenharmony_ci
74570af302Sopenharmony_cistatic inline u128 add64(u128 a, uint64_t b)
75570af302Sopenharmony_ci{
76570af302Sopenharmony_ci	u128 r;
77570af302Sopenharmony_ci	r.lo = a.lo + b;
78570af302Sopenharmony_ci	r.hi = a.hi;
79570af302Sopenharmony_ci	if (r.lo < a.lo)
80570af302Sopenharmony_ci		r.hi++;
81570af302Sopenharmony_ci	return r;
82570af302Sopenharmony_ci}
83570af302Sopenharmony_ci
84570af302Sopenharmony_cistatic inline u128 add128(u128 a, u128 b)
85570af302Sopenharmony_ci{
86570af302Sopenharmony_ci	u128 r;
87570af302Sopenharmony_ci	r.lo = a.lo + b.lo;
88570af302Sopenharmony_ci	r.hi = a.hi + b.hi;
89570af302Sopenharmony_ci	if (r.lo < a.lo)
90570af302Sopenharmony_ci		r.hi++;
91570af302Sopenharmony_ci	return r;
92570af302Sopenharmony_ci}
93570af302Sopenharmony_ci
94570af302Sopenharmony_cistatic inline u128 sub64(u128 a, uint64_t b)
95570af302Sopenharmony_ci{
96570af302Sopenharmony_ci	u128 r;
97570af302Sopenharmony_ci	r.lo = a.lo - b;
98570af302Sopenharmony_ci	r.hi = a.hi;
99570af302Sopenharmony_ci	if (a.lo < b)
100570af302Sopenharmony_ci		r.hi--;
101570af302Sopenharmony_ci	return r;
102570af302Sopenharmony_ci}
103570af302Sopenharmony_ci
104570af302Sopenharmony_cistatic inline u128 sub128(u128 a, u128 b)
105570af302Sopenharmony_ci{
106570af302Sopenharmony_ci	u128 r;
107570af302Sopenharmony_ci	r.lo = a.lo - b.lo;
108570af302Sopenharmony_ci	r.hi = a.hi - b.hi;
109570af302Sopenharmony_ci	if (a.lo < b.lo)
110570af302Sopenharmony_ci		r.hi--;
111570af302Sopenharmony_ci	return r;
112570af302Sopenharmony_ci}
113570af302Sopenharmony_ci
114570af302Sopenharmony_ci/* a<<n, 0 <= n <= 127 */
115570af302Sopenharmony_cistatic inline u128 lsh(u128 a, int n)
116570af302Sopenharmony_ci{
117570af302Sopenharmony_ci	if (n == 0)
118570af302Sopenharmony_ci		return a;
119570af302Sopenharmony_ci	if (n >= 64) {
120570af302Sopenharmony_ci		a.hi = a.lo<<(n-64);
121570af302Sopenharmony_ci		a.lo = 0;
122570af302Sopenharmony_ci	} else {
123570af302Sopenharmony_ci		a.hi = (a.hi<<n) | (a.lo>>(64-n));
124570af302Sopenharmony_ci		a.lo = a.lo<<n;
125570af302Sopenharmony_ci	}
126570af302Sopenharmony_ci	return a;
127570af302Sopenharmony_ci}
128570af302Sopenharmony_ci
129570af302Sopenharmony_ci/* a>>n, 0 <= n <= 127 */
130570af302Sopenharmony_cistatic inline u128 rsh(u128 a, int n)
131570af302Sopenharmony_ci{
132570af302Sopenharmony_ci	if (n == 0)
133570af302Sopenharmony_ci		return a;
134570af302Sopenharmony_ci	if (n >= 64) {
135570af302Sopenharmony_ci		a.lo = a.hi>>(n-64);
136570af302Sopenharmony_ci		a.hi = 0;
137570af302Sopenharmony_ci	} else {
138570af302Sopenharmony_ci		a.lo = (a.lo>>n) | (a.hi<<(64-n));
139570af302Sopenharmony_ci		a.hi = a.hi>>n;
140570af302Sopenharmony_ci	}
141570af302Sopenharmony_ci	return a;
142570af302Sopenharmony_ci}
143570af302Sopenharmony_ci
144570af302Sopenharmony_ci/* returns a*b exactly.  */
145570af302Sopenharmony_cistatic inline u128 mul64_128(uint64_t a, uint64_t b)
146570af302Sopenharmony_ci{
147570af302Sopenharmony_ci	u128 r;
148570af302Sopenharmony_ci	uint64_t ahi = a>>32;
149570af302Sopenharmony_ci	uint64_t alo = a&0xffffffff;
150570af302Sopenharmony_ci	uint64_t bhi = b>>32;
151570af302Sopenharmony_ci	uint64_t blo = b&0xffffffff;
152570af302Sopenharmony_ci	uint64_t lo1 = ((ahi*blo)&0xffffffff) + ((alo*bhi)&0xffffffff) + (alo*blo>>32);
153570af302Sopenharmony_ci	uint64_t lo2 = (alo*blo)&0xffffffff;
154570af302Sopenharmony_ci	r.hi = ahi*bhi + (ahi*blo>>32) + (alo*bhi>>32) + (lo1>>32);
155570af302Sopenharmony_ci	r.lo = (lo1<<32) + lo2;
156570af302Sopenharmony_ci	return r;
157570af302Sopenharmony_ci}
158570af302Sopenharmony_ci
159570af302Sopenharmony_ci/* returns a*b*2^-128 - e, with error 0 <= e < 7.  */
160570af302Sopenharmony_cistatic inline u128 mul128(u128 a, u128 b)
161570af302Sopenharmony_ci{
162570af302Sopenharmony_ci	u128 hi = mul64_128(a.hi, b.hi);
163570af302Sopenharmony_ci	uint64_t m1 = mul64(a.hi, b.lo);
164570af302Sopenharmony_ci	uint64_t m2 = mul64(a.lo, b.hi);
165570af302Sopenharmony_ci	return add64(add64(hi, m1), m2);
166570af302Sopenharmony_ci}
167570af302Sopenharmony_ci
168570af302Sopenharmony_ci/* returns a*b % 2^128.  */
169570af302Sopenharmony_cistatic inline u128 mul128_tail(u128 a, u128 b)
170570af302Sopenharmony_ci{
171570af302Sopenharmony_ci	u128 lo = mul64_128(a.lo, b.lo);
172570af302Sopenharmony_ci	lo.hi += a.hi*b.lo + a.lo*b.hi;
173570af302Sopenharmony_ci	return lo;
174570af302Sopenharmony_ci}
175570af302Sopenharmony_ci
176570af302Sopenharmony_ci
177570af302Sopenharmony_ci/* see sqrt.c for detailed comments.  */
178570af302Sopenharmony_ci
179570af302Sopenharmony_cilong double sqrtl(long double x)
180570af302Sopenharmony_ci{
181570af302Sopenharmony_ci	u128 ix, ml;
182570af302Sopenharmony_ci	uint64_t top;
183570af302Sopenharmony_ci
184570af302Sopenharmony_ci	ix = asu128(x);
185570af302Sopenharmony_ci	top = ix.hi >> 48;
186570af302Sopenharmony_ci	if (predict_false(top - 0x0001 >= 0x7fff - 0x0001)) {
187570af302Sopenharmony_ci		/* x < 0x1p-16382 or inf or nan.  */
188570af302Sopenharmony_ci		if (2*ix.hi == 0 && ix.lo == 0)
189570af302Sopenharmony_ci			return x;
190570af302Sopenharmony_ci		if (ix.hi == 0x7fff000000000000 && ix.lo == 0)
191570af302Sopenharmony_ci			return x;
192570af302Sopenharmony_ci		if (top >= 0x7fff)
193570af302Sopenharmony_ci			return __math_invalidl(x);
194570af302Sopenharmony_ci		/* x is subnormal, normalize it.  */
195570af302Sopenharmony_ci		ix = asu128(x * 0x1p112);
196570af302Sopenharmony_ci		top = ix.hi >> 48;
197570af302Sopenharmony_ci		top -= 112;
198570af302Sopenharmony_ci	}
199570af302Sopenharmony_ci
200570af302Sopenharmony_ci	/* x = 4^e m; with int e and m in [1, 4) */
201570af302Sopenharmony_ci	int even = top & 1;
202570af302Sopenharmony_ci	ml = lsh(ix, 15);
203570af302Sopenharmony_ci	ml.hi |= 0x8000000000000000;
204570af302Sopenharmony_ci	if (even) ml = rsh(ml, 1);
205570af302Sopenharmony_ci	top = (top + 0x3fff) >> 1;
206570af302Sopenharmony_ci
207570af302Sopenharmony_ci	/* r ~ 1/sqrt(m) */
208570af302Sopenharmony_ci	const uint64_t three = 0xc0000000;
209570af302Sopenharmony_ci	uint64_t r, s, d, u, i;
210570af302Sopenharmony_ci	i = (ix.hi >> 42) % 128;
211570af302Sopenharmony_ci	r = (uint32_t)__rsqrt_tab[i] << 16;
212570af302Sopenharmony_ci	/* |r sqrt(m) - 1| < 0x1p-8 */
213570af302Sopenharmony_ci	s = mul32(ml.hi>>32, r);
214570af302Sopenharmony_ci	d = mul32(s, r);
215570af302Sopenharmony_ci	u = three - d;
216570af302Sopenharmony_ci	r = mul32(u, r) << 1;
217570af302Sopenharmony_ci	/* |r sqrt(m) - 1| < 0x1.7bp-16, switch to 64bit */
218570af302Sopenharmony_ci	r = r<<32;
219570af302Sopenharmony_ci	s = mul64(ml.hi, r);
220570af302Sopenharmony_ci	d = mul64(s, r);
221570af302Sopenharmony_ci	u = (three<<32) - d;
222570af302Sopenharmony_ci	r = mul64(u, r) << 1;
223570af302Sopenharmony_ci	/* |r sqrt(m) - 1| < 0x1.a5p-31 */
224570af302Sopenharmony_ci	s = mul64(u, s) << 1;
225570af302Sopenharmony_ci	d = mul64(s, r);
226570af302Sopenharmony_ci	u = (three<<32) - d;
227570af302Sopenharmony_ci	r = mul64(u, r) << 1;
228570af302Sopenharmony_ci	/* |r sqrt(m) - 1| < 0x1.c001p-59, switch to 128bit */
229570af302Sopenharmony_ci
230570af302Sopenharmony_ci	const u128 threel = {.hi=three<<32, .lo=0};
231570af302Sopenharmony_ci	u128 rl, sl, dl, ul;
232570af302Sopenharmony_ci	rl.hi = r;
233570af302Sopenharmony_ci	rl.lo = 0;
234570af302Sopenharmony_ci	sl = mul128(ml, rl);
235570af302Sopenharmony_ci	dl = mul128(sl, rl);
236570af302Sopenharmony_ci	ul = sub128(threel, dl);
237570af302Sopenharmony_ci	sl = mul128(ul, sl); /* repr: 3.125 */
238570af302Sopenharmony_ci	/* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */
239570af302Sopenharmony_ci	sl = rsh(sub64(sl, 4), 125-(LDBL_MANT_DIG-1));
240570af302Sopenharmony_ci	/* s < sqrt(m) < s + 1 ULP + tiny */
241570af302Sopenharmony_ci
242570af302Sopenharmony_ci	long double y;
243570af302Sopenharmony_ci	u128 d2, d1, d0;
244570af302Sopenharmony_ci	d0 = sub128(lsh(ml, 2*(LDBL_MANT_DIG-1)-126), mul128_tail(sl,sl));
245570af302Sopenharmony_ci	d1 = sub128(sl, d0);
246570af302Sopenharmony_ci	d2 = add128(add64(sl, 1), d1);
247570af302Sopenharmony_ci	sl = add64(sl, d1.hi >> 63);
248570af302Sopenharmony_ci	y = mkldbl(top, sl);
249570af302Sopenharmony_ci	if (FENV_SUPPORT) {
250570af302Sopenharmony_ci		/* handle rounding modes and inexact exception.  */
251570af302Sopenharmony_ci		top = predict_false((d2.hi|d2.lo)==0) ? 0 : 1;
252570af302Sopenharmony_ci		top |= ((d1.hi^d2.hi)&0x8000000000000000) >> 48;
253570af302Sopenharmony_ci		y += mkldbl(top, (u128){0});
254570af302Sopenharmony_ci	}
255570af302Sopenharmony_ci	return y;
256570af302Sopenharmony_ci}
257570af302Sopenharmony_ci#else
258570af302Sopenharmony_ci#error unsupported long double format
259570af302Sopenharmony_ci#endif
260