162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* IEEE754 floating point arithmetic 362306a36Sopenharmony_ci * double precision square root 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci/* 662306a36Sopenharmony_ci * MIPS floating point support 762306a36Sopenharmony_ci * Copyright (C) 1994-2000 Algorithmics Ltd. 862306a36Sopenharmony_ci */ 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci#include "ieee754dp.h" 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_cistatic const unsigned int table[] = { 1362306a36Sopenharmony_ci 0, 1204, 3062, 5746, 9193, 13348, 18162, 23592, 1462306a36Sopenharmony_ci 29598, 36145, 43202, 50740, 58733, 67158, 75992, 1562306a36Sopenharmony_ci 85215, 83599, 71378, 60428, 50647, 41945, 34246, 1662306a36Sopenharmony_ci 27478, 21581, 16499, 12183, 8588, 5674, 3403, 1762306a36Sopenharmony_ci 1742, 661, 130 1862306a36Sopenharmony_ci}; 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ciunion ieee754dp ieee754dp_sqrt(union ieee754dp x) 2162306a36Sopenharmony_ci{ 2262306a36Sopenharmony_ci struct _ieee754_csr oldcsr; 2362306a36Sopenharmony_ci union ieee754dp y, z, t; 2462306a36Sopenharmony_ci unsigned int scalx, yh; 2562306a36Sopenharmony_ci COMPXDP; 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci EXPLODEXDP; 2862306a36Sopenharmony_ci ieee754_clearcx(); 2962306a36Sopenharmony_ci FLUSHXDP; 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_ci /* x == INF or NAN? */ 3262306a36Sopenharmony_ci switch (xc) { 3362306a36Sopenharmony_ci case IEEE754_CLASS_SNAN: 3462306a36Sopenharmony_ci return ieee754dp_nanxcpt(x); 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci case IEEE754_CLASS_QNAN: 3762306a36Sopenharmony_ci /* sqrt(Nan) = Nan */ 3862306a36Sopenharmony_ci return x; 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_ci case IEEE754_CLASS_ZERO: 4162306a36Sopenharmony_ci /* sqrt(0) = 0 */ 4262306a36Sopenharmony_ci return x; 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci case IEEE754_CLASS_INF: 4562306a36Sopenharmony_ci if (xs) { 4662306a36Sopenharmony_ci /* sqrt(-Inf) = Nan */ 4762306a36Sopenharmony_ci ieee754_setcx(IEEE754_INVALID_OPERATION); 4862306a36Sopenharmony_ci return ieee754dp_indef(); 4962306a36Sopenharmony_ci } 5062306a36Sopenharmony_ci /* sqrt(+Inf) = Inf */ 5162306a36Sopenharmony_ci return x; 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci case IEEE754_CLASS_DNORM: 5462306a36Sopenharmony_ci DPDNORMX; 5562306a36Sopenharmony_ci fallthrough; 5662306a36Sopenharmony_ci case IEEE754_CLASS_NORM: 5762306a36Sopenharmony_ci if (xs) { 5862306a36Sopenharmony_ci /* sqrt(-x) = Nan */ 5962306a36Sopenharmony_ci ieee754_setcx(IEEE754_INVALID_OPERATION); 6062306a36Sopenharmony_ci return ieee754dp_indef(); 6162306a36Sopenharmony_ci } 6262306a36Sopenharmony_ci break; 6362306a36Sopenharmony_ci } 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci /* save old csr; switch off INX enable & flag; set RN rounding */ 6662306a36Sopenharmony_ci oldcsr = ieee754_csr; 6762306a36Sopenharmony_ci ieee754_csr.mx &= ~IEEE754_INEXACT; 6862306a36Sopenharmony_ci ieee754_csr.sx &= ~IEEE754_INEXACT; 6962306a36Sopenharmony_ci ieee754_csr.rm = FPU_CSR_RN; 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci /* adjust exponent to prevent overflow */ 7262306a36Sopenharmony_ci scalx = 0; 7362306a36Sopenharmony_ci if (xe > 512) { /* x > 2**-512? */ 7462306a36Sopenharmony_ci xe -= 512; /* x = x / 2**512 */ 7562306a36Sopenharmony_ci scalx += 256; 7662306a36Sopenharmony_ci } else if (xe < -512) { /* x < 2**-512? */ 7762306a36Sopenharmony_ci xe += 512; /* x = x * 2**512 */ 7862306a36Sopenharmony_ci scalx -= 256; 7962306a36Sopenharmony_ci } 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci x = builddp(0, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT); 8262306a36Sopenharmony_ci y = x; 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci /* magic initial approximation to almost 8 sig. bits */ 8562306a36Sopenharmony_ci yh = y.bits >> 32; 8662306a36Sopenharmony_ci yh = (yh >> 1) + 0x1ff80000; 8762306a36Sopenharmony_ci yh = yh - table[(yh >> 15) & 31]; 8862306a36Sopenharmony_ci y.bits = ((u64) yh << 32) | (y.bits & 0xffffffff); 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci /* Heron's rule once with correction to improve to ~18 sig. bits */ 9162306a36Sopenharmony_ci /* t=x/y; y=y+t; py[n0]=py[n0]-0x00100006; py[n1]=0; */ 9262306a36Sopenharmony_ci t = ieee754dp_div(x, y); 9362306a36Sopenharmony_ci y = ieee754dp_add(y, t); 9462306a36Sopenharmony_ci y.bits -= 0x0010000600000000LL; 9562306a36Sopenharmony_ci y.bits &= 0xffffffff00000000LL; 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ci /* triple to almost 56 sig. bits: y ~= sqrt(x) to within 1 ulp */ 9862306a36Sopenharmony_ci /* t=y*y; z=t; pt[n0]+=0x00100000; t+=z; z=(x-z)*y; */ 9962306a36Sopenharmony_ci t = ieee754dp_mul(y, y); 10062306a36Sopenharmony_ci z = t; 10162306a36Sopenharmony_ci t.bexp += 0x001; 10262306a36Sopenharmony_ci t = ieee754dp_add(t, z); 10362306a36Sopenharmony_ci z = ieee754dp_mul(ieee754dp_sub(x, z), y); 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci /* t=z/(t+x) ; pt[n0]+=0x00100000; y+=t; */ 10662306a36Sopenharmony_ci t = ieee754dp_div(z, ieee754dp_add(t, x)); 10762306a36Sopenharmony_ci t.bexp += 0x001; 10862306a36Sopenharmony_ci y = ieee754dp_add(y, t); 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci /* twiddle last bit to force y correctly rounded */ 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_ci /* set RZ, clear INEX flag */ 11362306a36Sopenharmony_ci ieee754_csr.rm = FPU_CSR_RZ; 11462306a36Sopenharmony_ci ieee754_csr.sx &= ~IEEE754_INEXACT; 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci /* t=x/y; ...chopped quotient, possibly inexact */ 11762306a36Sopenharmony_ci t = ieee754dp_div(x, y); 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci if (ieee754_csr.sx & IEEE754_INEXACT || t.bits != y.bits) { 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci if (!(ieee754_csr.sx & IEEE754_INEXACT)) 12262306a36Sopenharmony_ci /* t = t-ulp */ 12362306a36Sopenharmony_ci t.bits -= 1; 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_ci /* add inexact to result status */ 12662306a36Sopenharmony_ci oldcsr.cx |= IEEE754_INEXACT; 12762306a36Sopenharmony_ci oldcsr.sx |= IEEE754_INEXACT; 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_ci switch (oldcsr.rm) { 13062306a36Sopenharmony_ci case FPU_CSR_RU: 13162306a36Sopenharmony_ci y.bits += 1; 13262306a36Sopenharmony_ci fallthrough; 13362306a36Sopenharmony_ci case FPU_CSR_RN: 13462306a36Sopenharmony_ci t.bits += 1; 13562306a36Sopenharmony_ci break; 13662306a36Sopenharmony_ci } 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci /* y=y+t; ...chopped sum */ 13962306a36Sopenharmony_ci y = ieee754dp_add(y, t); 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci /* adjust scalx for correctly rounded sqrt(x) */ 14262306a36Sopenharmony_ci scalx -= 1; 14362306a36Sopenharmony_ci } 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci /* py[n0]=py[n0]+scalx; ...scale back y */ 14662306a36Sopenharmony_ci y.bexp += scalx; 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci /* restore rounding mode, possibly set inexact */ 14962306a36Sopenharmony_ci ieee754_csr = oldcsr; 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci return y; 15262306a36Sopenharmony_ci} 153