1bf215546Sopenharmony_ci.section #gk110_builtin_code 2bf215546Sopenharmony_ci// DIV U32 3bf215546Sopenharmony_ci// 4bf215546Sopenharmony_ci// UNR recurrence (q = a / b): 5bf215546Sopenharmony_ci// look for z such that 2^32 - b <= b * z < 2^32 6bf215546Sopenharmony_ci// then q - 1 <= (a * z) / 2^32 <= q 7bf215546Sopenharmony_ci// 8bf215546Sopenharmony_ci// INPUT: $r0: dividend, $r1: divisor 9bf215546Sopenharmony_ci// OUTPUT: $r0: result, $r1: modulus 10bf215546Sopenharmony_ci// CLOBBER: $r2 - $r3, $p0 - $p1 11bf215546Sopenharmony_ci// SIZE: 22 / 14 * 8 bytes 12bf215546Sopenharmony_ci// 13bf215546Sopenharmony_cigk110_div_u32: 14bf215546Sopenharmony_ci sched 0x28 0x04 0x28 0x04 0x28 0x28 0x28 15bf215546Sopenharmony_ci bfind u32 $r2 $r1 16bf215546Sopenharmony_ci xor b32 $r2 $r2 0x1f 17bf215546Sopenharmony_ci mov b32 $r3 0x1 18bf215546Sopenharmony_ci shl b32 $r2 $r3 clamp $r2 19bf215546Sopenharmony_ci cvt u32 $r1 neg u32 $r1 20bf215546Sopenharmony_ci mul $r3 u32 $r1 u32 $r2 21bf215546Sopenharmony_ci add $r2 (mul high u32 $r2 u32 $r3) $r2 22bf215546Sopenharmony_ci sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 23bf215546Sopenharmony_ci mul $r3 u32 $r1 u32 $r2 24bf215546Sopenharmony_ci add $r2 (mul high u32 $r2 u32 $r3) $r2 25bf215546Sopenharmony_ci mul $r3 u32 $r1 u32 $r2 26bf215546Sopenharmony_ci add $r2 (mul high u32 $r2 u32 $r3) $r2 27bf215546Sopenharmony_ci mul $r3 u32 $r1 u32 $r2 28bf215546Sopenharmony_ci add $r2 (mul high u32 $r2 u32 $r3) $r2 29bf215546Sopenharmony_ci mul $r3 u32 $r1 u32 $r2 30bf215546Sopenharmony_ci sched 0x04 0x28 0x04 0x28 0x28 0x2c 0x04 31bf215546Sopenharmony_ci add $r2 (mul high u32 $r2 u32 $r3) $r2 32bf215546Sopenharmony_ci mov b32 $r3 $r0 33bf215546Sopenharmony_ci mul high $r0 u32 $r0 u32 $r2 34bf215546Sopenharmony_ci cvt u32 $r2 neg u32 $r1 35bf215546Sopenharmony_ci add $r1 (mul u32 $r1 u32 $r0) $r3 36bf215546Sopenharmony_ci set $p0 0x1 ge u32 $r1 $r2 37bf215546Sopenharmony_ci $p0 sub b32 $r1 $r1 $r2 38bf215546Sopenharmony_ci sched 0x28 0x2c 0x04 0x20 0x2e 0x28 0x20 39bf215546Sopenharmony_ci $p0 add b32 $r0 $r0 0x1 40bf215546Sopenharmony_ci $p0 set $p0 0x1 ge u32 $r1 $r2 41bf215546Sopenharmony_ci $p0 sub b32 $r1 $r1 $r2 42bf215546Sopenharmony_ci $p0 add b32 $r0 $r0 0x1 43bf215546Sopenharmony_ci ret 44bf215546Sopenharmony_ci 45bf215546Sopenharmony_ci// DIV S32, like DIV U32 after taking ABS(inputs) 46bf215546Sopenharmony_ci// 47bf215546Sopenharmony_ci// INPUT: $r0: dividend, $r1: divisor 48bf215546Sopenharmony_ci// OUTPUT: $r0: result, $r1: modulus 49bf215546Sopenharmony_ci// CLOBBER: $r2 - $r3, $p0 - $p3 50bf215546Sopenharmony_ci// 51bf215546Sopenharmony_cigk110_div_s32: 52bf215546Sopenharmony_ci set $p2 0x1 lt s32 $r0 0x0 53bf215546Sopenharmony_ci set $p3 0x1 lt s32 $r1 0x0 xor $p2 54bf215546Sopenharmony_ci sched 0x20 0x28 0x28 0x04 0x28 0x04 0x28 55bf215546Sopenharmony_ci cvt s32 $r0 abs s32 $r0 56bf215546Sopenharmony_ci cvt s32 $r1 abs s32 $r1 57bf215546Sopenharmony_ci bfind u32 $r2 $r1 58bf215546Sopenharmony_ci xor b32 $r2 $r2 0x1f 59bf215546Sopenharmony_ci mov b32 $r3 0x1 60bf215546Sopenharmony_ci shl b32 $r2 $r3 clamp $r2 61bf215546Sopenharmony_ci cvt u32 $r1 neg u32 $r1 62bf215546Sopenharmony_ci sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 63bf215546Sopenharmony_ci mul $r3 u32 $r1 u32 $r2 64bf215546Sopenharmony_ci add $r2 (mul high u32 $r2 u32 $r3) $r2 65bf215546Sopenharmony_ci mul $r3 u32 $r1 u32 $r2 66bf215546Sopenharmony_ci add $r2 (mul high u32 $r2 u32 $r3) $r2 67bf215546Sopenharmony_ci mul $r3 u32 $r1 u32 $r2 68bf215546Sopenharmony_ci add $r2 (mul high u32 $r2 u32 $r3) $r2 69bf215546Sopenharmony_ci mul $r3 u32 $r1 u32 $r2 70bf215546Sopenharmony_ci sched 0x28 0x28 0x04 0x28 0x04 0x28 0x28 71bf215546Sopenharmony_ci add $r2 (mul high u32 $r2 u32 $r3) $r2 72bf215546Sopenharmony_ci mul $r3 u32 $r1 u32 $r2 73bf215546Sopenharmony_ci add $r2 (mul high u32 $r2 u32 $r3) $r2 74bf215546Sopenharmony_ci mov b32 $r3 $r0 75bf215546Sopenharmony_ci mul high $r0 u32 $r0 u32 $r2 76bf215546Sopenharmony_ci cvt u32 $r2 neg u32 $r1 77bf215546Sopenharmony_ci add $r1 (mul u32 $r1 u32 $r0) $r3 78bf215546Sopenharmony_ci sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20 79bf215546Sopenharmony_ci set $p0 0x1 ge u32 $r1 $r2 80bf215546Sopenharmony_ci $p0 sub b32 $r1 $r1 $r2 81bf215546Sopenharmony_ci $p0 add b32 $r0 $r0 0x1 82bf215546Sopenharmony_ci $p0 set $p0 0x1 ge u32 $r1 $r2 83bf215546Sopenharmony_ci $p0 sub b32 $r1 $r1 $r2 84bf215546Sopenharmony_ci $p0 add b32 $r0 $r0 0x1 85bf215546Sopenharmony_ci $p3 cvt s32 $r0 neg s32 $r0 86bf215546Sopenharmony_ci sched 0x04 0x2e 0x28 0x04 0x28 0x28 0x28 87bf215546Sopenharmony_ci $p2 cvt s32 $r1 neg s32 $r1 88bf215546Sopenharmony_ci ret 89bf215546Sopenharmony_ci 90bf215546Sopenharmony_ci// RCP F64 91bf215546Sopenharmony_ci// 92bf215546Sopenharmony_ci// INPUT: $r0d 93bf215546Sopenharmony_ci// OUTPUT: $r0d 94bf215546Sopenharmony_ci// CLOBBER: $r2 - $r9, $p0 95bf215546Sopenharmony_ci// 96bf215546Sopenharmony_ci// The core of RCP and RSQ implementation is Newton-Raphson step, which is 97bf215546Sopenharmony_ci// used to find successively better approximation from an imprecise initial 98bf215546Sopenharmony_ci// value (single precision rcp in RCP and rsqrt64h in RSQ). 99bf215546Sopenharmony_ci// 100bf215546Sopenharmony_cigk110_rcp_f64: 101bf215546Sopenharmony_ci // Step 1: classify input according to exponent and value, and calculate 102bf215546Sopenharmony_ci // result for 0/inf/nan. $r2 holds the exponent value, which starts at 103bf215546Sopenharmony_ci // bit 52 (bit 20 of the upper half) and is 11 bits in length 104bf215546Sopenharmony_ci ext u32 $r2 $r1 0xb14 105bf215546Sopenharmony_ci add b32 $r3 $r2 0xffffffff 106bf215546Sopenharmony_ci joinat #rcp_rejoin 107bf215546Sopenharmony_ci // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf, 108bf215546Sopenharmony_ci // denorm, or 0). Do this by subtracting 1 from the exponent, which will 109bf215546Sopenharmony_ci // mean that it's > 0x7fd in those cases when doing unsigned comparison 110bf215546Sopenharmony_ci set b32 $p0 0x1 gt u32 $r3 0x7fd 111bf215546Sopenharmony_ci // $r3: 0 for norms, 0x36 for denorms, -1 for others 112bf215546Sopenharmony_ci mov b32 $r3 0x0 113bf215546Sopenharmony_ci sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28 114bf215546Sopenharmony_ci join (not $p0) nop 115bf215546Sopenharmony_ci // Process all special values: NaN, inf, denorm, 0 116bf215546Sopenharmony_ci mov b32 $r3 0xffffffff 117bf215546Sopenharmony_ci // A number is NaN if its abs value is greater than or unordered with inf 118bf215546Sopenharmony_ci set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 119bf215546Sopenharmony_ci (not $p0) bra #rcp_inf_or_denorm_or_zero 120bf215546Sopenharmony_ci // NaN -> NaN, the next line sets the "quiet" bit of the result. This 121bf215546Sopenharmony_ci // behavior is both seen on the CPU and the blob 122bf215546Sopenharmony_ci join or b32 $r1 $r1 0x80000 123bf215546Sopenharmony_circp_inf_or_denorm_or_zero: 124bf215546Sopenharmony_ci and b32 $r4 $r1 0x7ff00000 125bf215546Sopenharmony_ci // Other values with nonzero in exponent field should be inf 126bf215546Sopenharmony_ci set b32 $p0 0x1 eq s32 $r4 0x0 127bf215546Sopenharmony_ci sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20 128bf215546Sopenharmony_ci $p0 bra #rcp_denorm_or_zero 129bf215546Sopenharmony_ci // +/-Inf -> +/-0 130bf215546Sopenharmony_ci xor b32 $r1 $r1 0x7ff00000 131bf215546Sopenharmony_ci join mov b32 $r0 0x0 132bf215546Sopenharmony_circp_denorm_or_zero: 133bf215546Sopenharmony_ci set $p0 0x1 gtu f64 abs $r0d 0x0 134bf215546Sopenharmony_ci $p0 bra #rcp_denorm 135bf215546Sopenharmony_ci // +/-0 -> +/-Inf 136bf215546Sopenharmony_ci join or b32 $r1 $r1 0x7ff00000 137bf215546Sopenharmony_circp_denorm: 138bf215546Sopenharmony_ci // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms 139bf215546Sopenharmony_ci mul rn f64 $r0d $r0d 0x4350000000000000 140bf215546Sopenharmony_ci sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28 141bf215546Sopenharmony_ci join mov b32 $r3 0x36 142bf215546Sopenharmony_circp_rejoin: 143bf215546Sopenharmony_ci // All numbers with -1 in $r3 have their result ready in $r0d, return them 144bf215546Sopenharmony_ci // others need further calculation 145bf215546Sopenharmony_ci set b32 $p0 0x1 lt s32 $r3 0x0 146bf215546Sopenharmony_ci $p0 bra #rcp_end 147bf215546Sopenharmony_ci // Step 2: Before the real calculation goes on, renormalize the values to 148bf215546Sopenharmony_ci // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1) 149bf215546Sopenharmony_ci // result in $r6d. The exponent will be recovered later. 150bf215546Sopenharmony_ci ext u32 $r2 $r1 0xb14 151bf215546Sopenharmony_ci and b32 $r7 $r1 0x800fffff 152bf215546Sopenharmony_ci add b32 $r7 $r7 0x3ff00000 153bf215546Sopenharmony_ci mov b32 $r6 $r0 154bf215546Sopenharmony_ci sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e 155bf215546Sopenharmony_ci // Step 3: Convert new value to float (no overflow will occur due to step 156bf215546Sopenharmony_ci // 2), calculate rcp and do newton-raphson step once 157bf215546Sopenharmony_ci cvt rz f32 $r5 f64 $r6d 158bf215546Sopenharmony_ci rcp f32 $r4 $r5 159bf215546Sopenharmony_ci mov b32 $r0 0xbf800000 160bf215546Sopenharmony_ci fma rn f32 $r5 $r4 $r5 $r0 161bf215546Sopenharmony_ci fma rn f32 $r0 neg $r4 $r5 $r4 162bf215546Sopenharmony_ci // Step 4: convert result $r0 back to double, do newton-raphson steps 163bf215546Sopenharmony_ci cvt f64 $r0d f32 $r0 164bf215546Sopenharmony_ci cvt f64 $r6d f64 neg $r6d 165bf215546Sopenharmony_ci sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29 166bf215546Sopenharmony_ci cvt f64 $r8d f32 0x3f800000 167bf215546Sopenharmony_ci // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d 168bf215546Sopenharmony_ci // The formula used here (and above) is: 169bf215546Sopenharmony_ci // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n} 170bf215546Sopenharmony_ci // The following code uses 2 FMAs for each step, and it will basically 171bf215546Sopenharmony_ci // looks like: 172bf215546Sopenharmony_ci // tmp = -src * RCP_{n} + 1 173bf215546Sopenharmony_ci // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n} 174bf215546Sopenharmony_ci fma rn f64 $r4d $r6d $r0d $r8d 175bf215546Sopenharmony_ci fma rn f64 $r0d $r0d $r4d $r0d 176bf215546Sopenharmony_ci fma rn f64 $r4d $r6d $r0d $r8d 177bf215546Sopenharmony_ci fma rn f64 $r0d $r0d $r4d $r0d 178bf215546Sopenharmony_ci fma rn f64 $r4d $r6d $r0d $r8d 179bf215546Sopenharmony_ci fma rn f64 $r0d $r0d $r4d $r0d 180bf215546Sopenharmony_ci sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28 181bf215546Sopenharmony_ci fma rn f64 $r4d $r6d $r0d $r8d 182bf215546Sopenharmony_ci fma rn f64 $r0d $r0d $r4d $r0d 183bf215546Sopenharmony_ci // Step 5: Exponent recovery and final processing 184bf215546Sopenharmony_ci // The exponent is recovered by adding what we added to the exponent. 185bf215546Sopenharmony_ci // Suppose we want to calculate rcp(x), but we have rcp(cx), then 186bf215546Sopenharmony_ci // rcp(x) = c * rcp(cx) 187bf215546Sopenharmony_ci // The delta in exponent comes from two sources: 188bf215546Sopenharmony_ci // 1) The renormalization in step 2. The delta is: 189bf215546Sopenharmony_ci // 0x3ff - $r2 190bf215546Sopenharmony_ci // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored 191bf215546Sopenharmony_ci // in $r3 192bf215546Sopenharmony_ci // These 2 sources are calculated in the first two lines below, and then 193bf215546Sopenharmony_ci // added to the exponent extracted from the result above. 194bf215546Sopenharmony_ci // Note that after processing, the new exponent may >= 0x7ff (inf) 195bf215546Sopenharmony_ci // or <= 0 (denorm). Those cases will be handled respectively below 196bf215546Sopenharmony_ci subr b32 $r2 $r2 0x3ff 197bf215546Sopenharmony_ci add b32 $r4 $r2 $r3 198bf215546Sopenharmony_ci ext u32 $r3 $r1 0xb14 199bf215546Sopenharmony_ci // New exponent in $r3 200bf215546Sopenharmony_ci add b32 $r3 $r3 $r4 201bf215546Sopenharmony_ci add b32 $r2 $r3 0xffffffff 202bf215546Sopenharmony_ci sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b 203bf215546Sopenharmony_ci // (exponent-1) < 0x7fe (unsigned) means the result is in norm range 204bf215546Sopenharmony_ci // (same logic as in step 1) 205bf215546Sopenharmony_ci set b32 $p0 0x1 lt u32 $r2 0x7fe 206bf215546Sopenharmony_ci (not $p0) bra #rcp_result_inf_or_denorm 207bf215546Sopenharmony_ci // Norms: convert exponents back and return 208bf215546Sopenharmony_ci shl b32 $r4 $r4 clamp 0x14 209bf215546Sopenharmony_ci add b32 $r1 $r4 $r1 210bf215546Sopenharmony_ci bra #rcp_end 211bf215546Sopenharmony_circp_result_inf_or_denorm: 212bf215546Sopenharmony_ci // New exponent >= 0x7ff means that result is inf 213bf215546Sopenharmony_ci set b32 $p0 0x1 ge s32 $r3 0x7ff 214bf215546Sopenharmony_ci (not $p0) bra #rcp_result_denorm 215bf215546Sopenharmony_ci sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f 216bf215546Sopenharmony_ci // Infinity 217bf215546Sopenharmony_ci and b32 $r1 $r1 0x80000000 218bf215546Sopenharmony_ci mov b32 $r0 0x0 219bf215546Sopenharmony_ci add b32 $r1 $r1 0x7ff00000 220bf215546Sopenharmony_ci bra #rcp_end 221bf215546Sopenharmony_circp_result_denorm: 222bf215546Sopenharmony_ci // Denorm result comes from huge input. The greatest possible fp64, i.e. 223bf215546Sopenharmony_ci // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest 224bf215546Sopenharmony_ci // normal value. Other rcp result should be greater than that. If we 225bf215546Sopenharmony_ci // set the exponent field to 1, we can recover the result by multiplying 226bf215546Sopenharmony_ci // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise 227bf215546Sopenharmony_ci // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies 228bf215546Sopenharmony_ci // the logic here. 229bf215546Sopenharmony_ci set b32 $p0 0x1 ne u32 $r3 0x0 230bf215546Sopenharmony_ci and b32 $r1 $r1 0x800fffff 231bf215546Sopenharmony_ci // 0x3e800000: 1/4 232bf215546Sopenharmony_ci $p0 cvt f64 $r6d f32 0x3e800000 233bf215546Sopenharmony_ci sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27 234bf215546Sopenharmony_ci // 0x3f000000: 1/2 235bf215546Sopenharmony_ci (not $p0) cvt f64 $r6d f32 0x3f000000 236bf215546Sopenharmony_ci add b32 $r1 $r1 0x00100000 237bf215546Sopenharmony_ci mul rn f64 $r0d $r0d $r6d 238bf215546Sopenharmony_circp_end: 239bf215546Sopenharmony_ci ret 240bf215546Sopenharmony_ci 241bf215546Sopenharmony_ci// RSQ F64 242bf215546Sopenharmony_ci// 243bf215546Sopenharmony_ci// INPUT: $r0d 244bf215546Sopenharmony_ci// OUTPUT: $r0d 245bf215546Sopenharmony_ci// CLOBBER: $r2 - $r9, $p0 - $p1 246bf215546Sopenharmony_ci// 247bf215546Sopenharmony_cigk110_rsq_f64: 248bf215546Sopenharmony_ci // Before getting initial result rsqrt64h, two special cases should be 249bf215546Sopenharmony_ci // handled first. 250bf215546Sopenharmony_ci // 1. NaN: set the highest bit in mantissa so it'll be surely recognized 251bf215546Sopenharmony_ci // as NaN in rsqrt64h 252bf215546Sopenharmony_ci set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 253bf215546Sopenharmony_ci $p0 or b32 $r1 $r1 0x00080000 254bf215546Sopenharmony_ci and b32 $r2 $r1 0x7fffffff 255bf215546Sopenharmony_ci sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28 256bf215546Sopenharmony_ci // 2. denorms and small normal values: using their original value will 257bf215546Sopenharmony_ci // lose precision either at rsqrt64h or the first step in newton-raphson 258bf215546Sopenharmony_ci // steps below. Take 2 as a threshold in exponent field, and multiply 259bf215546Sopenharmony_ci // with 2^54 if the exponent is smaller or equal. (will multiply 2^27 260bf215546Sopenharmony_ci // to recover in the end) 261bf215546Sopenharmony_ci ext u32 $r3 $r1 0xb14 262bf215546Sopenharmony_ci set b32 $p1 0x1 le u32 $r3 0x2 263bf215546Sopenharmony_ci or b32 $r2 $r0 $r2 264bf215546Sopenharmony_ci $p1 mul rn f64 $r0d $r0d 0x4350000000000000 265bf215546Sopenharmony_ci rsqrt64h f32 $r5 $r1 266bf215546Sopenharmony_ci // rsqrt64h will give correct result for 0/inf/nan, the following logic 267bf215546Sopenharmony_ci // checks whether the input is one of those (exponent is 0x7ff or all 0 268bf215546Sopenharmony_ci // except for the sign bit) 269bf215546Sopenharmony_ci set b32 $r6 ne u32 $r3 0x7ff 270bf215546Sopenharmony_ci and b32 $r2 $r2 $r6 271bf215546Sopenharmony_ci sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28 272bf215546Sopenharmony_ci set b32 $p0 0x1 ne u32 $r2 0x0 273bf215546Sopenharmony_ci $p0 bra #rsq_norm 274bf215546Sopenharmony_ci // For 0/inf/nan, make sure the sign bit agrees with input and return 275bf215546Sopenharmony_ci and b32 $r1 $r1 0x80000000 276bf215546Sopenharmony_ci mov b32 $r0 0x0 277bf215546Sopenharmony_ci or b32 $r1 $r1 $r5 278bf215546Sopenharmony_ci ret 279bf215546Sopenharmony_cirsq_norm: 280bf215546Sopenharmony_ci // For others, do 4 Newton-Raphson steps with the formula: 281bf215546Sopenharmony_ci // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n}) 282bf215546Sopenharmony_ci // In the code below, each step is written as: 283bf215546Sopenharmony_ci // tmp1 = 0.5 * x * RSQ_{n} 284bf215546Sopenharmony_ci // tmp2 = -RSQ_{n} * tmp1 + 0.5 285bf215546Sopenharmony_ci // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n} 286bf215546Sopenharmony_ci mov b32 $r4 0x0 287bf215546Sopenharmony_ci sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29 288bf215546Sopenharmony_ci // 0x3f000000: 1/2 289bf215546Sopenharmony_ci cvt f64 $r8d f32 0x3f000000 290bf215546Sopenharmony_ci mul rn f64 $r2d $r0d $r8d 291bf215546Sopenharmony_ci mul rn f64 $r0d $r2d $r4d 292bf215546Sopenharmony_ci fma rn f64 $r6d neg $r4d $r0d $r8d 293bf215546Sopenharmony_ci fma rn f64 $r4d $r4d $r6d $r4d 294bf215546Sopenharmony_ci mul rn f64 $r0d $r2d $r4d 295bf215546Sopenharmony_ci fma rn f64 $r6d neg $r4d $r0d $r8d 296bf215546Sopenharmony_ci sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29 297bf215546Sopenharmony_ci fma rn f64 $r4d $r4d $r6d $r4d 298bf215546Sopenharmony_ci mul rn f64 $r0d $r2d $r4d 299bf215546Sopenharmony_ci fma rn f64 $r6d neg $r4d $r0d $r8d 300bf215546Sopenharmony_ci fma rn f64 $r4d $r4d $r6d $r4d 301bf215546Sopenharmony_ci mul rn f64 $r0d $r2d $r4d 302bf215546Sopenharmony_ci fma rn f64 $r6d neg $r4d $r0d $r8d 303bf215546Sopenharmony_ci fma rn f64 $r4d $r4d $r6d $r4d 304bf215546Sopenharmony_ci sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00 305bf215546Sopenharmony_ci // Multiply 2^27 to result for small inputs to recover 306bf215546Sopenharmony_ci $p1 mul rn f64 $r4d $r4d 0x41a0000000000000 307bf215546Sopenharmony_ci mov b32 $r1 $r5 308bf215546Sopenharmony_ci mov b32 $r0 $r4 309bf215546Sopenharmony_ci ret 310bf215546Sopenharmony_ci 311bf215546Sopenharmony_ci.section #gk110_builtin_offsets 312bf215546Sopenharmony_ci.b64 #gk110_div_u32 313bf215546Sopenharmony_ci.b64 #gk110_div_s32 314bf215546Sopenharmony_ci.b64 #gk110_rcp_f64 315bf215546Sopenharmony_ci.b64 #gk110_rsq_f64 316