1//! Defines rounding schemes for floating-point numbers. 2 3#![doc(hidden)] 4 5use crate::extended_float::ExtendedFloat; 6use crate::mask::{lower_n_halfway, lower_n_mask}; 7use crate::num::Float; 8 9// ROUNDING 10// -------- 11 12/// Round an extended-precision float to the nearest machine float. 13/// 14/// Shifts the significant digits into place, adjusts the exponent, 15/// so it can be easily converted to a native float. 16#[cfg_attr(not(feature = "compact"), inline)] 17pub fn round<F, Cb>(fp: &mut ExtendedFloat, cb: Cb) 18where 19 F: Float, 20 Cb: Fn(&mut ExtendedFloat, i32), 21{ 22 let fp_inf = ExtendedFloat { 23 mant: 0, 24 exp: F::INFINITE_POWER, 25 }; 26 27 // Calculate our shift in significant digits. 28 let mantissa_shift = 64 - F::MANTISSA_SIZE - 1; 29 30 // Check for a denormal float, if after the shift the exponent is negative. 31 if -fp.exp >= mantissa_shift { 32 // Have a denormal float that isn't a literal 0. 33 // The extra 1 is to adjust for the denormal float, which is 34 // `1 - F::EXPONENT_BIAS`. This works as before, because our 35 // old logic rounded to `F::DENORMAL_EXPONENT` (now 1), and then 36 // checked if `exp == F::DENORMAL_EXPONENT` and no hidden mask 37 // bit was set. Here, we handle that here, rather than later. 38 // 39 // This might round-down to 0, but shift will be at **max** 65, 40 // for halfway cases rounding towards 0. 41 let shift = -fp.exp + 1; 42 debug_assert!(shift <= 65); 43 cb(fp, shift.min(64)); 44 // Check for round-up: if rounding-nearest carried us to the hidden bit. 45 fp.exp = (fp.mant >= F::HIDDEN_BIT_MASK) as i32; 46 return; 47 } 48 49 // The float is normal, round to the hidden bit. 50 cb(fp, mantissa_shift); 51 52 // Check if we carried, and if so, shift the bit to the hidden bit. 53 let carry_mask = F::CARRY_MASK; 54 if fp.mant & carry_mask == carry_mask { 55 fp.mant >>= 1; 56 fp.exp += 1; 57 } 58 59 // Handle if we carried and check for overflow again. 60 if fp.exp >= F::INFINITE_POWER { 61 // Exponent is above largest normal value, must be infinite. 62 *fp = fp_inf; 63 return; 64 } 65 66 // Remove the hidden bit. 67 fp.mant &= F::MANTISSA_MASK; 68} 69 70/// Shift right N-bytes and round towards a direction. 71/// 72/// Callback should take the following parameters: 73/// 1. is_odd 74/// 1. is_halfway 75/// 1. is_above 76#[cfg_attr(not(feature = "compact"), inline)] 77pub fn round_nearest_tie_even<Cb>(fp: &mut ExtendedFloat, shift: i32, cb: Cb) 78where 79 // is_odd, is_halfway, is_above 80 Cb: Fn(bool, bool, bool) -> bool, 81{ 82 // Ensure we've already handled denormal values that underflow. 83 debug_assert!(shift <= 64); 84 85 // Extract the truncated bits using mask. 86 // Calculate if the value of the truncated bits are either above 87 // the mid-way point, or equal to it. 88 // 89 // For example, for 4 truncated bytes, the mask would be 0b1111 90 // and the midway point would be 0b1000. 91 let mask = lower_n_mask(shift as u64); 92 let halfway = lower_n_halfway(shift as u64); 93 let truncated_bits = fp.mant & mask; 94 let is_above = truncated_bits > halfway; 95 let is_halfway = truncated_bits == halfway; 96 97 // Bit shift so the leading bit is in the hidden bit. 98 // This optimixes pretty well: 99 // ```text 100 // mov ecx, esi 101 // shr rdi, cl 102 // xor eax, eax 103 // cmp esi, 64 104 // cmovne rax, rdi 105 // ret 106 // ``` 107 fp.mant = match shift == 64 { 108 true => 0, 109 false => fp.mant >> shift, 110 }; 111 fp.exp += shift; 112 113 // Extract the last bit after shifting (and determine if it is odd). 114 let is_odd = fp.mant & 1 == 1; 115 116 // Calculate if we need to roundup. 117 // We need to roundup if we are above halfway, or if we are odd 118 // and at half-way (need to tie-to-even). Avoid the branch here. 119 fp.mant += cb(is_odd, is_halfway, is_above) as u64; 120} 121 122/// Round our significant digits into place, truncating them. 123#[cfg_attr(not(feature = "compact"), inline)] 124pub fn round_down(fp: &mut ExtendedFloat, shift: i32) { 125 // Might have a shift greater than 64 if we have an error. 126 fp.mant = match shift == 64 { 127 true => 0, 128 false => fp.mant >> shift, 129 }; 130 fp.exp += shift; 131} 132