1cbd624adSopenharmony_ci//! A simple big-integer type for slow path algorithms. 2cbd624adSopenharmony_ci//! 3cbd624adSopenharmony_ci//! This includes minimal stackvector for use in big-integer arithmetic. 4cbd624adSopenharmony_ci 5cbd624adSopenharmony_ci#![doc(hidden)] 6cbd624adSopenharmony_ci 7cbd624adSopenharmony_ci#[cfg(feature = "alloc")] 8cbd624adSopenharmony_ciuse crate::heapvec::HeapVec; 9cbd624adSopenharmony_ciuse crate::num::Float; 10cbd624adSopenharmony_ci#[cfg(not(feature = "alloc"))] 11cbd624adSopenharmony_ciuse crate::stackvec::StackVec; 12cbd624adSopenharmony_ci#[cfg(not(feature = "compact"))] 13cbd624adSopenharmony_ciuse crate::table::{LARGE_POW5, LARGE_POW5_STEP}; 14cbd624adSopenharmony_ciuse core::{cmp, ops, ptr}; 15cbd624adSopenharmony_ci 16cbd624adSopenharmony_ci/// Number of bits in a Bigint. 17cbd624adSopenharmony_ci/// 18cbd624adSopenharmony_ci/// This needs to be at least the number of bits required to store 19cbd624adSopenharmony_ci/// a Bigint, which is `log2(radix**digits)`. 20cbd624adSopenharmony_ci/// ≅ 3600 for base-10, rounded-up. 21cbd624adSopenharmony_cipub const BIGINT_BITS: usize = 4000; 22cbd624adSopenharmony_ci 23cbd624adSopenharmony_ci/// The number of limbs for the bigint. 24cbd624adSopenharmony_cipub const BIGINT_LIMBS: usize = BIGINT_BITS / LIMB_BITS; 25cbd624adSopenharmony_ci 26cbd624adSopenharmony_ci#[cfg(feature = "alloc")] 27cbd624adSopenharmony_cipub type VecType = HeapVec; 28cbd624adSopenharmony_ci 29cbd624adSopenharmony_ci#[cfg(not(feature = "alloc"))] 30cbd624adSopenharmony_cipub type VecType = StackVec; 31cbd624adSopenharmony_ci 32cbd624adSopenharmony_ci/// Storage for a big integer type. 33cbd624adSopenharmony_ci/// 34cbd624adSopenharmony_ci/// This is used for algorithms when we have a finite number of digits. 35cbd624adSopenharmony_ci/// Specifically, it stores all the significant digits scaled to the 36cbd624adSopenharmony_ci/// proper exponent, as an integral type, and then directly compares 37cbd624adSopenharmony_ci/// these digits. 38cbd624adSopenharmony_ci/// 39cbd624adSopenharmony_ci/// This requires us to store the number of significant bits, plus the 40cbd624adSopenharmony_ci/// number of exponent bits (required) since we scale everything 41cbd624adSopenharmony_ci/// to the same exponent. 42cbd624adSopenharmony_ci#[derive(Clone, PartialEq, Eq)] 43cbd624adSopenharmony_cipub struct Bigint { 44cbd624adSopenharmony_ci /// Significant digits for the float, stored in a big integer in LE order. 45cbd624adSopenharmony_ci /// 46cbd624adSopenharmony_ci /// This is pretty much the same number of digits for any radix, since the 47cbd624adSopenharmony_ci /// significant digits balances out the zeros from the exponent: 48cbd624adSopenharmony_ci /// 1. Decimal is 1091 digits, 767 mantissa digits + 324 exponent zeros. 49cbd624adSopenharmony_ci /// 2. Base 6 is 1097 digits, or 680 mantissa digits + 417 exponent zeros. 50cbd624adSopenharmony_ci /// 3. Base 36 is 1086 digits, or 877 mantissa digits + 209 exponent zeros. 51cbd624adSopenharmony_ci /// 52cbd624adSopenharmony_ci /// However, the number of bytes required is larger for large radixes: 53cbd624adSopenharmony_ci /// for decimal, we need `log2(10**1091) ≅ 3600`, while for base 36 54cbd624adSopenharmony_ci /// we need `log2(36**1086) ≅ 5600`. Since we use uninitialized data, 55cbd624adSopenharmony_ci /// we avoid a major performance hit from the large buffer size. 56cbd624adSopenharmony_ci pub data: VecType, 57cbd624adSopenharmony_ci} 58cbd624adSopenharmony_ci 59cbd624adSopenharmony_ci#[allow(clippy::new_without_default)] 60cbd624adSopenharmony_ciimpl Bigint { 61cbd624adSopenharmony_ci /// Construct a bigint representing 0. 62cbd624adSopenharmony_ci #[inline(always)] 63cbd624adSopenharmony_ci pub fn new() -> Self { 64cbd624adSopenharmony_ci Self { 65cbd624adSopenharmony_ci data: VecType::new(), 66cbd624adSopenharmony_ci } 67cbd624adSopenharmony_ci } 68cbd624adSopenharmony_ci 69cbd624adSopenharmony_ci /// Construct a bigint from an integer. 70cbd624adSopenharmony_ci #[inline(always)] 71cbd624adSopenharmony_ci pub fn from_u64(value: u64) -> Self { 72cbd624adSopenharmony_ci Self { 73cbd624adSopenharmony_ci data: VecType::from_u64(value), 74cbd624adSopenharmony_ci } 75cbd624adSopenharmony_ci } 76cbd624adSopenharmony_ci 77cbd624adSopenharmony_ci #[inline(always)] 78cbd624adSopenharmony_ci pub fn hi64(&self) -> (u64, bool) { 79cbd624adSopenharmony_ci self.data.hi64() 80cbd624adSopenharmony_ci } 81cbd624adSopenharmony_ci 82cbd624adSopenharmony_ci /// Multiply and assign as if by exponentiation by a power. 83cbd624adSopenharmony_ci #[inline] 84cbd624adSopenharmony_ci pub fn pow(&mut self, base: u32, exp: u32) -> Option<()> { 85cbd624adSopenharmony_ci debug_assert!(base == 2 || base == 5 || base == 10); 86cbd624adSopenharmony_ci if base % 5 == 0 { 87cbd624adSopenharmony_ci pow(&mut self.data, exp)?; 88cbd624adSopenharmony_ci } 89cbd624adSopenharmony_ci if base % 2 == 0 { 90cbd624adSopenharmony_ci shl(&mut self.data, exp as usize)?; 91cbd624adSopenharmony_ci } 92cbd624adSopenharmony_ci Some(()) 93cbd624adSopenharmony_ci } 94cbd624adSopenharmony_ci 95cbd624adSopenharmony_ci /// Calculate the bit-length of the big-integer. 96cbd624adSopenharmony_ci #[inline] 97cbd624adSopenharmony_ci pub fn bit_length(&self) -> u32 { 98cbd624adSopenharmony_ci bit_length(&self.data) 99cbd624adSopenharmony_ci } 100cbd624adSopenharmony_ci} 101cbd624adSopenharmony_ci 102cbd624adSopenharmony_ciimpl ops::MulAssign<&Bigint> for Bigint { 103cbd624adSopenharmony_ci fn mul_assign(&mut self, rhs: &Bigint) { 104cbd624adSopenharmony_ci self.data *= &rhs.data; 105cbd624adSopenharmony_ci } 106cbd624adSopenharmony_ci} 107cbd624adSopenharmony_ci 108cbd624adSopenharmony_ci/// REVERSE VIEW 109cbd624adSopenharmony_ci 110cbd624adSopenharmony_ci/// Reverse, immutable view of a sequence. 111cbd624adSopenharmony_cipub struct ReverseView<'a, T: 'a> { 112cbd624adSopenharmony_ci inner: &'a [T], 113cbd624adSopenharmony_ci} 114cbd624adSopenharmony_ci 115cbd624adSopenharmony_ciimpl<'a, T> ops::Index<usize> for ReverseView<'a, T> { 116cbd624adSopenharmony_ci type Output = T; 117cbd624adSopenharmony_ci 118cbd624adSopenharmony_ci #[inline] 119cbd624adSopenharmony_ci fn index(&self, index: usize) -> &T { 120cbd624adSopenharmony_ci let len = self.inner.len(); 121cbd624adSopenharmony_ci &(*self.inner)[len - index - 1] 122cbd624adSopenharmony_ci } 123cbd624adSopenharmony_ci} 124cbd624adSopenharmony_ci 125cbd624adSopenharmony_ci/// Create a reverse view of the vector for indexing. 126cbd624adSopenharmony_ci#[inline] 127cbd624adSopenharmony_cipub fn rview(x: &[Limb]) -> ReverseView<Limb> { 128cbd624adSopenharmony_ci ReverseView { 129cbd624adSopenharmony_ci inner: x, 130cbd624adSopenharmony_ci } 131cbd624adSopenharmony_ci} 132cbd624adSopenharmony_ci 133cbd624adSopenharmony_ci// COMPARE 134cbd624adSopenharmony_ci// ------- 135cbd624adSopenharmony_ci 136cbd624adSopenharmony_ci/// Compare `x` to `y`, in little-endian order. 137cbd624adSopenharmony_ci#[inline] 138cbd624adSopenharmony_cipub fn compare(x: &[Limb], y: &[Limb]) -> cmp::Ordering { 139cbd624adSopenharmony_ci match x.len().cmp(&y.len()) { 140cbd624adSopenharmony_ci cmp::Ordering::Equal => { 141cbd624adSopenharmony_ci let iter = x.iter().rev().zip(y.iter().rev()); 142cbd624adSopenharmony_ci for (&xi, yi) in iter { 143cbd624adSopenharmony_ci match xi.cmp(yi) { 144cbd624adSopenharmony_ci cmp::Ordering::Equal => (), 145cbd624adSopenharmony_ci ord => return ord, 146cbd624adSopenharmony_ci } 147cbd624adSopenharmony_ci } 148cbd624adSopenharmony_ci // Equal case. 149cbd624adSopenharmony_ci cmp::Ordering::Equal 150cbd624adSopenharmony_ci }, 151cbd624adSopenharmony_ci ord => ord, 152cbd624adSopenharmony_ci } 153cbd624adSopenharmony_ci} 154cbd624adSopenharmony_ci 155cbd624adSopenharmony_ci// NORMALIZE 156cbd624adSopenharmony_ci// --------- 157cbd624adSopenharmony_ci 158cbd624adSopenharmony_ci/// Normalize the integer, so any leading zero values are removed. 159cbd624adSopenharmony_ci#[inline] 160cbd624adSopenharmony_cipub fn normalize(x: &mut VecType) { 161cbd624adSopenharmony_ci // We don't care if this wraps: the index is bounds-checked. 162cbd624adSopenharmony_ci while let Some(&value) = x.get(x.len().wrapping_sub(1)) { 163cbd624adSopenharmony_ci if value == 0 { 164cbd624adSopenharmony_ci unsafe { x.set_len(x.len() - 1) }; 165cbd624adSopenharmony_ci } else { 166cbd624adSopenharmony_ci break; 167cbd624adSopenharmony_ci } 168cbd624adSopenharmony_ci } 169cbd624adSopenharmony_ci} 170cbd624adSopenharmony_ci 171cbd624adSopenharmony_ci/// Get if the big integer is normalized. 172cbd624adSopenharmony_ci#[inline] 173cbd624adSopenharmony_ci#[allow(clippy::match_like_matches_macro)] 174cbd624adSopenharmony_cipub fn is_normalized(x: &[Limb]) -> bool { 175cbd624adSopenharmony_ci // We don't care if this wraps: the index is bounds-checked. 176cbd624adSopenharmony_ci match x.get(x.len().wrapping_sub(1)) { 177cbd624adSopenharmony_ci Some(&0) => false, 178cbd624adSopenharmony_ci _ => true, 179cbd624adSopenharmony_ci } 180cbd624adSopenharmony_ci} 181cbd624adSopenharmony_ci 182cbd624adSopenharmony_ci// FROM 183cbd624adSopenharmony_ci// ---- 184cbd624adSopenharmony_ci 185cbd624adSopenharmony_ci/// Create StackVec from u64 value. 186cbd624adSopenharmony_ci#[inline(always)] 187cbd624adSopenharmony_ci#[allow(clippy::branches_sharing_code)] 188cbd624adSopenharmony_cipub fn from_u64(x: u64) -> VecType { 189cbd624adSopenharmony_ci let mut vec = VecType::new(); 190cbd624adSopenharmony_ci debug_assert!(vec.capacity() >= 2); 191cbd624adSopenharmony_ci if LIMB_BITS == 32 { 192cbd624adSopenharmony_ci vec.try_push(x as Limb).unwrap(); 193cbd624adSopenharmony_ci vec.try_push((x >> 32) as Limb).unwrap(); 194cbd624adSopenharmony_ci } else { 195cbd624adSopenharmony_ci vec.try_push(x as Limb).unwrap(); 196cbd624adSopenharmony_ci } 197cbd624adSopenharmony_ci vec.normalize(); 198cbd624adSopenharmony_ci vec 199cbd624adSopenharmony_ci} 200cbd624adSopenharmony_ci 201cbd624adSopenharmony_ci// HI 202cbd624adSopenharmony_ci// -- 203cbd624adSopenharmony_ci 204cbd624adSopenharmony_ci/// Check if any of the remaining bits are non-zero. 205cbd624adSopenharmony_ci/// 206cbd624adSopenharmony_ci/// # Safety 207cbd624adSopenharmony_ci/// 208cbd624adSopenharmony_ci/// Safe as long as `rindex <= x.len()`. 209cbd624adSopenharmony_ci#[inline] 210cbd624adSopenharmony_cipub fn nonzero(x: &[Limb], rindex: usize) -> bool { 211cbd624adSopenharmony_ci debug_assert!(rindex <= x.len()); 212cbd624adSopenharmony_ci 213cbd624adSopenharmony_ci let len = x.len(); 214cbd624adSopenharmony_ci let slc = &x[..len - rindex]; 215cbd624adSopenharmony_ci slc.iter().rev().any(|&x| x != 0) 216cbd624adSopenharmony_ci} 217cbd624adSopenharmony_ci 218cbd624adSopenharmony_ci// These return the high X bits and if the bits were truncated. 219cbd624adSopenharmony_ci 220cbd624adSopenharmony_ci/// Shift 32-bit integer to high 64-bits. 221cbd624adSopenharmony_ci#[inline] 222cbd624adSopenharmony_cipub fn u32_to_hi64_1(r0: u32) -> (u64, bool) { 223cbd624adSopenharmony_ci u64_to_hi64_1(r0 as u64) 224cbd624adSopenharmony_ci} 225cbd624adSopenharmony_ci 226cbd624adSopenharmony_ci/// Shift 2 32-bit integers to high 64-bits. 227cbd624adSopenharmony_ci#[inline] 228cbd624adSopenharmony_cipub fn u32_to_hi64_2(r0: u32, r1: u32) -> (u64, bool) { 229cbd624adSopenharmony_ci let r0 = (r0 as u64) << 32; 230cbd624adSopenharmony_ci let r1 = r1 as u64; 231cbd624adSopenharmony_ci u64_to_hi64_1(r0 | r1) 232cbd624adSopenharmony_ci} 233cbd624adSopenharmony_ci 234cbd624adSopenharmony_ci/// Shift 3 32-bit integers to high 64-bits. 235cbd624adSopenharmony_ci#[inline] 236cbd624adSopenharmony_cipub fn u32_to_hi64_3(r0: u32, r1: u32, r2: u32) -> (u64, bool) { 237cbd624adSopenharmony_ci let r0 = r0 as u64; 238cbd624adSopenharmony_ci let r1 = (r1 as u64) << 32; 239cbd624adSopenharmony_ci let r2 = r2 as u64; 240cbd624adSopenharmony_ci u64_to_hi64_2(r0, r1 | r2) 241cbd624adSopenharmony_ci} 242cbd624adSopenharmony_ci 243cbd624adSopenharmony_ci/// Shift 64-bit integer to high 64-bits. 244cbd624adSopenharmony_ci#[inline] 245cbd624adSopenharmony_cipub fn u64_to_hi64_1(r0: u64) -> (u64, bool) { 246cbd624adSopenharmony_ci let ls = r0.leading_zeros(); 247cbd624adSopenharmony_ci (r0 << ls, false) 248cbd624adSopenharmony_ci} 249cbd624adSopenharmony_ci 250cbd624adSopenharmony_ci/// Shift 2 64-bit integers to high 64-bits. 251cbd624adSopenharmony_ci#[inline] 252cbd624adSopenharmony_cipub fn u64_to_hi64_2(r0: u64, r1: u64) -> (u64, bool) { 253cbd624adSopenharmony_ci let ls = r0.leading_zeros(); 254cbd624adSopenharmony_ci let rs = 64 - ls; 255cbd624adSopenharmony_ci let v = match ls { 256cbd624adSopenharmony_ci 0 => r0, 257cbd624adSopenharmony_ci _ => (r0 << ls) | (r1 >> rs), 258cbd624adSopenharmony_ci }; 259cbd624adSopenharmony_ci let n = r1 << ls != 0; 260cbd624adSopenharmony_ci (v, n) 261cbd624adSopenharmony_ci} 262cbd624adSopenharmony_ci 263cbd624adSopenharmony_ci/// Extract the hi bits from the buffer. 264cbd624adSopenharmony_cimacro_rules! hi { 265cbd624adSopenharmony_ci // # Safety 266cbd624adSopenharmony_ci // 267cbd624adSopenharmony_ci // Safe as long as the `stackvec.len() >= 1`. 268cbd624adSopenharmony_ci (@1 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ 269cbd624adSopenharmony_ci $fn($rview[0] as $t) 270cbd624adSopenharmony_ci }}; 271cbd624adSopenharmony_ci 272cbd624adSopenharmony_ci // # Safety 273cbd624adSopenharmony_ci // 274cbd624adSopenharmony_ci // Safe as long as the `stackvec.len() >= 2`. 275cbd624adSopenharmony_ci (@2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ 276cbd624adSopenharmony_ci let r0 = $rview[0] as $t; 277cbd624adSopenharmony_ci let r1 = $rview[1] as $t; 278cbd624adSopenharmony_ci $fn(r0, r1) 279cbd624adSopenharmony_ci }}; 280cbd624adSopenharmony_ci 281cbd624adSopenharmony_ci // # Safety 282cbd624adSopenharmony_ci // 283cbd624adSopenharmony_ci // Safe as long as the `stackvec.len() >= 2`. 284cbd624adSopenharmony_ci (@nonzero2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ 285cbd624adSopenharmony_ci let (v, n) = hi!(@2 $self, $rview, $t, $fn); 286cbd624adSopenharmony_ci (v, n || nonzero($self, 2 )) 287cbd624adSopenharmony_ci }}; 288cbd624adSopenharmony_ci 289cbd624adSopenharmony_ci // # Safety 290cbd624adSopenharmony_ci // 291cbd624adSopenharmony_ci // Safe as long as the `stackvec.len() >= 3`. 292cbd624adSopenharmony_ci (@3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ 293cbd624adSopenharmony_ci let r0 = $rview[0] as $t; 294cbd624adSopenharmony_ci let r1 = $rview[1] as $t; 295cbd624adSopenharmony_ci let r2 = $rview[2] as $t; 296cbd624adSopenharmony_ci $fn(r0, r1, r2) 297cbd624adSopenharmony_ci }}; 298cbd624adSopenharmony_ci 299cbd624adSopenharmony_ci // # Safety 300cbd624adSopenharmony_ci // 301cbd624adSopenharmony_ci // Safe as long as the `stackvec.len() >= 3`. 302cbd624adSopenharmony_ci (@nonzero3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ 303cbd624adSopenharmony_ci let (v, n) = hi!(@3 $self, $rview, $t, $fn); 304cbd624adSopenharmony_ci (v, n || nonzero($self, 3)) 305cbd624adSopenharmony_ci }}; 306cbd624adSopenharmony_ci} 307cbd624adSopenharmony_ci 308cbd624adSopenharmony_ci/// Get the high 64 bits from the vector. 309cbd624adSopenharmony_ci#[inline(always)] 310cbd624adSopenharmony_cipub fn hi64(x: &[Limb]) -> (u64, bool) { 311cbd624adSopenharmony_ci let rslc = rview(x); 312cbd624adSopenharmony_ci // SAFETY: the buffer must be at least length bytes long. 313cbd624adSopenharmony_ci match x.len() { 314cbd624adSopenharmony_ci 0 => (0, false), 315cbd624adSopenharmony_ci 1 if LIMB_BITS == 32 => hi!(@1 x, rslc, u32, u32_to_hi64_1), 316cbd624adSopenharmony_ci 1 => hi!(@1 x, rslc, u64, u64_to_hi64_1), 317cbd624adSopenharmony_ci 2 if LIMB_BITS == 32 => hi!(@2 x, rslc, u32, u32_to_hi64_2), 318cbd624adSopenharmony_ci 2 => hi!(@2 x, rslc, u64, u64_to_hi64_2), 319cbd624adSopenharmony_ci _ if LIMB_BITS == 32 => hi!(@nonzero3 x, rslc, u32, u32_to_hi64_3), 320cbd624adSopenharmony_ci _ => hi!(@nonzero2 x, rslc, u64, u64_to_hi64_2), 321cbd624adSopenharmony_ci } 322cbd624adSopenharmony_ci} 323cbd624adSopenharmony_ci 324cbd624adSopenharmony_ci// POWERS 325cbd624adSopenharmony_ci// ------ 326cbd624adSopenharmony_ci 327cbd624adSopenharmony_ci/// MulAssign by a power of 5. 328cbd624adSopenharmony_ci/// 329cbd624adSopenharmony_ci/// Theoretically... 330cbd624adSopenharmony_ci/// 331cbd624adSopenharmony_ci/// Use an exponentiation by squaring method, since it reduces the time 332cbd624adSopenharmony_ci/// complexity of the multiplication to ~`O(log(n))` for the squaring, 333cbd624adSopenharmony_ci/// and `O(n*m)` for the result. Since `m` is typically a lower-order 334cbd624adSopenharmony_ci/// factor, this significantly reduces the number of multiplications 335cbd624adSopenharmony_ci/// we need to do. Iteratively multiplying by small powers follows 336cbd624adSopenharmony_ci/// the nth triangular number series, which scales as `O(p^2)`, but 337cbd624adSopenharmony_ci/// where `p` is `n+m`. In short, it scales very poorly. 338cbd624adSopenharmony_ci/// 339cbd624adSopenharmony_ci/// Practically.... 340cbd624adSopenharmony_ci/// 341cbd624adSopenharmony_ci/// Exponentiation by Squaring: 342cbd624adSopenharmony_ci/// running 2 tests 343cbd624adSopenharmony_ci/// test bigcomp_f32_lexical ... bench: 1,018 ns/iter (+/- 78) 344cbd624adSopenharmony_ci/// test bigcomp_f64_lexical ... bench: 3,639 ns/iter (+/- 1,007) 345cbd624adSopenharmony_ci/// 346cbd624adSopenharmony_ci/// Exponentiation by Iterative Small Powers: 347cbd624adSopenharmony_ci/// running 2 tests 348cbd624adSopenharmony_ci/// test bigcomp_f32_lexical ... bench: 518 ns/iter (+/- 31) 349cbd624adSopenharmony_ci/// test bigcomp_f64_lexical ... bench: 583 ns/iter (+/- 47) 350cbd624adSopenharmony_ci/// 351cbd624adSopenharmony_ci/// Exponentiation by Iterative Large Powers (of 2): 352cbd624adSopenharmony_ci/// running 2 tests 353cbd624adSopenharmony_ci/// test bigcomp_f32_lexical ... bench: 671 ns/iter (+/- 31) 354cbd624adSopenharmony_ci/// test bigcomp_f64_lexical ... bench: 1,394 ns/iter (+/- 47) 355cbd624adSopenharmony_ci/// 356cbd624adSopenharmony_ci/// The following benchmarks were run on `1 * 5^300`, using native `pow`, 357cbd624adSopenharmony_ci/// a version with only small powers, and one with pre-computed powers 358cbd624adSopenharmony_ci/// of `5^(3 * max_exp)`, rather than `5^(5 * max_exp)`. 359cbd624adSopenharmony_ci/// 360cbd624adSopenharmony_ci/// However, using large powers is crucial for good performance for higher 361cbd624adSopenharmony_ci/// powers. 362cbd624adSopenharmony_ci/// pow/default time: [426.20 ns 427.96 ns 429.89 ns] 363cbd624adSopenharmony_ci/// pow/small time: [2.9270 us 2.9411 us 2.9565 us] 364cbd624adSopenharmony_ci/// pow/large:3 time: [838.51 ns 842.21 ns 846.27 ns] 365cbd624adSopenharmony_ci/// 366cbd624adSopenharmony_ci/// Even using worst-case scenarios, exponentiation by squaring is 367cbd624adSopenharmony_ci/// significantly slower for our workloads. Just multiply by small powers, 368cbd624adSopenharmony_ci/// in simple cases, and use precalculated large powers in other cases. 369cbd624adSopenharmony_ci/// 370cbd624adSopenharmony_ci/// Furthermore, using sufficiently big large powers is also crucial for 371cbd624adSopenharmony_ci/// performance. This is a tradeoff of binary size and performance, and 372cbd624adSopenharmony_ci/// using a single value at ~`5^(5 * max_exp)` seems optimal. 373cbd624adSopenharmony_cipub fn pow(x: &mut VecType, mut exp: u32) -> Option<()> { 374cbd624adSopenharmony_ci // Minimize the number of iterations for large exponents: just 375cbd624adSopenharmony_ci // do a few steps with a large powers. 376cbd624adSopenharmony_ci #[cfg(not(feature = "compact"))] 377cbd624adSopenharmony_ci { 378cbd624adSopenharmony_ci while exp >= LARGE_POW5_STEP { 379cbd624adSopenharmony_ci large_mul(x, &LARGE_POW5)?; 380cbd624adSopenharmony_ci exp -= LARGE_POW5_STEP; 381cbd624adSopenharmony_ci } 382cbd624adSopenharmony_ci } 383cbd624adSopenharmony_ci 384cbd624adSopenharmony_ci // Now use our pre-computed small powers iteratively. 385cbd624adSopenharmony_ci // This is calculated as `⌊log(2^BITS - 1, 5)⌋`. 386cbd624adSopenharmony_ci let small_step = if LIMB_BITS == 32 { 387cbd624adSopenharmony_ci 13 388cbd624adSopenharmony_ci } else { 389cbd624adSopenharmony_ci 27 390cbd624adSopenharmony_ci }; 391cbd624adSopenharmony_ci let max_native = (5 as Limb).pow(small_step); 392cbd624adSopenharmony_ci while exp >= small_step { 393cbd624adSopenharmony_ci small_mul(x, max_native)?; 394cbd624adSopenharmony_ci exp -= small_step; 395cbd624adSopenharmony_ci } 396cbd624adSopenharmony_ci if exp != 0 { 397cbd624adSopenharmony_ci // SAFETY: safe, since `exp < small_step`. 398cbd624adSopenharmony_ci let small_power = unsafe { f64::int_pow_fast_path(exp as usize, 5) }; 399cbd624adSopenharmony_ci small_mul(x, small_power as Limb)?; 400cbd624adSopenharmony_ci } 401cbd624adSopenharmony_ci Some(()) 402cbd624adSopenharmony_ci} 403cbd624adSopenharmony_ci 404cbd624adSopenharmony_ci// SCALAR 405cbd624adSopenharmony_ci// ------ 406cbd624adSopenharmony_ci 407cbd624adSopenharmony_ci/// Add two small integers and return the resulting value and if overflow happens. 408cbd624adSopenharmony_ci#[inline(always)] 409cbd624adSopenharmony_cipub fn scalar_add(x: Limb, y: Limb) -> (Limb, bool) { 410cbd624adSopenharmony_ci x.overflowing_add(y) 411cbd624adSopenharmony_ci} 412cbd624adSopenharmony_ci 413cbd624adSopenharmony_ci/// Multiply two small integers (with carry) (and return the overflow contribution). 414cbd624adSopenharmony_ci/// 415cbd624adSopenharmony_ci/// Returns the (low, high) components. 416cbd624adSopenharmony_ci#[inline(always)] 417cbd624adSopenharmony_cipub fn scalar_mul(x: Limb, y: Limb, carry: Limb) -> (Limb, Limb) { 418cbd624adSopenharmony_ci // Cannot overflow, as long as wide is 2x as wide. This is because 419cbd624adSopenharmony_ci // the following is always true: 420cbd624adSopenharmony_ci // `Wide::MAX - (Narrow::MAX * Narrow::MAX) >= Narrow::MAX` 421cbd624adSopenharmony_ci let z: Wide = (x as Wide) * (y as Wide) + (carry as Wide); 422cbd624adSopenharmony_ci (z as Limb, (z >> LIMB_BITS) as Limb) 423cbd624adSopenharmony_ci} 424cbd624adSopenharmony_ci 425cbd624adSopenharmony_ci// SMALL 426cbd624adSopenharmony_ci// ----- 427cbd624adSopenharmony_ci 428cbd624adSopenharmony_ci/// Add small integer to bigint starting from offset. 429cbd624adSopenharmony_ci#[inline] 430cbd624adSopenharmony_cipub fn small_add_from(x: &mut VecType, y: Limb, start: usize) -> Option<()> { 431cbd624adSopenharmony_ci let mut index = start; 432cbd624adSopenharmony_ci let mut carry = y; 433cbd624adSopenharmony_ci while carry != 0 && index < x.len() { 434cbd624adSopenharmony_ci let result = scalar_add(x[index], carry); 435cbd624adSopenharmony_ci x[index] = result.0; 436cbd624adSopenharmony_ci carry = result.1 as Limb; 437cbd624adSopenharmony_ci index += 1; 438cbd624adSopenharmony_ci } 439cbd624adSopenharmony_ci // If we carried past all the elements, add to the end of the buffer. 440cbd624adSopenharmony_ci if carry != 0 { 441cbd624adSopenharmony_ci x.try_push(carry)?; 442cbd624adSopenharmony_ci } 443cbd624adSopenharmony_ci Some(()) 444cbd624adSopenharmony_ci} 445cbd624adSopenharmony_ci 446cbd624adSopenharmony_ci/// Add small integer to bigint. 447cbd624adSopenharmony_ci#[inline(always)] 448cbd624adSopenharmony_cipub fn small_add(x: &mut VecType, y: Limb) -> Option<()> { 449cbd624adSopenharmony_ci small_add_from(x, y, 0) 450cbd624adSopenharmony_ci} 451cbd624adSopenharmony_ci 452cbd624adSopenharmony_ci/// Multiply bigint by small integer. 453cbd624adSopenharmony_ci#[inline] 454cbd624adSopenharmony_cipub fn small_mul(x: &mut VecType, y: Limb) -> Option<()> { 455cbd624adSopenharmony_ci let mut carry = 0; 456cbd624adSopenharmony_ci for xi in x.iter_mut() { 457cbd624adSopenharmony_ci let result = scalar_mul(*xi, y, carry); 458cbd624adSopenharmony_ci *xi = result.0; 459cbd624adSopenharmony_ci carry = result.1; 460cbd624adSopenharmony_ci } 461cbd624adSopenharmony_ci // If we carried past all the elements, add to the end of the buffer. 462cbd624adSopenharmony_ci if carry != 0 { 463cbd624adSopenharmony_ci x.try_push(carry)?; 464cbd624adSopenharmony_ci } 465cbd624adSopenharmony_ci Some(()) 466cbd624adSopenharmony_ci} 467cbd624adSopenharmony_ci 468cbd624adSopenharmony_ci// LARGE 469cbd624adSopenharmony_ci// ----- 470cbd624adSopenharmony_ci 471cbd624adSopenharmony_ci/// Add bigint to bigint starting from offset. 472cbd624adSopenharmony_cipub fn large_add_from(x: &mut VecType, y: &[Limb], start: usize) -> Option<()> { 473cbd624adSopenharmony_ci // The effective x buffer is from `xstart..x.len()`, so we need to treat 474cbd624adSopenharmony_ci // that as the current range. If the effective y buffer is longer, need 475cbd624adSopenharmony_ci // to resize to that, + the start index. 476cbd624adSopenharmony_ci if y.len() > x.len().saturating_sub(start) { 477cbd624adSopenharmony_ci // Ensure we panic if we can't extend the buffer. 478cbd624adSopenharmony_ci // This avoids any unsafe behavior afterwards. 479cbd624adSopenharmony_ci x.try_resize(y.len() + start, 0)?; 480cbd624adSopenharmony_ci } 481cbd624adSopenharmony_ci 482cbd624adSopenharmony_ci // Iteratively add elements from y to x. 483cbd624adSopenharmony_ci let mut carry = false; 484cbd624adSopenharmony_ci for (index, &yi) in y.iter().enumerate() { 485cbd624adSopenharmony_ci // We panicked in `try_resize` if this wasn't true. 486cbd624adSopenharmony_ci let xi = x.get_mut(start + index).unwrap(); 487cbd624adSopenharmony_ci 488cbd624adSopenharmony_ci // Only one op of the two ops can overflow, since we added at max 489cbd624adSopenharmony_ci // Limb::max_value() + Limb::max_value(). Add the previous carry, 490cbd624adSopenharmony_ci // and store the current carry for the next. 491cbd624adSopenharmony_ci let result = scalar_add(*xi, yi); 492cbd624adSopenharmony_ci *xi = result.0; 493cbd624adSopenharmony_ci let mut tmp = result.1; 494cbd624adSopenharmony_ci if carry { 495cbd624adSopenharmony_ci let result = scalar_add(*xi, 1); 496cbd624adSopenharmony_ci *xi = result.0; 497cbd624adSopenharmony_ci tmp |= result.1; 498cbd624adSopenharmony_ci } 499cbd624adSopenharmony_ci carry = tmp; 500cbd624adSopenharmony_ci } 501cbd624adSopenharmony_ci 502cbd624adSopenharmony_ci // Handle overflow. 503cbd624adSopenharmony_ci if carry { 504cbd624adSopenharmony_ci small_add_from(x, 1, y.len() + start)?; 505cbd624adSopenharmony_ci } 506cbd624adSopenharmony_ci Some(()) 507cbd624adSopenharmony_ci} 508cbd624adSopenharmony_ci 509cbd624adSopenharmony_ci/// Add bigint to bigint. 510cbd624adSopenharmony_ci#[inline(always)] 511cbd624adSopenharmony_cipub fn large_add(x: &mut VecType, y: &[Limb]) -> Option<()> { 512cbd624adSopenharmony_ci large_add_from(x, y, 0) 513cbd624adSopenharmony_ci} 514cbd624adSopenharmony_ci 515cbd624adSopenharmony_ci/// Grade-school multiplication algorithm. 516cbd624adSopenharmony_ci/// 517cbd624adSopenharmony_ci/// Slow, naive algorithm, using limb-bit bases and just shifting left for 518cbd624adSopenharmony_ci/// each iteration. This could be optimized with numerous other algorithms, 519cbd624adSopenharmony_ci/// but it's extremely simple, and works in O(n*m) time, which is fine 520cbd624adSopenharmony_ci/// by me. Each iteration, of which there are `m` iterations, requires 521cbd624adSopenharmony_ci/// `n` multiplications, and `n` additions, or grade-school multiplication. 522cbd624adSopenharmony_ci/// 523cbd624adSopenharmony_ci/// Don't use Karatsuba multiplication, since out implementation seems to 524cbd624adSopenharmony_ci/// be slower asymptotically, which is likely just due to the small sizes 525cbd624adSopenharmony_ci/// we deal with here. For example, running on the following data: 526cbd624adSopenharmony_ci/// 527cbd624adSopenharmony_ci/// ```text 528cbd624adSopenharmony_ci/// const SMALL_X: &[u32] = &[ 529cbd624adSopenharmony_ci/// 766857581, 3588187092, 1583923090, 2204542082, 1564708913, 2695310100, 3676050286, 530cbd624adSopenharmony_ci/// 1022770393, 468044626, 446028186 531cbd624adSopenharmony_ci/// ]; 532cbd624adSopenharmony_ci/// const SMALL_Y: &[u32] = &[ 533cbd624adSopenharmony_ci/// 3945492125, 3250752032, 1282554898, 1708742809, 1131807209, 3171663979, 1353276095, 534cbd624adSopenharmony_ci/// 1678845844, 2373924447, 3640713171 535cbd624adSopenharmony_ci/// ]; 536cbd624adSopenharmony_ci/// const LARGE_X: &[u32] = &[ 537cbd624adSopenharmony_ci/// 3647536243, 2836434412, 2154401029, 1297917894, 137240595, 790694805, 2260404854, 538cbd624adSopenharmony_ci/// 3872698172, 690585094, 99641546, 3510774932, 1672049983, 2313458559, 2017623719, 539cbd624adSopenharmony_ci/// 638180197, 1140936565, 1787190494, 1797420655, 14113450, 2350476485, 3052941684, 540cbd624adSopenharmony_ci/// 1993594787, 2901001571, 4156930025, 1248016552, 848099908, 2660577483, 4030871206, 541cbd624adSopenharmony_ci/// 692169593, 2835966319, 1781364505, 4266390061, 1813581655, 4210899844, 2137005290, 542cbd624adSopenharmony_ci/// 2346701569, 3715571980, 3386325356, 1251725092, 2267270902, 474686922, 2712200426, 543cbd624adSopenharmony_ci/// 197581715, 3087636290, 1379224439, 1258285015, 3230794403, 2759309199, 1494932094, 544cbd624adSopenharmony_ci/// 326310242 545cbd624adSopenharmony_ci/// ]; 546cbd624adSopenharmony_ci/// const LARGE_Y: &[u32] = &[ 547cbd624adSopenharmony_ci/// 1574249566, 868970575, 76716509, 3198027972, 1541766986, 1095120699, 3891610505, 548cbd624adSopenharmony_ci/// 2322545818, 1677345138, 865101357, 2650232883, 2831881215, 3985005565, 2294283760, 549cbd624adSopenharmony_ci/// 3468161605, 393539559, 3665153349, 1494067812, 106699483, 2596454134, 797235106, 550cbd624adSopenharmony_ci/// 705031740, 1209732933, 2732145769, 4122429072, 141002534, 790195010, 4014829800, 551cbd624adSopenharmony_ci/// 1303930792, 3649568494, 308065964, 1233648836, 2807326116, 79326486, 1262500691, 552cbd624adSopenharmony_ci/// 621809229, 2258109428, 3819258501, 171115668, 1139491184, 2979680603, 1333372297, 553cbd624adSopenharmony_ci/// 1657496603, 2790845317, 4090236532, 4220374789, 601876604, 1828177209, 2372228171, 554cbd624adSopenharmony_ci/// 2247372529 555cbd624adSopenharmony_ci/// ]; 556cbd624adSopenharmony_ci/// ``` 557cbd624adSopenharmony_ci/// 558cbd624adSopenharmony_ci/// We get the following results: 559cbd624adSopenharmony_ci 560cbd624adSopenharmony_ci/// ```text 561cbd624adSopenharmony_ci/// mul/small:long time: [220.23 ns 221.47 ns 222.81 ns] 562cbd624adSopenharmony_ci/// Found 4 outliers among 100 measurements (4.00%) 563cbd624adSopenharmony_ci/// 2 (2.00%) high mild 564cbd624adSopenharmony_ci/// 2 (2.00%) high severe 565cbd624adSopenharmony_ci/// mul/small:karatsuba time: [233.88 ns 234.63 ns 235.44 ns] 566cbd624adSopenharmony_ci/// Found 11 outliers among 100 measurements (11.00%) 567cbd624adSopenharmony_ci/// 8 (8.00%) high mild 568cbd624adSopenharmony_ci/// 3 (3.00%) high severe 569cbd624adSopenharmony_ci/// mul/large:long time: [1.9365 us 1.9455 us 1.9558 us] 570cbd624adSopenharmony_ci/// Found 12 outliers among 100 measurements (12.00%) 571cbd624adSopenharmony_ci/// 7 (7.00%) high mild 572cbd624adSopenharmony_ci/// 5 (5.00%) high severe 573cbd624adSopenharmony_ci/// mul/large:karatsuba time: [4.4250 us 4.4515 us 4.4812 us] 574cbd624adSopenharmony_ci/// ``` 575cbd624adSopenharmony_ci/// 576cbd624adSopenharmony_ci/// In short, Karatsuba multiplication is never worthwhile for out use-case. 577cbd624adSopenharmony_cipub fn long_mul(x: &[Limb], y: &[Limb]) -> Option<VecType> { 578cbd624adSopenharmony_ci // Using the immutable value, multiply by all the scalars in y, using 579cbd624adSopenharmony_ci // the algorithm defined above. Use a single buffer to avoid 580cbd624adSopenharmony_ci // frequent reallocations. Handle the first case to avoid a redundant 581cbd624adSopenharmony_ci // addition, since we know y.len() >= 1. 582cbd624adSopenharmony_ci let mut z = VecType::try_from(x)?; 583cbd624adSopenharmony_ci if !y.is_empty() { 584cbd624adSopenharmony_ci let y0 = y[0]; 585cbd624adSopenharmony_ci small_mul(&mut z, y0)?; 586cbd624adSopenharmony_ci 587cbd624adSopenharmony_ci for (index, &yi) in y.iter().enumerate().skip(1) { 588cbd624adSopenharmony_ci if yi != 0 { 589cbd624adSopenharmony_ci let mut zi = VecType::try_from(x)?; 590cbd624adSopenharmony_ci small_mul(&mut zi, yi)?; 591cbd624adSopenharmony_ci large_add_from(&mut z, &zi, index)?; 592cbd624adSopenharmony_ci } 593cbd624adSopenharmony_ci } 594cbd624adSopenharmony_ci } 595cbd624adSopenharmony_ci 596cbd624adSopenharmony_ci z.normalize(); 597cbd624adSopenharmony_ci Some(z) 598cbd624adSopenharmony_ci} 599cbd624adSopenharmony_ci 600cbd624adSopenharmony_ci/// Multiply bigint by bigint using grade-school multiplication algorithm. 601cbd624adSopenharmony_ci#[inline(always)] 602cbd624adSopenharmony_cipub fn large_mul(x: &mut VecType, y: &[Limb]) -> Option<()> { 603cbd624adSopenharmony_ci // Karatsuba multiplication never makes sense, so just use grade school 604cbd624adSopenharmony_ci // multiplication. 605cbd624adSopenharmony_ci if y.len() == 1 { 606cbd624adSopenharmony_ci // SAFETY: safe since `y.len() == 1`. 607cbd624adSopenharmony_ci small_mul(x, y[0])?; 608cbd624adSopenharmony_ci } else { 609cbd624adSopenharmony_ci *x = long_mul(y, x)?; 610cbd624adSopenharmony_ci } 611cbd624adSopenharmony_ci Some(()) 612cbd624adSopenharmony_ci} 613cbd624adSopenharmony_ci 614cbd624adSopenharmony_ci// SHIFT 615cbd624adSopenharmony_ci// ----- 616cbd624adSopenharmony_ci 617cbd624adSopenharmony_ci/// Shift-left `n` bits inside a buffer. 618cbd624adSopenharmony_ci#[inline] 619cbd624adSopenharmony_cipub fn shl_bits(x: &mut VecType, n: usize) -> Option<()> { 620cbd624adSopenharmony_ci debug_assert!(n != 0); 621cbd624adSopenharmony_ci 622cbd624adSopenharmony_ci // Internally, for each item, we shift left by n, and add the previous 623cbd624adSopenharmony_ci // right shifted limb-bits. 624cbd624adSopenharmony_ci // For example, we transform (for u8) shifted left 2, to: 625cbd624adSopenharmony_ci // b10100100 b01000010 626cbd624adSopenharmony_ci // b10 b10010001 b00001000 627cbd624adSopenharmony_ci debug_assert!(n < LIMB_BITS); 628cbd624adSopenharmony_ci let rshift = LIMB_BITS - n; 629cbd624adSopenharmony_ci let lshift = n; 630cbd624adSopenharmony_ci let mut prev: Limb = 0; 631cbd624adSopenharmony_ci for xi in x.iter_mut() { 632cbd624adSopenharmony_ci let tmp = *xi; 633cbd624adSopenharmony_ci *xi <<= lshift; 634cbd624adSopenharmony_ci *xi |= prev >> rshift; 635cbd624adSopenharmony_ci prev = tmp; 636cbd624adSopenharmony_ci } 637cbd624adSopenharmony_ci 638cbd624adSopenharmony_ci // Always push the carry, even if it creates a non-normal result. 639cbd624adSopenharmony_ci let carry = prev >> rshift; 640cbd624adSopenharmony_ci if carry != 0 { 641cbd624adSopenharmony_ci x.try_push(carry)?; 642cbd624adSopenharmony_ci } 643cbd624adSopenharmony_ci 644cbd624adSopenharmony_ci Some(()) 645cbd624adSopenharmony_ci} 646cbd624adSopenharmony_ci 647cbd624adSopenharmony_ci/// Shift-left `n` limbs inside a buffer. 648cbd624adSopenharmony_ci#[inline] 649cbd624adSopenharmony_cipub fn shl_limbs(x: &mut VecType, n: usize) -> Option<()> { 650cbd624adSopenharmony_ci debug_assert!(n != 0); 651cbd624adSopenharmony_ci if n + x.len() > x.capacity() { 652cbd624adSopenharmony_ci None 653cbd624adSopenharmony_ci } else if !x.is_empty() { 654cbd624adSopenharmony_ci let len = n + x.len(); 655cbd624adSopenharmony_ci // SAFE: since x is not empty, and `x.len() + n <= x.capacity()`. 656cbd624adSopenharmony_ci unsafe { 657cbd624adSopenharmony_ci // Move the elements. 658cbd624adSopenharmony_ci let src = x.as_ptr(); 659cbd624adSopenharmony_ci let dst = x.as_mut_ptr().add(n); 660cbd624adSopenharmony_ci ptr::copy(src, dst, x.len()); 661cbd624adSopenharmony_ci // Write our 0s. 662cbd624adSopenharmony_ci ptr::write_bytes(x.as_mut_ptr(), 0, n); 663cbd624adSopenharmony_ci x.set_len(len); 664cbd624adSopenharmony_ci } 665cbd624adSopenharmony_ci Some(()) 666cbd624adSopenharmony_ci } else { 667cbd624adSopenharmony_ci Some(()) 668cbd624adSopenharmony_ci } 669cbd624adSopenharmony_ci} 670cbd624adSopenharmony_ci 671cbd624adSopenharmony_ci/// Shift-left buffer by n bits. 672cbd624adSopenharmony_ci#[inline] 673cbd624adSopenharmony_cipub fn shl(x: &mut VecType, n: usize) -> Option<()> { 674cbd624adSopenharmony_ci let rem = n % LIMB_BITS; 675cbd624adSopenharmony_ci let div = n / LIMB_BITS; 676cbd624adSopenharmony_ci if rem != 0 { 677cbd624adSopenharmony_ci shl_bits(x, rem)?; 678cbd624adSopenharmony_ci } 679cbd624adSopenharmony_ci if div != 0 { 680cbd624adSopenharmony_ci shl_limbs(x, div)?; 681cbd624adSopenharmony_ci } 682cbd624adSopenharmony_ci Some(()) 683cbd624adSopenharmony_ci} 684cbd624adSopenharmony_ci 685cbd624adSopenharmony_ci/// Get number of leading zero bits in the storage. 686cbd624adSopenharmony_ci#[inline] 687cbd624adSopenharmony_cipub fn leading_zeros(x: &[Limb]) -> u32 { 688cbd624adSopenharmony_ci let length = x.len(); 689cbd624adSopenharmony_ci // wrapping_sub is fine, since it'll just return None. 690cbd624adSopenharmony_ci if let Some(&value) = x.get(length.wrapping_sub(1)) { 691cbd624adSopenharmony_ci value.leading_zeros() 692cbd624adSopenharmony_ci } else { 693cbd624adSopenharmony_ci 0 694cbd624adSopenharmony_ci } 695cbd624adSopenharmony_ci} 696cbd624adSopenharmony_ci 697cbd624adSopenharmony_ci/// Calculate the bit-length of the big-integer. 698cbd624adSopenharmony_ci#[inline] 699cbd624adSopenharmony_cipub fn bit_length(x: &[Limb]) -> u32 { 700cbd624adSopenharmony_ci let nlz = leading_zeros(x); 701cbd624adSopenharmony_ci LIMB_BITS as u32 * x.len() as u32 - nlz 702cbd624adSopenharmony_ci} 703cbd624adSopenharmony_ci 704cbd624adSopenharmony_ci// LIMB 705cbd624adSopenharmony_ci// ---- 706cbd624adSopenharmony_ci 707cbd624adSopenharmony_ci// Type for a single limb of the big integer. 708cbd624adSopenharmony_ci// 709cbd624adSopenharmony_ci// A limb is analogous to a digit in base10, except, it stores 32-bit 710cbd624adSopenharmony_ci// or 64-bit numbers instead. We want types where 64-bit multiplication 711cbd624adSopenharmony_ci// is well-supported by the architecture, rather than emulated in 3 712cbd624adSopenharmony_ci// instructions. The quickest way to check this support is using a 713cbd624adSopenharmony_ci// cross-compiler for numerous architectures, along with the following 714cbd624adSopenharmony_ci// source file and command: 715cbd624adSopenharmony_ci// 716cbd624adSopenharmony_ci// Compile with `gcc main.c -c -S -O3 -masm=intel` 717cbd624adSopenharmony_ci// 718cbd624adSopenharmony_ci// And the source code is: 719cbd624adSopenharmony_ci// ```text 720cbd624adSopenharmony_ci// #include <stdint.h> 721cbd624adSopenharmony_ci// 722cbd624adSopenharmony_ci// struct i128 { 723cbd624adSopenharmony_ci// uint64_t hi; 724cbd624adSopenharmony_ci// uint64_t lo; 725cbd624adSopenharmony_ci// }; 726cbd624adSopenharmony_ci// 727cbd624adSopenharmony_ci// // Type your code here, or load an example. 728cbd624adSopenharmony_ci// struct i128 square(uint64_t x, uint64_t y) { 729cbd624adSopenharmony_ci// __int128 prod = (__int128)x * (__int128)y; 730cbd624adSopenharmony_ci// struct i128 z; 731cbd624adSopenharmony_ci// z.hi = (uint64_t)(prod >> 64); 732cbd624adSopenharmony_ci// z.lo = (uint64_t)prod; 733cbd624adSopenharmony_ci// return z; 734cbd624adSopenharmony_ci// } 735cbd624adSopenharmony_ci// ``` 736cbd624adSopenharmony_ci// 737cbd624adSopenharmony_ci// If the result contains `call __multi3`, then the multiplication 738cbd624adSopenharmony_ci// is emulated by the compiler. Otherwise, it's natively supported. 739cbd624adSopenharmony_ci// 740cbd624adSopenharmony_ci// This should be all-known 64-bit platforms supported by Rust. 741cbd624adSopenharmony_ci// https://forge.rust-lang.org/platform-support.html 742cbd624adSopenharmony_ci// 743cbd624adSopenharmony_ci// # Supported 744cbd624adSopenharmony_ci// 745cbd624adSopenharmony_ci// Platforms where native 128-bit multiplication is explicitly supported: 746cbd624adSopenharmony_ci// - x86_64 (Supported via `MUL`). 747cbd624adSopenharmony_ci// - mips64 (Supported via `DMULTU`, which `HI` and `LO` can be read-from). 748cbd624adSopenharmony_ci// - s390x (Supported via `MLGR`). 749cbd624adSopenharmony_ci// 750cbd624adSopenharmony_ci// # Efficient 751cbd624adSopenharmony_ci// 752cbd624adSopenharmony_ci// Platforms where native 64-bit multiplication is supported and 753cbd624adSopenharmony_ci// you can extract hi-lo for 64-bit multiplications. 754cbd624adSopenharmony_ci// - aarch64 (Requires `UMULH` and `MUL` to capture high and low bits). 755cbd624adSopenharmony_ci// - powerpc64 (Requires `MULHDU` and `MULLD` to capture high and low bits). 756cbd624adSopenharmony_ci// - riscv64 (Requires `MUL` and `MULH` to capture high and low bits). 757cbd624adSopenharmony_ci// 758cbd624adSopenharmony_ci// # Unsupported 759cbd624adSopenharmony_ci// 760cbd624adSopenharmony_ci// Platforms where native 128-bit multiplication is not supported, 761cbd624adSopenharmony_ci// requiring software emulation. 762cbd624adSopenharmony_ci// sparc64 (`UMUL` only supports double-word arguments). 763cbd624adSopenharmony_ci// sparcv9 (Same as sparc64). 764cbd624adSopenharmony_ci// 765cbd624adSopenharmony_ci// These tests are run via `xcross`, my own library for C cross-compiling, 766cbd624adSopenharmony_ci// which supports numerous targets (far in excess of Rust's tier 1 support, 767cbd624adSopenharmony_ci// or rust-embedded/cross's list). xcross may be found here: 768cbd624adSopenharmony_ci// https://github.com/Alexhuszagh/xcross 769cbd624adSopenharmony_ci// 770cbd624adSopenharmony_ci// To compile for the given target, run: 771cbd624adSopenharmony_ci// `xcross gcc main.c -c -S -O3 --target $target` 772cbd624adSopenharmony_ci// 773cbd624adSopenharmony_ci// All 32-bit architectures inherently do not have support. That means 774cbd624adSopenharmony_ci// we can essentially look for 64-bit architectures that are not SPARC. 775cbd624adSopenharmony_ci 776cbd624adSopenharmony_ci#[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))] 777cbd624adSopenharmony_cipub type Limb = u64; 778cbd624adSopenharmony_ci#[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))] 779cbd624adSopenharmony_cipub type Wide = u128; 780cbd624adSopenharmony_ci#[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))] 781cbd624adSopenharmony_cipub const LIMB_BITS: usize = 64; 782cbd624adSopenharmony_ci 783cbd624adSopenharmony_ci#[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))] 784cbd624adSopenharmony_cipub type Limb = u32; 785cbd624adSopenharmony_ci#[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))] 786cbd624adSopenharmony_cipub type Wide = u64; 787cbd624adSopenharmony_ci#[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))] 788cbd624adSopenharmony_cipub const LIMB_BITS: usize = 32; 789