18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* mpihelp-div.c - MPI helper functions 38c2ecf20Sopenharmony_ci * Copyright (C) 1994, 1996 Free Software Foundation, Inc. 48c2ecf20Sopenharmony_ci * Copyright (C) 1998, 1999 Free Software Foundation, Inc. 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * This file is part of GnuPG. 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * Note: This code is heavily based on the GNU MP Library. 98c2ecf20Sopenharmony_ci * Actually it's the same code with only minor changes in the 108c2ecf20Sopenharmony_ci * way the data is stored; this is to support the abstraction 118c2ecf20Sopenharmony_ci * of an optional secure memory allocation which may be used 128c2ecf20Sopenharmony_ci * to avoid revealing of sensitive data due to paging etc. 138c2ecf20Sopenharmony_ci * The GNU MP Library itself is published under the LGPL; 148c2ecf20Sopenharmony_ci * however I decided to publish this code under the plain GPL. 158c2ecf20Sopenharmony_ci */ 168c2ecf20Sopenharmony_ci 178c2ecf20Sopenharmony_ci#include "mpi-internal.h" 188c2ecf20Sopenharmony_ci#include "longlong.h" 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ci#ifndef UMUL_TIME 218c2ecf20Sopenharmony_ci#define UMUL_TIME 1 228c2ecf20Sopenharmony_ci#endif 238c2ecf20Sopenharmony_ci#ifndef UDIV_TIME 248c2ecf20Sopenharmony_ci#define UDIV_TIME UMUL_TIME 258c2ecf20Sopenharmony_ci#endif 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci 288c2ecf20Sopenharmony_cimpi_limb_t 298c2ecf20Sopenharmony_cimpihelp_mod_1(mpi_ptr_t dividend_ptr, mpi_size_t dividend_size, 308c2ecf20Sopenharmony_ci mpi_limb_t divisor_limb) 318c2ecf20Sopenharmony_ci{ 328c2ecf20Sopenharmony_ci mpi_size_t i; 338c2ecf20Sopenharmony_ci mpi_limb_t n1, n0, r; 348c2ecf20Sopenharmony_ci mpi_limb_t dummy __maybe_unused; 358c2ecf20Sopenharmony_ci 368c2ecf20Sopenharmony_ci /* Botch: Should this be handled at all? Rely on callers? */ 378c2ecf20Sopenharmony_ci if (!dividend_size) 388c2ecf20Sopenharmony_ci return 0; 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_ci /* If multiplication is much faster than division, and the 418c2ecf20Sopenharmony_ci * dividend is large, pre-invert the divisor, and use 428c2ecf20Sopenharmony_ci * only multiplications in the inner loop. 438c2ecf20Sopenharmony_ci * 448c2ecf20Sopenharmony_ci * This test should be read: 458c2ecf20Sopenharmony_ci * Does it ever help to use udiv_qrnnd_preinv? 468c2ecf20Sopenharmony_ci * && Does what we save compensate for the inversion overhead? 478c2ecf20Sopenharmony_ci */ 488c2ecf20Sopenharmony_ci if (UDIV_TIME > (2 * UMUL_TIME + 6) 498c2ecf20Sopenharmony_ci && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) { 508c2ecf20Sopenharmony_ci int normalization_steps; 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_ci normalization_steps = count_leading_zeros(divisor_limb); 538c2ecf20Sopenharmony_ci if (normalization_steps) { 548c2ecf20Sopenharmony_ci mpi_limb_t divisor_limb_inverted; 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci divisor_limb <<= normalization_steps; 578c2ecf20Sopenharmony_ci 588c2ecf20Sopenharmony_ci /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The 598c2ecf20Sopenharmony_ci * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the 608c2ecf20Sopenharmony_ci * most significant bit (with weight 2**N) implicit. 618c2ecf20Sopenharmony_ci * 628c2ecf20Sopenharmony_ci * Special case for DIVISOR_LIMB == 100...000. 638c2ecf20Sopenharmony_ci */ 648c2ecf20Sopenharmony_ci if (!(divisor_limb << 1)) 658c2ecf20Sopenharmony_ci divisor_limb_inverted = ~(mpi_limb_t)0; 668c2ecf20Sopenharmony_ci else 678c2ecf20Sopenharmony_ci udiv_qrnnd(divisor_limb_inverted, dummy, 688c2ecf20Sopenharmony_ci -divisor_limb, 0, divisor_limb); 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_ci n1 = dividend_ptr[dividend_size - 1]; 718c2ecf20Sopenharmony_ci r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps); 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_ci /* Possible optimization: 748c2ecf20Sopenharmony_ci * if (r == 0 758c2ecf20Sopenharmony_ci * && divisor_limb > ((n1 << normalization_steps) 768c2ecf20Sopenharmony_ci * | (dividend_ptr[dividend_size - 2] >> ...))) 778c2ecf20Sopenharmony_ci * ...one division less... 788c2ecf20Sopenharmony_ci */ 798c2ecf20Sopenharmony_ci for (i = dividend_size - 2; i >= 0; i--) { 808c2ecf20Sopenharmony_ci n0 = dividend_ptr[i]; 818c2ecf20Sopenharmony_ci UDIV_QRNND_PREINV(dummy, r, r, 828c2ecf20Sopenharmony_ci ((n1 << normalization_steps) 838c2ecf20Sopenharmony_ci | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))), 848c2ecf20Sopenharmony_ci divisor_limb, divisor_limb_inverted); 858c2ecf20Sopenharmony_ci n1 = n0; 868c2ecf20Sopenharmony_ci } 878c2ecf20Sopenharmony_ci UDIV_QRNND_PREINV(dummy, r, r, 888c2ecf20Sopenharmony_ci n1 << normalization_steps, 898c2ecf20Sopenharmony_ci divisor_limb, divisor_limb_inverted); 908c2ecf20Sopenharmony_ci return r >> normalization_steps; 918c2ecf20Sopenharmony_ci } else { 928c2ecf20Sopenharmony_ci mpi_limb_t divisor_limb_inverted; 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The 958c2ecf20Sopenharmony_ci * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the 968c2ecf20Sopenharmony_ci * most significant bit (with weight 2**N) implicit. 978c2ecf20Sopenharmony_ci * 988c2ecf20Sopenharmony_ci * Special case for DIVISOR_LIMB == 100...000. 998c2ecf20Sopenharmony_ci */ 1008c2ecf20Sopenharmony_ci if (!(divisor_limb << 1)) 1018c2ecf20Sopenharmony_ci divisor_limb_inverted = ~(mpi_limb_t)0; 1028c2ecf20Sopenharmony_ci else 1038c2ecf20Sopenharmony_ci udiv_qrnnd(divisor_limb_inverted, dummy, 1048c2ecf20Sopenharmony_ci -divisor_limb, 0, divisor_limb); 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci i = dividend_size - 1; 1078c2ecf20Sopenharmony_ci r = dividend_ptr[i]; 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ci if (r >= divisor_limb) 1108c2ecf20Sopenharmony_ci r = 0; 1118c2ecf20Sopenharmony_ci else 1128c2ecf20Sopenharmony_ci i--; 1138c2ecf20Sopenharmony_ci 1148c2ecf20Sopenharmony_ci for ( ; i >= 0; i--) { 1158c2ecf20Sopenharmony_ci n0 = dividend_ptr[i]; 1168c2ecf20Sopenharmony_ci UDIV_QRNND_PREINV(dummy, r, r, 1178c2ecf20Sopenharmony_ci n0, divisor_limb, divisor_limb_inverted); 1188c2ecf20Sopenharmony_ci } 1198c2ecf20Sopenharmony_ci return r; 1208c2ecf20Sopenharmony_ci } 1218c2ecf20Sopenharmony_ci } else { 1228c2ecf20Sopenharmony_ci if (UDIV_NEEDS_NORMALIZATION) { 1238c2ecf20Sopenharmony_ci int normalization_steps; 1248c2ecf20Sopenharmony_ci 1258c2ecf20Sopenharmony_ci normalization_steps = count_leading_zeros(divisor_limb); 1268c2ecf20Sopenharmony_ci if (normalization_steps) { 1278c2ecf20Sopenharmony_ci divisor_limb <<= normalization_steps; 1288c2ecf20Sopenharmony_ci 1298c2ecf20Sopenharmony_ci n1 = dividend_ptr[dividend_size - 1]; 1308c2ecf20Sopenharmony_ci r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps); 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci /* Possible optimization: 1338c2ecf20Sopenharmony_ci * if (r == 0 1348c2ecf20Sopenharmony_ci * && divisor_limb > ((n1 << normalization_steps) 1358c2ecf20Sopenharmony_ci * | (dividend_ptr[dividend_size - 2] >> ...))) 1368c2ecf20Sopenharmony_ci * ...one division less... 1378c2ecf20Sopenharmony_ci */ 1388c2ecf20Sopenharmony_ci for (i = dividend_size - 2; i >= 0; i--) { 1398c2ecf20Sopenharmony_ci n0 = dividend_ptr[i]; 1408c2ecf20Sopenharmony_ci udiv_qrnnd(dummy, r, r, 1418c2ecf20Sopenharmony_ci ((n1 << normalization_steps) 1428c2ecf20Sopenharmony_ci | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))), 1438c2ecf20Sopenharmony_ci divisor_limb); 1448c2ecf20Sopenharmony_ci n1 = n0; 1458c2ecf20Sopenharmony_ci } 1468c2ecf20Sopenharmony_ci udiv_qrnnd(dummy, r, r, 1478c2ecf20Sopenharmony_ci n1 << normalization_steps, 1488c2ecf20Sopenharmony_ci divisor_limb); 1498c2ecf20Sopenharmony_ci return r >> normalization_steps; 1508c2ecf20Sopenharmony_ci } 1518c2ecf20Sopenharmony_ci } 1528c2ecf20Sopenharmony_ci /* No normalization needed, either because udiv_qrnnd doesn't require 1538c2ecf20Sopenharmony_ci * it, or because DIVISOR_LIMB is already normalized. 1548c2ecf20Sopenharmony_ci */ 1558c2ecf20Sopenharmony_ci i = dividend_size - 1; 1568c2ecf20Sopenharmony_ci r = dividend_ptr[i]; 1578c2ecf20Sopenharmony_ci 1588c2ecf20Sopenharmony_ci if (r >= divisor_limb) 1598c2ecf20Sopenharmony_ci r = 0; 1608c2ecf20Sopenharmony_ci else 1618c2ecf20Sopenharmony_ci i--; 1628c2ecf20Sopenharmony_ci 1638c2ecf20Sopenharmony_ci for (; i >= 0; i--) { 1648c2ecf20Sopenharmony_ci n0 = dividend_ptr[i]; 1658c2ecf20Sopenharmony_ci udiv_qrnnd(dummy, r, r, n0, divisor_limb); 1668c2ecf20Sopenharmony_ci } 1678c2ecf20Sopenharmony_ci return r; 1688c2ecf20Sopenharmony_ci } 1698c2ecf20Sopenharmony_ci} 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write 1728c2ecf20Sopenharmony_ci * the NSIZE-DSIZE least significant quotient limbs at QP 1738c2ecf20Sopenharmony_ci * and the DSIZE long remainder at NP. If QEXTRA_LIMBS is 1748c2ecf20Sopenharmony_ci * non-zero, generate that many fraction bits and append them after the 1758c2ecf20Sopenharmony_ci * other quotient limbs. 1768c2ecf20Sopenharmony_ci * Return the most significant limb of the quotient, this is always 0 or 1. 1778c2ecf20Sopenharmony_ci * 1788c2ecf20Sopenharmony_ci * Preconditions: 1798c2ecf20Sopenharmony_ci * 0. NSIZE >= DSIZE. 1808c2ecf20Sopenharmony_ci * 1. The most significant bit of the divisor must be set. 1818c2ecf20Sopenharmony_ci * 2. QP must either not overlap with the input operands at all, or 1828c2ecf20Sopenharmony_ci * QP + DSIZE >= NP must hold true. (This means that it's 1838c2ecf20Sopenharmony_ci * possible to put the quotient in the high part of NUM, right after the 1848c2ecf20Sopenharmony_ci * remainder in NUM. 1858c2ecf20Sopenharmony_ci * 3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero. 1868c2ecf20Sopenharmony_ci */ 1878c2ecf20Sopenharmony_ci 1888c2ecf20Sopenharmony_cimpi_limb_t 1898c2ecf20Sopenharmony_cimpihelp_divrem(mpi_ptr_t qp, mpi_size_t qextra_limbs, 1908c2ecf20Sopenharmony_ci mpi_ptr_t np, mpi_size_t nsize, mpi_ptr_t dp, mpi_size_t dsize) 1918c2ecf20Sopenharmony_ci{ 1928c2ecf20Sopenharmony_ci mpi_limb_t most_significant_q_limb = 0; 1938c2ecf20Sopenharmony_ci 1948c2ecf20Sopenharmony_ci switch (dsize) { 1958c2ecf20Sopenharmony_ci case 0: 1968c2ecf20Sopenharmony_ci /* We are asked to divide by zero, so go ahead and do it! (To make 1978c2ecf20Sopenharmony_ci the compiler not remove this statement, return the value.) */ 1988c2ecf20Sopenharmony_ci /* 1998c2ecf20Sopenharmony_ci * existing clients of this function have been modified 2008c2ecf20Sopenharmony_ci * not to call it with dsize == 0, so this should not happen 2018c2ecf20Sopenharmony_ci */ 2028c2ecf20Sopenharmony_ci return 1 / dsize; 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci case 1: 2058c2ecf20Sopenharmony_ci { 2068c2ecf20Sopenharmony_ci mpi_size_t i; 2078c2ecf20Sopenharmony_ci mpi_limb_t n1; 2088c2ecf20Sopenharmony_ci mpi_limb_t d; 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci d = dp[0]; 2118c2ecf20Sopenharmony_ci n1 = np[nsize - 1]; 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci if (n1 >= d) { 2148c2ecf20Sopenharmony_ci n1 -= d; 2158c2ecf20Sopenharmony_ci most_significant_q_limb = 1; 2168c2ecf20Sopenharmony_ci } 2178c2ecf20Sopenharmony_ci 2188c2ecf20Sopenharmony_ci qp += qextra_limbs; 2198c2ecf20Sopenharmony_ci for (i = nsize - 2; i >= 0; i--) 2208c2ecf20Sopenharmony_ci udiv_qrnnd(qp[i], n1, n1, np[i], d); 2218c2ecf20Sopenharmony_ci qp -= qextra_limbs; 2228c2ecf20Sopenharmony_ci 2238c2ecf20Sopenharmony_ci for (i = qextra_limbs - 1; i >= 0; i--) 2248c2ecf20Sopenharmony_ci udiv_qrnnd(qp[i], n1, n1, 0, d); 2258c2ecf20Sopenharmony_ci 2268c2ecf20Sopenharmony_ci np[0] = n1; 2278c2ecf20Sopenharmony_ci } 2288c2ecf20Sopenharmony_ci break; 2298c2ecf20Sopenharmony_ci 2308c2ecf20Sopenharmony_ci case 2: 2318c2ecf20Sopenharmony_ci { 2328c2ecf20Sopenharmony_ci mpi_size_t i; 2338c2ecf20Sopenharmony_ci mpi_limb_t n1, n0, n2; 2348c2ecf20Sopenharmony_ci mpi_limb_t d1, d0; 2358c2ecf20Sopenharmony_ci 2368c2ecf20Sopenharmony_ci np += nsize - 2; 2378c2ecf20Sopenharmony_ci d1 = dp[1]; 2388c2ecf20Sopenharmony_ci d0 = dp[0]; 2398c2ecf20Sopenharmony_ci n1 = np[1]; 2408c2ecf20Sopenharmony_ci n0 = np[0]; 2418c2ecf20Sopenharmony_ci 2428c2ecf20Sopenharmony_ci if (n1 >= d1 && (n1 > d1 || n0 >= d0)) { 2438c2ecf20Sopenharmony_ci sub_ddmmss(n1, n0, n1, n0, d1, d0); 2448c2ecf20Sopenharmony_ci most_significant_q_limb = 1; 2458c2ecf20Sopenharmony_ci } 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_ci for (i = qextra_limbs + nsize - 2 - 1; i >= 0; i--) { 2488c2ecf20Sopenharmony_ci mpi_limb_t q; 2498c2ecf20Sopenharmony_ci mpi_limb_t r; 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci if (i >= qextra_limbs) 2528c2ecf20Sopenharmony_ci np--; 2538c2ecf20Sopenharmony_ci else 2548c2ecf20Sopenharmony_ci np[0] = 0; 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_ci if (n1 == d1) { 2578c2ecf20Sopenharmony_ci /* Q should be either 111..111 or 111..110. Need special 2588c2ecf20Sopenharmony_ci * treatment of this rare case as normal division would 2598c2ecf20Sopenharmony_ci * give overflow. */ 2608c2ecf20Sopenharmony_ci q = ~(mpi_limb_t) 0; 2618c2ecf20Sopenharmony_ci 2628c2ecf20Sopenharmony_ci r = n0 + d1; 2638c2ecf20Sopenharmony_ci if (r < d1) { /* Carry in the addition? */ 2648c2ecf20Sopenharmony_ci add_ssaaaa(n1, n0, r - d0, 2658c2ecf20Sopenharmony_ci np[0], 0, d0); 2668c2ecf20Sopenharmony_ci qp[i] = q; 2678c2ecf20Sopenharmony_ci continue; 2688c2ecf20Sopenharmony_ci } 2698c2ecf20Sopenharmony_ci n1 = d0 - (d0 != 0 ? 1 : 0); 2708c2ecf20Sopenharmony_ci n0 = -d0; 2718c2ecf20Sopenharmony_ci } else { 2728c2ecf20Sopenharmony_ci udiv_qrnnd(q, r, n1, n0, d1); 2738c2ecf20Sopenharmony_ci umul_ppmm(n1, n0, d0, q); 2748c2ecf20Sopenharmony_ci } 2758c2ecf20Sopenharmony_ci 2768c2ecf20Sopenharmony_ci n2 = np[0]; 2778c2ecf20Sopenharmony_ciq_test: 2788c2ecf20Sopenharmony_ci if (n1 > r || (n1 == r && n0 > n2)) { 2798c2ecf20Sopenharmony_ci /* The estimated Q was too large. */ 2808c2ecf20Sopenharmony_ci q--; 2818c2ecf20Sopenharmony_ci sub_ddmmss(n1, n0, n1, n0, 0, d0); 2828c2ecf20Sopenharmony_ci r += d1; 2838c2ecf20Sopenharmony_ci if (r >= d1) /* If not carry, test Q again. */ 2848c2ecf20Sopenharmony_ci goto q_test; 2858c2ecf20Sopenharmony_ci } 2868c2ecf20Sopenharmony_ci 2878c2ecf20Sopenharmony_ci qp[i] = q; 2888c2ecf20Sopenharmony_ci sub_ddmmss(n1, n0, r, n2, n1, n0); 2898c2ecf20Sopenharmony_ci } 2908c2ecf20Sopenharmony_ci np[1] = n1; 2918c2ecf20Sopenharmony_ci np[0] = n0; 2928c2ecf20Sopenharmony_ci } 2938c2ecf20Sopenharmony_ci break; 2948c2ecf20Sopenharmony_ci 2958c2ecf20Sopenharmony_ci default: 2968c2ecf20Sopenharmony_ci { 2978c2ecf20Sopenharmony_ci mpi_size_t i; 2988c2ecf20Sopenharmony_ci mpi_limb_t dX, d1, n0; 2998c2ecf20Sopenharmony_ci 3008c2ecf20Sopenharmony_ci np += nsize - dsize; 3018c2ecf20Sopenharmony_ci dX = dp[dsize - 1]; 3028c2ecf20Sopenharmony_ci d1 = dp[dsize - 2]; 3038c2ecf20Sopenharmony_ci n0 = np[dsize - 1]; 3048c2ecf20Sopenharmony_ci 3058c2ecf20Sopenharmony_ci if (n0 >= dX) { 3068c2ecf20Sopenharmony_ci if (n0 > dX 3078c2ecf20Sopenharmony_ci || mpihelp_cmp(np, dp, dsize - 1) >= 0) { 3088c2ecf20Sopenharmony_ci mpihelp_sub_n(np, np, dp, dsize); 3098c2ecf20Sopenharmony_ci n0 = np[dsize - 1]; 3108c2ecf20Sopenharmony_ci most_significant_q_limb = 1; 3118c2ecf20Sopenharmony_ci } 3128c2ecf20Sopenharmony_ci } 3138c2ecf20Sopenharmony_ci 3148c2ecf20Sopenharmony_ci for (i = qextra_limbs + nsize - dsize - 1; i >= 0; i--) { 3158c2ecf20Sopenharmony_ci mpi_limb_t q; 3168c2ecf20Sopenharmony_ci mpi_limb_t n1, n2; 3178c2ecf20Sopenharmony_ci mpi_limb_t cy_limb; 3188c2ecf20Sopenharmony_ci 3198c2ecf20Sopenharmony_ci if (i >= qextra_limbs) { 3208c2ecf20Sopenharmony_ci np--; 3218c2ecf20Sopenharmony_ci n2 = np[dsize]; 3228c2ecf20Sopenharmony_ci } else { 3238c2ecf20Sopenharmony_ci n2 = np[dsize - 1]; 3248c2ecf20Sopenharmony_ci MPN_COPY_DECR(np + 1, np, dsize - 1); 3258c2ecf20Sopenharmony_ci np[0] = 0; 3268c2ecf20Sopenharmony_ci } 3278c2ecf20Sopenharmony_ci 3288c2ecf20Sopenharmony_ci if (n0 == dX) { 3298c2ecf20Sopenharmony_ci /* This might over-estimate q, but it's probably not worth 3308c2ecf20Sopenharmony_ci * the extra code here to find out. */ 3318c2ecf20Sopenharmony_ci q = ~(mpi_limb_t) 0; 3328c2ecf20Sopenharmony_ci } else { 3338c2ecf20Sopenharmony_ci mpi_limb_t r; 3348c2ecf20Sopenharmony_ci 3358c2ecf20Sopenharmony_ci udiv_qrnnd(q, r, n0, np[dsize - 1], dX); 3368c2ecf20Sopenharmony_ci umul_ppmm(n1, n0, d1, q); 3378c2ecf20Sopenharmony_ci 3388c2ecf20Sopenharmony_ci while (n1 > r 3398c2ecf20Sopenharmony_ci || (n1 == r 3408c2ecf20Sopenharmony_ci && n0 > np[dsize - 2])) { 3418c2ecf20Sopenharmony_ci q--; 3428c2ecf20Sopenharmony_ci r += dX; 3438c2ecf20Sopenharmony_ci if (r < dX) /* I.e. "carry in previous addition?" */ 3448c2ecf20Sopenharmony_ci break; 3458c2ecf20Sopenharmony_ci n1 -= n0 < d1; 3468c2ecf20Sopenharmony_ci n0 -= d1; 3478c2ecf20Sopenharmony_ci } 3488c2ecf20Sopenharmony_ci } 3498c2ecf20Sopenharmony_ci 3508c2ecf20Sopenharmony_ci /* Possible optimization: We already have (q * n0) and (1 * n1) 3518c2ecf20Sopenharmony_ci * after the calculation of q. Taking advantage of that, we 3528c2ecf20Sopenharmony_ci * could make this loop make two iterations less. */ 3538c2ecf20Sopenharmony_ci cy_limb = mpihelp_submul_1(np, dp, dsize, q); 3548c2ecf20Sopenharmony_ci 3558c2ecf20Sopenharmony_ci if (n2 != cy_limb) { 3568c2ecf20Sopenharmony_ci mpihelp_add_n(np, np, dp, dsize); 3578c2ecf20Sopenharmony_ci q--; 3588c2ecf20Sopenharmony_ci } 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci qp[i] = q; 3618c2ecf20Sopenharmony_ci n0 = np[dsize - 1]; 3628c2ecf20Sopenharmony_ci } 3638c2ecf20Sopenharmony_ci } 3648c2ecf20Sopenharmony_ci } 3658c2ecf20Sopenharmony_ci 3668c2ecf20Sopenharmony_ci return most_significant_q_limb; 3678c2ecf20Sopenharmony_ci} 3688c2ecf20Sopenharmony_ci 3698c2ecf20Sopenharmony_ci/**************** 3708c2ecf20Sopenharmony_ci * Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB. 3718c2ecf20Sopenharmony_ci * Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR. 3728c2ecf20Sopenharmony_ci * Return the single-limb remainder. 3738c2ecf20Sopenharmony_ci * There are no constraints on the value of the divisor. 3748c2ecf20Sopenharmony_ci * 3758c2ecf20Sopenharmony_ci * QUOT_PTR and DIVIDEND_PTR might point to the same limb. 3768c2ecf20Sopenharmony_ci */ 3778c2ecf20Sopenharmony_ci 3788c2ecf20Sopenharmony_cimpi_limb_t 3798c2ecf20Sopenharmony_cimpihelp_divmod_1(mpi_ptr_t quot_ptr, 3808c2ecf20Sopenharmony_ci mpi_ptr_t dividend_ptr, mpi_size_t dividend_size, 3818c2ecf20Sopenharmony_ci mpi_limb_t divisor_limb) 3828c2ecf20Sopenharmony_ci{ 3838c2ecf20Sopenharmony_ci mpi_size_t i; 3848c2ecf20Sopenharmony_ci mpi_limb_t n1, n0, r; 3858c2ecf20Sopenharmony_ci mpi_limb_t dummy __maybe_unused; 3868c2ecf20Sopenharmony_ci 3878c2ecf20Sopenharmony_ci if (!dividend_size) 3888c2ecf20Sopenharmony_ci return 0; 3898c2ecf20Sopenharmony_ci 3908c2ecf20Sopenharmony_ci /* If multiplication is much faster than division, and the 3918c2ecf20Sopenharmony_ci * dividend is large, pre-invert the divisor, and use 3928c2ecf20Sopenharmony_ci * only multiplications in the inner loop. 3938c2ecf20Sopenharmony_ci * 3948c2ecf20Sopenharmony_ci * This test should be read: 3958c2ecf20Sopenharmony_ci * Does it ever help to use udiv_qrnnd_preinv? 3968c2ecf20Sopenharmony_ci * && Does what we save compensate for the inversion overhead? 3978c2ecf20Sopenharmony_ci */ 3988c2ecf20Sopenharmony_ci if (UDIV_TIME > (2 * UMUL_TIME + 6) 3998c2ecf20Sopenharmony_ci && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) { 4008c2ecf20Sopenharmony_ci int normalization_steps; 4018c2ecf20Sopenharmony_ci 4028c2ecf20Sopenharmony_ci normalization_steps = count_leading_zeros(divisor_limb); 4038c2ecf20Sopenharmony_ci if (normalization_steps) { 4048c2ecf20Sopenharmony_ci mpi_limb_t divisor_limb_inverted; 4058c2ecf20Sopenharmony_ci 4068c2ecf20Sopenharmony_ci divisor_limb <<= normalization_steps; 4078c2ecf20Sopenharmony_ci 4088c2ecf20Sopenharmony_ci /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The 4098c2ecf20Sopenharmony_ci * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the 4108c2ecf20Sopenharmony_ci * most significant bit (with weight 2**N) implicit. 4118c2ecf20Sopenharmony_ci */ 4128c2ecf20Sopenharmony_ci /* Special case for DIVISOR_LIMB == 100...000. */ 4138c2ecf20Sopenharmony_ci if (!(divisor_limb << 1)) 4148c2ecf20Sopenharmony_ci divisor_limb_inverted = ~(mpi_limb_t)0; 4158c2ecf20Sopenharmony_ci else 4168c2ecf20Sopenharmony_ci udiv_qrnnd(divisor_limb_inverted, dummy, 4178c2ecf20Sopenharmony_ci -divisor_limb, 0, divisor_limb); 4188c2ecf20Sopenharmony_ci 4198c2ecf20Sopenharmony_ci n1 = dividend_ptr[dividend_size - 1]; 4208c2ecf20Sopenharmony_ci r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps); 4218c2ecf20Sopenharmony_ci 4228c2ecf20Sopenharmony_ci /* Possible optimization: 4238c2ecf20Sopenharmony_ci * if (r == 0 4248c2ecf20Sopenharmony_ci * && divisor_limb > ((n1 << normalization_steps) 4258c2ecf20Sopenharmony_ci * | (dividend_ptr[dividend_size - 2] >> ...))) 4268c2ecf20Sopenharmony_ci * ...one division less... 4278c2ecf20Sopenharmony_ci */ 4288c2ecf20Sopenharmony_ci for (i = dividend_size - 2; i >= 0; i--) { 4298c2ecf20Sopenharmony_ci n0 = dividend_ptr[i]; 4308c2ecf20Sopenharmony_ci UDIV_QRNND_PREINV(quot_ptr[i + 1], r, r, 4318c2ecf20Sopenharmony_ci ((n1 << normalization_steps) 4328c2ecf20Sopenharmony_ci | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))), 4338c2ecf20Sopenharmony_ci divisor_limb, divisor_limb_inverted); 4348c2ecf20Sopenharmony_ci n1 = n0; 4358c2ecf20Sopenharmony_ci } 4368c2ecf20Sopenharmony_ci UDIV_QRNND_PREINV(quot_ptr[0], r, r, 4378c2ecf20Sopenharmony_ci n1 << normalization_steps, 4388c2ecf20Sopenharmony_ci divisor_limb, divisor_limb_inverted); 4398c2ecf20Sopenharmony_ci return r >> normalization_steps; 4408c2ecf20Sopenharmony_ci } else { 4418c2ecf20Sopenharmony_ci mpi_limb_t divisor_limb_inverted; 4428c2ecf20Sopenharmony_ci 4438c2ecf20Sopenharmony_ci /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The 4448c2ecf20Sopenharmony_ci * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the 4458c2ecf20Sopenharmony_ci * most significant bit (with weight 2**N) implicit. 4468c2ecf20Sopenharmony_ci */ 4478c2ecf20Sopenharmony_ci /* Special case for DIVISOR_LIMB == 100...000. */ 4488c2ecf20Sopenharmony_ci if (!(divisor_limb << 1)) 4498c2ecf20Sopenharmony_ci divisor_limb_inverted = ~(mpi_limb_t) 0; 4508c2ecf20Sopenharmony_ci else 4518c2ecf20Sopenharmony_ci udiv_qrnnd(divisor_limb_inverted, dummy, 4528c2ecf20Sopenharmony_ci -divisor_limb, 0, divisor_limb); 4538c2ecf20Sopenharmony_ci 4548c2ecf20Sopenharmony_ci i = dividend_size - 1; 4558c2ecf20Sopenharmony_ci r = dividend_ptr[i]; 4568c2ecf20Sopenharmony_ci 4578c2ecf20Sopenharmony_ci if (r >= divisor_limb) 4588c2ecf20Sopenharmony_ci r = 0; 4598c2ecf20Sopenharmony_ci else 4608c2ecf20Sopenharmony_ci quot_ptr[i--] = 0; 4618c2ecf20Sopenharmony_ci 4628c2ecf20Sopenharmony_ci for ( ; i >= 0; i--) { 4638c2ecf20Sopenharmony_ci n0 = dividend_ptr[i]; 4648c2ecf20Sopenharmony_ci UDIV_QRNND_PREINV(quot_ptr[i], r, r, 4658c2ecf20Sopenharmony_ci n0, divisor_limb, divisor_limb_inverted); 4668c2ecf20Sopenharmony_ci } 4678c2ecf20Sopenharmony_ci return r; 4688c2ecf20Sopenharmony_ci } 4698c2ecf20Sopenharmony_ci } else { 4708c2ecf20Sopenharmony_ci if (UDIV_NEEDS_NORMALIZATION) { 4718c2ecf20Sopenharmony_ci int normalization_steps; 4728c2ecf20Sopenharmony_ci 4738c2ecf20Sopenharmony_ci normalization_steps = count_leading_zeros(divisor_limb); 4748c2ecf20Sopenharmony_ci if (normalization_steps) { 4758c2ecf20Sopenharmony_ci divisor_limb <<= normalization_steps; 4768c2ecf20Sopenharmony_ci 4778c2ecf20Sopenharmony_ci n1 = dividend_ptr[dividend_size - 1]; 4788c2ecf20Sopenharmony_ci r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps); 4798c2ecf20Sopenharmony_ci 4808c2ecf20Sopenharmony_ci /* Possible optimization: 4818c2ecf20Sopenharmony_ci * if (r == 0 4828c2ecf20Sopenharmony_ci * && divisor_limb > ((n1 << normalization_steps) 4838c2ecf20Sopenharmony_ci * | (dividend_ptr[dividend_size - 2] >> ...))) 4848c2ecf20Sopenharmony_ci * ...one division less... 4858c2ecf20Sopenharmony_ci */ 4868c2ecf20Sopenharmony_ci for (i = dividend_size - 2; i >= 0; i--) { 4878c2ecf20Sopenharmony_ci n0 = dividend_ptr[i]; 4888c2ecf20Sopenharmony_ci udiv_qrnnd(quot_ptr[i + 1], r, r, 4898c2ecf20Sopenharmony_ci ((n1 << normalization_steps) 4908c2ecf20Sopenharmony_ci | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))), 4918c2ecf20Sopenharmony_ci divisor_limb); 4928c2ecf20Sopenharmony_ci n1 = n0; 4938c2ecf20Sopenharmony_ci } 4948c2ecf20Sopenharmony_ci udiv_qrnnd(quot_ptr[0], r, r, 4958c2ecf20Sopenharmony_ci n1 << normalization_steps, 4968c2ecf20Sopenharmony_ci divisor_limb); 4978c2ecf20Sopenharmony_ci return r >> normalization_steps; 4988c2ecf20Sopenharmony_ci } 4998c2ecf20Sopenharmony_ci } 5008c2ecf20Sopenharmony_ci /* No normalization needed, either because udiv_qrnnd doesn't require 5018c2ecf20Sopenharmony_ci * it, or because DIVISOR_LIMB is already normalized. 5028c2ecf20Sopenharmony_ci */ 5038c2ecf20Sopenharmony_ci i = dividend_size - 1; 5048c2ecf20Sopenharmony_ci r = dividend_ptr[i]; 5058c2ecf20Sopenharmony_ci 5068c2ecf20Sopenharmony_ci if (r >= divisor_limb) 5078c2ecf20Sopenharmony_ci r = 0; 5088c2ecf20Sopenharmony_ci else 5098c2ecf20Sopenharmony_ci quot_ptr[i--] = 0; 5108c2ecf20Sopenharmony_ci 5118c2ecf20Sopenharmony_ci for (; i >= 0; i--) { 5128c2ecf20Sopenharmony_ci n0 = dividend_ptr[i]; 5138c2ecf20Sopenharmony_ci udiv_qrnnd(quot_ptr[i], r, r, n0, divisor_limb); 5148c2ecf20Sopenharmony_ci } 5158c2ecf20Sopenharmony_ci return r; 5168c2ecf20Sopenharmony_ci } 5178c2ecf20Sopenharmony_ci} 518