11cb0ef41Sopenharmony_ci// Copyright 2021 the V8 project authors. All rights reserved. 21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 31cb0ef41Sopenharmony_ci// found in the LICENSE file. 41cb0ef41Sopenharmony_ci 51cb0ef41Sopenharmony_ci// FFT-based multiplication, due to Schönhage and Strassen. 61cb0ef41Sopenharmony_ci// This implementation mostly follows the description given in: 71cb0ef41Sopenharmony_ci// Christoph Lüders: Fast Multiplication of Large Integers, 81cb0ef41Sopenharmony_ci// http://arxiv.org/abs/1503.04955 91cb0ef41Sopenharmony_ci 101cb0ef41Sopenharmony_ci#include "src/bigint/bigint-internal.h" 111cb0ef41Sopenharmony_ci#include "src/bigint/digit-arithmetic.h" 121cb0ef41Sopenharmony_ci#include "src/bigint/util.h" 131cb0ef41Sopenharmony_ci#include "src/bigint/vector-arithmetic.h" 141cb0ef41Sopenharmony_ci 151cb0ef41Sopenharmony_cinamespace v8 { 161cb0ef41Sopenharmony_cinamespace bigint { 171cb0ef41Sopenharmony_ci 181cb0ef41Sopenharmony_cinamespace { 191cb0ef41Sopenharmony_ci 201cb0ef41Sopenharmony_ci//////////////////////////////////////////////////////////////////////////////// 211cb0ef41Sopenharmony_ci// Part 1: Functions for "mod F_n" arithmetic. 221cb0ef41Sopenharmony_ci// F_n is of the shape 2^K + 1, and for convenience we use K to count the 231cb0ef41Sopenharmony_ci// number of digits rather than the number of bits, so F_n (or K) are implicit 241cb0ef41Sopenharmony_ci// and deduced from the length {len} of the digits array. 251cb0ef41Sopenharmony_ci 261cb0ef41Sopenharmony_ci// Helper function for {ModFn} below. 271cb0ef41Sopenharmony_civoid ModFn_Helper(digit_t* x, int len, signed_digit_t high) { 281cb0ef41Sopenharmony_ci if (high > 0) { 291cb0ef41Sopenharmony_ci digit_t borrow = high; 301cb0ef41Sopenharmony_ci x[len - 1] = 0; 311cb0ef41Sopenharmony_ci for (int i = 0; i < len; i++) { 321cb0ef41Sopenharmony_ci x[i] = digit_sub(x[i], borrow, &borrow); 331cb0ef41Sopenharmony_ci if (borrow == 0) break; 341cb0ef41Sopenharmony_ci } 351cb0ef41Sopenharmony_ci } else { 361cb0ef41Sopenharmony_ci digit_t carry = -high; 371cb0ef41Sopenharmony_ci x[len - 1] = 0; 381cb0ef41Sopenharmony_ci for (int i = 0; i < len; i++) { 391cb0ef41Sopenharmony_ci x[i] = digit_add2(x[i], carry, &carry); 401cb0ef41Sopenharmony_ci if (carry == 0) break; 411cb0ef41Sopenharmony_ci } 421cb0ef41Sopenharmony_ci } 431cb0ef41Sopenharmony_ci} 441cb0ef41Sopenharmony_ci 451cb0ef41Sopenharmony_ci// {x} := {x} mod F_n, assuming that {x} is "slightly" larger than F_n (e.g. 461cb0ef41Sopenharmony_ci// after addition of two numbers that were mod-F_n-normalized before). 471cb0ef41Sopenharmony_civoid ModFn(digit_t* x, int len) { 481cb0ef41Sopenharmony_ci int K = len - 1; 491cb0ef41Sopenharmony_ci signed_digit_t high = x[K]; 501cb0ef41Sopenharmony_ci if (high == 0) return; 511cb0ef41Sopenharmony_ci ModFn_Helper(x, len, high); 521cb0ef41Sopenharmony_ci high = x[K]; 531cb0ef41Sopenharmony_ci if (high == 0) return; 541cb0ef41Sopenharmony_ci DCHECK(high == 1 || high == -1); 551cb0ef41Sopenharmony_ci ModFn_Helper(x, len, high); 561cb0ef41Sopenharmony_ci high = x[K]; 571cb0ef41Sopenharmony_ci if (high == -1) ModFn_Helper(x, len, high); 581cb0ef41Sopenharmony_ci} 591cb0ef41Sopenharmony_ci 601cb0ef41Sopenharmony_ci// {dest} := {src} mod F_n, assuming that {src} is about twice as long as F_n 611cb0ef41Sopenharmony_ci// (e.g. after multiplication of two numbers that were mod-F_n-normalized 621cb0ef41Sopenharmony_ci// before). 631cb0ef41Sopenharmony_ci// {len} is length of {dest}; {src} is twice as long. 641cb0ef41Sopenharmony_civoid ModFnDoubleWidth(digit_t* dest, const digit_t* src, int len) { 651cb0ef41Sopenharmony_ci int K = len - 1; 661cb0ef41Sopenharmony_ci digit_t borrow = 0; 671cb0ef41Sopenharmony_ci for (int i = 0; i < K; i++) { 681cb0ef41Sopenharmony_ci dest[i] = digit_sub2(src[i], src[i + K], borrow, &borrow); 691cb0ef41Sopenharmony_ci } 701cb0ef41Sopenharmony_ci dest[K] = digit_sub2(0, src[2 * K], borrow, &borrow); 711cb0ef41Sopenharmony_ci // {borrow} may be non-zero here, that's OK as {ModFn} will take care of it. 721cb0ef41Sopenharmony_ci ModFn(dest, len); 731cb0ef41Sopenharmony_ci} 741cb0ef41Sopenharmony_ci 751cb0ef41Sopenharmony_ci// Sets {sum} := {a} + {b} and {diff} := {a} - {b}, which is more efficient 761cb0ef41Sopenharmony_ci// than computing sum and difference separately. Applies "mod F_n" normalization 771cb0ef41Sopenharmony_ci// to both results. 781cb0ef41Sopenharmony_civoid SumDiff(digit_t* sum, digit_t* diff, const digit_t* a, const digit_t* b, 791cb0ef41Sopenharmony_ci int len) { 801cb0ef41Sopenharmony_ci digit_t carry = 0; 811cb0ef41Sopenharmony_ci digit_t borrow = 0; 821cb0ef41Sopenharmony_ci for (int i = 0; i < len; i++) { 831cb0ef41Sopenharmony_ci // Read both values first, because inputs and outputs can overlap. 841cb0ef41Sopenharmony_ci digit_t ai = a[i]; 851cb0ef41Sopenharmony_ci digit_t bi = b[i]; 861cb0ef41Sopenharmony_ci sum[i] = digit_add3(ai, bi, carry, &carry); 871cb0ef41Sopenharmony_ci diff[i] = digit_sub2(ai, bi, borrow, &borrow); 881cb0ef41Sopenharmony_ci } 891cb0ef41Sopenharmony_ci ModFn(sum, len); 901cb0ef41Sopenharmony_ci ModFn(diff, len); 911cb0ef41Sopenharmony_ci} 921cb0ef41Sopenharmony_ci 931cb0ef41Sopenharmony_ci// {result} := ({input} << shift) mod F_n, where shift >= K. 941cb0ef41Sopenharmony_civoid ShiftModFn_Large(digit_t* result, const digit_t* input, int digit_shift, 951cb0ef41Sopenharmony_ci int bits_shift, int K) { 961cb0ef41Sopenharmony_ci // If {digit_shift} is greater than K, we use the following transformation 971cb0ef41Sopenharmony_ci // (where, since everything is mod 2^K + 1, we are allowed to add or 981cb0ef41Sopenharmony_ci // subtract any multiple of 2^K + 1 at any time): 991cb0ef41Sopenharmony_ci // x * 2^{K+m} mod 2^K + 1 1001cb0ef41Sopenharmony_ci // == x * 2^K * 2^m - (2^K + 1)*(x * 2^m) mod 2^K + 1 1011cb0ef41Sopenharmony_ci // == x * 2^K * 2^m - x * 2^K * 2^m - x * 2^m mod 2^K + 1 1021cb0ef41Sopenharmony_ci // == -x * 2^m mod 2^K + 1 1031cb0ef41Sopenharmony_ci // So the flow is the same as for m < K, but we invert the subtraction's 1041cb0ef41Sopenharmony_ci // operands. In order to avoid underflow, we virtually initialize the 1051cb0ef41Sopenharmony_ci // result to 2^K + 1: 1061cb0ef41Sopenharmony_ci // input = [ iK ][iK-1] .... .... [ i1 ][ i0 ] 1071cb0ef41Sopenharmony_ci // result = [ 1][0000] .... .... [0000][0001] 1081cb0ef41Sopenharmony_ci // + [ iK ] .... [ iX ] 1091cb0ef41Sopenharmony_ci // - [iX-1] .... [ i0 ] 1101cb0ef41Sopenharmony_ci DCHECK(digit_shift >= K); 1111cb0ef41Sopenharmony_ci digit_shift -= K; 1121cb0ef41Sopenharmony_ci digit_t borrow = 0; 1131cb0ef41Sopenharmony_ci if (bits_shift == 0) { 1141cb0ef41Sopenharmony_ci digit_t carry = 1; 1151cb0ef41Sopenharmony_ci for (int i = 0; i < digit_shift; i++) { 1161cb0ef41Sopenharmony_ci result[i] = digit_add2(input[i + K - digit_shift], carry, &carry); 1171cb0ef41Sopenharmony_ci } 1181cb0ef41Sopenharmony_ci result[digit_shift] = digit_sub(input[K] + carry, input[0], &borrow); 1191cb0ef41Sopenharmony_ci for (int i = digit_shift + 1; i < K; i++) { 1201cb0ef41Sopenharmony_ci digit_t d = input[i - digit_shift]; 1211cb0ef41Sopenharmony_ci result[i] = digit_sub2(0, d, borrow, &borrow); 1221cb0ef41Sopenharmony_ci } 1231cb0ef41Sopenharmony_ci } else { 1241cb0ef41Sopenharmony_ci digit_t add_carry = 1; 1251cb0ef41Sopenharmony_ci digit_t input_carry = 1261cb0ef41Sopenharmony_ci input[K - digit_shift - 1] >> (kDigitBits - bits_shift); 1271cb0ef41Sopenharmony_ci for (int i = 0; i < digit_shift; i++) { 1281cb0ef41Sopenharmony_ci digit_t d = input[i + K - digit_shift]; 1291cb0ef41Sopenharmony_ci digit_t summand = (d << bits_shift) | input_carry; 1301cb0ef41Sopenharmony_ci result[i] = digit_add2(summand, add_carry, &add_carry); 1311cb0ef41Sopenharmony_ci input_carry = d >> (kDigitBits - bits_shift); 1321cb0ef41Sopenharmony_ci } 1331cb0ef41Sopenharmony_ci { 1341cb0ef41Sopenharmony_ci // result[digit_shift] = (add_carry + iK_part) - i0_part 1351cb0ef41Sopenharmony_ci digit_t d = input[K]; 1361cb0ef41Sopenharmony_ci digit_t iK_part = (d << bits_shift) | input_carry; 1371cb0ef41Sopenharmony_ci digit_t iK_carry = d >> (kDigitBits - bits_shift); 1381cb0ef41Sopenharmony_ci digit_t sum = digit_add2(add_carry, iK_part, &add_carry); 1391cb0ef41Sopenharmony_ci // {iK_carry} is less than a full digit, so we can merge {add_carry} 1401cb0ef41Sopenharmony_ci // into it without overflow. 1411cb0ef41Sopenharmony_ci iK_carry += add_carry; 1421cb0ef41Sopenharmony_ci d = input[0]; 1431cb0ef41Sopenharmony_ci digit_t i0_part = d << bits_shift; 1441cb0ef41Sopenharmony_ci result[digit_shift] = digit_sub(sum, i0_part, &borrow); 1451cb0ef41Sopenharmony_ci input_carry = d >> (kDigitBits - bits_shift); 1461cb0ef41Sopenharmony_ci if (digit_shift + 1 < K) { 1471cb0ef41Sopenharmony_ci d = input[1]; 1481cb0ef41Sopenharmony_ci digit_t subtrahend = (d << bits_shift) | input_carry; 1491cb0ef41Sopenharmony_ci result[digit_shift + 1] = 1501cb0ef41Sopenharmony_ci digit_sub2(iK_carry, subtrahend, borrow, &borrow); 1511cb0ef41Sopenharmony_ci input_carry = d >> (kDigitBits - bits_shift); 1521cb0ef41Sopenharmony_ci } 1531cb0ef41Sopenharmony_ci } 1541cb0ef41Sopenharmony_ci for (int i = digit_shift + 2; i < K; i++) { 1551cb0ef41Sopenharmony_ci digit_t d = input[i - digit_shift]; 1561cb0ef41Sopenharmony_ci digit_t subtrahend = (d << bits_shift) | input_carry; 1571cb0ef41Sopenharmony_ci result[i] = digit_sub2(0, subtrahend, borrow, &borrow); 1581cb0ef41Sopenharmony_ci input_carry = d >> (kDigitBits - bits_shift); 1591cb0ef41Sopenharmony_ci } 1601cb0ef41Sopenharmony_ci } 1611cb0ef41Sopenharmony_ci // The virtual 1 in result[K] should be eliminated by {borrow}. If there 1621cb0ef41Sopenharmony_ci // is no borrow, then the virtual initialization was too much. Subtract 1631cb0ef41Sopenharmony_ci // 2^K + 1. 1641cb0ef41Sopenharmony_ci result[K] = 0; 1651cb0ef41Sopenharmony_ci if (borrow != 1) { 1661cb0ef41Sopenharmony_ci borrow = 1; 1671cb0ef41Sopenharmony_ci for (int i = 0; i < K; i++) { 1681cb0ef41Sopenharmony_ci result[i] = digit_sub(result[i], borrow, &borrow); 1691cb0ef41Sopenharmony_ci if (borrow == 0) break; 1701cb0ef41Sopenharmony_ci } 1711cb0ef41Sopenharmony_ci if (borrow != 0) { 1721cb0ef41Sopenharmony_ci // The result must be 2^K. 1731cb0ef41Sopenharmony_ci for (int i = 0; i < K; i++) result[i] = 0; 1741cb0ef41Sopenharmony_ci result[K] = 1; 1751cb0ef41Sopenharmony_ci } 1761cb0ef41Sopenharmony_ci } 1771cb0ef41Sopenharmony_ci} 1781cb0ef41Sopenharmony_ci 1791cb0ef41Sopenharmony_ci// Sets {result} := {input} * 2^{power_of_two} mod 2^{K} + 1. 1801cb0ef41Sopenharmony_ci// This function is highly relevant for overall performance. 1811cb0ef41Sopenharmony_civoid ShiftModFn(digit_t* result, const digit_t* input, int power_of_two, int K, 1821cb0ef41Sopenharmony_ci int zero_above = 0x7FFFFFFF) { 1831cb0ef41Sopenharmony_ci // The modulo-reduction amounts to a subtraction, which we combine 1841cb0ef41Sopenharmony_ci // with the shift as follows: 1851cb0ef41Sopenharmony_ci // input = [ iK ][iK-1] .... .... [ i1 ][ i0 ] 1861cb0ef41Sopenharmony_ci // result = [iX-1] .... [ i0 ] <---------- shift by {power_of_two} 1871cb0ef41Sopenharmony_ci // - [ iK ] .... [ iX ] 1881cb0ef41Sopenharmony_ci // where "X" is the index "K - digit_shift". 1891cb0ef41Sopenharmony_ci int digit_shift = power_of_two / kDigitBits; 1901cb0ef41Sopenharmony_ci int bits_shift = power_of_two % kDigitBits; 1911cb0ef41Sopenharmony_ci // By an analogous construction to the "digit_shift >= K" case, 1921cb0ef41Sopenharmony_ci // it turns out that: 1931cb0ef41Sopenharmony_ci // x * 2^{2K+m} == x * 2^m mod 2^K + 1. 1941cb0ef41Sopenharmony_ci while (digit_shift >= 2 * K) digit_shift -= 2 * K; // Faster than '%'! 1951cb0ef41Sopenharmony_ci if (digit_shift >= K) { 1961cb0ef41Sopenharmony_ci return ShiftModFn_Large(result, input, digit_shift, bits_shift, K); 1971cb0ef41Sopenharmony_ci } 1981cb0ef41Sopenharmony_ci digit_t borrow = 0; 1991cb0ef41Sopenharmony_ci if (bits_shift == 0) { 2001cb0ef41Sopenharmony_ci // We do a single pass over {input}, starting by copying digits [i1] to 2011cb0ef41Sopenharmony_ci // [iX-1] to result indices digit_shift+1 to K-1. 2021cb0ef41Sopenharmony_ci int i = 1; 2031cb0ef41Sopenharmony_ci // Read input digits unless we know they are zero. 2041cb0ef41Sopenharmony_ci int cap = std::min(K - digit_shift, zero_above); 2051cb0ef41Sopenharmony_ci for (; i < cap; i++) { 2061cb0ef41Sopenharmony_ci result[i + digit_shift] = input[i]; 2071cb0ef41Sopenharmony_ci } 2081cb0ef41Sopenharmony_ci // Any remaining work can hard-code the knowledge that input[i] == 0. 2091cb0ef41Sopenharmony_ci for (; i < K - digit_shift; i++) { 2101cb0ef41Sopenharmony_ci DCHECK(input[i] == 0); 2111cb0ef41Sopenharmony_ci result[i + digit_shift] = 0; 2121cb0ef41Sopenharmony_ci } 2131cb0ef41Sopenharmony_ci // Second phase: subtract input digits [iX] to [iK] from (virtually) zero- 2141cb0ef41Sopenharmony_ci // initialized result indices 0 to digit_shift-1. 2151cb0ef41Sopenharmony_ci cap = std::min(K, zero_above); 2161cb0ef41Sopenharmony_ci for (; i < cap; i++) { 2171cb0ef41Sopenharmony_ci digit_t d = input[i]; 2181cb0ef41Sopenharmony_ci result[i - K + digit_shift] = digit_sub2(0, d, borrow, &borrow); 2191cb0ef41Sopenharmony_ci } 2201cb0ef41Sopenharmony_ci // Any remaining work can hard-code the knowledge that input[i] == 0. 2211cb0ef41Sopenharmony_ci for (; i < K; i++) { 2221cb0ef41Sopenharmony_ci DCHECK(input[i] == 0); 2231cb0ef41Sopenharmony_ci result[i - K + digit_shift] = digit_sub(0, borrow, &borrow); 2241cb0ef41Sopenharmony_ci } 2251cb0ef41Sopenharmony_ci // Last step: subtract [iK] from [i0] and store at result index digit_shift. 2261cb0ef41Sopenharmony_ci result[digit_shift] = digit_sub2(input[0], input[K], borrow, &borrow); 2271cb0ef41Sopenharmony_ci } else { 2281cb0ef41Sopenharmony_ci // Same flow as before, but taking bits_shift != 0 into account. 2291cb0ef41Sopenharmony_ci // First phase: result indices digit_shift+1 to K. 2301cb0ef41Sopenharmony_ci digit_t carry = 0; 2311cb0ef41Sopenharmony_ci int i = 0; 2321cb0ef41Sopenharmony_ci // Read input digits unless we know they are zero. 2331cb0ef41Sopenharmony_ci int cap = std::min(K - digit_shift, zero_above); 2341cb0ef41Sopenharmony_ci for (; i < cap; i++) { 2351cb0ef41Sopenharmony_ci digit_t d = input[i]; 2361cb0ef41Sopenharmony_ci result[i + digit_shift] = (d << bits_shift) | carry; 2371cb0ef41Sopenharmony_ci carry = d >> (kDigitBits - bits_shift); 2381cb0ef41Sopenharmony_ci } 2391cb0ef41Sopenharmony_ci // Any remaining work can hard-code the knowledge that input[i] == 0. 2401cb0ef41Sopenharmony_ci for (; i < K - digit_shift; i++) { 2411cb0ef41Sopenharmony_ci DCHECK(input[i] == 0); 2421cb0ef41Sopenharmony_ci result[i + digit_shift] = carry; 2431cb0ef41Sopenharmony_ci carry = 0; 2441cb0ef41Sopenharmony_ci } 2451cb0ef41Sopenharmony_ci // Second phase: result indices 0 to digit_shift - 1. 2461cb0ef41Sopenharmony_ci cap = std::min(K, zero_above); 2471cb0ef41Sopenharmony_ci for (; i < cap; i++) { 2481cb0ef41Sopenharmony_ci digit_t d = input[i]; 2491cb0ef41Sopenharmony_ci result[i - K + digit_shift] = 2501cb0ef41Sopenharmony_ci digit_sub2(0, (d << bits_shift) | carry, borrow, &borrow); 2511cb0ef41Sopenharmony_ci carry = d >> (kDigitBits - bits_shift); 2521cb0ef41Sopenharmony_ci } 2531cb0ef41Sopenharmony_ci // Any remaining work can hard-code the knowledge that input[i] == 0. 2541cb0ef41Sopenharmony_ci if (i < K) { 2551cb0ef41Sopenharmony_ci DCHECK(input[i] == 0); 2561cb0ef41Sopenharmony_ci result[i - K + digit_shift] = digit_sub2(0, carry, borrow, &borrow); 2571cb0ef41Sopenharmony_ci carry = 0; 2581cb0ef41Sopenharmony_ci i++; 2591cb0ef41Sopenharmony_ci } 2601cb0ef41Sopenharmony_ci for (; i < K; i++) { 2611cb0ef41Sopenharmony_ci DCHECK(input[i] == 0); 2621cb0ef41Sopenharmony_ci result[i - K + digit_shift] = digit_sub(0, borrow, &borrow); 2631cb0ef41Sopenharmony_ci } 2641cb0ef41Sopenharmony_ci // Last step: compute result[digit_shift]. 2651cb0ef41Sopenharmony_ci digit_t d = input[K]; 2661cb0ef41Sopenharmony_ci result[digit_shift] = digit_sub2( 2671cb0ef41Sopenharmony_ci result[digit_shift], (d << bits_shift) | carry, borrow, &borrow); 2681cb0ef41Sopenharmony_ci // No carry left. 2691cb0ef41Sopenharmony_ci DCHECK((d >> (kDigitBits - bits_shift)) == 0); 2701cb0ef41Sopenharmony_ci } 2711cb0ef41Sopenharmony_ci result[K] = 0; 2721cb0ef41Sopenharmony_ci for (int i = digit_shift + 1; i <= K && borrow > 0; i++) { 2731cb0ef41Sopenharmony_ci result[i] = digit_sub(result[i], borrow, &borrow); 2741cb0ef41Sopenharmony_ci } 2751cb0ef41Sopenharmony_ci if (borrow > 0) { 2761cb0ef41Sopenharmony_ci // Underflow means we subtracted too much. Add 2^K + 1. 2771cb0ef41Sopenharmony_ci digit_t carry = 1; 2781cb0ef41Sopenharmony_ci for (int i = 0; i <= K; i++) { 2791cb0ef41Sopenharmony_ci result[i] = digit_add2(result[i], carry, &carry); 2801cb0ef41Sopenharmony_ci if (carry == 0) break; 2811cb0ef41Sopenharmony_ci } 2821cb0ef41Sopenharmony_ci result[K] = digit_add2(result[K], 1, &carry); 2831cb0ef41Sopenharmony_ci } 2841cb0ef41Sopenharmony_ci} 2851cb0ef41Sopenharmony_ci 2861cb0ef41Sopenharmony_ci//////////////////////////////////////////////////////////////////////////////// 2871cb0ef41Sopenharmony_ci// Part 2: FFT-based multiplication is very sensitive to appropriate choice 2881cb0ef41Sopenharmony_ci// of parameters. The following functions choose the parameters that the 2891cb0ef41Sopenharmony_ci// subsequent actual computation will use. This is partially based on formal 2901cb0ef41Sopenharmony_ci// constraints and partially on experimentally-determined heuristics. 2911cb0ef41Sopenharmony_ci 2921cb0ef41Sopenharmony_cistruct Parameters { 2931cb0ef41Sopenharmony_ci // We never use the default values, but skipping zero-initialization 2941cb0ef41Sopenharmony_ci // of these fields saddens and confuses MSan. 2951cb0ef41Sopenharmony_ci int m{0}; 2961cb0ef41Sopenharmony_ci int K{0}; 2971cb0ef41Sopenharmony_ci int n{0}; 2981cb0ef41Sopenharmony_ci int s{0}; 2991cb0ef41Sopenharmony_ci int r{0}; 3001cb0ef41Sopenharmony_ci}; 3011cb0ef41Sopenharmony_ci 3021cb0ef41Sopenharmony_ci// Computes parameters for the main calculation, given a bit length {N} and 3031cb0ef41Sopenharmony_ci// an {m}. See the paper for details. 3041cb0ef41Sopenharmony_civoid ComputeParameters(int N, int m, Parameters* params) { 3051cb0ef41Sopenharmony_ci N *= kDigitBits; 3061cb0ef41Sopenharmony_ci int n = 1 << m; // 2^m 3071cb0ef41Sopenharmony_ci int nhalf = n >> 1; 3081cb0ef41Sopenharmony_ci int s = (N + n - 1) >> m; // ceil(N/n) 3091cb0ef41Sopenharmony_ci s = RoundUp(s, kDigitBits); 3101cb0ef41Sopenharmony_ci int K = m + 2 * s + 1; // K must be at least this big... 3111cb0ef41Sopenharmony_ci K = RoundUp(K, nhalf); // ...and a multiple of n/2. 3121cb0ef41Sopenharmony_ci int r = K >> (m - 1); // Which multiple? 3131cb0ef41Sopenharmony_ci 3141cb0ef41Sopenharmony_ci // We want recursive calls to make progress, so force K to be a multiple 3151cb0ef41Sopenharmony_ci // of 8 if it's above the recursion threshold. Otherwise, K must be a 3161cb0ef41Sopenharmony_ci // multiple of kDigitBits. 3171cb0ef41Sopenharmony_ci const int threshold = (K + 1 >= kFftInnerThreshold * kDigitBits) 3181cb0ef41Sopenharmony_ci ? 3 + kLog2DigitBits 3191cb0ef41Sopenharmony_ci : kLog2DigitBits; 3201cb0ef41Sopenharmony_ci int K_tz = CountTrailingZeros(K); 3211cb0ef41Sopenharmony_ci while (K_tz < threshold) { 3221cb0ef41Sopenharmony_ci K += (1 << K_tz); 3231cb0ef41Sopenharmony_ci r = K >> (m - 1); 3241cb0ef41Sopenharmony_ci K_tz = CountTrailingZeros(K); 3251cb0ef41Sopenharmony_ci } 3261cb0ef41Sopenharmony_ci 3271cb0ef41Sopenharmony_ci DCHECK(K % kDigitBits == 0); 3281cb0ef41Sopenharmony_ci DCHECK(s % kDigitBits == 0); 3291cb0ef41Sopenharmony_ci params->K = K / kDigitBits; 3301cb0ef41Sopenharmony_ci params->s = s / kDigitBits; 3311cb0ef41Sopenharmony_ci params->n = n; 3321cb0ef41Sopenharmony_ci params->r = r; 3331cb0ef41Sopenharmony_ci} 3341cb0ef41Sopenharmony_ci 3351cb0ef41Sopenharmony_ci// Computes parameters for recursive invocations ("inner layer"). 3361cb0ef41Sopenharmony_civoid ComputeParameters_Inner(int N, Parameters* params) { 3371cb0ef41Sopenharmony_ci int max_m = CountTrailingZeros(N); 3381cb0ef41Sopenharmony_ci int N_bits = BitLength(N); 3391cb0ef41Sopenharmony_ci int m = N_bits - 4; // Don't let s get too small. 3401cb0ef41Sopenharmony_ci m = std::min(max_m, m); 3411cb0ef41Sopenharmony_ci N *= kDigitBits; 3421cb0ef41Sopenharmony_ci int n = 1 << m; // 2^m 3431cb0ef41Sopenharmony_ci // We can't round up s in the inner layer, because N = n*s is fixed. 3441cb0ef41Sopenharmony_ci int s = N >> m; 3451cb0ef41Sopenharmony_ci DCHECK(N == s * n); 3461cb0ef41Sopenharmony_ci int K = m + 2 * s + 1; // K must be at least this big... 3471cb0ef41Sopenharmony_ci K = RoundUp(K, n); // ...and a multiple of n and kDigitBits. 3481cb0ef41Sopenharmony_ci K = RoundUp(K, kDigitBits); 3491cb0ef41Sopenharmony_ci params->r = K >> m; // Which multiple? 3501cb0ef41Sopenharmony_ci DCHECK(K % kDigitBits == 0); 3511cb0ef41Sopenharmony_ci DCHECK(s % kDigitBits == 0); 3521cb0ef41Sopenharmony_ci params->K = K / kDigitBits; 3531cb0ef41Sopenharmony_ci params->s = s / kDigitBits; 3541cb0ef41Sopenharmony_ci params->n = n; 3551cb0ef41Sopenharmony_ci params->m = m; 3561cb0ef41Sopenharmony_ci} 3571cb0ef41Sopenharmony_ci 3581cb0ef41Sopenharmony_ciint PredictInnerK(int N) { 3591cb0ef41Sopenharmony_ci Parameters params; 3601cb0ef41Sopenharmony_ci ComputeParameters_Inner(N, ¶ms); 3611cb0ef41Sopenharmony_ci return params.K; 3621cb0ef41Sopenharmony_ci} 3631cb0ef41Sopenharmony_ci 3641cb0ef41Sopenharmony_ci// Applies heuristics to decide whether {m} should be decremented, by looking 3651cb0ef41Sopenharmony_ci// at what would happen to {K} and {s} if {m} was decremented. 3661cb0ef41Sopenharmony_cibool ShouldDecrementM(const Parameters& current, const Parameters& next, 3671cb0ef41Sopenharmony_ci const Parameters& after_next) { 3681cb0ef41Sopenharmony_ci // K == 64 seems to work particularly well. 3691cb0ef41Sopenharmony_ci if (current.K == 64 && next.K >= 112) return false; 3701cb0ef41Sopenharmony_ci // Small values for s are never efficient. 3711cb0ef41Sopenharmony_ci if (current.s < 6) return true; 3721cb0ef41Sopenharmony_ci // The time is roughly determined by K * n. When we decrement m, then 3731cb0ef41Sopenharmony_ci // n always halves, and K usually gets bigger, by up to 2x. 3741cb0ef41Sopenharmony_ci // For not-quite-so-small s, look at how much bigger K would get: if 3751cb0ef41Sopenharmony_ci // the K increase is small enough, making n smaller is worth it. 3761cb0ef41Sopenharmony_ci // Empirically, it's most meaningful to look at the K *after* next. 3771cb0ef41Sopenharmony_ci // The specific threshold values have been chosen by running many 3781cb0ef41Sopenharmony_ci // benchmarks on inputs of many sizes, and manually selecting thresholds 3791cb0ef41Sopenharmony_ci // that seemed to produce good results. 3801cb0ef41Sopenharmony_ci double factor = static_cast<double>(after_next.K) / current.K; 3811cb0ef41Sopenharmony_ci if ((current.s == 6 && factor < 3.85) || // -- 3821cb0ef41Sopenharmony_ci (current.s == 7 && factor < 3.73) || // -- 3831cb0ef41Sopenharmony_ci (current.s == 8 && factor < 3.55) || // -- 3841cb0ef41Sopenharmony_ci (current.s == 9 && factor < 3.50) || // -- 3851cb0ef41Sopenharmony_ci factor < 3.4) { 3861cb0ef41Sopenharmony_ci return true; 3871cb0ef41Sopenharmony_ci } 3881cb0ef41Sopenharmony_ci // If K is just below the recursion threshold, make sure we do recurse, 3891cb0ef41Sopenharmony_ci // unless doing so would be particularly inefficient (large inner_K). 3901cb0ef41Sopenharmony_ci // If K is just above the recursion threshold, doubling it often makes 3911cb0ef41Sopenharmony_ci // the inner call more efficient. 3921cb0ef41Sopenharmony_ci if (current.K >= 160 && current.K < 250 && PredictInnerK(next.K) < 28) { 3931cb0ef41Sopenharmony_ci return true; 3941cb0ef41Sopenharmony_ci } 3951cb0ef41Sopenharmony_ci // If we found no reason to decrement, keep m as large as possible. 3961cb0ef41Sopenharmony_ci return false; 3971cb0ef41Sopenharmony_ci} 3981cb0ef41Sopenharmony_ci 3991cb0ef41Sopenharmony_ci// Decides what parameters to use for a given input bit length {N}. 4001cb0ef41Sopenharmony_ci// Returns the chosen m. 4011cb0ef41Sopenharmony_ciint GetParameters(int N, Parameters* params) { 4021cb0ef41Sopenharmony_ci int N_bits = BitLength(N); 4031cb0ef41Sopenharmony_ci int max_m = N_bits - 3; // Larger m make s too small. 4041cb0ef41Sopenharmony_ci max_m = std::max(kLog2DigitBits, max_m); // Smaller m break the logic below. 4051cb0ef41Sopenharmony_ci int m = max_m; 4061cb0ef41Sopenharmony_ci Parameters current; 4071cb0ef41Sopenharmony_ci ComputeParameters(N, m, ¤t); 4081cb0ef41Sopenharmony_ci Parameters next; 4091cb0ef41Sopenharmony_ci ComputeParameters(N, m - 1, &next); 4101cb0ef41Sopenharmony_ci while (m > 2) { 4111cb0ef41Sopenharmony_ci Parameters after_next; 4121cb0ef41Sopenharmony_ci ComputeParameters(N, m - 2, &after_next); 4131cb0ef41Sopenharmony_ci if (ShouldDecrementM(current, next, after_next)) { 4141cb0ef41Sopenharmony_ci m--; 4151cb0ef41Sopenharmony_ci current = next; 4161cb0ef41Sopenharmony_ci next = after_next; 4171cb0ef41Sopenharmony_ci } else { 4181cb0ef41Sopenharmony_ci break; 4191cb0ef41Sopenharmony_ci } 4201cb0ef41Sopenharmony_ci } 4211cb0ef41Sopenharmony_ci *params = current; 4221cb0ef41Sopenharmony_ci return m; 4231cb0ef41Sopenharmony_ci} 4241cb0ef41Sopenharmony_ci 4251cb0ef41Sopenharmony_ci//////////////////////////////////////////////////////////////////////////////// 4261cb0ef41Sopenharmony_ci// Part 3: Fast Fourier Transformation. 4271cb0ef41Sopenharmony_ci 4281cb0ef41Sopenharmony_ciclass FFTContainer { 4291cb0ef41Sopenharmony_ci public: 4301cb0ef41Sopenharmony_ci // {n} is the number of chunks, whose length is {K}+1. 4311cb0ef41Sopenharmony_ci // {K} determines F_n = 2^(K * kDigitBits) + 1. 4321cb0ef41Sopenharmony_ci FFTContainer(int n, int K, ProcessorImpl* processor) 4331cb0ef41Sopenharmony_ci : n_(n), K_(K), length_(K + 1), processor_(processor) { 4341cb0ef41Sopenharmony_ci storage_ = new digit_t[length_ * n_]; 4351cb0ef41Sopenharmony_ci part_ = new digit_t*[n_]; 4361cb0ef41Sopenharmony_ci digit_t* ptr = storage_; 4371cb0ef41Sopenharmony_ci for (int i = 0; i < n; i++, ptr += length_) { 4381cb0ef41Sopenharmony_ci part_[i] = ptr; 4391cb0ef41Sopenharmony_ci } 4401cb0ef41Sopenharmony_ci temp_ = new digit_t[length_ * 2]; 4411cb0ef41Sopenharmony_ci } 4421cb0ef41Sopenharmony_ci FFTContainer() = delete; 4431cb0ef41Sopenharmony_ci FFTContainer(const FFTContainer&) = delete; 4441cb0ef41Sopenharmony_ci FFTContainer& operator=(const FFTContainer&) = delete; 4451cb0ef41Sopenharmony_ci 4461cb0ef41Sopenharmony_ci ~FFTContainer() { 4471cb0ef41Sopenharmony_ci delete[] storage_; 4481cb0ef41Sopenharmony_ci delete[] part_; 4491cb0ef41Sopenharmony_ci delete[] temp_; 4501cb0ef41Sopenharmony_ci } 4511cb0ef41Sopenharmony_ci 4521cb0ef41Sopenharmony_ci void Start_Default(Digits X, int chunk_size, int theta, int omega); 4531cb0ef41Sopenharmony_ci void Start(Digits X, int chunk_size, int theta, int omega); 4541cb0ef41Sopenharmony_ci 4551cb0ef41Sopenharmony_ci void NormalizeAndRecombine(int omega, int m, RWDigits Z, int chunk_size); 4561cb0ef41Sopenharmony_ci void CounterWeightAndRecombine(int theta, int m, RWDigits Z, int chunk_size); 4571cb0ef41Sopenharmony_ci 4581cb0ef41Sopenharmony_ci void FFT_ReturnShuffledThreadsafe(int start, int len, int omega, 4591cb0ef41Sopenharmony_ci digit_t* temp); 4601cb0ef41Sopenharmony_ci void FFT_Recurse(int start, int half, int omega, digit_t* temp); 4611cb0ef41Sopenharmony_ci 4621cb0ef41Sopenharmony_ci void BackwardFFT(int start, int len, int omega); 4631cb0ef41Sopenharmony_ci void BackwardFFT_Threadsafe(int start, int len, int omega, digit_t* temp); 4641cb0ef41Sopenharmony_ci 4651cb0ef41Sopenharmony_ci void PointwiseMultiply(const FFTContainer& other); 4661cb0ef41Sopenharmony_ci void DoPointwiseMultiplication(const FFTContainer& other, int start, int end, 4671cb0ef41Sopenharmony_ci digit_t* temp); 4681cb0ef41Sopenharmony_ci 4691cb0ef41Sopenharmony_ci int length() const { return length_; } 4701cb0ef41Sopenharmony_ci 4711cb0ef41Sopenharmony_ci private: 4721cb0ef41Sopenharmony_ci const int n_; // Number of parts. 4731cb0ef41Sopenharmony_ci const int K_; // Always length_ - 1. 4741cb0ef41Sopenharmony_ci const int length_; // Length of each part, in digits. 4751cb0ef41Sopenharmony_ci ProcessorImpl* processor_; 4761cb0ef41Sopenharmony_ci digit_t* storage_; // Combined storage of all parts. 4771cb0ef41Sopenharmony_ci digit_t** part_; // Pointers to each part. 4781cb0ef41Sopenharmony_ci digit_t* temp_; // Temporary storage with size 2 * length_. 4791cb0ef41Sopenharmony_ci}; 4801cb0ef41Sopenharmony_ci 4811cb0ef41Sopenharmony_ciinline void CopyAndZeroExtend(digit_t* dst, const digit_t* src, 4821cb0ef41Sopenharmony_ci int digits_to_copy, size_t total_bytes) { 4831cb0ef41Sopenharmony_ci size_t bytes_to_copy = digits_to_copy * sizeof(digit_t); 4841cb0ef41Sopenharmony_ci memcpy(dst, src, bytes_to_copy); 4851cb0ef41Sopenharmony_ci memset(dst + digits_to_copy, 0, total_bytes - bytes_to_copy); 4861cb0ef41Sopenharmony_ci} 4871cb0ef41Sopenharmony_ci 4881cb0ef41Sopenharmony_ci// Reads {X} into the FFTContainer's internal storage, dividing it into chunks 4891cb0ef41Sopenharmony_ci// while doing so; then performs the forward FFT. 4901cb0ef41Sopenharmony_civoid FFTContainer::Start_Default(Digits X, int chunk_size, int theta, 4911cb0ef41Sopenharmony_ci int omega) { 4921cb0ef41Sopenharmony_ci int len = X.len(); 4931cb0ef41Sopenharmony_ci const digit_t* pointer = X.digits(); 4941cb0ef41Sopenharmony_ci const size_t part_length_in_bytes = length_ * sizeof(digit_t); 4951cb0ef41Sopenharmony_ci int current_theta = 0; 4961cb0ef41Sopenharmony_ci int i = 0; 4971cb0ef41Sopenharmony_ci for (; i < n_ && len > 0; i++, current_theta += theta) { 4981cb0ef41Sopenharmony_ci chunk_size = std::min(chunk_size, len); 4991cb0ef41Sopenharmony_ci // For invocations via MultiplyFFT_Inner, X.len() == n_ * chunk_size + 1, 5001cb0ef41Sopenharmony_ci // because the outer layer's "K" is passed as the inner layer's "N". 5011cb0ef41Sopenharmony_ci // Since X is (mod Fn)-normalized on the outer layer, there is the rare 5021cb0ef41Sopenharmony_ci // corner case where X[n_ * chunk_size] == 1. Detect that case, and handle 5031cb0ef41Sopenharmony_ci // the extra bit as part of the last chunk; we always have the space. 5041cb0ef41Sopenharmony_ci if (i == n_ - 1 && len == chunk_size + 1) { 5051cb0ef41Sopenharmony_ci DCHECK(X[n_ * chunk_size] <= 1); 5061cb0ef41Sopenharmony_ci DCHECK(length_ >= chunk_size + 1); 5071cb0ef41Sopenharmony_ci chunk_size++; 5081cb0ef41Sopenharmony_ci } 5091cb0ef41Sopenharmony_ci if (current_theta != 0) { 5101cb0ef41Sopenharmony_ci // Multiply with theta^i, and reduce modulo 2^K + 1. 5111cb0ef41Sopenharmony_ci // We pass theta as a shift amount; it really means 2^theta. 5121cb0ef41Sopenharmony_ci CopyAndZeroExtend(temp_, pointer, chunk_size, part_length_in_bytes); 5131cb0ef41Sopenharmony_ci ShiftModFn(part_[i], temp_, current_theta, K_, chunk_size); 5141cb0ef41Sopenharmony_ci } else { 5151cb0ef41Sopenharmony_ci CopyAndZeroExtend(part_[i], pointer, chunk_size, part_length_in_bytes); 5161cb0ef41Sopenharmony_ci } 5171cb0ef41Sopenharmony_ci pointer += chunk_size; 5181cb0ef41Sopenharmony_ci len -= chunk_size; 5191cb0ef41Sopenharmony_ci } 5201cb0ef41Sopenharmony_ci DCHECK(len == 0); 5211cb0ef41Sopenharmony_ci for (; i < n_; i++) { 5221cb0ef41Sopenharmony_ci memset(part_[i], 0, part_length_in_bytes); 5231cb0ef41Sopenharmony_ci } 5241cb0ef41Sopenharmony_ci FFT_ReturnShuffledThreadsafe(0, n_, omega, temp_); 5251cb0ef41Sopenharmony_ci} 5261cb0ef41Sopenharmony_ci 5271cb0ef41Sopenharmony_ci// This version of Start is optimized for the case where ~half of the 5281cb0ef41Sopenharmony_ci// container will be filled with padding zeros. 5291cb0ef41Sopenharmony_civoid FFTContainer::Start(Digits X, int chunk_size, int theta, int omega) { 5301cb0ef41Sopenharmony_ci int len = X.len(); 5311cb0ef41Sopenharmony_ci if (len > n_ * chunk_size / 2) { 5321cb0ef41Sopenharmony_ci return Start_Default(X, chunk_size, theta, omega); 5331cb0ef41Sopenharmony_ci } 5341cb0ef41Sopenharmony_ci DCHECK(theta == 0); 5351cb0ef41Sopenharmony_ci const digit_t* pointer = X.digits(); 5361cb0ef41Sopenharmony_ci const size_t part_length_in_bytes = length_ * sizeof(digit_t); 5371cb0ef41Sopenharmony_ci int nhalf = n_ / 2; 5381cb0ef41Sopenharmony_ci // Unrolled first iteration. 5391cb0ef41Sopenharmony_ci CopyAndZeroExtend(part_[0], pointer, chunk_size, part_length_in_bytes); 5401cb0ef41Sopenharmony_ci CopyAndZeroExtend(part_[nhalf], pointer, chunk_size, part_length_in_bytes); 5411cb0ef41Sopenharmony_ci pointer += chunk_size; 5421cb0ef41Sopenharmony_ci len -= chunk_size; 5431cb0ef41Sopenharmony_ci int i = 1; 5441cb0ef41Sopenharmony_ci for (; i < nhalf && len > 0; i++) { 5451cb0ef41Sopenharmony_ci chunk_size = std::min(chunk_size, len); 5461cb0ef41Sopenharmony_ci CopyAndZeroExtend(part_[i], pointer, chunk_size, part_length_in_bytes); 5471cb0ef41Sopenharmony_ci int w = omega * i; 5481cb0ef41Sopenharmony_ci ShiftModFn(part_[i + nhalf], part_[i], w, K_, chunk_size); 5491cb0ef41Sopenharmony_ci pointer += chunk_size; 5501cb0ef41Sopenharmony_ci len -= chunk_size; 5511cb0ef41Sopenharmony_ci } 5521cb0ef41Sopenharmony_ci for (; i < nhalf; i++) { 5531cb0ef41Sopenharmony_ci memset(part_[i], 0, part_length_in_bytes); 5541cb0ef41Sopenharmony_ci memset(part_[i + nhalf], 0, part_length_in_bytes); 5551cb0ef41Sopenharmony_ci } 5561cb0ef41Sopenharmony_ci FFT_Recurse(0, nhalf, omega, temp_); 5571cb0ef41Sopenharmony_ci} 5581cb0ef41Sopenharmony_ci 5591cb0ef41Sopenharmony_ci// Forward transformation. 5601cb0ef41Sopenharmony_ci// We use the "DIF" aka "decimation in frequency" transform, because it 5611cb0ef41Sopenharmony_ci// leaves the result in "bit reversed" order, which is precisely what we 5621cb0ef41Sopenharmony_ci// need as input for the "DIT" aka "decimation in time" backwards transform. 5631cb0ef41Sopenharmony_civoid FFTContainer::FFT_ReturnShuffledThreadsafe(int start, int len, int omega, 5641cb0ef41Sopenharmony_ci digit_t* temp) { 5651cb0ef41Sopenharmony_ci DCHECK((len & 1) == 0); // {len} must be even. 5661cb0ef41Sopenharmony_ci int half = len / 2; 5671cb0ef41Sopenharmony_ci SumDiff(part_[start], part_[start + half], part_[start], part_[start + half], 5681cb0ef41Sopenharmony_ci length_); 5691cb0ef41Sopenharmony_ci for (int k = 1; k < half; k++) { 5701cb0ef41Sopenharmony_ci SumDiff(part_[start + k], temp, part_[start + k], part_[start + half + k], 5711cb0ef41Sopenharmony_ci length_); 5721cb0ef41Sopenharmony_ci int w = omega * k; 5731cb0ef41Sopenharmony_ci ShiftModFn(part_[start + half + k], temp, w, K_); 5741cb0ef41Sopenharmony_ci } 5751cb0ef41Sopenharmony_ci FFT_Recurse(start, half, omega, temp); 5761cb0ef41Sopenharmony_ci} 5771cb0ef41Sopenharmony_ci 5781cb0ef41Sopenharmony_ci// Recursive step of the above, factored out for additional callers. 5791cb0ef41Sopenharmony_civoid FFTContainer::FFT_Recurse(int start, int half, int omega, digit_t* temp) { 5801cb0ef41Sopenharmony_ci if (half > 1) { 5811cb0ef41Sopenharmony_ci FFT_ReturnShuffledThreadsafe(start, half, 2 * omega, temp); 5821cb0ef41Sopenharmony_ci FFT_ReturnShuffledThreadsafe(start + half, half, 2 * omega, temp); 5831cb0ef41Sopenharmony_ci } 5841cb0ef41Sopenharmony_ci} 5851cb0ef41Sopenharmony_ci 5861cb0ef41Sopenharmony_ci// Backward transformation. 5871cb0ef41Sopenharmony_ci// We use the "DIT" aka "decimation in time" transform here, because it 5881cb0ef41Sopenharmony_ci// turns bit-reversed input into normally sorted output. 5891cb0ef41Sopenharmony_civoid FFTContainer::BackwardFFT(int start, int len, int omega) { 5901cb0ef41Sopenharmony_ci BackwardFFT_Threadsafe(start, len, omega, temp_); 5911cb0ef41Sopenharmony_ci} 5921cb0ef41Sopenharmony_ci 5931cb0ef41Sopenharmony_civoid FFTContainer::BackwardFFT_Threadsafe(int start, int len, int omega, 5941cb0ef41Sopenharmony_ci digit_t* temp) { 5951cb0ef41Sopenharmony_ci DCHECK((len & 1) == 0); // {len} must be even. 5961cb0ef41Sopenharmony_ci int half = len / 2; 5971cb0ef41Sopenharmony_ci // Don't recurse for half == 2, as PointwiseMultiply already performed 5981cb0ef41Sopenharmony_ci // the first level of the backwards FFT. 5991cb0ef41Sopenharmony_ci if (half > 2) { 6001cb0ef41Sopenharmony_ci BackwardFFT_Threadsafe(start, half, 2 * omega, temp); 6011cb0ef41Sopenharmony_ci BackwardFFT_Threadsafe(start + half, half, 2 * omega, temp); 6021cb0ef41Sopenharmony_ci } 6031cb0ef41Sopenharmony_ci SumDiff(part_[start], part_[start + half], part_[start], part_[start + half], 6041cb0ef41Sopenharmony_ci length_); 6051cb0ef41Sopenharmony_ci for (int k = 1; k < half; k++) { 6061cb0ef41Sopenharmony_ci int w = omega * (len - k); 6071cb0ef41Sopenharmony_ci ShiftModFn(temp, part_[start + half + k], w, K_); 6081cb0ef41Sopenharmony_ci SumDiff(part_[start + k], part_[start + half + k], part_[start + k], temp, 6091cb0ef41Sopenharmony_ci length_); 6101cb0ef41Sopenharmony_ci } 6111cb0ef41Sopenharmony_ci} 6121cb0ef41Sopenharmony_ci 6131cb0ef41Sopenharmony_ci// Recombines the result's parts into {Z}, after backwards FFT. 6141cb0ef41Sopenharmony_civoid FFTContainer::NormalizeAndRecombine(int omega, int m, RWDigits Z, 6151cb0ef41Sopenharmony_ci int chunk_size) { 6161cb0ef41Sopenharmony_ci Z.Clear(); 6171cb0ef41Sopenharmony_ci int z_index = 0; 6181cb0ef41Sopenharmony_ci const int shift = n_ * omega - m; 6191cb0ef41Sopenharmony_ci for (int i = 0; i < n_; i++, z_index += chunk_size) { 6201cb0ef41Sopenharmony_ci digit_t* part = part_[i]; 6211cb0ef41Sopenharmony_ci ShiftModFn(temp_, part, shift, K_); 6221cb0ef41Sopenharmony_ci digit_t carry = 0; 6231cb0ef41Sopenharmony_ci int zi = z_index; 6241cb0ef41Sopenharmony_ci int j = 0; 6251cb0ef41Sopenharmony_ci for (; j < length_ && zi < Z.len(); j++, zi++) { 6261cb0ef41Sopenharmony_ci Z[zi] = digit_add3(Z[zi], temp_[j], carry, &carry); 6271cb0ef41Sopenharmony_ci } 6281cb0ef41Sopenharmony_ci for (; j < length_; j++) { 6291cb0ef41Sopenharmony_ci DCHECK(temp_[j] == 0); 6301cb0ef41Sopenharmony_ci } 6311cb0ef41Sopenharmony_ci if (carry != 0) { 6321cb0ef41Sopenharmony_ci DCHECK(zi < Z.len()); 6331cb0ef41Sopenharmony_ci Z[zi] = carry; 6341cb0ef41Sopenharmony_ci } 6351cb0ef41Sopenharmony_ci } 6361cb0ef41Sopenharmony_ci} 6371cb0ef41Sopenharmony_ci 6381cb0ef41Sopenharmony_ci// Helper function for {CounterWeightAndRecombine} below. 6391cb0ef41Sopenharmony_cibool ShouldBeNegative(const digit_t* x, int xlen, digit_t threshold, int s) { 6401cb0ef41Sopenharmony_ci if (x[2 * s] >= threshold) return true; 6411cb0ef41Sopenharmony_ci for (int i = 2 * s + 1; i < xlen; i++) { 6421cb0ef41Sopenharmony_ci if (x[i] > 0) return true; 6431cb0ef41Sopenharmony_ci } 6441cb0ef41Sopenharmony_ci return false; 6451cb0ef41Sopenharmony_ci} 6461cb0ef41Sopenharmony_ci 6471cb0ef41Sopenharmony_ci// Same as {NormalizeAndRecombine} above, but for the needs of the recursive 6481cb0ef41Sopenharmony_ci// invocation ("inner layer") of FFT multiplication, where an additional 6491cb0ef41Sopenharmony_ci// counter-weighting step is required. 6501cb0ef41Sopenharmony_civoid FFTContainer::CounterWeightAndRecombine(int theta, int m, RWDigits Z, 6511cb0ef41Sopenharmony_ci int s) { 6521cb0ef41Sopenharmony_ci Z.Clear(); 6531cb0ef41Sopenharmony_ci int z_index = 0; 6541cb0ef41Sopenharmony_ci for (int k = 0; k < n_; k++, z_index += s) { 6551cb0ef41Sopenharmony_ci int shift = -theta * k - m; 6561cb0ef41Sopenharmony_ci if (shift < 0) shift += 2 * n_ * theta; 6571cb0ef41Sopenharmony_ci DCHECK(shift >= 0); 6581cb0ef41Sopenharmony_ci digit_t* input = part_[k]; 6591cb0ef41Sopenharmony_ci ShiftModFn(temp_, input, shift, K_); 6601cb0ef41Sopenharmony_ci int remaining_z = Z.len() - z_index; 6611cb0ef41Sopenharmony_ci if (ShouldBeNegative(temp_, length_, k + 1, s)) { 6621cb0ef41Sopenharmony_ci // Subtract F_n from input before adding to result. We use the following 6631cb0ef41Sopenharmony_ci // transformation (knowing that X < F_n): 6641cb0ef41Sopenharmony_ci // Z + (X - F_n) == Z - (F_n - X) 6651cb0ef41Sopenharmony_ci digit_t borrow_z = 0; 6661cb0ef41Sopenharmony_ci digit_t borrow_Fn = 0; 6671cb0ef41Sopenharmony_ci { 6681cb0ef41Sopenharmony_ci // i == 0: 6691cb0ef41Sopenharmony_ci digit_t d = digit_sub(1, temp_[0], &borrow_Fn); 6701cb0ef41Sopenharmony_ci Z[z_index] = digit_sub(Z[z_index], d, &borrow_z); 6711cb0ef41Sopenharmony_ci } 6721cb0ef41Sopenharmony_ci int i = 1; 6731cb0ef41Sopenharmony_ci for (; i < K_ && i < remaining_z; i++) { 6741cb0ef41Sopenharmony_ci digit_t d = digit_sub2(0, temp_[i], borrow_Fn, &borrow_Fn); 6751cb0ef41Sopenharmony_ci Z[z_index + i] = digit_sub2(Z[z_index + i], d, borrow_z, &borrow_z); 6761cb0ef41Sopenharmony_ci } 6771cb0ef41Sopenharmony_ci DCHECK(i == K_ && K_ == length_ - 1); 6781cb0ef41Sopenharmony_ci for (; i < length_ && i < remaining_z; i++) { 6791cb0ef41Sopenharmony_ci digit_t d = digit_sub2(1, temp_[i], borrow_Fn, &borrow_Fn); 6801cb0ef41Sopenharmony_ci Z[z_index + i] = digit_sub2(Z[z_index + i], d, borrow_z, &borrow_z); 6811cb0ef41Sopenharmony_ci } 6821cb0ef41Sopenharmony_ci DCHECK(borrow_Fn == 0); 6831cb0ef41Sopenharmony_ci for (; borrow_z > 0 && i < remaining_z; i++) { 6841cb0ef41Sopenharmony_ci Z[z_index + i] = digit_sub(Z[z_index + i], borrow_z, &borrow_z); 6851cb0ef41Sopenharmony_ci } 6861cb0ef41Sopenharmony_ci } else { 6871cb0ef41Sopenharmony_ci digit_t carry = 0; 6881cb0ef41Sopenharmony_ci int i = 0; 6891cb0ef41Sopenharmony_ci for (; i < length_ && i < remaining_z; i++) { 6901cb0ef41Sopenharmony_ci Z[z_index + i] = digit_add3(Z[z_index + i], temp_[i], carry, &carry); 6911cb0ef41Sopenharmony_ci } 6921cb0ef41Sopenharmony_ci for (; i < length_; i++) { 6931cb0ef41Sopenharmony_ci DCHECK(temp_[i] == 0); 6941cb0ef41Sopenharmony_ci } 6951cb0ef41Sopenharmony_ci for (; carry > 0 && i < remaining_z; i++) { 6961cb0ef41Sopenharmony_ci Z[z_index + i] = digit_add2(Z[z_index + i], carry, &carry); 6971cb0ef41Sopenharmony_ci } 6981cb0ef41Sopenharmony_ci // {carry} might be != 0 here if Z was negative before. That's fine. 6991cb0ef41Sopenharmony_ci } 7001cb0ef41Sopenharmony_ci } 7011cb0ef41Sopenharmony_ci} 7021cb0ef41Sopenharmony_ci 7031cb0ef41Sopenharmony_ci// Main FFT function for recursive invocations ("inner layer"). 7041cb0ef41Sopenharmony_civoid MultiplyFFT_Inner(RWDigits Z, Digits X, Digits Y, const Parameters& params, 7051cb0ef41Sopenharmony_ci ProcessorImpl* processor) { 7061cb0ef41Sopenharmony_ci int omega = 2 * params.r; // really: 2^(2r) 7071cb0ef41Sopenharmony_ci int theta = params.r; // really: 2^r 7081cb0ef41Sopenharmony_ci 7091cb0ef41Sopenharmony_ci FFTContainer a(params.n, params.K, processor); 7101cb0ef41Sopenharmony_ci a.Start_Default(X, params.s, theta, omega); 7111cb0ef41Sopenharmony_ci FFTContainer b(params.n, params.K, processor); 7121cb0ef41Sopenharmony_ci b.Start_Default(Y, params.s, theta, omega); 7131cb0ef41Sopenharmony_ci 7141cb0ef41Sopenharmony_ci a.PointwiseMultiply(b); 7151cb0ef41Sopenharmony_ci if (processor->should_terminate()) return; 7161cb0ef41Sopenharmony_ci 7171cb0ef41Sopenharmony_ci FFTContainer& c = a; 7181cb0ef41Sopenharmony_ci c.BackwardFFT(0, params.n, omega); 7191cb0ef41Sopenharmony_ci 7201cb0ef41Sopenharmony_ci c.CounterWeightAndRecombine(theta, params.m, Z, params.s); 7211cb0ef41Sopenharmony_ci} 7221cb0ef41Sopenharmony_ci 7231cb0ef41Sopenharmony_ci// Actual implementation of pointwise multiplications. 7241cb0ef41Sopenharmony_civoid FFTContainer::DoPointwiseMultiplication(const FFTContainer& other, 7251cb0ef41Sopenharmony_ci int start, int end, 7261cb0ef41Sopenharmony_ci digit_t* temp) { 7271cb0ef41Sopenharmony_ci // The (K_ & 3) != 0 condition makes sure that the inner FFT gets 7281cb0ef41Sopenharmony_ci // to split the work into at least 4 chunks. 7291cb0ef41Sopenharmony_ci bool use_fft = length_ >= kFftInnerThreshold && (K_ & 3) == 0; 7301cb0ef41Sopenharmony_ci Parameters params; 7311cb0ef41Sopenharmony_ci if (use_fft) ComputeParameters_Inner(K_, ¶ms); 7321cb0ef41Sopenharmony_ci RWDigits result(temp, 2 * length_); 7331cb0ef41Sopenharmony_ci for (int i = start; i < end; i++) { 7341cb0ef41Sopenharmony_ci Digits A(part_[i], length_); 7351cb0ef41Sopenharmony_ci Digits B(other.part_[i], length_); 7361cb0ef41Sopenharmony_ci if (use_fft) { 7371cb0ef41Sopenharmony_ci MultiplyFFT_Inner(result, A, B, params, processor_); 7381cb0ef41Sopenharmony_ci } else { 7391cb0ef41Sopenharmony_ci processor_->Multiply(result, A, B); 7401cb0ef41Sopenharmony_ci } 7411cb0ef41Sopenharmony_ci if (processor_->should_terminate()) return; 7421cb0ef41Sopenharmony_ci ModFnDoubleWidth(part_[i], result.digits(), length_); 7431cb0ef41Sopenharmony_ci // To improve cache friendliness, we perform the first level of the 7441cb0ef41Sopenharmony_ci // backwards FFT here. 7451cb0ef41Sopenharmony_ci if ((i & 1) == 1) { 7461cb0ef41Sopenharmony_ci SumDiff(part_[i - 1], part_[i], part_[i - 1], part_[i], length_); 7471cb0ef41Sopenharmony_ci } 7481cb0ef41Sopenharmony_ci } 7491cb0ef41Sopenharmony_ci} 7501cb0ef41Sopenharmony_ci 7511cb0ef41Sopenharmony_ci// Convenient entry point for pointwise multiplications. 7521cb0ef41Sopenharmony_civoid FFTContainer::PointwiseMultiply(const FFTContainer& other) { 7531cb0ef41Sopenharmony_ci DCHECK(n_ == other.n_); 7541cb0ef41Sopenharmony_ci DoPointwiseMultiplication(other, 0, n_, temp_); 7551cb0ef41Sopenharmony_ci} 7561cb0ef41Sopenharmony_ci 7571cb0ef41Sopenharmony_ci} // namespace 7581cb0ef41Sopenharmony_ci 7591cb0ef41Sopenharmony_ci//////////////////////////////////////////////////////////////////////////////// 7601cb0ef41Sopenharmony_ci// Part 4: Tying everything together into a multiplication algorithm. 7611cb0ef41Sopenharmony_ci 7621cb0ef41Sopenharmony_ci// TODO(jkummerow): Consider doing a "Mersenne transform" and CRT reconstruction 7631cb0ef41Sopenharmony_ci// of the final result. Might yield a few percent of perf improvement. 7641cb0ef41Sopenharmony_ci 7651cb0ef41Sopenharmony_ci// TODO(jkummerow): Consider implementing the "sqrt(2) trick". 7661cb0ef41Sopenharmony_ci// Gaudry/Kruppa/Zimmerman report that it saved them around 10%. 7671cb0ef41Sopenharmony_ci 7681cb0ef41Sopenharmony_civoid ProcessorImpl::MultiplyFFT(RWDigits Z, Digits X, Digits Y) { 7691cb0ef41Sopenharmony_ci Parameters params; 7701cb0ef41Sopenharmony_ci int m = GetParameters(X.len() + Y.len(), ¶ms); 7711cb0ef41Sopenharmony_ci int omega = params.r; // really: 2^r 7721cb0ef41Sopenharmony_ci 7731cb0ef41Sopenharmony_ci FFTContainer a(params.n, params.K, this); 7741cb0ef41Sopenharmony_ci a.Start(X, params.s, 0, omega); 7751cb0ef41Sopenharmony_ci if (X == Y) { 7761cb0ef41Sopenharmony_ci // Squaring. 7771cb0ef41Sopenharmony_ci a.PointwiseMultiply(a); 7781cb0ef41Sopenharmony_ci } else { 7791cb0ef41Sopenharmony_ci FFTContainer b(params.n, params.K, this); 7801cb0ef41Sopenharmony_ci b.Start(Y, params.s, 0, omega); 7811cb0ef41Sopenharmony_ci a.PointwiseMultiply(b); 7821cb0ef41Sopenharmony_ci } 7831cb0ef41Sopenharmony_ci if (should_terminate()) return; 7841cb0ef41Sopenharmony_ci 7851cb0ef41Sopenharmony_ci a.BackwardFFT(0, params.n, omega); 7861cb0ef41Sopenharmony_ci a.NormalizeAndRecombine(omega, m, Z, params.s); 7871cb0ef41Sopenharmony_ci} 7881cb0ef41Sopenharmony_ci 7891cb0ef41Sopenharmony_ci} // namespace bigint 7901cb0ef41Sopenharmony_ci} // namespace v8 791