11cb0ef41Sopenharmony_ci// Copyright 2021 the V8 project authors. All rights reserved.
21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be
31cb0ef41Sopenharmony_ci// found in the LICENSE file.
41cb0ef41Sopenharmony_ci
51cb0ef41Sopenharmony_ci// FFT-based multiplication, due to Schönhage and Strassen.
61cb0ef41Sopenharmony_ci// This implementation mostly follows the description given in:
71cb0ef41Sopenharmony_ci// Christoph Lüders: Fast Multiplication of Large Integers,
81cb0ef41Sopenharmony_ci// http://arxiv.org/abs/1503.04955
91cb0ef41Sopenharmony_ci
101cb0ef41Sopenharmony_ci#include "src/bigint/bigint-internal.h"
111cb0ef41Sopenharmony_ci#include "src/bigint/digit-arithmetic.h"
121cb0ef41Sopenharmony_ci#include "src/bigint/util.h"
131cb0ef41Sopenharmony_ci#include "src/bigint/vector-arithmetic.h"
141cb0ef41Sopenharmony_ci
151cb0ef41Sopenharmony_cinamespace v8 {
161cb0ef41Sopenharmony_cinamespace bigint {
171cb0ef41Sopenharmony_ci
181cb0ef41Sopenharmony_cinamespace {
191cb0ef41Sopenharmony_ci
201cb0ef41Sopenharmony_ci////////////////////////////////////////////////////////////////////////////////
211cb0ef41Sopenharmony_ci// Part 1: Functions for "mod F_n" arithmetic.
221cb0ef41Sopenharmony_ci// F_n is of the shape 2^K + 1, and for convenience we use K to count the
231cb0ef41Sopenharmony_ci// number of digits rather than the number of bits, so F_n (or K) are implicit
241cb0ef41Sopenharmony_ci// and deduced from the length {len} of the digits array.
251cb0ef41Sopenharmony_ci
261cb0ef41Sopenharmony_ci// Helper function for {ModFn} below.
271cb0ef41Sopenharmony_civoid ModFn_Helper(digit_t* x, int len, signed_digit_t high) {
281cb0ef41Sopenharmony_ci  if (high > 0) {
291cb0ef41Sopenharmony_ci    digit_t borrow = high;
301cb0ef41Sopenharmony_ci    x[len - 1] = 0;
311cb0ef41Sopenharmony_ci    for (int i = 0; i < len; i++) {
321cb0ef41Sopenharmony_ci      x[i] = digit_sub(x[i], borrow, &borrow);
331cb0ef41Sopenharmony_ci      if (borrow == 0) break;
341cb0ef41Sopenharmony_ci    }
351cb0ef41Sopenharmony_ci  } else {
361cb0ef41Sopenharmony_ci    digit_t carry = -high;
371cb0ef41Sopenharmony_ci    x[len - 1] = 0;
381cb0ef41Sopenharmony_ci    for (int i = 0; i < len; i++) {
391cb0ef41Sopenharmony_ci      x[i] = digit_add2(x[i], carry, &carry);
401cb0ef41Sopenharmony_ci      if (carry == 0) break;
411cb0ef41Sopenharmony_ci    }
421cb0ef41Sopenharmony_ci  }
431cb0ef41Sopenharmony_ci}
441cb0ef41Sopenharmony_ci
451cb0ef41Sopenharmony_ci// {x} := {x} mod F_n, assuming that {x} is "slightly" larger than F_n (e.g.
461cb0ef41Sopenharmony_ci// after addition of two numbers that were mod-F_n-normalized before).
471cb0ef41Sopenharmony_civoid ModFn(digit_t* x, int len) {
481cb0ef41Sopenharmony_ci  int K = len - 1;
491cb0ef41Sopenharmony_ci  signed_digit_t high = x[K];
501cb0ef41Sopenharmony_ci  if (high == 0) return;
511cb0ef41Sopenharmony_ci  ModFn_Helper(x, len, high);
521cb0ef41Sopenharmony_ci  high = x[K];
531cb0ef41Sopenharmony_ci  if (high == 0) return;
541cb0ef41Sopenharmony_ci  DCHECK(high == 1 || high == -1);
551cb0ef41Sopenharmony_ci  ModFn_Helper(x, len, high);
561cb0ef41Sopenharmony_ci  high = x[K];
571cb0ef41Sopenharmony_ci  if (high == -1) ModFn_Helper(x, len, high);
581cb0ef41Sopenharmony_ci}
591cb0ef41Sopenharmony_ci
601cb0ef41Sopenharmony_ci// {dest} := {src} mod F_n, assuming that {src} is about twice as long as F_n
611cb0ef41Sopenharmony_ci// (e.g. after multiplication of two numbers that were mod-F_n-normalized
621cb0ef41Sopenharmony_ci// before).
631cb0ef41Sopenharmony_ci// {len} is length of {dest}; {src} is twice as long.
641cb0ef41Sopenharmony_civoid ModFnDoubleWidth(digit_t* dest, const digit_t* src, int len) {
651cb0ef41Sopenharmony_ci  int K = len - 1;
661cb0ef41Sopenharmony_ci  digit_t borrow = 0;
671cb0ef41Sopenharmony_ci  for (int i = 0; i < K; i++) {
681cb0ef41Sopenharmony_ci    dest[i] = digit_sub2(src[i], src[i + K], borrow, &borrow);
691cb0ef41Sopenharmony_ci  }
701cb0ef41Sopenharmony_ci  dest[K] = digit_sub2(0, src[2 * K], borrow, &borrow);
711cb0ef41Sopenharmony_ci  // {borrow} may be non-zero here, that's OK as {ModFn} will take care of it.
721cb0ef41Sopenharmony_ci  ModFn(dest, len);
731cb0ef41Sopenharmony_ci}
741cb0ef41Sopenharmony_ci
751cb0ef41Sopenharmony_ci// Sets {sum} := {a} + {b} and {diff} := {a} - {b}, which is more efficient
761cb0ef41Sopenharmony_ci// than computing sum and difference separately. Applies "mod F_n" normalization
771cb0ef41Sopenharmony_ci// to both results.
781cb0ef41Sopenharmony_civoid SumDiff(digit_t* sum, digit_t* diff, const digit_t* a, const digit_t* b,
791cb0ef41Sopenharmony_ci             int len) {
801cb0ef41Sopenharmony_ci  digit_t carry = 0;
811cb0ef41Sopenharmony_ci  digit_t borrow = 0;
821cb0ef41Sopenharmony_ci  for (int i = 0; i < len; i++) {
831cb0ef41Sopenharmony_ci    // Read both values first, because inputs and outputs can overlap.
841cb0ef41Sopenharmony_ci    digit_t ai = a[i];
851cb0ef41Sopenharmony_ci    digit_t bi = b[i];
861cb0ef41Sopenharmony_ci    sum[i] = digit_add3(ai, bi, carry, &carry);
871cb0ef41Sopenharmony_ci    diff[i] = digit_sub2(ai, bi, borrow, &borrow);
881cb0ef41Sopenharmony_ci  }
891cb0ef41Sopenharmony_ci  ModFn(sum, len);
901cb0ef41Sopenharmony_ci  ModFn(diff, len);
911cb0ef41Sopenharmony_ci}
921cb0ef41Sopenharmony_ci
931cb0ef41Sopenharmony_ci// {result} := ({input} << shift) mod F_n, where shift >= K.
941cb0ef41Sopenharmony_civoid ShiftModFn_Large(digit_t* result, const digit_t* input, int digit_shift,
951cb0ef41Sopenharmony_ci                      int bits_shift, int K) {
961cb0ef41Sopenharmony_ci  // If {digit_shift} is greater than K, we use the following transformation
971cb0ef41Sopenharmony_ci  // (where, since everything is mod 2^K + 1, we are allowed to add or
981cb0ef41Sopenharmony_ci  // subtract any multiple of 2^K + 1 at any time):
991cb0ef41Sopenharmony_ci  //      x * 2^{K+m}   mod 2^K + 1
1001cb0ef41Sopenharmony_ci  //   == x * 2^K * 2^m - (2^K + 1)*(x * 2^m)   mod 2^K + 1
1011cb0ef41Sopenharmony_ci  //   == x * 2^K * 2^m - x * 2^K * 2^m - x * 2^m   mod 2^K + 1
1021cb0ef41Sopenharmony_ci  //   == -x * 2^m   mod 2^K + 1
1031cb0ef41Sopenharmony_ci  // So the flow is the same as for m < K, but we invert the subtraction's
1041cb0ef41Sopenharmony_ci  // operands. In order to avoid underflow, we virtually initialize the
1051cb0ef41Sopenharmony_ci  // result to 2^K + 1:
1061cb0ef41Sopenharmony_ci  //   input  =  [ iK ][iK-1] ....  .... [ i1 ][ i0 ]
1071cb0ef41Sopenharmony_ci  //   result =  [   1][0000] ....  .... [0000][0001]
1081cb0ef41Sopenharmony_ci  //            +                  [ iK ] .... [ iX ]
1091cb0ef41Sopenharmony_ci  //            -      [iX-1] .... [ i0 ]
1101cb0ef41Sopenharmony_ci  DCHECK(digit_shift >= K);
1111cb0ef41Sopenharmony_ci  digit_shift -= K;
1121cb0ef41Sopenharmony_ci  digit_t borrow = 0;
1131cb0ef41Sopenharmony_ci  if (bits_shift == 0) {
1141cb0ef41Sopenharmony_ci    digit_t carry = 1;
1151cb0ef41Sopenharmony_ci    for (int i = 0; i < digit_shift; i++) {
1161cb0ef41Sopenharmony_ci      result[i] = digit_add2(input[i + K - digit_shift], carry, &carry);
1171cb0ef41Sopenharmony_ci    }
1181cb0ef41Sopenharmony_ci    result[digit_shift] = digit_sub(input[K] + carry, input[0], &borrow);
1191cb0ef41Sopenharmony_ci    for (int i = digit_shift + 1; i < K; i++) {
1201cb0ef41Sopenharmony_ci      digit_t d = input[i - digit_shift];
1211cb0ef41Sopenharmony_ci      result[i] = digit_sub2(0, d, borrow, &borrow);
1221cb0ef41Sopenharmony_ci    }
1231cb0ef41Sopenharmony_ci  } else {
1241cb0ef41Sopenharmony_ci    digit_t add_carry = 1;
1251cb0ef41Sopenharmony_ci    digit_t input_carry =
1261cb0ef41Sopenharmony_ci        input[K - digit_shift - 1] >> (kDigitBits - bits_shift);
1271cb0ef41Sopenharmony_ci    for (int i = 0; i < digit_shift; i++) {
1281cb0ef41Sopenharmony_ci      digit_t d = input[i + K - digit_shift];
1291cb0ef41Sopenharmony_ci      digit_t summand = (d << bits_shift) | input_carry;
1301cb0ef41Sopenharmony_ci      result[i] = digit_add2(summand, add_carry, &add_carry);
1311cb0ef41Sopenharmony_ci      input_carry = d >> (kDigitBits - bits_shift);
1321cb0ef41Sopenharmony_ci    }
1331cb0ef41Sopenharmony_ci    {
1341cb0ef41Sopenharmony_ci      // result[digit_shift] = (add_carry + iK_part) - i0_part
1351cb0ef41Sopenharmony_ci      digit_t d = input[K];
1361cb0ef41Sopenharmony_ci      digit_t iK_part = (d << bits_shift) | input_carry;
1371cb0ef41Sopenharmony_ci      digit_t iK_carry = d >> (kDigitBits - bits_shift);
1381cb0ef41Sopenharmony_ci      digit_t sum = digit_add2(add_carry, iK_part, &add_carry);
1391cb0ef41Sopenharmony_ci      // {iK_carry} is less than a full digit, so we can merge {add_carry}
1401cb0ef41Sopenharmony_ci      // into it without overflow.
1411cb0ef41Sopenharmony_ci      iK_carry += add_carry;
1421cb0ef41Sopenharmony_ci      d = input[0];
1431cb0ef41Sopenharmony_ci      digit_t i0_part = d << bits_shift;
1441cb0ef41Sopenharmony_ci      result[digit_shift] = digit_sub(sum, i0_part, &borrow);
1451cb0ef41Sopenharmony_ci      input_carry = d >> (kDigitBits - bits_shift);
1461cb0ef41Sopenharmony_ci      if (digit_shift + 1 < K) {
1471cb0ef41Sopenharmony_ci        d = input[1];
1481cb0ef41Sopenharmony_ci        digit_t subtrahend = (d << bits_shift) | input_carry;
1491cb0ef41Sopenharmony_ci        result[digit_shift + 1] =
1501cb0ef41Sopenharmony_ci            digit_sub2(iK_carry, subtrahend, borrow, &borrow);
1511cb0ef41Sopenharmony_ci        input_carry = d >> (kDigitBits - bits_shift);
1521cb0ef41Sopenharmony_ci      }
1531cb0ef41Sopenharmony_ci    }
1541cb0ef41Sopenharmony_ci    for (int i = digit_shift + 2; i < K; i++) {
1551cb0ef41Sopenharmony_ci      digit_t d = input[i - digit_shift];
1561cb0ef41Sopenharmony_ci      digit_t subtrahend = (d << bits_shift) | input_carry;
1571cb0ef41Sopenharmony_ci      result[i] = digit_sub2(0, subtrahend, borrow, &borrow);
1581cb0ef41Sopenharmony_ci      input_carry = d >> (kDigitBits - bits_shift);
1591cb0ef41Sopenharmony_ci    }
1601cb0ef41Sopenharmony_ci  }
1611cb0ef41Sopenharmony_ci  // The virtual 1 in result[K] should be eliminated by {borrow}. If there
1621cb0ef41Sopenharmony_ci  // is no borrow, then the virtual initialization was too much. Subtract
1631cb0ef41Sopenharmony_ci  // 2^K + 1.
1641cb0ef41Sopenharmony_ci  result[K] = 0;
1651cb0ef41Sopenharmony_ci  if (borrow != 1) {
1661cb0ef41Sopenharmony_ci    borrow = 1;
1671cb0ef41Sopenharmony_ci    for (int i = 0; i < K; i++) {
1681cb0ef41Sopenharmony_ci      result[i] = digit_sub(result[i], borrow, &borrow);
1691cb0ef41Sopenharmony_ci      if (borrow == 0) break;
1701cb0ef41Sopenharmony_ci    }
1711cb0ef41Sopenharmony_ci    if (borrow != 0) {
1721cb0ef41Sopenharmony_ci      // The result must be 2^K.
1731cb0ef41Sopenharmony_ci      for (int i = 0; i < K; i++) result[i] = 0;
1741cb0ef41Sopenharmony_ci      result[K] = 1;
1751cb0ef41Sopenharmony_ci    }
1761cb0ef41Sopenharmony_ci  }
1771cb0ef41Sopenharmony_ci}
1781cb0ef41Sopenharmony_ci
1791cb0ef41Sopenharmony_ci// Sets {result} := {input} * 2^{power_of_two} mod 2^{K} + 1.
1801cb0ef41Sopenharmony_ci// This function is highly relevant for overall performance.
1811cb0ef41Sopenharmony_civoid ShiftModFn(digit_t* result, const digit_t* input, int power_of_two, int K,
1821cb0ef41Sopenharmony_ci                int zero_above = 0x7FFFFFFF) {
1831cb0ef41Sopenharmony_ci  // The modulo-reduction amounts to a subtraction, which we combine
1841cb0ef41Sopenharmony_ci  // with the shift as follows:
1851cb0ef41Sopenharmony_ci  //   input  =  [ iK ][iK-1] ....  .... [ i1 ][ i0 ]
1861cb0ef41Sopenharmony_ci  //   result =        [iX-1] .... [ i0 ] <---------- shift by {power_of_two}
1871cb0ef41Sopenharmony_ci  //            -                  [ iK ] .... [ iX ]
1881cb0ef41Sopenharmony_ci  // where "X" is the index "K - digit_shift".
1891cb0ef41Sopenharmony_ci  int digit_shift = power_of_two / kDigitBits;
1901cb0ef41Sopenharmony_ci  int bits_shift = power_of_two % kDigitBits;
1911cb0ef41Sopenharmony_ci  // By an analogous construction to the "digit_shift >= K" case,
1921cb0ef41Sopenharmony_ci  // it turns out that:
1931cb0ef41Sopenharmony_ci  //    x * 2^{2K+m} == x * 2^m   mod 2^K + 1.
1941cb0ef41Sopenharmony_ci  while (digit_shift >= 2 * K) digit_shift -= 2 * K;  // Faster than '%'!
1951cb0ef41Sopenharmony_ci  if (digit_shift >= K) {
1961cb0ef41Sopenharmony_ci    return ShiftModFn_Large(result, input, digit_shift, bits_shift, K);
1971cb0ef41Sopenharmony_ci  }
1981cb0ef41Sopenharmony_ci  digit_t borrow = 0;
1991cb0ef41Sopenharmony_ci  if (bits_shift == 0) {
2001cb0ef41Sopenharmony_ci    // We do a single pass over {input}, starting by copying digits [i1] to
2011cb0ef41Sopenharmony_ci    // [iX-1] to result indices digit_shift+1 to K-1.
2021cb0ef41Sopenharmony_ci    int i = 1;
2031cb0ef41Sopenharmony_ci    // Read input digits unless we know they are zero.
2041cb0ef41Sopenharmony_ci    int cap = std::min(K - digit_shift, zero_above);
2051cb0ef41Sopenharmony_ci    for (; i < cap; i++) {
2061cb0ef41Sopenharmony_ci      result[i + digit_shift] = input[i];
2071cb0ef41Sopenharmony_ci    }
2081cb0ef41Sopenharmony_ci    // Any remaining work can hard-code the knowledge that input[i] == 0.
2091cb0ef41Sopenharmony_ci    for (; i < K - digit_shift; i++) {
2101cb0ef41Sopenharmony_ci      DCHECK(input[i] == 0);
2111cb0ef41Sopenharmony_ci      result[i + digit_shift] = 0;
2121cb0ef41Sopenharmony_ci    }
2131cb0ef41Sopenharmony_ci    // Second phase: subtract input digits [iX] to [iK] from (virtually) zero-
2141cb0ef41Sopenharmony_ci    // initialized result indices 0 to digit_shift-1.
2151cb0ef41Sopenharmony_ci    cap = std::min(K, zero_above);
2161cb0ef41Sopenharmony_ci    for (; i < cap; i++) {
2171cb0ef41Sopenharmony_ci      digit_t d = input[i];
2181cb0ef41Sopenharmony_ci      result[i - K + digit_shift] = digit_sub2(0, d, borrow, &borrow);
2191cb0ef41Sopenharmony_ci    }
2201cb0ef41Sopenharmony_ci    // Any remaining work can hard-code the knowledge that input[i] == 0.
2211cb0ef41Sopenharmony_ci    for (; i < K; i++) {
2221cb0ef41Sopenharmony_ci      DCHECK(input[i] == 0);
2231cb0ef41Sopenharmony_ci      result[i - K + digit_shift] = digit_sub(0, borrow, &borrow);
2241cb0ef41Sopenharmony_ci    }
2251cb0ef41Sopenharmony_ci    // Last step: subtract [iK] from [i0] and store at result index digit_shift.
2261cb0ef41Sopenharmony_ci    result[digit_shift] = digit_sub2(input[0], input[K], borrow, &borrow);
2271cb0ef41Sopenharmony_ci  } else {
2281cb0ef41Sopenharmony_ci    // Same flow as before, but taking bits_shift != 0 into account.
2291cb0ef41Sopenharmony_ci    // First phase: result indices digit_shift+1 to K.
2301cb0ef41Sopenharmony_ci    digit_t carry = 0;
2311cb0ef41Sopenharmony_ci    int i = 0;
2321cb0ef41Sopenharmony_ci    // Read input digits unless we know they are zero.
2331cb0ef41Sopenharmony_ci    int cap = std::min(K - digit_shift, zero_above);
2341cb0ef41Sopenharmony_ci    for (; i < cap; i++) {
2351cb0ef41Sopenharmony_ci      digit_t d = input[i];
2361cb0ef41Sopenharmony_ci      result[i + digit_shift] = (d << bits_shift) | carry;
2371cb0ef41Sopenharmony_ci      carry = d >> (kDigitBits - bits_shift);
2381cb0ef41Sopenharmony_ci    }
2391cb0ef41Sopenharmony_ci    // Any remaining work can hard-code the knowledge that input[i] == 0.
2401cb0ef41Sopenharmony_ci    for (; i < K - digit_shift; i++) {
2411cb0ef41Sopenharmony_ci      DCHECK(input[i] == 0);
2421cb0ef41Sopenharmony_ci      result[i + digit_shift] = carry;
2431cb0ef41Sopenharmony_ci      carry = 0;
2441cb0ef41Sopenharmony_ci    }
2451cb0ef41Sopenharmony_ci    // Second phase: result indices 0 to digit_shift - 1.
2461cb0ef41Sopenharmony_ci    cap = std::min(K, zero_above);
2471cb0ef41Sopenharmony_ci    for (; i < cap; i++) {
2481cb0ef41Sopenharmony_ci      digit_t d = input[i];
2491cb0ef41Sopenharmony_ci      result[i - K + digit_shift] =
2501cb0ef41Sopenharmony_ci          digit_sub2(0, (d << bits_shift) | carry, borrow, &borrow);
2511cb0ef41Sopenharmony_ci      carry = d >> (kDigitBits - bits_shift);
2521cb0ef41Sopenharmony_ci    }
2531cb0ef41Sopenharmony_ci    // Any remaining work can hard-code the knowledge that input[i] == 0.
2541cb0ef41Sopenharmony_ci    if (i < K) {
2551cb0ef41Sopenharmony_ci      DCHECK(input[i] == 0);
2561cb0ef41Sopenharmony_ci      result[i - K + digit_shift] = digit_sub2(0, carry, borrow, &borrow);
2571cb0ef41Sopenharmony_ci      carry = 0;
2581cb0ef41Sopenharmony_ci      i++;
2591cb0ef41Sopenharmony_ci    }
2601cb0ef41Sopenharmony_ci    for (; i < K; i++) {
2611cb0ef41Sopenharmony_ci      DCHECK(input[i] == 0);
2621cb0ef41Sopenharmony_ci      result[i - K + digit_shift] = digit_sub(0, borrow, &borrow);
2631cb0ef41Sopenharmony_ci    }
2641cb0ef41Sopenharmony_ci    // Last step: compute result[digit_shift].
2651cb0ef41Sopenharmony_ci    digit_t d = input[K];
2661cb0ef41Sopenharmony_ci    result[digit_shift] = digit_sub2(
2671cb0ef41Sopenharmony_ci        result[digit_shift], (d << bits_shift) | carry, borrow, &borrow);
2681cb0ef41Sopenharmony_ci    // No carry left.
2691cb0ef41Sopenharmony_ci    DCHECK((d >> (kDigitBits - bits_shift)) == 0);
2701cb0ef41Sopenharmony_ci  }
2711cb0ef41Sopenharmony_ci  result[K] = 0;
2721cb0ef41Sopenharmony_ci  for (int i = digit_shift + 1; i <= K && borrow > 0; i++) {
2731cb0ef41Sopenharmony_ci    result[i] = digit_sub(result[i], borrow, &borrow);
2741cb0ef41Sopenharmony_ci  }
2751cb0ef41Sopenharmony_ci  if (borrow > 0) {
2761cb0ef41Sopenharmony_ci    // Underflow means we subtracted too much. Add 2^K + 1.
2771cb0ef41Sopenharmony_ci    digit_t carry = 1;
2781cb0ef41Sopenharmony_ci    for (int i = 0; i <= K; i++) {
2791cb0ef41Sopenharmony_ci      result[i] = digit_add2(result[i], carry, &carry);
2801cb0ef41Sopenharmony_ci      if (carry == 0) break;
2811cb0ef41Sopenharmony_ci    }
2821cb0ef41Sopenharmony_ci    result[K] = digit_add2(result[K], 1, &carry);
2831cb0ef41Sopenharmony_ci  }
2841cb0ef41Sopenharmony_ci}
2851cb0ef41Sopenharmony_ci
2861cb0ef41Sopenharmony_ci////////////////////////////////////////////////////////////////////////////////
2871cb0ef41Sopenharmony_ci// Part 2: FFT-based multiplication is very sensitive to appropriate choice
2881cb0ef41Sopenharmony_ci// of parameters. The following functions choose the parameters that the
2891cb0ef41Sopenharmony_ci// subsequent actual computation will use. This is partially based on formal
2901cb0ef41Sopenharmony_ci// constraints and partially on experimentally-determined heuristics.
2911cb0ef41Sopenharmony_ci
2921cb0ef41Sopenharmony_cistruct Parameters {
2931cb0ef41Sopenharmony_ci  // We never use the default values, but skipping zero-initialization
2941cb0ef41Sopenharmony_ci  // of these fields saddens and confuses MSan.
2951cb0ef41Sopenharmony_ci  int m{0};
2961cb0ef41Sopenharmony_ci  int K{0};
2971cb0ef41Sopenharmony_ci  int n{0};
2981cb0ef41Sopenharmony_ci  int s{0};
2991cb0ef41Sopenharmony_ci  int r{0};
3001cb0ef41Sopenharmony_ci};
3011cb0ef41Sopenharmony_ci
3021cb0ef41Sopenharmony_ci// Computes parameters for the main calculation, given a bit length {N} and
3031cb0ef41Sopenharmony_ci// an {m}. See the paper for details.
3041cb0ef41Sopenharmony_civoid ComputeParameters(int N, int m, Parameters* params) {
3051cb0ef41Sopenharmony_ci  N *= kDigitBits;
3061cb0ef41Sopenharmony_ci  int n = 1 << m;  // 2^m
3071cb0ef41Sopenharmony_ci  int nhalf = n >> 1;
3081cb0ef41Sopenharmony_ci  int s = (N + n - 1) >> m;  // ceil(N/n)
3091cb0ef41Sopenharmony_ci  s = RoundUp(s, kDigitBits);
3101cb0ef41Sopenharmony_ci  int K = m + 2 * s + 1;  // K must be at least this big...
3111cb0ef41Sopenharmony_ci  K = RoundUp(K, nhalf);  // ...and a multiple of n/2.
3121cb0ef41Sopenharmony_ci  int r = K >> (m - 1);   // Which multiple?
3131cb0ef41Sopenharmony_ci
3141cb0ef41Sopenharmony_ci  // We want recursive calls to make progress, so force K to be a multiple
3151cb0ef41Sopenharmony_ci  // of 8 if it's above the recursion threshold. Otherwise, K must be a
3161cb0ef41Sopenharmony_ci  // multiple of kDigitBits.
3171cb0ef41Sopenharmony_ci  const int threshold = (K + 1 >= kFftInnerThreshold * kDigitBits)
3181cb0ef41Sopenharmony_ci                            ? 3 + kLog2DigitBits
3191cb0ef41Sopenharmony_ci                            : kLog2DigitBits;
3201cb0ef41Sopenharmony_ci  int K_tz = CountTrailingZeros(K);
3211cb0ef41Sopenharmony_ci  while (K_tz < threshold) {
3221cb0ef41Sopenharmony_ci    K += (1 << K_tz);
3231cb0ef41Sopenharmony_ci    r = K >> (m - 1);
3241cb0ef41Sopenharmony_ci    K_tz = CountTrailingZeros(K);
3251cb0ef41Sopenharmony_ci  }
3261cb0ef41Sopenharmony_ci
3271cb0ef41Sopenharmony_ci  DCHECK(K % kDigitBits == 0);
3281cb0ef41Sopenharmony_ci  DCHECK(s % kDigitBits == 0);
3291cb0ef41Sopenharmony_ci  params->K = K / kDigitBits;
3301cb0ef41Sopenharmony_ci  params->s = s / kDigitBits;
3311cb0ef41Sopenharmony_ci  params->n = n;
3321cb0ef41Sopenharmony_ci  params->r = r;
3331cb0ef41Sopenharmony_ci}
3341cb0ef41Sopenharmony_ci
3351cb0ef41Sopenharmony_ci// Computes parameters for recursive invocations ("inner layer").
3361cb0ef41Sopenharmony_civoid ComputeParameters_Inner(int N, Parameters* params) {
3371cb0ef41Sopenharmony_ci  int max_m = CountTrailingZeros(N);
3381cb0ef41Sopenharmony_ci  int N_bits = BitLength(N);
3391cb0ef41Sopenharmony_ci  int m = N_bits - 4;  // Don't let s get too small.
3401cb0ef41Sopenharmony_ci  m = std::min(max_m, m);
3411cb0ef41Sopenharmony_ci  N *= kDigitBits;
3421cb0ef41Sopenharmony_ci  int n = 1 << m;  // 2^m
3431cb0ef41Sopenharmony_ci  // We can't round up s in the inner layer, because N = n*s is fixed.
3441cb0ef41Sopenharmony_ci  int s = N >> m;
3451cb0ef41Sopenharmony_ci  DCHECK(N == s * n);
3461cb0ef41Sopenharmony_ci  int K = m + 2 * s + 1;  // K must be at least this big...
3471cb0ef41Sopenharmony_ci  K = RoundUp(K, n);      // ...and a multiple of n and kDigitBits.
3481cb0ef41Sopenharmony_ci  K = RoundUp(K, kDigitBits);
3491cb0ef41Sopenharmony_ci  params->r = K >> m;           // Which multiple?
3501cb0ef41Sopenharmony_ci  DCHECK(K % kDigitBits == 0);
3511cb0ef41Sopenharmony_ci  DCHECK(s % kDigitBits == 0);
3521cb0ef41Sopenharmony_ci  params->K = K / kDigitBits;
3531cb0ef41Sopenharmony_ci  params->s = s / kDigitBits;
3541cb0ef41Sopenharmony_ci  params->n = n;
3551cb0ef41Sopenharmony_ci  params->m = m;
3561cb0ef41Sopenharmony_ci}
3571cb0ef41Sopenharmony_ci
3581cb0ef41Sopenharmony_ciint PredictInnerK(int N) {
3591cb0ef41Sopenharmony_ci  Parameters params;
3601cb0ef41Sopenharmony_ci  ComputeParameters_Inner(N, &params);
3611cb0ef41Sopenharmony_ci  return params.K;
3621cb0ef41Sopenharmony_ci}
3631cb0ef41Sopenharmony_ci
3641cb0ef41Sopenharmony_ci// Applies heuristics to decide whether {m} should be decremented, by looking
3651cb0ef41Sopenharmony_ci// at what would happen to {K} and {s} if {m} was decremented.
3661cb0ef41Sopenharmony_cibool ShouldDecrementM(const Parameters& current, const Parameters& next,
3671cb0ef41Sopenharmony_ci                      const Parameters& after_next) {
3681cb0ef41Sopenharmony_ci  // K == 64 seems to work particularly well.
3691cb0ef41Sopenharmony_ci  if (current.K == 64 && next.K >= 112) return false;
3701cb0ef41Sopenharmony_ci  // Small values for s are never efficient.
3711cb0ef41Sopenharmony_ci  if (current.s < 6) return true;
3721cb0ef41Sopenharmony_ci  // The time is roughly determined by K * n. When we decrement m, then
3731cb0ef41Sopenharmony_ci  // n always halves, and K usually gets bigger, by up to 2x.
3741cb0ef41Sopenharmony_ci  // For not-quite-so-small s, look at how much bigger K would get: if
3751cb0ef41Sopenharmony_ci  // the K increase is small enough, making n smaller is worth it.
3761cb0ef41Sopenharmony_ci  // Empirically, it's most meaningful to look at the K *after* next.
3771cb0ef41Sopenharmony_ci  // The specific threshold values have been chosen by running many
3781cb0ef41Sopenharmony_ci  // benchmarks on inputs of many sizes, and manually selecting thresholds
3791cb0ef41Sopenharmony_ci  // that seemed to produce good results.
3801cb0ef41Sopenharmony_ci  double factor = static_cast<double>(after_next.K) / current.K;
3811cb0ef41Sopenharmony_ci  if ((current.s == 6 && factor < 3.85) ||  // --
3821cb0ef41Sopenharmony_ci      (current.s == 7 && factor < 3.73) ||  // --
3831cb0ef41Sopenharmony_ci      (current.s == 8 && factor < 3.55) ||  // --
3841cb0ef41Sopenharmony_ci      (current.s == 9 && factor < 3.50) ||  // --
3851cb0ef41Sopenharmony_ci      factor < 3.4) {
3861cb0ef41Sopenharmony_ci    return true;
3871cb0ef41Sopenharmony_ci  }
3881cb0ef41Sopenharmony_ci  // If K is just below the recursion threshold, make sure we do recurse,
3891cb0ef41Sopenharmony_ci  // unless doing so would be particularly inefficient (large inner_K).
3901cb0ef41Sopenharmony_ci  // If K is just above the recursion threshold, doubling it often makes
3911cb0ef41Sopenharmony_ci  // the inner call more efficient.
3921cb0ef41Sopenharmony_ci  if (current.K >= 160 && current.K < 250 && PredictInnerK(next.K) < 28) {
3931cb0ef41Sopenharmony_ci    return true;
3941cb0ef41Sopenharmony_ci  }
3951cb0ef41Sopenharmony_ci  // If we found no reason to decrement, keep m as large as possible.
3961cb0ef41Sopenharmony_ci  return false;
3971cb0ef41Sopenharmony_ci}
3981cb0ef41Sopenharmony_ci
3991cb0ef41Sopenharmony_ci// Decides what parameters to use for a given input bit length {N}.
4001cb0ef41Sopenharmony_ci// Returns the chosen m.
4011cb0ef41Sopenharmony_ciint GetParameters(int N, Parameters* params) {
4021cb0ef41Sopenharmony_ci  int N_bits = BitLength(N);
4031cb0ef41Sopenharmony_ci  int max_m = N_bits - 3;                   // Larger m make s too small.
4041cb0ef41Sopenharmony_ci  max_m = std::max(kLog2DigitBits, max_m);  // Smaller m break the logic below.
4051cb0ef41Sopenharmony_ci  int m = max_m;
4061cb0ef41Sopenharmony_ci  Parameters current;
4071cb0ef41Sopenharmony_ci  ComputeParameters(N, m, &current);
4081cb0ef41Sopenharmony_ci  Parameters next;
4091cb0ef41Sopenharmony_ci  ComputeParameters(N, m - 1, &next);
4101cb0ef41Sopenharmony_ci  while (m > 2) {
4111cb0ef41Sopenharmony_ci    Parameters after_next;
4121cb0ef41Sopenharmony_ci    ComputeParameters(N, m - 2, &after_next);
4131cb0ef41Sopenharmony_ci    if (ShouldDecrementM(current, next, after_next)) {
4141cb0ef41Sopenharmony_ci      m--;
4151cb0ef41Sopenharmony_ci      current = next;
4161cb0ef41Sopenharmony_ci      next = after_next;
4171cb0ef41Sopenharmony_ci    } else {
4181cb0ef41Sopenharmony_ci      break;
4191cb0ef41Sopenharmony_ci    }
4201cb0ef41Sopenharmony_ci  }
4211cb0ef41Sopenharmony_ci  *params = current;
4221cb0ef41Sopenharmony_ci  return m;
4231cb0ef41Sopenharmony_ci}
4241cb0ef41Sopenharmony_ci
4251cb0ef41Sopenharmony_ci////////////////////////////////////////////////////////////////////////////////
4261cb0ef41Sopenharmony_ci// Part 3: Fast Fourier Transformation.
4271cb0ef41Sopenharmony_ci
4281cb0ef41Sopenharmony_ciclass FFTContainer {
4291cb0ef41Sopenharmony_ci public:
4301cb0ef41Sopenharmony_ci  // {n} is the number of chunks, whose length is {K}+1.
4311cb0ef41Sopenharmony_ci  // {K} determines F_n = 2^(K * kDigitBits) + 1.
4321cb0ef41Sopenharmony_ci  FFTContainer(int n, int K, ProcessorImpl* processor)
4331cb0ef41Sopenharmony_ci      : n_(n), K_(K), length_(K + 1), processor_(processor) {
4341cb0ef41Sopenharmony_ci    storage_ = new digit_t[length_ * n_];
4351cb0ef41Sopenharmony_ci    part_ = new digit_t*[n_];
4361cb0ef41Sopenharmony_ci    digit_t* ptr = storage_;
4371cb0ef41Sopenharmony_ci    for (int i = 0; i < n; i++, ptr += length_) {
4381cb0ef41Sopenharmony_ci      part_[i] = ptr;
4391cb0ef41Sopenharmony_ci    }
4401cb0ef41Sopenharmony_ci    temp_ = new digit_t[length_ * 2];
4411cb0ef41Sopenharmony_ci  }
4421cb0ef41Sopenharmony_ci  FFTContainer() = delete;
4431cb0ef41Sopenharmony_ci  FFTContainer(const FFTContainer&) = delete;
4441cb0ef41Sopenharmony_ci  FFTContainer& operator=(const FFTContainer&) = delete;
4451cb0ef41Sopenharmony_ci
4461cb0ef41Sopenharmony_ci  ~FFTContainer() {
4471cb0ef41Sopenharmony_ci    delete[] storage_;
4481cb0ef41Sopenharmony_ci    delete[] part_;
4491cb0ef41Sopenharmony_ci    delete[] temp_;
4501cb0ef41Sopenharmony_ci  }
4511cb0ef41Sopenharmony_ci
4521cb0ef41Sopenharmony_ci  void Start_Default(Digits X, int chunk_size, int theta, int omega);
4531cb0ef41Sopenharmony_ci  void Start(Digits X, int chunk_size, int theta, int omega);
4541cb0ef41Sopenharmony_ci
4551cb0ef41Sopenharmony_ci  void NormalizeAndRecombine(int omega, int m, RWDigits Z, int chunk_size);
4561cb0ef41Sopenharmony_ci  void CounterWeightAndRecombine(int theta, int m, RWDigits Z, int chunk_size);
4571cb0ef41Sopenharmony_ci
4581cb0ef41Sopenharmony_ci  void FFT_ReturnShuffledThreadsafe(int start, int len, int omega,
4591cb0ef41Sopenharmony_ci                                    digit_t* temp);
4601cb0ef41Sopenharmony_ci  void FFT_Recurse(int start, int half, int omega, digit_t* temp);
4611cb0ef41Sopenharmony_ci
4621cb0ef41Sopenharmony_ci  void BackwardFFT(int start, int len, int omega);
4631cb0ef41Sopenharmony_ci  void BackwardFFT_Threadsafe(int start, int len, int omega, digit_t* temp);
4641cb0ef41Sopenharmony_ci
4651cb0ef41Sopenharmony_ci  void PointwiseMultiply(const FFTContainer& other);
4661cb0ef41Sopenharmony_ci  void DoPointwiseMultiplication(const FFTContainer& other, int start, int end,
4671cb0ef41Sopenharmony_ci                                 digit_t* temp);
4681cb0ef41Sopenharmony_ci
4691cb0ef41Sopenharmony_ci  int length() const { return length_; }
4701cb0ef41Sopenharmony_ci
4711cb0ef41Sopenharmony_ci private:
4721cb0ef41Sopenharmony_ci  const int n_;       // Number of parts.
4731cb0ef41Sopenharmony_ci  const int K_;       // Always length_ - 1.
4741cb0ef41Sopenharmony_ci  const int length_;  // Length of each part, in digits.
4751cb0ef41Sopenharmony_ci  ProcessorImpl* processor_;
4761cb0ef41Sopenharmony_ci  digit_t* storage_;  // Combined storage of all parts.
4771cb0ef41Sopenharmony_ci  digit_t** part_;    // Pointers to each part.
4781cb0ef41Sopenharmony_ci  digit_t* temp_;     // Temporary storage with size 2 * length_.
4791cb0ef41Sopenharmony_ci};
4801cb0ef41Sopenharmony_ci
4811cb0ef41Sopenharmony_ciinline void CopyAndZeroExtend(digit_t* dst, const digit_t* src,
4821cb0ef41Sopenharmony_ci                              int digits_to_copy, size_t total_bytes) {
4831cb0ef41Sopenharmony_ci  size_t bytes_to_copy = digits_to_copy * sizeof(digit_t);
4841cb0ef41Sopenharmony_ci  memcpy(dst, src, bytes_to_copy);
4851cb0ef41Sopenharmony_ci  memset(dst + digits_to_copy, 0, total_bytes - bytes_to_copy);
4861cb0ef41Sopenharmony_ci}
4871cb0ef41Sopenharmony_ci
4881cb0ef41Sopenharmony_ci// Reads {X} into the FFTContainer's internal storage, dividing it into chunks
4891cb0ef41Sopenharmony_ci// while doing so; then performs the forward FFT.
4901cb0ef41Sopenharmony_civoid FFTContainer::Start_Default(Digits X, int chunk_size, int theta,
4911cb0ef41Sopenharmony_ci                                 int omega) {
4921cb0ef41Sopenharmony_ci  int len = X.len();
4931cb0ef41Sopenharmony_ci  const digit_t* pointer = X.digits();
4941cb0ef41Sopenharmony_ci  const size_t part_length_in_bytes = length_ * sizeof(digit_t);
4951cb0ef41Sopenharmony_ci  int current_theta = 0;
4961cb0ef41Sopenharmony_ci  int i = 0;
4971cb0ef41Sopenharmony_ci  for (; i < n_ && len > 0; i++, current_theta += theta) {
4981cb0ef41Sopenharmony_ci    chunk_size = std::min(chunk_size, len);
4991cb0ef41Sopenharmony_ci    // For invocations via MultiplyFFT_Inner, X.len() == n_ * chunk_size + 1,
5001cb0ef41Sopenharmony_ci    // because the outer layer's "K" is passed as the inner layer's "N".
5011cb0ef41Sopenharmony_ci    // Since X is (mod Fn)-normalized on the outer layer, there is the rare
5021cb0ef41Sopenharmony_ci    // corner case where X[n_ * chunk_size] == 1. Detect that case, and handle
5031cb0ef41Sopenharmony_ci    // the extra bit as part of the last chunk; we always have the space.
5041cb0ef41Sopenharmony_ci    if (i == n_ - 1 && len == chunk_size + 1) {
5051cb0ef41Sopenharmony_ci      DCHECK(X[n_ * chunk_size] <= 1);
5061cb0ef41Sopenharmony_ci      DCHECK(length_ >= chunk_size + 1);
5071cb0ef41Sopenharmony_ci      chunk_size++;
5081cb0ef41Sopenharmony_ci    }
5091cb0ef41Sopenharmony_ci    if (current_theta != 0) {
5101cb0ef41Sopenharmony_ci      // Multiply with theta^i, and reduce modulo 2^K + 1.
5111cb0ef41Sopenharmony_ci      // We pass theta as a shift amount; it really means 2^theta.
5121cb0ef41Sopenharmony_ci      CopyAndZeroExtend(temp_, pointer, chunk_size, part_length_in_bytes);
5131cb0ef41Sopenharmony_ci      ShiftModFn(part_[i], temp_, current_theta, K_, chunk_size);
5141cb0ef41Sopenharmony_ci    } else {
5151cb0ef41Sopenharmony_ci      CopyAndZeroExtend(part_[i], pointer, chunk_size, part_length_in_bytes);
5161cb0ef41Sopenharmony_ci    }
5171cb0ef41Sopenharmony_ci    pointer += chunk_size;
5181cb0ef41Sopenharmony_ci    len -= chunk_size;
5191cb0ef41Sopenharmony_ci  }
5201cb0ef41Sopenharmony_ci  DCHECK(len == 0);
5211cb0ef41Sopenharmony_ci  for (; i < n_; i++) {
5221cb0ef41Sopenharmony_ci    memset(part_[i], 0, part_length_in_bytes);
5231cb0ef41Sopenharmony_ci  }
5241cb0ef41Sopenharmony_ci  FFT_ReturnShuffledThreadsafe(0, n_, omega, temp_);
5251cb0ef41Sopenharmony_ci}
5261cb0ef41Sopenharmony_ci
5271cb0ef41Sopenharmony_ci// This version of Start is optimized for the case where ~half of the
5281cb0ef41Sopenharmony_ci// container will be filled with padding zeros.
5291cb0ef41Sopenharmony_civoid FFTContainer::Start(Digits X, int chunk_size, int theta, int omega) {
5301cb0ef41Sopenharmony_ci  int len = X.len();
5311cb0ef41Sopenharmony_ci  if (len > n_ * chunk_size / 2) {
5321cb0ef41Sopenharmony_ci    return Start_Default(X, chunk_size, theta, omega);
5331cb0ef41Sopenharmony_ci  }
5341cb0ef41Sopenharmony_ci  DCHECK(theta == 0);
5351cb0ef41Sopenharmony_ci  const digit_t* pointer = X.digits();
5361cb0ef41Sopenharmony_ci  const size_t part_length_in_bytes = length_ * sizeof(digit_t);
5371cb0ef41Sopenharmony_ci  int nhalf = n_ / 2;
5381cb0ef41Sopenharmony_ci  // Unrolled first iteration.
5391cb0ef41Sopenharmony_ci  CopyAndZeroExtend(part_[0], pointer, chunk_size, part_length_in_bytes);
5401cb0ef41Sopenharmony_ci  CopyAndZeroExtend(part_[nhalf], pointer, chunk_size, part_length_in_bytes);
5411cb0ef41Sopenharmony_ci  pointer += chunk_size;
5421cb0ef41Sopenharmony_ci  len -= chunk_size;
5431cb0ef41Sopenharmony_ci  int i = 1;
5441cb0ef41Sopenharmony_ci  for (; i < nhalf && len > 0; i++) {
5451cb0ef41Sopenharmony_ci    chunk_size = std::min(chunk_size, len);
5461cb0ef41Sopenharmony_ci    CopyAndZeroExtend(part_[i], pointer, chunk_size, part_length_in_bytes);
5471cb0ef41Sopenharmony_ci    int w = omega * i;
5481cb0ef41Sopenharmony_ci    ShiftModFn(part_[i + nhalf], part_[i], w, K_, chunk_size);
5491cb0ef41Sopenharmony_ci    pointer += chunk_size;
5501cb0ef41Sopenharmony_ci    len -= chunk_size;
5511cb0ef41Sopenharmony_ci  }
5521cb0ef41Sopenharmony_ci  for (; i < nhalf; i++) {
5531cb0ef41Sopenharmony_ci    memset(part_[i], 0, part_length_in_bytes);
5541cb0ef41Sopenharmony_ci    memset(part_[i + nhalf], 0, part_length_in_bytes);
5551cb0ef41Sopenharmony_ci  }
5561cb0ef41Sopenharmony_ci  FFT_Recurse(0, nhalf, omega, temp_);
5571cb0ef41Sopenharmony_ci}
5581cb0ef41Sopenharmony_ci
5591cb0ef41Sopenharmony_ci// Forward transformation.
5601cb0ef41Sopenharmony_ci// We use the "DIF" aka "decimation in frequency" transform, because it
5611cb0ef41Sopenharmony_ci// leaves the result in "bit reversed" order, which is precisely what we
5621cb0ef41Sopenharmony_ci// need as input for the "DIT" aka "decimation in time" backwards transform.
5631cb0ef41Sopenharmony_civoid FFTContainer::FFT_ReturnShuffledThreadsafe(int start, int len, int omega,
5641cb0ef41Sopenharmony_ci                                                digit_t* temp) {
5651cb0ef41Sopenharmony_ci  DCHECK((len & 1) == 0);  // {len} must be even.
5661cb0ef41Sopenharmony_ci  int half = len / 2;
5671cb0ef41Sopenharmony_ci  SumDiff(part_[start], part_[start + half], part_[start], part_[start + half],
5681cb0ef41Sopenharmony_ci          length_);
5691cb0ef41Sopenharmony_ci  for (int k = 1; k < half; k++) {
5701cb0ef41Sopenharmony_ci    SumDiff(part_[start + k], temp, part_[start + k], part_[start + half + k],
5711cb0ef41Sopenharmony_ci            length_);
5721cb0ef41Sopenharmony_ci    int w = omega * k;
5731cb0ef41Sopenharmony_ci    ShiftModFn(part_[start + half + k], temp, w, K_);
5741cb0ef41Sopenharmony_ci  }
5751cb0ef41Sopenharmony_ci  FFT_Recurse(start, half, omega, temp);
5761cb0ef41Sopenharmony_ci}
5771cb0ef41Sopenharmony_ci
5781cb0ef41Sopenharmony_ci// Recursive step of the above, factored out for additional callers.
5791cb0ef41Sopenharmony_civoid FFTContainer::FFT_Recurse(int start, int half, int omega, digit_t* temp) {
5801cb0ef41Sopenharmony_ci  if (half > 1) {
5811cb0ef41Sopenharmony_ci    FFT_ReturnShuffledThreadsafe(start, half, 2 * omega, temp);
5821cb0ef41Sopenharmony_ci    FFT_ReturnShuffledThreadsafe(start + half, half, 2 * omega, temp);
5831cb0ef41Sopenharmony_ci  }
5841cb0ef41Sopenharmony_ci}
5851cb0ef41Sopenharmony_ci
5861cb0ef41Sopenharmony_ci// Backward transformation.
5871cb0ef41Sopenharmony_ci// We use the "DIT" aka "decimation in time" transform here, because it
5881cb0ef41Sopenharmony_ci// turns bit-reversed input into normally sorted output.
5891cb0ef41Sopenharmony_civoid FFTContainer::BackwardFFT(int start, int len, int omega) {
5901cb0ef41Sopenharmony_ci  BackwardFFT_Threadsafe(start, len, omega, temp_);
5911cb0ef41Sopenharmony_ci}
5921cb0ef41Sopenharmony_ci
5931cb0ef41Sopenharmony_civoid FFTContainer::BackwardFFT_Threadsafe(int start, int len, int omega,
5941cb0ef41Sopenharmony_ci                                          digit_t* temp) {
5951cb0ef41Sopenharmony_ci  DCHECK((len & 1) == 0);  // {len} must be even.
5961cb0ef41Sopenharmony_ci  int half = len / 2;
5971cb0ef41Sopenharmony_ci  // Don't recurse for half == 2, as PointwiseMultiply already performed
5981cb0ef41Sopenharmony_ci  // the first level of the backwards FFT.
5991cb0ef41Sopenharmony_ci  if (half > 2) {
6001cb0ef41Sopenharmony_ci    BackwardFFT_Threadsafe(start, half, 2 * omega, temp);
6011cb0ef41Sopenharmony_ci    BackwardFFT_Threadsafe(start + half, half, 2 * omega, temp);
6021cb0ef41Sopenharmony_ci  }
6031cb0ef41Sopenharmony_ci  SumDiff(part_[start], part_[start + half], part_[start], part_[start + half],
6041cb0ef41Sopenharmony_ci          length_);
6051cb0ef41Sopenharmony_ci  for (int k = 1; k < half; k++) {
6061cb0ef41Sopenharmony_ci    int w = omega * (len - k);
6071cb0ef41Sopenharmony_ci    ShiftModFn(temp, part_[start + half + k], w, K_);
6081cb0ef41Sopenharmony_ci    SumDiff(part_[start + k], part_[start + half + k], part_[start + k], temp,
6091cb0ef41Sopenharmony_ci            length_);
6101cb0ef41Sopenharmony_ci  }
6111cb0ef41Sopenharmony_ci}
6121cb0ef41Sopenharmony_ci
6131cb0ef41Sopenharmony_ci// Recombines the result's parts into {Z}, after backwards FFT.
6141cb0ef41Sopenharmony_civoid FFTContainer::NormalizeAndRecombine(int omega, int m, RWDigits Z,
6151cb0ef41Sopenharmony_ci                                         int chunk_size) {
6161cb0ef41Sopenharmony_ci  Z.Clear();
6171cb0ef41Sopenharmony_ci  int z_index = 0;
6181cb0ef41Sopenharmony_ci  const int shift = n_ * omega - m;
6191cb0ef41Sopenharmony_ci  for (int i = 0; i < n_; i++, z_index += chunk_size) {
6201cb0ef41Sopenharmony_ci    digit_t* part = part_[i];
6211cb0ef41Sopenharmony_ci    ShiftModFn(temp_, part, shift, K_);
6221cb0ef41Sopenharmony_ci    digit_t carry = 0;
6231cb0ef41Sopenharmony_ci    int zi = z_index;
6241cb0ef41Sopenharmony_ci    int j = 0;
6251cb0ef41Sopenharmony_ci    for (; j < length_ && zi < Z.len(); j++, zi++) {
6261cb0ef41Sopenharmony_ci      Z[zi] = digit_add3(Z[zi], temp_[j], carry, &carry);
6271cb0ef41Sopenharmony_ci    }
6281cb0ef41Sopenharmony_ci    for (; j < length_; j++) {
6291cb0ef41Sopenharmony_ci      DCHECK(temp_[j] == 0);
6301cb0ef41Sopenharmony_ci    }
6311cb0ef41Sopenharmony_ci    if (carry != 0) {
6321cb0ef41Sopenharmony_ci      DCHECK(zi < Z.len());
6331cb0ef41Sopenharmony_ci      Z[zi] = carry;
6341cb0ef41Sopenharmony_ci    }
6351cb0ef41Sopenharmony_ci  }
6361cb0ef41Sopenharmony_ci}
6371cb0ef41Sopenharmony_ci
6381cb0ef41Sopenharmony_ci// Helper function for {CounterWeightAndRecombine} below.
6391cb0ef41Sopenharmony_cibool ShouldBeNegative(const digit_t* x, int xlen, digit_t threshold, int s) {
6401cb0ef41Sopenharmony_ci  if (x[2 * s] >= threshold) return true;
6411cb0ef41Sopenharmony_ci  for (int i = 2 * s + 1; i < xlen; i++) {
6421cb0ef41Sopenharmony_ci    if (x[i] > 0) return true;
6431cb0ef41Sopenharmony_ci  }
6441cb0ef41Sopenharmony_ci  return false;
6451cb0ef41Sopenharmony_ci}
6461cb0ef41Sopenharmony_ci
6471cb0ef41Sopenharmony_ci// Same as {NormalizeAndRecombine} above, but for the needs of the recursive
6481cb0ef41Sopenharmony_ci// invocation ("inner layer") of FFT multiplication, where an additional
6491cb0ef41Sopenharmony_ci// counter-weighting step is required.
6501cb0ef41Sopenharmony_civoid FFTContainer::CounterWeightAndRecombine(int theta, int m, RWDigits Z,
6511cb0ef41Sopenharmony_ci                                             int s) {
6521cb0ef41Sopenharmony_ci  Z.Clear();
6531cb0ef41Sopenharmony_ci  int z_index = 0;
6541cb0ef41Sopenharmony_ci  for (int k = 0; k < n_; k++, z_index += s) {
6551cb0ef41Sopenharmony_ci    int shift = -theta * k - m;
6561cb0ef41Sopenharmony_ci    if (shift < 0) shift += 2 * n_ * theta;
6571cb0ef41Sopenharmony_ci    DCHECK(shift >= 0);
6581cb0ef41Sopenharmony_ci    digit_t* input = part_[k];
6591cb0ef41Sopenharmony_ci    ShiftModFn(temp_, input, shift, K_);
6601cb0ef41Sopenharmony_ci    int remaining_z = Z.len() - z_index;
6611cb0ef41Sopenharmony_ci    if (ShouldBeNegative(temp_, length_, k + 1, s)) {
6621cb0ef41Sopenharmony_ci      // Subtract F_n from input before adding to result. We use the following
6631cb0ef41Sopenharmony_ci      // transformation (knowing that X < F_n):
6641cb0ef41Sopenharmony_ci      // Z + (X - F_n) == Z - (F_n - X)
6651cb0ef41Sopenharmony_ci      digit_t borrow_z = 0;
6661cb0ef41Sopenharmony_ci      digit_t borrow_Fn = 0;
6671cb0ef41Sopenharmony_ci      {
6681cb0ef41Sopenharmony_ci        // i == 0:
6691cb0ef41Sopenharmony_ci        digit_t d = digit_sub(1, temp_[0], &borrow_Fn);
6701cb0ef41Sopenharmony_ci        Z[z_index] = digit_sub(Z[z_index], d, &borrow_z);
6711cb0ef41Sopenharmony_ci      }
6721cb0ef41Sopenharmony_ci      int i = 1;
6731cb0ef41Sopenharmony_ci      for (; i < K_ && i < remaining_z; i++) {
6741cb0ef41Sopenharmony_ci        digit_t d = digit_sub2(0, temp_[i], borrow_Fn, &borrow_Fn);
6751cb0ef41Sopenharmony_ci        Z[z_index + i] = digit_sub2(Z[z_index + i], d, borrow_z, &borrow_z);
6761cb0ef41Sopenharmony_ci      }
6771cb0ef41Sopenharmony_ci      DCHECK(i == K_ && K_ == length_ - 1);
6781cb0ef41Sopenharmony_ci      for (; i < length_ && i < remaining_z; i++) {
6791cb0ef41Sopenharmony_ci        digit_t d = digit_sub2(1, temp_[i], borrow_Fn, &borrow_Fn);
6801cb0ef41Sopenharmony_ci        Z[z_index + i] = digit_sub2(Z[z_index + i], d, borrow_z, &borrow_z);
6811cb0ef41Sopenharmony_ci      }
6821cb0ef41Sopenharmony_ci      DCHECK(borrow_Fn == 0);
6831cb0ef41Sopenharmony_ci      for (; borrow_z > 0 && i < remaining_z; i++) {
6841cb0ef41Sopenharmony_ci        Z[z_index + i] = digit_sub(Z[z_index + i], borrow_z, &borrow_z);
6851cb0ef41Sopenharmony_ci      }
6861cb0ef41Sopenharmony_ci    } else {
6871cb0ef41Sopenharmony_ci      digit_t carry = 0;
6881cb0ef41Sopenharmony_ci      int i = 0;
6891cb0ef41Sopenharmony_ci      for (; i < length_ && i < remaining_z; i++) {
6901cb0ef41Sopenharmony_ci        Z[z_index + i] = digit_add3(Z[z_index + i], temp_[i], carry, &carry);
6911cb0ef41Sopenharmony_ci      }
6921cb0ef41Sopenharmony_ci      for (; i < length_; i++) {
6931cb0ef41Sopenharmony_ci        DCHECK(temp_[i] == 0);
6941cb0ef41Sopenharmony_ci      }
6951cb0ef41Sopenharmony_ci      for (; carry > 0 && i < remaining_z; i++) {
6961cb0ef41Sopenharmony_ci        Z[z_index + i] = digit_add2(Z[z_index + i], carry, &carry);
6971cb0ef41Sopenharmony_ci      }
6981cb0ef41Sopenharmony_ci      // {carry} might be != 0 here if Z was negative before. That's fine.
6991cb0ef41Sopenharmony_ci    }
7001cb0ef41Sopenharmony_ci  }
7011cb0ef41Sopenharmony_ci}
7021cb0ef41Sopenharmony_ci
7031cb0ef41Sopenharmony_ci// Main FFT function for recursive invocations ("inner layer").
7041cb0ef41Sopenharmony_civoid MultiplyFFT_Inner(RWDigits Z, Digits X, Digits Y, const Parameters& params,
7051cb0ef41Sopenharmony_ci                       ProcessorImpl* processor) {
7061cb0ef41Sopenharmony_ci  int omega = 2 * params.r;  // really: 2^(2r)
7071cb0ef41Sopenharmony_ci  int theta = params.r;      // really: 2^r
7081cb0ef41Sopenharmony_ci
7091cb0ef41Sopenharmony_ci  FFTContainer a(params.n, params.K, processor);
7101cb0ef41Sopenharmony_ci  a.Start_Default(X, params.s, theta, omega);
7111cb0ef41Sopenharmony_ci  FFTContainer b(params.n, params.K, processor);
7121cb0ef41Sopenharmony_ci  b.Start_Default(Y, params.s, theta, omega);
7131cb0ef41Sopenharmony_ci
7141cb0ef41Sopenharmony_ci  a.PointwiseMultiply(b);
7151cb0ef41Sopenharmony_ci  if (processor->should_terminate()) return;
7161cb0ef41Sopenharmony_ci
7171cb0ef41Sopenharmony_ci  FFTContainer& c = a;
7181cb0ef41Sopenharmony_ci  c.BackwardFFT(0, params.n, omega);
7191cb0ef41Sopenharmony_ci
7201cb0ef41Sopenharmony_ci  c.CounterWeightAndRecombine(theta, params.m, Z, params.s);
7211cb0ef41Sopenharmony_ci}
7221cb0ef41Sopenharmony_ci
7231cb0ef41Sopenharmony_ci// Actual implementation of pointwise multiplications.
7241cb0ef41Sopenharmony_civoid FFTContainer::DoPointwiseMultiplication(const FFTContainer& other,
7251cb0ef41Sopenharmony_ci                                             int start, int end,
7261cb0ef41Sopenharmony_ci                                             digit_t* temp) {
7271cb0ef41Sopenharmony_ci  // The (K_ & 3) != 0 condition makes sure that the inner FFT gets
7281cb0ef41Sopenharmony_ci  // to split the work into at least 4 chunks.
7291cb0ef41Sopenharmony_ci  bool use_fft = length_ >= kFftInnerThreshold && (K_ & 3) == 0;
7301cb0ef41Sopenharmony_ci  Parameters params;
7311cb0ef41Sopenharmony_ci  if (use_fft) ComputeParameters_Inner(K_, &params);
7321cb0ef41Sopenharmony_ci  RWDigits result(temp, 2 * length_);
7331cb0ef41Sopenharmony_ci  for (int i = start; i < end; i++) {
7341cb0ef41Sopenharmony_ci    Digits A(part_[i], length_);
7351cb0ef41Sopenharmony_ci    Digits B(other.part_[i], length_);
7361cb0ef41Sopenharmony_ci    if (use_fft) {
7371cb0ef41Sopenharmony_ci      MultiplyFFT_Inner(result, A, B, params, processor_);
7381cb0ef41Sopenharmony_ci    } else {
7391cb0ef41Sopenharmony_ci      processor_->Multiply(result, A, B);
7401cb0ef41Sopenharmony_ci    }
7411cb0ef41Sopenharmony_ci    if (processor_->should_terminate()) return;
7421cb0ef41Sopenharmony_ci    ModFnDoubleWidth(part_[i], result.digits(), length_);
7431cb0ef41Sopenharmony_ci    // To improve cache friendliness, we perform the first level of the
7441cb0ef41Sopenharmony_ci    // backwards FFT here.
7451cb0ef41Sopenharmony_ci    if ((i & 1) == 1) {
7461cb0ef41Sopenharmony_ci      SumDiff(part_[i - 1], part_[i], part_[i - 1], part_[i], length_);
7471cb0ef41Sopenharmony_ci    }
7481cb0ef41Sopenharmony_ci  }
7491cb0ef41Sopenharmony_ci}
7501cb0ef41Sopenharmony_ci
7511cb0ef41Sopenharmony_ci// Convenient entry point for pointwise multiplications.
7521cb0ef41Sopenharmony_civoid FFTContainer::PointwiseMultiply(const FFTContainer& other) {
7531cb0ef41Sopenharmony_ci  DCHECK(n_ == other.n_);
7541cb0ef41Sopenharmony_ci  DoPointwiseMultiplication(other, 0, n_, temp_);
7551cb0ef41Sopenharmony_ci}
7561cb0ef41Sopenharmony_ci
7571cb0ef41Sopenharmony_ci}  // namespace
7581cb0ef41Sopenharmony_ci
7591cb0ef41Sopenharmony_ci////////////////////////////////////////////////////////////////////////////////
7601cb0ef41Sopenharmony_ci// Part 4: Tying everything together into a multiplication algorithm.
7611cb0ef41Sopenharmony_ci
7621cb0ef41Sopenharmony_ci// TODO(jkummerow): Consider doing a "Mersenne transform" and CRT reconstruction
7631cb0ef41Sopenharmony_ci// of the final result. Might yield a few percent of perf improvement.
7641cb0ef41Sopenharmony_ci
7651cb0ef41Sopenharmony_ci// TODO(jkummerow): Consider implementing the "sqrt(2) trick".
7661cb0ef41Sopenharmony_ci// Gaudry/Kruppa/Zimmerman report that it saved them around 10%.
7671cb0ef41Sopenharmony_ci
7681cb0ef41Sopenharmony_civoid ProcessorImpl::MultiplyFFT(RWDigits Z, Digits X, Digits Y) {
7691cb0ef41Sopenharmony_ci  Parameters params;
7701cb0ef41Sopenharmony_ci  int m = GetParameters(X.len() + Y.len(), &params);
7711cb0ef41Sopenharmony_ci  int omega = params.r;  // really: 2^r
7721cb0ef41Sopenharmony_ci
7731cb0ef41Sopenharmony_ci  FFTContainer a(params.n, params.K, this);
7741cb0ef41Sopenharmony_ci  a.Start(X, params.s, 0, omega);
7751cb0ef41Sopenharmony_ci  if (X == Y) {
7761cb0ef41Sopenharmony_ci    // Squaring.
7771cb0ef41Sopenharmony_ci    a.PointwiseMultiply(a);
7781cb0ef41Sopenharmony_ci  } else {
7791cb0ef41Sopenharmony_ci    FFTContainer b(params.n, params.K, this);
7801cb0ef41Sopenharmony_ci    b.Start(Y, params.s, 0, omega);
7811cb0ef41Sopenharmony_ci    a.PointwiseMultiply(b);
7821cb0ef41Sopenharmony_ci  }
7831cb0ef41Sopenharmony_ci  if (should_terminate()) return;
7841cb0ef41Sopenharmony_ci
7851cb0ef41Sopenharmony_ci  a.BackwardFFT(0, params.n, omega);
7861cb0ef41Sopenharmony_ci  a.NormalizeAndRecombine(omega, m, Z, params.s);
7871cb0ef41Sopenharmony_ci}
7881cb0ef41Sopenharmony_ci
7891cb0ef41Sopenharmony_ci}  // namespace bigint
7901cb0ef41Sopenharmony_ci}  // namespace v8
791