17db96d56Sopenharmony_ci/*
27db96d56Sopenharmony_ci * Copyright (c) 2008-2020 Stefan Krah. All rights reserved.
37db96d56Sopenharmony_ci *
47db96d56Sopenharmony_ci * Redistribution and use in source and binary forms, with or without
57db96d56Sopenharmony_ci * modification, are permitted provided that the following conditions
67db96d56Sopenharmony_ci * are met:
77db96d56Sopenharmony_ci *
87db96d56Sopenharmony_ci * 1. Redistributions of source code must retain the above copyright
97db96d56Sopenharmony_ci *    notice, this list of conditions and the following disclaimer.
107db96d56Sopenharmony_ci *
117db96d56Sopenharmony_ci * 2. Redistributions in binary form must reproduce the above copyright
127db96d56Sopenharmony_ci *    notice, this list of conditions and the following disclaimer in the
137db96d56Sopenharmony_ci *    documentation and/or other materials provided with the distribution.
147db96d56Sopenharmony_ci *
157db96d56Sopenharmony_ci * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
167db96d56Sopenharmony_ci * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
177db96d56Sopenharmony_ci * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
187db96d56Sopenharmony_ci * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
197db96d56Sopenharmony_ci * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
207db96d56Sopenharmony_ci * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
217db96d56Sopenharmony_ci * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
227db96d56Sopenharmony_ci * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
237db96d56Sopenharmony_ci * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
247db96d56Sopenharmony_ci * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
257db96d56Sopenharmony_ci * SUCH DAMAGE.
267db96d56Sopenharmony_ci */
277db96d56Sopenharmony_ci
287db96d56Sopenharmony_ci
297db96d56Sopenharmony_ci#include "mpdecimal.h"
307db96d56Sopenharmony_ci
317db96d56Sopenharmony_ci#include <assert.h>
327db96d56Sopenharmony_ci
337db96d56Sopenharmony_ci#include "constants.h"
347db96d56Sopenharmony_ci#include "fourstep.h"
357db96d56Sopenharmony_ci#include "numbertheory.h"
367db96d56Sopenharmony_ci#include "sixstep.h"
377db96d56Sopenharmony_ci#include "umodarith.h"
387db96d56Sopenharmony_ci
397db96d56Sopenharmony_ci
407db96d56Sopenharmony_ci/* Bignum: Cache efficient Matrix Fourier Transform for arrays of the
417db96d56Sopenharmony_ci   form 3 * 2**n (See literature/matrix-transform.txt). */
427db96d56Sopenharmony_ci
437db96d56Sopenharmony_ci
447db96d56Sopenharmony_ci#ifndef PPRO
457db96d56Sopenharmony_cistatic inline void
467db96d56Sopenharmony_cistd_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3,
477db96d56Sopenharmony_ci              mpd_uint_t w3table[3], mpd_uint_t umod)
487db96d56Sopenharmony_ci{
497db96d56Sopenharmony_ci    mpd_uint_t r1, r2;
507db96d56Sopenharmony_ci    mpd_uint_t w;
517db96d56Sopenharmony_ci    mpd_uint_t s, tmp;
527db96d56Sopenharmony_ci
537db96d56Sopenharmony_ci
547db96d56Sopenharmony_ci    /* k = 0 -> w = 1 */
557db96d56Sopenharmony_ci    s = *x1;
567db96d56Sopenharmony_ci    s = addmod(s, *x2, umod);
577db96d56Sopenharmony_ci    s = addmod(s, *x3, umod);
587db96d56Sopenharmony_ci
597db96d56Sopenharmony_ci    r1 = s;
607db96d56Sopenharmony_ci
617db96d56Sopenharmony_ci    /* k = 1 */
627db96d56Sopenharmony_ci    s = *x1;
637db96d56Sopenharmony_ci
647db96d56Sopenharmony_ci    w = w3table[1];
657db96d56Sopenharmony_ci    tmp = MULMOD(*x2, w);
667db96d56Sopenharmony_ci    s = addmod(s, tmp, umod);
677db96d56Sopenharmony_ci
687db96d56Sopenharmony_ci    w = w3table[2];
697db96d56Sopenharmony_ci    tmp = MULMOD(*x3, w);
707db96d56Sopenharmony_ci    s = addmod(s, tmp, umod);
717db96d56Sopenharmony_ci
727db96d56Sopenharmony_ci    r2 = s;
737db96d56Sopenharmony_ci
747db96d56Sopenharmony_ci    /* k = 2 */
757db96d56Sopenharmony_ci    s = *x1;
767db96d56Sopenharmony_ci
777db96d56Sopenharmony_ci    w = w3table[2];
787db96d56Sopenharmony_ci    tmp = MULMOD(*x2, w);
797db96d56Sopenharmony_ci    s = addmod(s, tmp, umod);
807db96d56Sopenharmony_ci
817db96d56Sopenharmony_ci    w = w3table[1];
827db96d56Sopenharmony_ci    tmp = MULMOD(*x3, w);
837db96d56Sopenharmony_ci    s = addmod(s, tmp, umod);
847db96d56Sopenharmony_ci
857db96d56Sopenharmony_ci    *x3 = s;
867db96d56Sopenharmony_ci    *x2 = r2;
877db96d56Sopenharmony_ci    *x1 = r1;
887db96d56Sopenharmony_ci}
897db96d56Sopenharmony_ci#else /* PPRO */
907db96d56Sopenharmony_cistatic inline void
917db96d56Sopenharmony_cippro_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3, mpd_uint_t w3table[3],
927db96d56Sopenharmony_ci               mpd_uint_t umod, double *dmod, uint32_t dinvmod[3])
937db96d56Sopenharmony_ci{
947db96d56Sopenharmony_ci    mpd_uint_t r1, r2;
957db96d56Sopenharmony_ci    mpd_uint_t w;
967db96d56Sopenharmony_ci    mpd_uint_t s, tmp;
977db96d56Sopenharmony_ci
987db96d56Sopenharmony_ci
997db96d56Sopenharmony_ci    /* k = 0 -> w = 1 */
1007db96d56Sopenharmony_ci    s = *x1;
1017db96d56Sopenharmony_ci    s = addmod(s, *x2, umod);
1027db96d56Sopenharmony_ci    s = addmod(s, *x3, umod);
1037db96d56Sopenharmony_ci
1047db96d56Sopenharmony_ci    r1 = s;
1057db96d56Sopenharmony_ci
1067db96d56Sopenharmony_ci    /* k = 1 */
1077db96d56Sopenharmony_ci    s = *x1;
1087db96d56Sopenharmony_ci
1097db96d56Sopenharmony_ci    w = w3table[1];
1107db96d56Sopenharmony_ci    tmp = ppro_mulmod(*x2, w, dmod, dinvmod);
1117db96d56Sopenharmony_ci    s = addmod(s, tmp, umod);
1127db96d56Sopenharmony_ci
1137db96d56Sopenharmony_ci    w = w3table[2];
1147db96d56Sopenharmony_ci    tmp = ppro_mulmod(*x3, w, dmod, dinvmod);
1157db96d56Sopenharmony_ci    s = addmod(s, tmp, umod);
1167db96d56Sopenharmony_ci
1177db96d56Sopenharmony_ci    r2 = s;
1187db96d56Sopenharmony_ci
1197db96d56Sopenharmony_ci    /* k = 2 */
1207db96d56Sopenharmony_ci    s = *x1;
1217db96d56Sopenharmony_ci
1227db96d56Sopenharmony_ci    w = w3table[2];
1237db96d56Sopenharmony_ci    tmp = ppro_mulmod(*x2, w, dmod, dinvmod);
1247db96d56Sopenharmony_ci    s = addmod(s, tmp, umod);
1257db96d56Sopenharmony_ci
1267db96d56Sopenharmony_ci    w = w3table[1];
1277db96d56Sopenharmony_ci    tmp = ppro_mulmod(*x3, w, dmod, dinvmod);
1287db96d56Sopenharmony_ci    s = addmod(s, tmp, umod);
1297db96d56Sopenharmony_ci
1307db96d56Sopenharmony_ci    *x3 = s;
1317db96d56Sopenharmony_ci    *x2 = r2;
1327db96d56Sopenharmony_ci    *x1 = r1;
1337db96d56Sopenharmony_ci}
1347db96d56Sopenharmony_ci#endif
1357db96d56Sopenharmony_ci
1367db96d56Sopenharmony_ci
1377db96d56Sopenharmony_ci/* forward transform, sign = -1; transform length = 3 * 2**n */
1387db96d56Sopenharmony_ciint
1397db96d56Sopenharmony_cifour_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum)
1407db96d56Sopenharmony_ci{
1417db96d56Sopenharmony_ci    mpd_size_t R = 3; /* number of rows */
1427db96d56Sopenharmony_ci    mpd_size_t C = n / 3; /* number of columns */
1437db96d56Sopenharmony_ci    mpd_uint_t w3table[3];
1447db96d56Sopenharmony_ci    mpd_uint_t kernel, w0, w1, wstep;
1457db96d56Sopenharmony_ci    mpd_uint_t *s, *p0, *p1, *p2;
1467db96d56Sopenharmony_ci    mpd_uint_t umod;
1477db96d56Sopenharmony_ci#ifdef PPRO
1487db96d56Sopenharmony_ci    double dmod;
1497db96d56Sopenharmony_ci    uint32_t dinvmod[3];
1507db96d56Sopenharmony_ci#endif
1517db96d56Sopenharmony_ci    mpd_size_t i, k;
1527db96d56Sopenharmony_ci
1537db96d56Sopenharmony_ci
1547db96d56Sopenharmony_ci    assert(n >= 48);
1557db96d56Sopenharmony_ci    assert(n <= 3*MPD_MAXTRANSFORM_2N);
1567db96d56Sopenharmony_ci
1577db96d56Sopenharmony_ci
1587db96d56Sopenharmony_ci    /* Length R transform on the columns. */
1597db96d56Sopenharmony_ci    SETMODULUS(modnum);
1607db96d56Sopenharmony_ci    _mpd_init_w3table(w3table, -1, modnum);
1617db96d56Sopenharmony_ci    for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) {
1627db96d56Sopenharmony_ci
1637db96d56Sopenharmony_ci        SIZE3_NTT(p0, p1, p2, w3table);
1647db96d56Sopenharmony_ci    }
1657db96d56Sopenharmony_ci
1667db96d56Sopenharmony_ci    /* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */
1677db96d56Sopenharmony_ci    kernel = _mpd_getkernel(n, -1, modnum);
1687db96d56Sopenharmony_ci    for (i = 1; i < R; i++) {
1697db96d56Sopenharmony_ci        w0 = 1;                  /* r**(i*0): initial value for k=0 */
1707db96d56Sopenharmony_ci        w1 = POWMOD(kernel, i);  /* r**(i*1): initial value for k=1 */
1717db96d56Sopenharmony_ci        wstep = MULMOD(w1, w1);  /* r**(2*i) */
1727db96d56Sopenharmony_ci        for (k = 0; k < C-1; k += 2) {
1737db96d56Sopenharmony_ci            mpd_uint_t x0 = a[i*C+k];
1747db96d56Sopenharmony_ci            mpd_uint_t x1 = a[i*C+k+1];
1757db96d56Sopenharmony_ci            MULMOD2(&x0, w0, &x1, w1);
1767db96d56Sopenharmony_ci            MULMOD2C(&w0, &w1, wstep);  /* r**(i*(k+2)) = r**(i*k) * r**(2*i) */
1777db96d56Sopenharmony_ci            a[i*C+k] = x0;
1787db96d56Sopenharmony_ci            a[i*C+k+1] = x1;
1797db96d56Sopenharmony_ci        }
1807db96d56Sopenharmony_ci    }
1817db96d56Sopenharmony_ci
1827db96d56Sopenharmony_ci    /* Length C transform on the rows. */
1837db96d56Sopenharmony_ci    for (s = a; s < a+n; s += C) {
1847db96d56Sopenharmony_ci        if (!six_step_fnt(s, C, modnum)) {
1857db96d56Sopenharmony_ci            return 0;
1867db96d56Sopenharmony_ci        }
1877db96d56Sopenharmony_ci    }
1887db96d56Sopenharmony_ci
1897db96d56Sopenharmony_ci#if 0
1907db96d56Sopenharmony_ci    /* An unordered transform is sufficient for convolution. */
1917db96d56Sopenharmony_ci    /* Transpose the matrix. */
1927db96d56Sopenharmony_ci    #include "transpose.h"
1937db96d56Sopenharmony_ci    transpose_3xpow2(a, R, C);
1947db96d56Sopenharmony_ci#endif
1957db96d56Sopenharmony_ci
1967db96d56Sopenharmony_ci    return 1;
1977db96d56Sopenharmony_ci}
1987db96d56Sopenharmony_ci
1997db96d56Sopenharmony_ci/* backward transform, sign = 1; transform length = 3 * 2**n */
2007db96d56Sopenharmony_ciint
2017db96d56Sopenharmony_ciinv_four_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum)
2027db96d56Sopenharmony_ci{
2037db96d56Sopenharmony_ci    mpd_size_t R = 3; /* number of rows */
2047db96d56Sopenharmony_ci    mpd_size_t C = n / 3; /* number of columns */
2057db96d56Sopenharmony_ci    mpd_uint_t w3table[3];
2067db96d56Sopenharmony_ci    mpd_uint_t kernel, w0, w1, wstep;
2077db96d56Sopenharmony_ci    mpd_uint_t *s, *p0, *p1, *p2;
2087db96d56Sopenharmony_ci    mpd_uint_t umod;
2097db96d56Sopenharmony_ci#ifdef PPRO
2107db96d56Sopenharmony_ci    double dmod;
2117db96d56Sopenharmony_ci    uint32_t dinvmod[3];
2127db96d56Sopenharmony_ci#endif
2137db96d56Sopenharmony_ci    mpd_size_t i, k;
2147db96d56Sopenharmony_ci
2157db96d56Sopenharmony_ci
2167db96d56Sopenharmony_ci    assert(n >= 48);
2177db96d56Sopenharmony_ci    assert(n <= 3*MPD_MAXTRANSFORM_2N);
2187db96d56Sopenharmony_ci
2197db96d56Sopenharmony_ci
2207db96d56Sopenharmony_ci#if 0
2217db96d56Sopenharmony_ci    /* An unordered transform is sufficient for convolution. */
2227db96d56Sopenharmony_ci    /* Transpose the matrix, producing an R*C matrix. */
2237db96d56Sopenharmony_ci    #include "transpose.h"
2247db96d56Sopenharmony_ci    transpose_3xpow2(a, C, R);
2257db96d56Sopenharmony_ci#endif
2267db96d56Sopenharmony_ci
2277db96d56Sopenharmony_ci    /* Length C transform on the rows. */
2287db96d56Sopenharmony_ci    for (s = a; s < a+n; s += C) {
2297db96d56Sopenharmony_ci        if (!inv_six_step_fnt(s, C, modnum)) {
2307db96d56Sopenharmony_ci            return 0;
2317db96d56Sopenharmony_ci        }
2327db96d56Sopenharmony_ci    }
2337db96d56Sopenharmony_ci
2347db96d56Sopenharmony_ci    /* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */
2357db96d56Sopenharmony_ci    SETMODULUS(modnum);
2367db96d56Sopenharmony_ci    kernel = _mpd_getkernel(n, 1, modnum);
2377db96d56Sopenharmony_ci    for (i = 1; i < R; i++) {
2387db96d56Sopenharmony_ci        w0 = 1;
2397db96d56Sopenharmony_ci        w1 = POWMOD(kernel, i);
2407db96d56Sopenharmony_ci        wstep = MULMOD(w1, w1);
2417db96d56Sopenharmony_ci        for (k = 0; k < C; k += 2) {
2427db96d56Sopenharmony_ci            mpd_uint_t x0 = a[i*C+k];
2437db96d56Sopenharmony_ci            mpd_uint_t x1 = a[i*C+k+1];
2447db96d56Sopenharmony_ci            MULMOD2(&x0, w0, &x1, w1);
2457db96d56Sopenharmony_ci            MULMOD2C(&w0, &w1, wstep);
2467db96d56Sopenharmony_ci            a[i*C+k] = x0;
2477db96d56Sopenharmony_ci            a[i*C+k+1] = x1;
2487db96d56Sopenharmony_ci        }
2497db96d56Sopenharmony_ci    }
2507db96d56Sopenharmony_ci
2517db96d56Sopenharmony_ci    /* Length R transform on the columns. */
2527db96d56Sopenharmony_ci    _mpd_init_w3table(w3table, 1, modnum);
2537db96d56Sopenharmony_ci    for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) {
2547db96d56Sopenharmony_ci
2557db96d56Sopenharmony_ci        SIZE3_NTT(p0, p1, p2, w3table);
2567db96d56Sopenharmony_ci    }
2577db96d56Sopenharmony_ci
2587db96d56Sopenharmony_ci    return 1;
2597db96d56Sopenharmony_ci}
260