17db96d56Sopenharmony_ci/* 27db96d56Sopenharmony_ci * Copyright (c) 2008-2020 Stefan Krah. All rights reserved. 37db96d56Sopenharmony_ci * 47db96d56Sopenharmony_ci * Redistribution and use in source and binary forms, with or without 57db96d56Sopenharmony_ci * modification, are permitted provided that the following conditions 67db96d56Sopenharmony_ci * are met: 77db96d56Sopenharmony_ci * 87db96d56Sopenharmony_ci * 1. Redistributions of source code must retain the above copyright 97db96d56Sopenharmony_ci * notice, this list of conditions and the following disclaimer. 107db96d56Sopenharmony_ci * 117db96d56Sopenharmony_ci * 2. Redistributions in binary form must reproduce the above copyright 127db96d56Sopenharmony_ci * notice, this list of conditions and the following disclaimer in the 137db96d56Sopenharmony_ci * documentation and/or other materials provided with the distribution. 147db96d56Sopenharmony_ci * 157db96d56Sopenharmony_ci * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND 167db96d56Sopenharmony_ci * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 177db96d56Sopenharmony_ci * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 187db96d56Sopenharmony_ci * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 197db96d56Sopenharmony_ci * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 207db96d56Sopenharmony_ci * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 217db96d56Sopenharmony_ci * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 227db96d56Sopenharmony_ci * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 237db96d56Sopenharmony_ci * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 247db96d56Sopenharmony_ci * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 257db96d56Sopenharmony_ci * SUCH DAMAGE. 267db96d56Sopenharmony_ci */ 277db96d56Sopenharmony_ci 287db96d56Sopenharmony_ci 297db96d56Sopenharmony_ci#include "mpdecimal.h" 307db96d56Sopenharmony_ci 317db96d56Sopenharmony_ci#include <assert.h> 327db96d56Sopenharmony_ci 337db96d56Sopenharmony_ci#include "constants.h" 347db96d56Sopenharmony_ci#include "fourstep.h" 357db96d56Sopenharmony_ci#include "numbertheory.h" 367db96d56Sopenharmony_ci#include "sixstep.h" 377db96d56Sopenharmony_ci#include "umodarith.h" 387db96d56Sopenharmony_ci 397db96d56Sopenharmony_ci 407db96d56Sopenharmony_ci/* Bignum: Cache efficient Matrix Fourier Transform for arrays of the 417db96d56Sopenharmony_ci form 3 * 2**n (See literature/matrix-transform.txt). */ 427db96d56Sopenharmony_ci 437db96d56Sopenharmony_ci 447db96d56Sopenharmony_ci#ifndef PPRO 457db96d56Sopenharmony_cistatic inline void 467db96d56Sopenharmony_cistd_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3, 477db96d56Sopenharmony_ci mpd_uint_t w3table[3], mpd_uint_t umod) 487db96d56Sopenharmony_ci{ 497db96d56Sopenharmony_ci mpd_uint_t r1, r2; 507db96d56Sopenharmony_ci mpd_uint_t w; 517db96d56Sopenharmony_ci mpd_uint_t s, tmp; 527db96d56Sopenharmony_ci 537db96d56Sopenharmony_ci 547db96d56Sopenharmony_ci /* k = 0 -> w = 1 */ 557db96d56Sopenharmony_ci s = *x1; 567db96d56Sopenharmony_ci s = addmod(s, *x2, umod); 577db96d56Sopenharmony_ci s = addmod(s, *x3, umod); 587db96d56Sopenharmony_ci 597db96d56Sopenharmony_ci r1 = s; 607db96d56Sopenharmony_ci 617db96d56Sopenharmony_ci /* k = 1 */ 627db96d56Sopenharmony_ci s = *x1; 637db96d56Sopenharmony_ci 647db96d56Sopenharmony_ci w = w3table[1]; 657db96d56Sopenharmony_ci tmp = MULMOD(*x2, w); 667db96d56Sopenharmony_ci s = addmod(s, tmp, umod); 677db96d56Sopenharmony_ci 687db96d56Sopenharmony_ci w = w3table[2]; 697db96d56Sopenharmony_ci tmp = MULMOD(*x3, w); 707db96d56Sopenharmony_ci s = addmod(s, tmp, umod); 717db96d56Sopenharmony_ci 727db96d56Sopenharmony_ci r2 = s; 737db96d56Sopenharmony_ci 747db96d56Sopenharmony_ci /* k = 2 */ 757db96d56Sopenharmony_ci s = *x1; 767db96d56Sopenharmony_ci 777db96d56Sopenharmony_ci w = w3table[2]; 787db96d56Sopenharmony_ci tmp = MULMOD(*x2, w); 797db96d56Sopenharmony_ci s = addmod(s, tmp, umod); 807db96d56Sopenharmony_ci 817db96d56Sopenharmony_ci w = w3table[1]; 827db96d56Sopenharmony_ci tmp = MULMOD(*x3, w); 837db96d56Sopenharmony_ci s = addmod(s, tmp, umod); 847db96d56Sopenharmony_ci 857db96d56Sopenharmony_ci *x3 = s; 867db96d56Sopenharmony_ci *x2 = r2; 877db96d56Sopenharmony_ci *x1 = r1; 887db96d56Sopenharmony_ci} 897db96d56Sopenharmony_ci#else /* PPRO */ 907db96d56Sopenharmony_cistatic inline void 917db96d56Sopenharmony_cippro_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3, mpd_uint_t w3table[3], 927db96d56Sopenharmony_ci mpd_uint_t umod, double *dmod, uint32_t dinvmod[3]) 937db96d56Sopenharmony_ci{ 947db96d56Sopenharmony_ci mpd_uint_t r1, r2; 957db96d56Sopenharmony_ci mpd_uint_t w; 967db96d56Sopenharmony_ci mpd_uint_t s, tmp; 977db96d56Sopenharmony_ci 987db96d56Sopenharmony_ci 997db96d56Sopenharmony_ci /* k = 0 -> w = 1 */ 1007db96d56Sopenharmony_ci s = *x1; 1017db96d56Sopenharmony_ci s = addmod(s, *x2, umod); 1027db96d56Sopenharmony_ci s = addmod(s, *x3, umod); 1037db96d56Sopenharmony_ci 1047db96d56Sopenharmony_ci r1 = s; 1057db96d56Sopenharmony_ci 1067db96d56Sopenharmony_ci /* k = 1 */ 1077db96d56Sopenharmony_ci s = *x1; 1087db96d56Sopenharmony_ci 1097db96d56Sopenharmony_ci w = w3table[1]; 1107db96d56Sopenharmony_ci tmp = ppro_mulmod(*x2, w, dmod, dinvmod); 1117db96d56Sopenharmony_ci s = addmod(s, tmp, umod); 1127db96d56Sopenharmony_ci 1137db96d56Sopenharmony_ci w = w3table[2]; 1147db96d56Sopenharmony_ci tmp = ppro_mulmod(*x3, w, dmod, dinvmod); 1157db96d56Sopenharmony_ci s = addmod(s, tmp, umod); 1167db96d56Sopenharmony_ci 1177db96d56Sopenharmony_ci r2 = s; 1187db96d56Sopenharmony_ci 1197db96d56Sopenharmony_ci /* k = 2 */ 1207db96d56Sopenharmony_ci s = *x1; 1217db96d56Sopenharmony_ci 1227db96d56Sopenharmony_ci w = w3table[2]; 1237db96d56Sopenharmony_ci tmp = ppro_mulmod(*x2, w, dmod, dinvmod); 1247db96d56Sopenharmony_ci s = addmod(s, tmp, umod); 1257db96d56Sopenharmony_ci 1267db96d56Sopenharmony_ci w = w3table[1]; 1277db96d56Sopenharmony_ci tmp = ppro_mulmod(*x3, w, dmod, dinvmod); 1287db96d56Sopenharmony_ci s = addmod(s, tmp, umod); 1297db96d56Sopenharmony_ci 1307db96d56Sopenharmony_ci *x3 = s; 1317db96d56Sopenharmony_ci *x2 = r2; 1327db96d56Sopenharmony_ci *x1 = r1; 1337db96d56Sopenharmony_ci} 1347db96d56Sopenharmony_ci#endif 1357db96d56Sopenharmony_ci 1367db96d56Sopenharmony_ci 1377db96d56Sopenharmony_ci/* forward transform, sign = -1; transform length = 3 * 2**n */ 1387db96d56Sopenharmony_ciint 1397db96d56Sopenharmony_cifour_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum) 1407db96d56Sopenharmony_ci{ 1417db96d56Sopenharmony_ci mpd_size_t R = 3; /* number of rows */ 1427db96d56Sopenharmony_ci mpd_size_t C = n / 3; /* number of columns */ 1437db96d56Sopenharmony_ci mpd_uint_t w3table[3]; 1447db96d56Sopenharmony_ci mpd_uint_t kernel, w0, w1, wstep; 1457db96d56Sopenharmony_ci mpd_uint_t *s, *p0, *p1, *p2; 1467db96d56Sopenharmony_ci mpd_uint_t umod; 1477db96d56Sopenharmony_ci#ifdef PPRO 1487db96d56Sopenharmony_ci double dmod; 1497db96d56Sopenharmony_ci uint32_t dinvmod[3]; 1507db96d56Sopenharmony_ci#endif 1517db96d56Sopenharmony_ci mpd_size_t i, k; 1527db96d56Sopenharmony_ci 1537db96d56Sopenharmony_ci 1547db96d56Sopenharmony_ci assert(n >= 48); 1557db96d56Sopenharmony_ci assert(n <= 3*MPD_MAXTRANSFORM_2N); 1567db96d56Sopenharmony_ci 1577db96d56Sopenharmony_ci 1587db96d56Sopenharmony_ci /* Length R transform on the columns. */ 1597db96d56Sopenharmony_ci SETMODULUS(modnum); 1607db96d56Sopenharmony_ci _mpd_init_w3table(w3table, -1, modnum); 1617db96d56Sopenharmony_ci for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) { 1627db96d56Sopenharmony_ci 1637db96d56Sopenharmony_ci SIZE3_NTT(p0, p1, p2, w3table); 1647db96d56Sopenharmony_ci } 1657db96d56Sopenharmony_ci 1667db96d56Sopenharmony_ci /* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */ 1677db96d56Sopenharmony_ci kernel = _mpd_getkernel(n, -1, modnum); 1687db96d56Sopenharmony_ci for (i = 1; i < R; i++) { 1697db96d56Sopenharmony_ci w0 = 1; /* r**(i*0): initial value for k=0 */ 1707db96d56Sopenharmony_ci w1 = POWMOD(kernel, i); /* r**(i*1): initial value for k=1 */ 1717db96d56Sopenharmony_ci wstep = MULMOD(w1, w1); /* r**(2*i) */ 1727db96d56Sopenharmony_ci for (k = 0; k < C-1; k += 2) { 1737db96d56Sopenharmony_ci mpd_uint_t x0 = a[i*C+k]; 1747db96d56Sopenharmony_ci mpd_uint_t x1 = a[i*C+k+1]; 1757db96d56Sopenharmony_ci MULMOD2(&x0, w0, &x1, w1); 1767db96d56Sopenharmony_ci MULMOD2C(&w0, &w1, wstep); /* r**(i*(k+2)) = r**(i*k) * r**(2*i) */ 1777db96d56Sopenharmony_ci a[i*C+k] = x0; 1787db96d56Sopenharmony_ci a[i*C+k+1] = x1; 1797db96d56Sopenharmony_ci } 1807db96d56Sopenharmony_ci } 1817db96d56Sopenharmony_ci 1827db96d56Sopenharmony_ci /* Length C transform on the rows. */ 1837db96d56Sopenharmony_ci for (s = a; s < a+n; s += C) { 1847db96d56Sopenharmony_ci if (!six_step_fnt(s, C, modnum)) { 1857db96d56Sopenharmony_ci return 0; 1867db96d56Sopenharmony_ci } 1877db96d56Sopenharmony_ci } 1887db96d56Sopenharmony_ci 1897db96d56Sopenharmony_ci#if 0 1907db96d56Sopenharmony_ci /* An unordered transform is sufficient for convolution. */ 1917db96d56Sopenharmony_ci /* Transpose the matrix. */ 1927db96d56Sopenharmony_ci #include "transpose.h" 1937db96d56Sopenharmony_ci transpose_3xpow2(a, R, C); 1947db96d56Sopenharmony_ci#endif 1957db96d56Sopenharmony_ci 1967db96d56Sopenharmony_ci return 1; 1977db96d56Sopenharmony_ci} 1987db96d56Sopenharmony_ci 1997db96d56Sopenharmony_ci/* backward transform, sign = 1; transform length = 3 * 2**n */ 2007db96d56Sopenharmony_ciint 2017db96d56Sopenharmony_ciinv_four_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum) 2027db96d56Sopenharmony_ci{ 2037db96d56Sopenharmony_ci mpd_size_t R = 3; /* number of rows */ 2047db96d56Sopenharmony_ci mpd_size_t C = n / 3; /* number of columns */ 2057db96d56Sopenharmony_ci mpd_uint_t w3table[3]; 2067db96d56Sopenharmony_ci mpd_uint_t kernel, w0, w1, wstep; 2077db96d56Sopenharmony_ci mpd_uint_t *s, *p0, *p1, *p2; 2087db96d56Sopenharmony_ci mpd_uint_t umod; 2097db96d56Sopenharmony_ci#ifdef PPRO 2107db96d56Sopenharmony_ci double dmod; 2117db96d56Sopenharmony_ci uint32_t dinvmod[3]; 2127db96d56Sopenharmony_ci#endif 2137db96d56Sopenharmony_ci mpd_size_t i, k; 2147db96d56Sopenharmony_ci 2157db96d56Sopenharmony_ci 2167db96d56Sopenharmony_ci assert(n >= 48); 2177db96d56Sopenharmony_ci assert(n <= 3*MPD_MAXTRANSFORM_2N); 2187db96d56Sopenharmony_ci 2197db96d56Sopenharmony_ci 2207db96d56Sopenharmony_ci#if 0 2217db96d56Sopenharmony_ci /* An unordered transform is sufficient for convolution. */ 2227db96d56Sopenharmony_ci /* Transpose the matrix, producing an R*C matrix. */ 2237db96d56Sopenharmony_ci #include "transpose.h" 2247db96d56Sopenharmony_ci transpose_3xpow2(a, C, R); 2257db96d56Sopenharmony_ci#endif 2267db96d56Sopenharmony_ci 2277db96d56Sopenharmony_ci /* Length C transform on the rows. */ 2287db96d56Sopenharmony_ci for (s = a; s < a+n; s += C) { 2297db96d56Sopenharmony_ci if (!inv_six_step_fnt(s, C, modnum)) { 2307db96d56Sopenharmony_ci return 0; 2317db96d56Sopenharmony_ci } 2327db96d56Sopenharmony_ci } 2337db96d56Sopenharmony_ci 2347db96d56Sopenharmony_ci /* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */ 2357db96d56Sopenharmony_ci SETMODULUS(modnum); 2367db96d56Sopenharmony_ci kernel = _mpd_getkernel(n, 1, modnum); 2377db96d56Sopenharmony_ci for (i = 1; i < R; i++) { 2387db96d56Sopenharmony_ci w0 = 1; 2397db96d56Sopenharmony_ci w1 = POWMOD(kernel, i); 2407db96d56Sopenharmony_ci wstep = MULMOD(w1, w1); 2417db96d56Sopenharmony_ci for (k = 0; k < C; k += 2) { 2427db96d56Sopenharmony_ci mpd_uint_t x0 = a[i*C+k]; 2437db96d56Sopenharmony_ci mpd_uint_t x1 = a[i*C+k+1]; 2447db96d56Sopenharmony_ci MULMOD2(&x0, w0, &x1, w1); 2457db96d56Sopenharmony_ci MULMOD2C(&w0, &w1, wstep); 2467db96d56Sopenharmony_ci a[i*C+k] = x0; 2477db96d56Sopenharmony_ci a[i*C+k+1] = x1; 2487db96d56Sopenharmony_ci } 2497db96d56Sopenharmony_ci } 2507db96d56Sopenharmony_ci 2517db96d56Sopenharmony_ci /* Length R transform on the columns. */ 2527db96d56Sopenharmony_ci _mpd_init_w3table(w3table, 1, modnum); 2537db96d56Sopenharmony_ci for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) { 2547db96d56Sopenharmony_ci 2557db96d56Sopenharmony_ci SIZE3_NTT(p0, p1, p2, w3table); 2567db96d56Sopenharmony_ci } 2577db96d56Sopenharmony_ci 2587db96d56Sopenharmony_ci return 1; 2597db96d56Sopenharmony_ci} 260