1/* 2 * Copyright (c) 2008-2020 Stefan Krah. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 29#include "mpdecimal.h" 30 31#include <assert.h> 32#include <limits.h> 33#include <stdio.h> 34#include <stdlib.h> 35#include <string.h> 36 37#include "bits.h" 38#include "constants.h" 39#include "transpose.h" 40#include "typearith.h" 41 42 43#define BUFSIZE 4096 44#define SIDE 128 45 46 47/* Bignum: The transpose functions are used for very large transforms 48 in sixstep.c and fourstep.c. */ 49 50 51/* Definition of the matrix transpose */ 52void 53std_trans(mpd_uint_t dest[], mpd_uint_t src[], mpd_size_t rows, mpd_size_t cols) 54{ 55 mpd_size_t idest, isrc; 56 mpd_size_t r, c; 57 58 for (r = 0; r < rows; r++) { 59 isrc = r * cols; 60 idest = r; 61 for (c = 0; c < cols; c++) { 62 dest[idest] = src[isrc]; 63 isrc += 1; 64 idest += rows; 65 } 66 } 67} 68 69/* 70 * Swap half-rows of 2^n * (2*2^n) matrix. 71 * FORWARD_CYCLE: even/odd permutation of the halfrows. 72 * BACKWARD_CYCLE: reverse the even/odd permutation. 73 */ 74static int 75swap_halfrows_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols, int dir) 76{ 77 mpd_uint_t buf1[BUFSIZE]; 78 mpd_uint_t buf2[BUFSIZE]; 79 mpd_uint_t *readbuf, *writebuf, *hp; 80 mpd_size_t *done, dbits; 81 mpd_size_t b = BUFSIZE, stride; 82 mpd_size_t hn, hmax; /* halfrow number */ 83 mpd_size_t m, r=0; 84 mpd_size_t offset; 85 mpd_size_t next; 86 87 88 assert(cols == mul_size_t(2, rows)); 89 90 if (dir == FORWARD_CYCLE) { 91 r = rows; 92 } 93 else if (dir == BACKWARD_CYCLE) { 94 r = 2; 95 } 96 else { 97 abort(); /* GCOV_NOT_REACHED */ 98 } 99 100 m = cols - 1; 101 hmax = rows; /* cycles start at odd halfrows */ 102 dbits = 8 * sizeof *done; 103 if ((done = mpd_calloc(hmax/(sizeof *done) + 1, sizeof *done)) == NULL) { 104 return 0; 105 } 106 107 for (hn = 1; hn <= hmax; hn += 2) { 108 109 if (done[hn/dbits] & mpd_bits[hn%dbits]) { 110 continue; 111 } 112 113 readbuf = buf1; writebuf = buf2; 114 115 for (offset = 0; offset < cols/2; offset += b) { 116 117 stride = (offset + b < cols/2) ? b : cols/2-offset; 118 119 hp = matrix + hn*cols/2; 120 memcpy(readbuf, hp+offset, stride*(sizeof *readbuf)); 121 pointerswap(&readbuf, &writebuf); 122 123 next = mulmod_size_t(hn, r, m); 124 hp = matrix + next*cols/2; 125 126 while (next != hn) { 127 128 memcpy(readbuf, hp+offset, stride*(sizeof *readbuf)); 129 memcpy(hp+offset, writebuf, stride*(sizeof *writebuf)); 130 pointerswap(&readbuf, &writebuf); 131 132 done[next/dbits] |= mpd_bits[next%dbits]; 133 134 next = mulmod_size_t(next, r, m); 135 hp = matrix + next*cols/2; 136 137 } 138 139 memcpy(hp+offset, writebuf, stride*(sizeof *writebuf)); 140 141 done[hn/dbits] |= mpd_bits[hn%dbits]; 142 } 143 } 144 145 mpd_free(done); 146 return 1; 147} 148 149/* In-place transpose of a square matrix */ 150static inline void 151squaretrans(mpd_uint_t *buf, mpd_size_t cols) 152{ 153 mpd_uint_t tmp; 154 mpd_size_t idest, isrc; 155 mpd_size_t r, c; 156 157 for (r = 0; r < cols; r++) { 158 c = r+1; 159 isrc = r*cols + c; 160 idest = c*cols + r; 161 for (c = r+1; c < cols; c++) { 162 tmp = buf[isrc]; 163 buf[isrc] = buf[idest]; 164 buf[idest] = tmp; 165 isrc += 1; 166 idest += cols; 167 } 168 } 169} 170 171/* 172 * Transpose 2^n * 2^n matrix. For cache efficiency, the matrix is split into 173 * square blocks with side length 'SIDE'. First, the blocks are transposed, 174 * then a square transposition is done on each individual block. 175 */ 176static void 177squaretrans_pow2(mpd_uint_t *matrix, mpd_size_t size) 178{ 179 mpd_uint_t buf1[SIDE*SIDE]; 180 mpd_uint_t buf2[SIDE*SIDE]; 181 mpd_uint_t *to, *from; 182 mpd_size_t b = size; 183 mpd_size_t r, c; 184 mpd_size_t i; 185 186 while (b > SIDE) b >>= 1; 187 188 for (r = 0; r < size; r += b) { 189 190 for (c = r; c < size; c += b) { 191 192 from = matrix + r*size + c; 193 to = buf1; 194 for (i = 0; i < b; i++) { 195 memcpy(to, from, b*(sizeof *to)); 196 from += size; 197 to += b; 198 } 199 squaretrans(buf1, b); 200 201 if (r == c) { 202 to = matrix + r*size + c; 203 from = buf1; 204 for (i = 0; i < b; i++) { 205 memcpy(to, from, b*(sizeof *to)); 206 from += b; 207 to += size; 208 } 209 continue; 210 } 211 else { 212 from = matrix + c*size + r; 213 to = buf2; 214 for (i = 0; i < b; i++) { 215 memcpy(to, from, b*(sizeof *to)); 216 from += size; 217 to += b; 218 } 219 squaretrans(buf2, b); 220 221 to = matrix + c*size + r; 222 from = buf1; 223 for (i = 0; i < b; i++) { 224 memcpy(to, from, b*(sizeof *to)); 225 from += b; 226 to += size; 227 } 228 229 to = matrix + r*size + c; 230 from = buf2; 231 for (i = 0; i < b; i++) { 232 memcpy(to, from, b*(sizeof *to)); 233 from += b; 234 to += size; 235 } 236 } 237 } 238 } 239 240} 241 242/* 243 * In-place transposition of a 2^n x 2^n or a 2^n x (2*2^n) 244 * or a (2*2^n) x 2^n matrix. 245 */ 246int 247transpose_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols) 248{ 249 mpd_size_t size = mul_size_t(rows, cols); 250 251 assert(ispower2(rows)); 252 assert(ispower2(cols)); 253 254 if (cols == rows) { 255 squaretrans_pow2(matrix, rows); 256 } 257 else if (cols == mul_size_t(2, rows)) { 258 if (!swap_halfrows_pow2(matrix, rows, cols, FORWARD_CYCLE)) { 259 return 0; 260 } 261 squaretrans_pow2(matrix, rows); 262 squaretrans_pow2(matrix+(size/2), rows); 263 } 264 else if (rows == mul_size_t(2, cols)) { 265 squaretrans_pow2(matrix, cols); 266 squaretrans_pow2(matrix+(size/2), cols); 267 if (!swap_halfrows_pow2(matrix, cols, rows, BACKWARD_CYCLE)) { 268 return 0; 269 } 270 } 271 else { 272 abort(); /* GCOV_NOT_REACHED */ 273 } 274 275 return 1; 276} 277