1#ifndef AVCODEC_PPC_FFT_VSX_H 2#define AVCODEC_PPC_FFT_VSX_H 3/* 4 * FFT transform, optimized with VSX built-in functions 5 * Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt 6 * 7 * This algorithm (though not any of the implementation details) is 8 * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S. 9 * 10 * This file is part of FFmpeg. 11 * 12 * FFmpeg is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU Lesser General Public 14 * License as published by the Free Software Foundation; either 15 * version 2.1 of the License, or (at your option) any later version. 16 * 17 * FFmpeg is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 * Lesser General Public License for more details. 21 * 22 * You should have received a copy of the GNU Lesser General Public 23 * License along with FFmpeg; if not, write to the Free Software 24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25 */ 26 27 28#include "config.h" 29#include "libavutil/cpu.h" 30#include "libavutil/ppc/util_altivec.h" 31#include "libavcodec/fft.h" 32#include "libavcodec/fft-internal.h" 33 34#if HAVE_VSX 35 36void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z); 37void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z); 38 39 40#define byte_2complex (2*sizeof(FFTComplex)) 41#define byte_4complex (4*sizeof(FFTComplex)) 42#define byte_6complex (6*sizeof(FFTComplex)) 43#define byte_8complex (8*sizeof(FFTComplex)) 44#define byte_10complex (10*sizeof(FFTComplex)) 45#define byte_12complex (12*sizeof(FFTComplex)) 46#define byte_14complex (14*sizeof(FFTComplex)) 47 48inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n) 49{ 50 int o1 = n<<1; 51 int o2 = n<<2; 52 int o3 = o1+o2; 53 int i1, i2, i3; 54 FFTSample* out = (FFTSample*)z; 55 const FFTSample *wim = wre+o1; 56 vec_f vz0, vzo1, vzo2, vzo3; 57 vec_f x0, x1, x2, x3; 58 vec_f x4, x5, x6, x7; 59 vec_f x8, x9, x10, x11; 60 vec_f x12, x13, x14, x15; 61 vec_f x16, x17, x18, x19; 62 vec_f x20, x21, x22, x23; 63 vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1; 64 vec_f y0, y1, y2, y3; 65 vec_f y4, y5, y8, y9; 66 vec_f y10, y13, y14, y15; 67 vec_f y16, y17, y18, y19; 68 vec_f y20, y21, y22, y23; 69 vec_f wr1, wi1, wr0, wi0; 70 vec_f wr2, wi2, wr3, wi3; 71 vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3; 72 73 n = n-2; 74 i1 = o1*sizeof(FFTComplex); 75 i2 = o2*sizeof(FFTComplex); 76 i3 = o3*sizeof(FFTComplex); 77 vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i 78 vzo2plus1 = vec_ld(i2+16, &(out[0])); 79 vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i 80 vzo3plus1 = vec_ld(i3+16, &(out[0])); 81 vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i 82 vz0plus1 = vec_ld(16, &(out[0])); 83 vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i 84 vzo1plus1 = vec_ld(i1+16, &(out[0])); 85 86 x0 = vec_add(vzo2, vzo3); 87 x1 = vec_sub(vzo2, vzo3); 88 y0 = vec_add(vzo2plus1, vzo3plus1); 89 y1 = vec_sub(vzo2plus1, vzo3plus1); 90 91 wr1 = vec_splats(wre[1]); 92 wi1 = vec_splats(wim[-1]); 93 wi2 = vec_splats(wim[-2]); 94 wi3 = vec_splats(wim[-3]); 95 wr2 = vec_splats(wre[2]); 96 wr3 = vec_splats(wre[3]); 97 98 x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3)); 99 x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2)); 100 101 y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0)); 102 y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2)); 103 y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1)); 104 y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3)); 105 106 ymulwi2 = vec_mul(y4, wi2); 107 ymulwi3 = vec_mul(y5, wi3); 108 x4 = vec_mul(x2, wr1); 109 x5 = vec_mul(x3, wi1); 110 y8 = vec_madd(y2, wr2, ymulwi2); 111 y9 = vec_msub(y2, wr2, ymulwi2); 112 x6 = vec_add(x4, x5); 113 x7 = vec_sub(x4, x5); 114 y13 = vec_madd(y3, wr3, ymulwi3); 115 y14 = vec_msub(y3, wr3, ymulwi3); 116 117 x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3)); 118 y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3)); 119 y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3)); 120 121 x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2)); 122 x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1)); 123 124 y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2)); 125 y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1)); 126 127 x11 = vec_add(vz0, x9); 128 x12 = vec_sub(vz0, x9); 129 x13 = vec_add(vzo1, x10); 130 x14 = vec_sub(vzo1, x10); 131 132 y18 = vec_add(vz0plus1, y16); 133 y19 = vec_sub(vz0plus1, y16); 134 y20 = vec_add(vzo1plus1, y17); 135 y21 = vec_sub(vzo1plus1, y17); 136 137 x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3)); 138 x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3)); 139 y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3)); 140 y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3)); 141 142 143 vec_st(x11, 0, &(out[0])); 144 vec_st(y18, 16, &(out[0])); 145 vec_st(x15, i1, &(out[0])); 146 vec_st(y22, i1+16, &(out[0])); 147 vec_st(x12, i2, &(out[0])); 148 vec_st(y19, i2+16, &(out[0])); 149 vec_st(x16, i3, &(out[0])); 150 vec_st(y23, i3+16, &(out[0])); 151 152 do { 153 out += 8; 154 wre += 4; 155 wim -= 4; 156 wr0 = vec_splats(wre[0]); 157 wr1 = vec_splats(wre[1]); 158 wi0 = vec_splats(wim[0]); 159 wi1 = vec_splats(wim[-1]); 160 161 wr2 = vec_splats(wre[2]); 162 wr3 = vec_splats(wre[3]); 163 wi2 = vec_splats(wim[-2]); 164 wi3 = vec_splats(wim[-3]); 165 166 vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i 167 vzo2plus1 = vec_ld(i2+16, &(out[0])); 168 vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i 169 vzo3plus1 = vec_ld(i3+16, &(out[0])); 170 vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i 171 vz0plus1 = vec_ld(16, &(out[0])); 172 vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i 173 vzo1plus1 = vec_ld(i1+16, &(out[0])); 174 175 x0 = vec_add(vzo2, vzo3); 176 x1 = vec_sub(vzo2, vzo3); 177 178 y0 = vec_add(vzo2plus1, vzo3plus1); 179 y1 = vec_sub(vzo2plus1, vzo3plus1); 180 181 x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0)); 182 x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2)); 183 x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1)); 184 x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3)); 185 186 y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1)); 187 y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3)); 188 xmulwi0 = vec_mul(x4, wi0); 189 xmulwi1 = vec_mul(x5, wi1); 190 191 y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0)); 192 y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2)); 193 194 x8 = vec_madd(x2, wr0, xmulwi0); 195 x9 = vec_msub(x2, wr0, xmulwi0); 196 ymulwi2 = vec_mul(y4, wi2); 197 ymulwi3 = vec_mul(y5, wi3); 198 199 x13 = vec_madd(x3, wr1, xmulwi1); 200 x14 = vec_msub(x3, wr1, xmulwi1); 201 202 y8 = vec_madd(y2, wr2, ymulwi2); 203 y9 = vec_msub(y2, wr2, ymulwi2); 204 y13 = vec_madd(y3, wr3, ymulwi3); 205 y14 = vec_msub(y3, wr3, ymulwi3); 206 207 x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3)); 208 x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3)); 209 210 y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3)); 211 y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3)); 212 213 x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2)); 214 x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1)); 215 216 y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2)); 217 y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1)); 218 219 x18 = vec_add(vz0, x16); 220 x19 = vec_sub(vz0, x16); 221 x20 = vec_add(vzo1, x17); 222 x21 = vec_sub(vzo1, x17); 223 224 y18 = vec_add(vz0plus1, y16); 225 y19 = vec_sub(vz0plus1, y16); 226 y20 = vec_add(vzo1plus1, y17); 227 y21 = vec_sub(vzo1plus1, y17); 228 229 x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3)); 230 x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3)); 231 232 y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3)); 233 y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3)); 234 235 vec_st(x18, 0, &(out[0])); 236 vec_st(y18, 16, &(out[0])); 237 vec_st(x22, i1, &(out[0])); 238 vec_st(y22, i1+16, &(out[0])); 239 vec_st(x19, i2, &(out[0])); 240 vec_st(y19, i2+16, &(out[0])); 241 vec_st(x23, i3, &(out[0])); 242 vec_st(y23, i3+16, &(out[0])); 243 } while (n-=2); 244} 245 246inline static void fft2_vsx_interleave(FFTComplex *z) 247{ 248 FFTSample r1, i1; 249 250 r1 = z[0].re - z[1].re; 251 z[0].re += z[1].re; 252 z[1].re = r1; 253 254 i1 = z[0].im - z[1].im; 255 z[0].im += z[1].im; 256 z[1].im = i1; 257 } 258 259inline static void fft4_vsx_interleave(FFTComplex *z) 260{ 261 vec_f a, b, c, d; 262 float* out= (float*)z; 263 a = vec_ld(0, &(out[0])); 264 b = vec_ld(byte_2complex, &(out[0])); 265 266 c = vec_perm(a, b, vcprm(0,1,s2,s1)); 267 d = vec_perm(a, b, vcprm(2,3,s0,s3)); 268 a = vec_add(c, d); 269 b = vec_sub(c, d); 270 271 c = vec_perm(a, b, vcprm(0,1,s0,s1)); 272 d = vec_perm(a, b, vcprm(2,3,s3,s2)); 273 274 a = vec_add(c, d); 275 b = vec_sub(c, d); 276 vec_st(a, 0, &(out[0])); 277 vec_st(b, byte_2complex, &(out[0])); 278} 279 280inline static void fft8_vsx_interleave(FFTComplex *z) 281{ 282 vec_f vz0, vz1, vz2, vz3; 283 vec_f x0, x1, x2, x3; 284 vec_f x4, x5, x6, x7; 285 vec_f x8, x9, x10, x11; 286 vec_f x12, x13, x14, x15; 287 vec_f x16, x17, x18, x19; 288 vec_f x20, x21, x22, x23; 289 vec_f x24, x25, x26, x27; 290 vec_f x28, x29, x30, x31; 291 vec_f x32, x33, x34; 292 293 float* out= (float*)z; 294 vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; 295 296 vz0 = vec_ld(0, &(out[0])); 297 vz1 = vec_ld(byte_2complex, &(out[0])); 298 vz2 = vec_ld(byte_4complex, &(out[0])); 299 vz3 = vec_ld(byte_6complex, &(out[0])); 300 301 x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 302 x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 303 x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1)); 304 x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3)); 305 306 x4 = vec_add(x0, x1); 307 x5 = vec_sub(x0, x1); 308 x6 = vec_add(x2, x3); 309 x7 = vec_sub(x2, x3); 310 311 x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1)); 312 x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2)); 313 x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1)); 314 x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3)); 315 316 x12 = vec_add(x8, x9); 317 x13 = vec_sub(x8, x9); 318 x14 = vec_add(x10, x11); 319 x15 = vec_sub(x10, x11); 320 x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1)); 321 x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1)); 322 x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1)); 323 x19 = vec_add(x16, x18); // z0.r z2.r z0.i z2.i 324 x20 = vec_sub(x16, x18); // z4.r z6.r z4.i z6.i 325 326 x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3)); 327 x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3)); 328 x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2)); 329 x24 = vec_add(x22, x23); 330 x25 = vec_sub(x22, x23); 331 x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1); 332 333 x27 = vec_add(x21, x26); // z1.r z7.r z1.i z3.i 334 x28 = vec_sub(x21, x26); //z5.r z3.r z5.i z7.i 335 336 x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r z0.i z1.r z1.i 337 x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r z2.i z7.r z3.i 338 x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r z4.i z5.r z5.i 339 x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r z6.i z3.r z7.i 340 x33 = vec_perm(x30, x32, vcprm(0,1,s2,3)); // z2.r z2.i z3.r z3.i 341 x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r z6.i z7.r z7.i 342 343 vec_st(x29, 0, &(out[0])); 344 vec_st(x33, byte_2complex, &(out[0])); 345 vec_st(x31, byte_4complex, &(out[0])); 346 vec_st(x34, byte_6complex, &(out[0])); 347} 348 349inline static void fft16_vsx_interleave(FFTComplex *z) 350{ 351 float* out= (float*)z; 352 vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; 353 vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]}; 354 vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]}; 355 vec_f vz0, vz1, vz2, vz3; 356 vec_f vz4, vz5, vz6, vz7; 357 vec_f x0, x1, x2, x3; 358 vec_f x4, x5, x6, x7; 359 vec_f x8, x9, x10, x11; 360 vec_f x12, x13, x14, x15; 361 vec_f x16, x17, x18, x19; 362 vec_f x20, x21, x22, x23; 363 vec_f x24, x25, x26, x27; 364 vec_f x28, x29, x30, x31; 365 vec_f x32, x33, x34, x35; 366 vec_f x36, x37, x38, x39; 367 vec_f x40, x41, x42, x43; 368 vec_f x44, x45, x46, x47; 369 vec_f x48, x49, x50, x51; 370 vec_f x52, x53, x54, x55; 371 vec_f x56, x57, x58, x59; 372 vec_f x60, x61, x62, x63; 373 vec_f x64, x65, x66, x67; 374 vec_f x68, x69, x70, x71; 375 vec_f x72, x73, x74, x75; 376 vec_f x76, x77, x78, x79; 377 vec_f x80, x81, x82, x83; 378 vec_f x84, x85, x86; 379 380 vz0 = vec_ld(0, &(out[0])); 381 vz1 = vec_ld(byte_2complex, &(out[0])); 382 vz2 = vec_ld(byte_4complex, &(out[0])); 383 vz3 = vec_ld(byte_6complex, &(out[0])); 384 vz4 = vec_ld(byte_8complex, &(out[0])); 385 vz5 = vec_ld(byte_10complex, &(out[0])); 386 vz6 = vec_ld(byte_12complex, &(out[0])); 387 vz7 = vec_ld(byte_14complex, &(out[0])); 388 389 x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 390 x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 391 x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1)); 392 x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3)); 393 394 x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1)); 395 x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3)); 396 x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1)); 397 x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3)); 398 399 x8 = vec_add(x0, x1); 400 x9 = vec_sub(x0, x1); 401 x10 = vec_add(x2, x3); 402 x11 = vec_sub(x2, x3); 403 404 x12 = vec_add(x4, x5); 405 x13 = vec_sub(x4, x5); 406 x14 = vec_add(x6, x7); 407 x15 = vec_sub(x6, x7); 408 409 x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1)); 410 x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2)); 411 x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2)); 412 x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3)); 413 x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1)); 414 x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3)); 415 x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1)); 416 x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2)); 417 418 x24 = vec_add(x16, x17); 419 x25 = vec_sub(x16, x17); 420 x26 = vec_add(x18, x19); 421 x27 = vec_sub(x18, x19); 422 x28 = vec_add(x20, x21); 423 x29 = vec_sub(x20, x21); 424 x30 = vec_add(x22, x23); 425 x31 = vec_sub(x22, x23); 426 427 x32 = vec_add(x24, x26); 428 x33 = vec_sub(x24, x26); 429 x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1)); 430 431 x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2)); 432 x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3)); 433 x37 = vec_add(x35, x36); 434 x38 = vec_sub(x35, x36); 435 x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0)); 436 437 x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3)); 438 x41 = vec_perm(x26, x37, vcprm(2,3,s3,s2)); 439 x42 = vec_add(x40, x41); 440 x43 = vec_sub(x40, x41); 441 x44 = vec_mul(x42, vc0); 442 x45 = vec_mul(x43, vc0); 443 444 x46 = vec_add(x34, x39); // z0.r z0.i z4.r z4.i 445 x47 = vec_sub(x34, x39); // z8.r z8.i z12.r z12.i 446 447 x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2)); 448 x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0)); 449 x50 = vec_add(x48, x49); 450 x51 = vec_sub(x48, x49); 451 x52 = vec_mul(x50, vc1); 452 x53 = vec_mul(x50, vc2); 453 x54 = vec_mul(x51, vc1); 454 x55 = vec_mul(x51, vc2); 455 456 x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3)); 457 x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0)); 458 x58 = vec_add(x56, x57); 459 x59 = vec_sub(x56, x57); 460 461 x60 = vec_perm(x54, x55, vcprm(1,0,3,2)); 462 x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2)); 463 x62 = vec_add(x52, x61); 464 x63 = vec_sub(x52, x61); 465 x64 = vec_add(x60, x53); 466 x65 = vec_sub(x60, x53); 467 x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2)); 468 x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2)); 469 470 x68 = vec_add(x58, x66); // z1.r z1.i z3.r z3.i 471 x69 = vec_sub(x58, x66); // z9.r z9.i z11.r z11.i 472 x70 = vec_add(x59, x67); // z5.r z5.i z15.r z15.i 473 x71 = vec_sub(x59, x67); // z13.r z13.i z7.r z7.i 474 475 x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3)); 476 x73 = vec_add(x25, x72); 477 x74 = vec_sub(x25, x72); 478 x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1)); 479 x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3)); 480 x77 = vec_add(x75, x76); // z2.r z2.i z6.r z6.i 481 x78 = vec_sub(x75, x76); // z10.r z10.i z14.r z14.i 482 483 x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r z0.i z1.r z1.i 484 x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r z2.i z3.r z3.i 485 x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r z4.i z5.r z5.i 486 x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r z6.i z7.r z7.i 487 vec_st(x79, 0, &(out[0])); 488 vec_st(x80, byte_2complex, &(out[0])); 489 vec_st(x81, byte_4complex, &(out[0])); 490 vec_st(x82, byte_6complex, &(out[0])); 491 x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r z8.i z9.r z9.i 492 x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r z10.i z11.r z11.i 493 x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r z12.i z13.r z13.i 494 x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r z14.i z15.r z15.i 495 vec_st(x83, byte_8complex, &(out[0])); 496 vec_st(x84, byte_10complex, &(out[0])); 497 vec_st(x85, byte_12complex, &(out[0])); 498 vec_st(x86, byte_14complex, &(out[0])); 499} 500 501inline static void fft4_vsx(FFTComplex *z) 502{ 503 vec_f a, b, c, d; 504 float* out= (float*)z; 505 a = vec_ld(0, &(out[0])); 506 b = vec_ld(byte_2complex, &(out[0])); 507 508 c = vec_perm(a, b, vcprm(0,1,s2,s1)); 509 d = vec_perm(a, b, vcprm(2,3,s0,s3)); 510 a = vec_add(c, d); 511 b = vec_sub(c, d); 512 513 c = vec_perm(a,b, vcprm(0,s0,1,s1)); 514 d = vec_perm(a, b, vcprm(2,s3,3,s2)); 515 516 a = vec_add(c, d); 517 b = vec_sub(c, d); 518 519 c = vec_perm(a, b, vcprm(0,1,s0,s1)); 520 d = vec_perm(a, b, vcprm(2,3,s2,s3)); 521 522 vec_st(c, 0, &(out[0])); 523 vec_st(d, byte_2complex, &(out[0])); 524 return; 525} 526 527inline static void fft8_vsx(FFTComplex *z) 528{ 529 vec_f vz0, vz1, vz2, vz3; 530 vec_f vz4, vz5, vz6, vz7, vz8; 531 532 float* out= (float*)z; 533 vec_f vc0 = {0.0, 0.0, 0.0, 0.0}; 534 vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf}; 535 vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; 536 537 vz0 = vec_ld(0, &(out[0])); 538 vz1 = vec_ld(byte_2complex, &(out[0])); 539 vz2 = vec_ld(byte_4complex, &(out[0])); 540 vz3 = vec_ld(byte_6complex, &(out[0])); 541 542 vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); 543 vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3)); 544 vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 545 vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 546 547 vz2 = vec_add(vz6, vz7); 548 vz3 = vec_sub(vz6, vz7); 549 vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1)); 550 551 vz0 = vec_add(vz4, vz5); 552 vz1 = vec_sub(vz4, vz5); 553 554 vz3 = vec_madd(vz3, vc1, vc0); 555 vz3 = vec_madd(vz8, vc2, vz3); 556 557 vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); 558 vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); 559 vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0)); 560 vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1)); 561 562 vz0 = vec_add(vz4, vz5); 563 vz1 = vec_sub(vz4, vz5); 564 vz2 = vec_add(vz6, vz7); 565 vz3 = vec_sub(vz6, vz7); 566 567 vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); 568 vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); 569 vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3)); 570 vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2)); 571 572 573 vz2 = vec_sub(vz4, vz6); 574 vz3 = vec_sub(vz5, vz7); 575 576 vz0 = vec_add(vz4, vz6); 577 vz1 = vec_add(vz5, vz7); 578 579 vec_st(vz0, 0, &(out[0])); 580 vec_st(vz1, byte_2complex, &(out[0])); 581 vec_st(vz2, byte_4complex, &(out[0])); 582 vec_st(vz3, byte_6complex, &(out[0])); 583 return; 584} 585 586inline static void fft16_vsx(FFTComplex *z) 587{ 588 float* out= (float*)z; 589 vec_f vc0 = {0.0, 0.0, 0.0, 0.0}; 590 vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf}; 591 vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; 592 vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343}; 593 vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953}; 594 vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953}; 595 596 vec_f vz0, vz1, vz2, vz3; 597 vec_f vz4, vz5, vz6, vz7; 598 vec_f vz8, vz9, vz10, vz11; 599 vec_f vz12, vz13; 600 601 vz0 = vec_ld(byte_8complex, &(out[0])); 602 vz1 = vec_ld(byte_10complex, &(out[0])); 603 vz2 = vec_ld(byte_12complex, &(out[0])); 604 vz3 = vec_ld(byte_14complex, &(out[0])); 605 606 vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 607 vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 608 vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1)); 609 vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3)); 610 611 vz0 = vec_add(vz4, vz5); 612 vz1= vec_sub(vz4, vz5); 613 vz2 = vec_add(vz6, vz7); 614 vz3 = vec_sub(vz6, vz7); 615 616 vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); 617 vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); 618 vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); 619 vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2)); 620 621 vz0 = vec_add(vz4, vz5); 622 vz1 = vec_sub(vz4, vz5); 623 vz2 = vec_add(vz6, vz7); 624 vz3 = vec_sub(vz6, vz7); 625 626 vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); 627 vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); 628 629 vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1)); 630 vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3)); 631 632 vz0 = vec_ld(0, &(out[0])); 633 vz1 = vec_ld(byte_2complex, &(out[0])); 634 vz2 = vec_ld(byte_4complex, &(out[0])); 635 vz3 = vec_ld(byte_6complex, &(out[0])); 636 vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); 637 vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3)); 638 vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 639 vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 640 641 vz2 = vec_add(vz10, vz11); 642 vz3 = vec_sub(vz10, vz11); 643 vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1)); 644 vz0 = vec_add(vz8, vz9); 645 vz1 = vec_sub(vz8, vz9); 646 647 vz3 = vec_madd(vz3, vc1, vc0); 648 vz3 = vec_madd(vz12, vc2, vz3); 649 vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); 650 vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); 651 vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0)); 652 vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1)); 653 654 vz0 = vec_add(vz8, vz9); 655 vz1 = vec_sub(vz8, vz9); 656 vz2 = vec_add(vz10, vz11); 657 vz3 = vec_sub(vz10, vz11); 658 659 vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); 660 vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); 661 vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3)); 662 vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2)); 663 664 vz2 = vec_sub(vz8, vz10); 665 vz3 = vec_sub(vz9, vz11); 666 vz0 = vec_add(vz8, vz10); 667 vz1 = vec_add(vz9, vz11); 668 669 vz8 = vec_madd(vz4, vc3, vc0); 670 vz9 = vec_madd(vz5, vc3, vc0); 671 vz10 = vec_madd(vz6, vc3, vc0); 672 vz11 = vec_madd(vz7, vc3, vc0); 673 674 vz8 = vec_madd(vz5, vc4, vz8); 675 vz9 = vec_madd(vz4, vc5, vz9); 676 vz10 = vec_madd(vz7, vc5, vz10); 677 vz11 = vec_madd(vz6, vc4, vz11); 678 679 vz12 = vec_sub(vz10, vz8); 680 vz10 = vec_add(vz10, vz8); 681 682 vz13 = vec_sub(vz9, vz11); 683 vz11 = vec_add(vz9, vz11); 684 685 vz4 = vec_sub(vz0, vz10); 686 vz0 = vec_add(vz0, vz10); 687 688 vz7= vec_sub(vz3, vz12); 689 vz3= vec_add(vz3, vz12); 690 691 vz5 = vec_sub(vz1, vz11); 692 vz1 = vec_add(vz1, vz11); 693 694 vz6 = vec_sub(vz2, vz13); 695 vz2 = vec_add(vz2, vz13); 696 697 vec_st(vz0, 0, &(out[0])); 698 vec_st(vz1, byte_2complex, &(out[0])); 699 vec_st(vz2, byte_4complex, &(out[0])); 700 vec_st(vz3, byte_6complex, &(out[0])); 701 vec_st(vz4, byte_8complex, &(out[0])); 702 vec_st(vz5, byte_10complex, &(out[0])); 703 vec_st(vz6, byte_12complex, &(out[0])); 704 vec_st(vz7, byte_14complex, &(out[0])); 705 return; 706 707} 708inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n) 709{ 710 int o1 = n<<1; 711 int o2 = n<<2; 712 int o3 = o1+o2; 713 int i1, i2, i3; 714 FFTSample* out = (FFTSample*)z; 715 const FFTSample *wim = wre+o1; 716 vec_f v0, v1, v2, v3; 717 vec_f v4, v5, v6, v7; 718 vec_f v8, v9, v10, v11; 719 vec_f v12, v13; 720 721 n = n-2; 722 i1 = o1*sizeof(FFTComplex); 723 i2 = o2*sizeof(FFTComplex); 724 i3 = o3*sizeof(FFTComplex); 725 726 v8 = vec_ld(0, &(wre[0])); 727 v10 = vec_ld(0, &(wim[0])); 728 v9 = vec_ld(0, &(wim[-4])); 729 v9 = vec_perm(v9, v10, vcprm(s0,3,2,1)); 730 731 v4 = vec_ld(i2, &(out[0])); 732 v5 = vec_ld(i2+16, &(out[0])); 733 v6 = vec_ld(i3, &(out[0])); 734 v7 = vec_ld(i3+16, &(out[0])); 735 v10 = vec_mul(v4, v8); // r2*wre 736 v11 = vec_mul(v5, v8); // i2*wre 737 v12 = vec_mul(v6, v8); // r3*wre 738 v13 = vec_mul(v7, v8); // i3*wre 739 740 v0 = vec_ld(0, &(out[0])); // r0 741 v3 = vec_ld(i1+16, &(out[0])); // i1 742 v10 = vec_madd(v5, v9, v10); // r2*wim 743 v11 = vec_nmsub(v4, v9, v11); // i2*wim 744 v12 = vec_nmsub(v7, v9, v12); // r3*wim 745 v13 = vec_madd(v6, v9, v13); // i3*wim 746 747 v1 = vec_ld(16, &(out[0])); // i0 748 v2 = vec_ld(i1, &(out[0])); // r1 749 v8 = vec_sub(v12, v10); 750 v12 = vec_add(v12, v10); 751 v9 = vec_sub(v11, v13); 752 v13 = vec_add(v11, v13); 753 v4 = vec_sub(v0, v12); 754 v0 = vec_add(v0, v12); 755 v7 = vec_sub(v3, v8); 756 v3 = vec_add(v3, v8); 757 758 vec_st(v0, 0, &(out[0])); // r0 759 vec_st(v3, i1+16, &(out[0])); // i1 760 vec_st(v4, i2, &(out[0])); // r2 761 vec_st(v7, i3+16, &(out[0]));// i3 762 763 v5 = vec_sub(v1, v13); 764 v1 = vec_add(v1, v13); 765 v6 = vec_sub(v2, v9); 766 v2 = vec_add(v2, v9); 767 768 vec_st(v1, 16, &(out[0])); // i0 769 vec_st(v2, i1, &(out[0])); // r1 770 vec_st(v5, i2+16, &(out[0])); // i2 771 vec_st(v6, i3, &(out[0])); // r3 772 773 do { 774 out += 8; 775 wre += 4; 776 wim -= 4; 777 778 v8 = vec_ld(0, &(wre[0])); 779 v10 = vec_ld(0, &(wim[0])); 780 v9 = vec_ld(0, &(wim[-4])); 781 v9 = vec_perm(v9, v10, vcprm(s0,3,2,1)); 782 783 v4 = vec_ld(i2, &(out[0])); // r2 784 v5 = vec_ld(i2+16, &(out[0])); // i2 785 v6 = vec_ld(i3, &(out[0])); // r3 786 v7 = vec_ld(i3+16, &(out[0]));// i3 787 v10 = vec_mul(v4, v8); // r2*wre 788 v11 = vec_mul(v5, v8); // i2*wre 789 v12 = vec_mul(v6, v8); // r3*wre 790 v13 = vec_mul(v7, v8); // i3*wre 791 792 v0 = vec_ld(0, &(out[0])); // r0 793 v3 = vec_ld(i1+16, &(out[0])); // i1 794 v10 = vec_madd(v5, v9, v10); // r2*wim 795 v11 = vec_nmsub(v4, v9, v11); // i2*wim 796 v12 = vec_nmsub(v7, v9, v12); // r3*wim 797 v13 = vec_madd(v6, v9, v13); // i3*wim 798 799 v1 = vec_ld(16, &(out[0])); // i0 800 v2 = vec_ld(i1, &(out[0])); // r1 801 v8 = vec_sub(v12, v10); 802 v12 = vec_add(v12, v10); 803 v9 = vec_sub(v11, v13); 804 v13 = vec_add(v11, v13); 805 v4 = vec_sub(v0, v12); 806 v0 = vec_add(v0, v12); 807 v7 = vec_sub(v3, v8); 808 v3 = vec_add(v3, v8); 809 810 vec_st(v0, 0, &(out[0])); // r0 811 vec_st(v3, i1+16, &(out[0])); // i1 812 vec_st(v4, i2, &(out[0])); // r2 813 vec_st(v7, i3+16, &(out[0])); // i3 814 815 v5 = vec_sub(v1, v13); 816 v1 = vec_add(v1, v13); 817 v6 = vec_sub(v2, v9); 818 v2 = vec_add(v2, v9); 819 820 vec_st(v1, 16, &(out[0])); // i0 821 vec_st(v2, i1, &(out[0])); // r1 822 vec_st(v5, i2+16, &(out[0])); // i2 823 vec_st(v6, i3, &(out[0])); // r3 824 } while (n-=2); 825} 826 827#endif 828 829#endif /* AVCODEC_PPC_FFT_VSX_H */ 830