1cabdff1aSopenharmony_ci#ifndef AVCODEC_PPC_FFT_VSX_H 2cabdff1aSopenharmony_ci#define AVCODEC_PPC_FFT_VSX_H 3cabdff1aSopenharmony_ci/* 4cabdff1aSopenharmony_ci * FFT transform, optimized with VSX built-in functions 5cabdff1aSopenharmony_ci * Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * This algorithm (though not any of the implementation details) is 8cabdff1aSopenharmony_ci * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S. 9cabdff1aSopenharmony_ci * 10cabdff1aSopenharmony_ci * This file is part of FFmpeg. 11cabdff1aSopenharmony_ci * 12cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 13cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 14cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 15cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 16cabdff1aSopenharmony_ci * 17cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 18cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 19cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20cabdff1aSopenharmony_ci * Lesser General Public License for more details. 21cabdff1aSopenharmony_ci * 22cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 23cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 24cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25cabdff1aSopenharmony_ci */ 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_ci#include "config.h" 29cabdff1aSopenharmony_ci#include "libavutil/cpu.h" 30cabdff1aSopenharmony_ci#include "libavutil/ppc/util_altivec.h" 31cabdff1aSopenharmony_ci#include "libavcodec/fft.h" 32cabdff1aSopenharmony_ci#include "libavcodec/fft-internal.h" 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ci#if HAVE_VSX 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_civoid ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z); 37cabdff1aSopenharmony_civoid ff_fft_calc_vsx(FFTContext *s, FFTComplex *z); 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_ci#define byte_2complex (2*sizeof(FFTComplex)) 41cabdff1aSopenharmony_ci#define byte_4complex (4*sizeof(FFTComplex)) 42cabdff1aSopenharmony_ci#define byte_6complex (6*sizeof(FFTComplex)) 43cabdff1aSopenharmony_ci#define byte_8complex (8*sizeof(FFTComplex)) 44cabdff1aSopenharmony_ci#define byte_10complex (10*sizeof(FFTComplex)) 45cabdff1aSopenharmony_ci#define byte_12complex (12*sizeof(FFTComplex)) 46cabdff1aSopenharmony_ci#define byte_14complex (14*sizeof(FFTComplex)) 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ciinline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n) 49cabdff1aSopenharmony_ci{ 50cabdff1aSopenharmony_ci int o1 = n<<1; 51cabdff1aSopenharmony_ci int o2 = n<<2; 52cabdff1aSopenharmony_ci int o3 = o1+o2; 53cabdff1aSopenharmony_ci int i1, i2, i3; 54cabdff1aSopenharmony_ci FFTSample* out = (FFTSample*)z; 55cabdff1aSopenharmony_ci const FFTSample *wim = wre+o1; 56cabdff1aSopenharmony_ci vec_f vz0, vzo1, vzo2, vzo3; 57cabdff1aSopenharmony_ci vec_f x0, x1, x2, x3; 58cabdff1aSopenharmony_ci vec_f x4, x5, x6, x7; 59cabdff1aSopenharmony_ci vec_f x8, x9, x10, x11; 60cabdff1aSopenharmony_ci vec_f x12, x13, x14, x15; 61cabdff1aSopenharmony_ci vec_f x16, x17, x18, x19; 62cabdff1aSopenharmony_ci vec_f x20, x21, x22, x23; 63cabdff1aSopenharmony_ci vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1; 64cabdff1aSopenharmony_ci vec_f y0, y1, y2, y3; 65cabdff1aSopenharmony_ci vec_f y4, y5, y8, y9; 66cabdff1aSopenharmony_ci vec_f y10, y13, y14, y15; 67cabdff1aSopenharmony_ci vec_f y16, y17, y18, y19; 68cabdff1aSopenharmony_ci vec_f y20, y21, y22, y23; 69cabdff1aSopenharmony_ci vec_f wr1, wi1, wr0, wi0; 70cabdff1aSopenharmony_ci vec_f wr2, wi2, wr3, wi3; 71cabdff1aSopenharmony_ci vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3; 72cabdff1aSopenharmony_ci 73cabdff1aSopenharmony_ci n = n-2; 74cabdff1aSopenharmony_ci i1 = o1*sizeof(FFTComplex); 75cabdff1aSopenharmony_ci i2 = o2*sizeof(FFTComplex); 76cabdff1aSopenharmony_ci i3 = o3*sizeof(FFTComplex); 77cabdff1aSopenharmony_ci vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i 78cabdff1aSopenharmony_ci vzo2plus1 = vec_ld(i2+16, &(out[0])); 79cabdff1aSopenharmony_ci vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i 80cabdff1aSopenharmony_ci vzo3plus1 = vec_ld(i3+16, &(out[0])); 81cabdff1aSopenharmony_ci vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i 82cabdff1aSopenharmony_ci vz0plus1 = vec_ld(16, &(out[0])); 83cabdff1aSopenharmony_ci vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i 84cabdff1aSopenharmony_ci vzo1plus1 = vec_ld(i1+16, &(out[0])); 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_ci x0 = vec_add(vzo2, vzo3); 87cabdff1aSopenharmony_ci x1 = vec_sub(vzo2, vzo3); 88cabdff1aSopenharmony_ci y0 = vec_add(vzo2plus1, vzo3plus1); 89cabdff1aSopenharmony_ci y1 = vec_sub(vzo2plus1, vzo3plus1); 90cabdff1aSopenharmony_ci 91cabdff1aSopenharmony_ci wr1 = vec_splats(wre[1]); 92cabdff1aSopenharmony_ci wi1 = vec_splats(wim[-1]); 93cabdff1aSopenharmony_ci wi2 = vec_splats(wim[-2]); 94cabdff1aSopenharmony_ci wi3 = vec_splats(wim[-3]); 95cabdff1aSopenharmony_ci wr2 = vec_splats(wre[2]); 96cabdff1aSopenharmony_ci wr3 = vec_splats(wre[3]); 97cabdff1aSopenharmony_ci 98cabdff1aSopenharmony_ci x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3)); 99cabdff1aSopenharmony_ci x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2)); 100cabdff1aSopenharmony_ci 101cabdff1aSopenharmony_ci y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0)); 102cabdff1aSopenharmony_ci y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2)); 103cabdff1aSopenharmony_ci y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1)); 104cabdff1aSopenharmony_ci y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3)); 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci ymulwi2 = vec_mul(y4, wi2); 107cabdff1aSopenharmony_ci ymulwi3 = vec_mul(y5, wi3); 108cabdff1aSopenharmony_ci x4 = vec_mul(x2, wr1); 109cabdff1aSopenharmony_ci x5 = vec_mul(x3, wi1); 110cabdff1aSopenharmony_ci y8 = vec_madd(y2, wr2, ymulwi2); 111cabdff1aSopenharmony_ci y9 = vec_msub(y2, wr2, ymulwi2); 112cabdff1aSopenharmony_ci x6 = vec_add(x4, x5); 113cabdff1aSopenharmony_ci x7 = vec_sub(x4, x5); 114cabdff1aSopenharmony_ci y13 = vec_madd(y3, wr3, ymulwi3); 115cabdff1aSopenharmony_ci y14 = vec_msub(y3, wr3, ymulwi3); 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_ci x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3)); 118cabdff1aSopenharmony_ci y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3)); 119cabdff1aSopenharmony_ci y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3)); 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_ci x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2)); 122cabdff1aSopenharmony_ci x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1)); 123cabdff1aSopenharmony_ci 124cabdff1aSopenharmony_ci y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2)); 125cabdff1aSopenharmony_ci y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1)); 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci x11 = vec_add(vz0, x9); 128cabdff1aSopenharmony_ci x12 = vec_sub(vz0, x9); 129cabdff1aSopenharmony_ci x13 = vec_add(vzo1, x10); 130cabdff1aSopenharmony_ci x14 = vec_sub(vzo1, x10); 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci y18 = vec_add(vz0plus1, y16); 133cabdff1aSopenharmony_ci y19 = vec_sub(vz0plus1, y16); 134cabdff1aSopenharmony_ci y20 = vec_add(vzo1plus1, y17); 135cabdff1aSopenharmony_ci y21 = vec_sub(vzo1plus1, y17); 136cabdff1aSopenharmony_ci 137cabdff1aSopenharmony_ci x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3)); 138cabdff1aSopenharmony_ci x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3)); 139cabdff1aSopenharmony_ci y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3)); 140cabdff1aSopenharmony_ci y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3)); 141cabdff1aSopenharmony_ci 142cabdff1aSopenharmony_ci 143cabdff1aSopenharmony_ci vec_st(x11, 0, &(out[0])); 144cabdff1aSopenharmony_ci vec_st(y18, 16, &(out[0])); 145cabdff1aSopenharmony_ci vec_st(x15, i1, &(out[0])); 146cabdff1aSopenharmony_ci vec_st(y22, i1+16, &(out[0])); 147cabdff1aSopenharmony_ci vec_st(x12, i2, &(out[0])); 148cabdff1aSopenharmony_ci vec_st(y19, i2+16, &(out[0])); 149cabdff1aSopenharmony_ci vec_st(x16, i3, &(out[0])); 150cabdff1aSopenharmony_ci vec_st(y23, i3+16, &(out[0])); 151cabdff1aSopenharmony_ci 152cabdff1aSopenharmony_ci do { 153cabdff1aSopenharmony_ci out += 8; 154cabdff1aSopenharmony_ci wre += 4; 155cabdff1aSopenharmony_ci wim -= 4; 156cabdff1aSopenharmony_ci wr0 = vec_splats(wre[0]); 157cabdff1aSopenharmony_ci wr1 = vec_splats(wre[1]); 158cabdff1aSopenharmony_ci wi0 = vec_splats(wim[0]); 159cabdff1aSopenharmony_ci wi1 = vec_splats(wim[-1]); 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_ci wr2 = vec_splats(wre[2]); 162cabdff1aSopenharmony_ci wr3 = vec_splats(wre[3]); 163cabdff1aSopenharmony_ci wi2 = vec_splats(wim[-2]); 164cabdff1aSopenharmony_ci wi3 = vec_splats(wim[-3]); 165cabdff1aSopenharmony_ci 166cabdff1aSopenharmony_ci vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i 167cabdff1aSopenharmony_ci vzo2plus1 = vec_ld(i2+16, &(out[0])); 168cabdff1aSopenharmony_ci vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i 169cabdff1aSopenharmony_ci vzo3plus1 = vec_ld(i3+16, &(out[0])); 170cabdff1aSopenharmony_ci vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i 171cabdff1aSopenharmony_ci vz0plus1 = vec_ld(16, &(out[0])); 172cabdff1aSopenharmony_ci vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i 173cabdff1aSopenharmony_ci vzo1plus1 = vec_ld(i1+16, &(out[0])); 174cabdff1aSopenharmony_ci 175cabdff1aSopenharmony_ci x0 = vec_add(vzo2, vzo3); 176cabdff1aSopenharmony_ci x1 = vec_sub(vzo2, vzo3); 177cabdff1aSopenharmony_ci 178cabdff1aSopenharmony_ci y0 = vec_add(vzo2plus1, vzo3plus1); 179cabdff1aSopenharmony_ci y1 = vec_sub(vzo2plus1, vzo3plus1); 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ci x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0)); 182cabdff1aSopenharmony_ci x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2)); 183cabdff1aSopenharmony_ci x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1)); 184cabdff1aSopenharmony_ci x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3)); 185cabdff1aSopenharmony_ci 186cabdff1aSopenharmony_ci y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1)); 187cabdff1aSopenharmony_ci y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3)); 188cabdff1aSopenharmony_ci xmulwi0 = vec_mul(x4, wi0); 189cabdff1aSopenharmony_ci xmulwi1 = vec_mul(x5, wi1); 190cabdff1aSopenharmony_ci 191cabdff1aSopenharmony_ci y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0)); 192cabdff1aSopenharmony_ci y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2)); 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_ci x8 = vec_madd(x2, wr0, xmulwi0); 195cabdff1aSopenharmony_ci x9 = vec_msub(x2, wr0, xmulwi0); 196cabdff1aSopenharmony_ci ymulwi2 = vec_mul(y4, wi2); 197cabdff1aSopenharmony_ci ymulwi3 = vec_mul(y5, wi3); 198cabdff1aSopenharmony_ci 199cabdff1aSopenharmony_ci x13 = vec_madd(x3, wr1, xmulwi1); 200cabdff1aSopenharmony_ci x14 = vec_msub(x3, wr1, xmulwi1); 201cabdff1aSopenharmony_ci 202cabdff1aSopenharmony_ci y8 = vec_madd(y2, wr2, ymulwi2); 203cabdff1aSopenharmony_ci y9 = vec_msub(y2, wr2, ymulwi2); 204cabdff1aSopenharmony_ci y13 = vec_madd(y3, wr3, ymulwi3); 205cabdff1aSopenharmony_ci y14 = vec_msub(y3, wr3, ymulwi3); 206cabdff1aSopenharmony_ci 207cabdff1aSopenharmony_ci x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3)); 208cabdff1aSopenharmony_ci x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3)); 209cabdff1aSopenharmony_ci 210cabdff1aSopenharmony_ci y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3)); 211cabdff1aSopenharmony_ci y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3)); 212cabdff1aSopenharmony_ci 213cabdff1aSopenharmony_ci x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2)); 214cabdff1aSopenharmony_ci x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1)); 215cabdff1aSopenharmony_ci 216cabdff1aSopenharmony_ci y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2)); 217cabdff1aSopenharmony_ci y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1)); 218cabdff1aSopenharmony_ci 219cabdff1aSopenharmony_ci x18 = vec_add(vz0, x16); 220cabdff1aSopenharmony_ci x19 = vec_sub(vz0, x16); 221cabdff1aSopenharmony_ci x20 = vec_add(vzo1, x17); 222cabdff1aSopenharmony_ci x21 = vec_sub(vzo1, x17); 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_ci y18 = vec_add(vz0plus1, y16); 225cabdff1aSopenharmony_ci y19 = vec_sub(vz0plus1, y16); 226cabdff1aSopenharmony_ci y20 = vec_add(vzo1plus1, y17); 227cabdff1aSopenharmony_ci y21 = vec_sub(vzo1plus1, y17); 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3)); 230cabdff1aSopenharmony_ci x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3)); 231cabdff1aSopenharmony_ci 232cabdff1aSopenharmony_ci y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3)); 233cabdff1aSopenharmony_ci y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3)); 234cabdff1aSopenharmony_ci 235cabdff1aSopenharmony_ci vec_st(x18, 0, &(out[0])); 236cabdff1aSopenharmony_ci vec_st(y18, 16, &(out[0])); 237cabdff1aSopenharmony_ci vec_st(x22, i1, &(out[0])); 238cabdff1aSopenharmony_ci vec_st(y22, i1+16, &(out[0])); 239cabdff1aSopenharmony_ci vec_st(x19, i2, &(out[0])); 240cabdff1aSopenharmony_ci vec_st(y19, i2+16, &(out[0])); 241cabdff1aSopenharmony_ci vec_st(x23, i3, &(out[0])); 242cabdff1aSopenharmony_ci vec_st(y23, i3+16, &(out[0])); 243cabdff1aSopenharmony_ci } while (n-=2); 244cabdff1aSopenharmony_ci} 245cabdff1aSopenharmony_ci 246cabdff1aSopenharmony_ciinline static void fft2_vsx_interleave(FFTComplex *z) 247cabdff1aSopenharmony_ci{ 248cabdff1aSopenharmony_ci FFTSample r1, i1; 249cabdff1aSopenharmony_ci 250cabdff1aSopenharmony_ci r1 = z[0].re - z[1].re; 251cabdff1aSopenharmony_ci z[0].re += z[1].re; 252cabdff1aSopenharmony_ci z[1].re = r1; 253cabdff1aSopenharmony_ci 254cabdff1aSopenharmony_ci i1 = z[0].im - z[1].im; 255cabdff1aSopenharmony_ci z[0].im += z[1].im; 256cabdff1aSopenharmony_ci z[1].im = i1; 257cabdff1aSopenharmony_ci } 258cabdff1aSopenharmony_ci 259cabdff1aSopenharmony_ciinline static void fft4_vsx_interleave(FFTComplex *z) 260cabdff1aSopenharmony_ci{ 261cabdff1aSopenharmony_ci vec_f a, b, c, d; 262cabdff1aSopenharmony_ci float* out= (float*)z; 263cabdff1aSopenharmony_ci a = vec_ld(0, &(out[0])); 264cabdff1aSopenharmony_ci b = vec_ld(byte_2complex, &(out[0])); 265cabdff1aSopenharmony_ci 266cabdff1aSopenharmony_ci c = vec_perm(a, b, vcprm(0,1,s2,s1)); 267cabdff1aSopenharmony_ci d = vec_perm(a, b, vcprm(2,3,s0,s3)); 268cabdff1aSopenharmony_ci a = vec_add(c, d); 269cabdff1aSopenharmony_ci b = vec_sub(c, d); 270cabdff1aSopenharmony_ci 271cabdff1aSopenharmony_ci c = vec_perm(a, b, vcprm(0,1,s0,s1)); 272cabdff1aSopenharmony_ci d = vec_perm(a, b, vcprm(2,3,s3,s2)); 273cabdff1aSopenharmony_ci 274cabdff1aSopenharmony_ci a = vec_add(c, d); 275cabdff1aSopenharmony_ci b = vec_sub(c, d); 276cabdff1aSopenharmony_ci vec_st(a, 0, &(out[0])); 277cabdff1aSopenharmony_ci vec_st(b, byte_2complex, &(out[0])); 278cabdff1aSopenharmony_ci} 279cabdff1aSopenharmony_ci 280cabdff1aSopenharmony_ciinline static void fft8_vsx_interleave(FFTComplex *z) 281cabdff1aSopenharmony_ci{ 282cabdff1aSopenharmony_ci vec_f vz0, vz1, vz2, vz3; 283cabdff1aSopenharmony_ci vec_f x0, x1, x2, x3; 284cabdff1aSopenharmony_ci vec_f x4, x5, x6, x7; 285cabdff1aSopenharmony_ci vec_f x8, x9, x10, x11; 286cabdff1aSopenharmony_ci vec_f x12, x13, x14, x15; 287cabdff1aSopenharmony_ci vec_f x16, x17, x18, x19; 288cabdff1aSopenharmony_ci vec_f x20, x21, x22, x23; 289cabdff1aSopenharmony_ci vec_f x24, x25, x26, x27; 290cabdff1aSopenharmony_ci vec_f x28, x29, x30, x31; 291cabdff1aSopenharmony_ci vec_f x32, x33, x34; 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci float* out= (float*)z; 294cabdff1aSopenharmony_ci vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; 295cabdff1aSopenharmony_ci 296cabdff1aSopenharmony_ci vz0 = vec_ld(0, &(out[0])); 297cabdff1aSopenharmony_ci vz1 = vec_ld(byte_2complex, &(out[0])); 298cabdff1aSopenharmony_ci vz2 = vec_ld(byte_4complex, &(out[0])); 299cabdff1aSopenharmony_ci vz3 = vec_ld(byte_6complex, &(out[0])); 300cabdff1aSopenharmony_ci 301cabdff1aSopenharmony_ci x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 302cabdff1aSopenharmony_ci x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 303cabdff1aSopenharmony_ci x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1)); 304cabdff1aSopenharmony_ci x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3)); 305cabdff1aSopenharmony_ci 306cabdff1aSopenharmony_ci x4 = vec_add(x0, x1); 307cabdff1aSopenharmony_ci x5 = vec_sub(x0, x1); 308cabdff1aSopenharmony_ci x6 = vec_add(x2, x3); 309cabdff1aSopenharmony_ci x7 = vec_sub(x2, x3); 310cabdff1aSopenharmony_ci 311cabdff1aSopenharmony_ci x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1)); 312cabdff1aSopenharmony_ci x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2)); 313cabdff1aSopenharmony_ci x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1)); 314cabdff1aSopenharmony_ci x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3)); 315cabdff1aSopenharmony_ci 316cabdff1aSopenharmony_ci x12 = vec_add(x8, x9); 317cabdff1aSopenharmony_ci x13 = vec_sub(x8, x9); 318cabdff1aSopenharmony_ci x14 = vec_add(x10, x11); 319cabdff1aSopenharmony_ci x15 = vec_sub(x10, x11); 320cabdff1aSopenharmony_ci x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1)); 321cabdff1aSopenharmony_ci x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1)); 322cabdff1aSopenharmony_ci x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1)); 323cabdff1aSopenharmony_ci x19 = vec_add(x16, x18); // z0.r z2.r z0.i z2.i 324cabdff1aSopenharmony_ci x20 = vec_sub(x16, x18); // z4.r z6.r z4.i z6.i 325cabdff1aSopenharmony_ci 326cabdff1aSopenharmony_ci x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3)); 327cabdff1aSopenharmony_ci x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3)); 328cabdff1aSopenharmony_ci x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2)); 329cabdff1aSopenharmony_ci x24 = vec_add(x22, x23); 330cabdff1aSopenharmony_ci x25 = vec_sub(x22, x23); 331cabdff1aSopenharmony_ci x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1); 332cabdff1aSopenharmony_ci 333cabdff1aSopenharmony_ci x27 = vec_add(x21, x26); // z1.r z7.r z1.i z3.i 334cabdff1aSopenharmony_ci x28 = vec_sub(x21, x26); //z5.r z3.r z5.i z7.i 335cabdff1aSopenharmony_ci 336cabdff1aSopenharmony_ci x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r z0.i z1.r z1.i 337cabdff1aSopenharmony_ci x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r z2.i z7.r z3.i 338cabdff1aSopenharmony_ci x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r z4.i z5.r z5.i 339cabdff1aSopenharmony_ci x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r z6.i z3.r z7.i 340cabdff1aSopenharmony_ci x33 = vec_perm(x30, x32, vcprm(0,1,s2,3)); // z2.r z2.i z3.r z3.i 341cabdff1aSopenharmony_ci x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r z6.i z7.r z7.i 342cabdff1aSopenharmony_ci 343cabdff1aSopenharmony_ci vec_st(x29, 0, &(out[0])); 344cabdff1aSopenharmony_ci vec_st(x33, byte_2complex, &(out[0])); 345cabdff1aSopenharmony_ci vec_st(x31, byte_4complex, &(out[0])); 346cabdff1aSopenharmony_ci vec_st(x34, byte_6complex, &(out[0])); 347cabdff1aSopenharmony_ci} 348cabdff1aSopenharmony_ci 349cabdff1aSopenharmony_ciinline static void fft16_vsx_interleave(FFTComplex *z) 350cabdff1aSopenharmony_ci{ 351cabdff1aSopenharmony_ci float* out= (float*)z; 352cabdff1aSopenharmony_ci vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; 353cabdff1aSopenharmony_ci vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]}; 354cabdff1aSopenharmony_ci vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]}; 355cabdff1aSopenharmony_ci vec_f vz0, vz1, vz2, vz3; 356cabdff1aSopenharmony_ci vec_f vz4, vz5, vz6, vz7; 357cabdff1aSopenharmony_ci vec_f x0, x1, x2, x3; 358cabdff1aSopenharmony_ci vec_f x4, x5, x6, x7; 359cabdff1aSopenharmony_ci vec_f x8, x9, x10, x11; 360cabdff1aSopenharmony_ci vec_f x12, x13, x14, x15; 361cabdff1aSopenharmony_ci vec_f x16, x17, x18, x19; 362cabdff1aSopenharmony_ci vec_f x20, x21, x22, x23; 363cabdff1aSopenharmony_ci vec_f x24, x25, x26, x27; 364cabdff1aSopenharmony_ci vec_f x28, x29, x30, x31; 365cabdff1aSopenharmony_ci vec_f x32, x33, x34, x35; 366cabdff1aSopenharmony_ci vec_f x36, x37, x38, x39; 367cabdff1aSopenharmony_ci vec_f x40, x41, x42, x43; 368cabdff1aSopenharmony_ci vec_f x44, x45, x46, x47; 369cabdff1aSopenharmony_ci vec_f x48, x49, x50, x51; 370cabdff1aSopenharmony_ci vec_f x52, x53, x54, x55; 371cabdff1aSopenharmony_ci vec_f x56, x57, x58, x59; 372cabdff1aSopenharmony_ci vec_f x60, x61, x62, x63; 373cabdff1aSopenharmony_ci vec_f x64, x65, x66, x67; 374cabdff1aSopenharmony_ci vec_f x68, x69, x70, x71; 375cabdff1aSopenharmony_ci vec_f x72, x73, x74, x75; 376cabdff1aSopenharmony_ci vec_f x76, x77, x78, x79; 377cabdff1aSopenharmony_ci vec_f x80, x81, x82, x83; 378cabdff1aSopenharmony_ci vec_f x84, x85, x86; 379cabdff1aSopenharmony_ci 380cabdff1aSopenharmony_ci vz0 = vec_ld(0, &(out[0])); 381cabdff1aSopenharmony_ci vz1 = vec_ld(byte_2complex, &(out[0])); 382cabdff1aSopenharmony_ci vz2 = vec_ld(byte_4complex, &(out[0])); 383cabdff1aSopenharmony_ci vz3 = vec_ld(byte_6complex, &(out[0])); 384cabdff1aSopenharmony_ci vz4 = vec_ld(byte_8complex, &(out[0])); 385cabdff1aSopenharmony_ci vz5 = vec_ld(byte_10complex, &(out[0])); 386cabdff1aSopenharmony_ci vz6 = vec_ld(byte_12complex, &(out[0])); 387cabdff1aSopenharmony_ci vz7 = vec_ld(byte_14complex, &(out[0])); 388cabdff1aSopenharmony_ci 389cabdff1aSopenharmony_ci x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 390cabdff1aSopenharmony_ci x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 391cabdff1aSopenharmony_ci x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1)); 392cabdff1aSopenharmony_ci x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3)); 393cabdff1aSopenharmony_ci 394cabdff1aSopenharmony_ci x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1)); 395cabdff1aSopenharmony_ci x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3)); 396cabdff1aSopenharmony_ci x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1)); 397cabdff1aSopenharmony_ci x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3)); 398cabdff1aSopenharmony_ci 399cabdff1aSopenharmony_ci x8 = vec_add(x0, x1); 400cabdff1aSopenharmony_ci x9 = vec_sub(x0, x1); 401cabdff1aSopenharmony_ci x10 = vec_add(x2, x3); 402cabdff1aSopenharmony_ci x11 = vec_sub(x2, x3); 403cabdff1aSopenharmony_ci 404cabdff1aSopenharmony_ci x12 = vec_add(x4, x5); 405cabdff1aSopenharmony_ci x13 = vec_sub(x4, x5); 406cabdff1aSopenharmony_ci x14 = vec_add(x6, x7); 407cabdff1aSopenharmony_ci x15 = vec_sub(x6, x7); 408cabdff1aSopenharmony_ci 409cabdff1aSopenharmony_ci x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1)); 410cabdff1aSopenharmony_ci x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2)); 411cabdff1aSopenharmony_ci x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2)); 412cabdff1aSopenharmony_ci x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3)); 413cabdff1aSopenharmony_ci x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1)); 414cabdff1aSopenharmony_ci x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3)); 415cabdff1aSopenharmony_ci x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1)); 416cabdff1aSopenharmony_ci x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2)); 417cabdff1aSopenharmony_ci 418cabdff1aSopenharmony_ci x24 = vec_add(x16, x17); 419cabdff1aSopenharmony_ci x25 = vec_sub(x16, x17); 420cabdff1aSopenharmony_ci x26 = vec_add(x18, x19); 421cabdff1aSopenharmony_ci x27 = vec_sub(x18, x19); 422cabdff1aSopenharmony_ci x28 = vec_add(x20, x21); 423cabdff1aSopenharmony_ci x29 = vec_sub(x20, x21); 424cabdff1aSopenharmony_ci x30 = vec_add(x22, x23); 425cabdff1aSopenharmony_ci x31 = vec_sub(x22, x23); 426cabdff1aSopenharmony_ci 427cabdff1aSopenharmony_ci x32 = vec_add(x24, x26); 428cabdff1aSopenharmony_ci x33 = vec_sub(x24, x26); 429cabdff1aSopenharmony_ci x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1)); 430cabdff1aSopenharmony_ci 431cabdff1aSopenharmony_ci x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2)); 432cabdff1aSopenharmony_ci x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3)); 433cabdff1aSopenharmony_ci x37 = vec_add(x35, x36); 434cabdff1aSopenharmony_ci x38 = vec_sub(x35, x36); 435cabdff1aSopenharmony_ci x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0)); 436cabdff1aSopenharmony_ci 437cabdff1aSopenharmony_ci x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3)); 438cabdff1aSopenharmony_ci x41 = vec_perm(x26, x37, vcprm(2,3,s3,s2)); 439cabdff1aSopenharmony_ci x42 = vec_add(x40, x41); 440cabdff1aSopenharmony_ci x43 = vec_sub(x40, x41); 441cabdff1aSopenharmony_ci x44 = vec_mul(x42, vc0); 442cabdff1aSopenharmony_ci x45 = vec_mul(x43, vc0); 443cabdff1aSopenharmony_ci 444cabdff1aSopenharmony_ci x46 = vec_add(x34, x39); // z0.r z0.i z4.r z4.i 445cabdff1aSopenharmony_ci x47 = vec_sub(x34, x39); // z8.r z8.i z12.r z12.i 446cabdff1aSopenharmony_ci 447cabdff1aSopenharmony_ci x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2)); 448cabdff1aSopenharmony_ci x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0)); 449cabdff1aSopenharmony_ci x50 = vec_add(x48, x49); 450cabdff1aSopenharmony_ci x51 = vec_sub(x48, x49); 451cabdff1aSopenharmony_ci x52 = vec_mul(x50, vc1); 452cabdff1aSopenharmony_ci x53 = vec_mul(x50, vc2); 453cabdff1aSopenharmony_ci x54 = vec_mul(x51, vc1); 454cabdff1aSopenharmony_ci x55 = vec_mul(x51, vc2); 455cabdff1aSopenharmony_ci 456cabdff1aSopenharmony_ci x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3)); 457cabdff1aSopenharmony_ci x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0)); 458cabdff1aSopenharmony_ci x58 = vec_add(x56, x57); 459cabdff1aSopenharmony_ci x59 = vec_sub(x56, x57); 460cabdff1aSopenharmony_ci 461cabdff1aSopenharmony_ci x60 = vec_perm(x54, x55, vcprm(1,0,3,2)); 462cabdff1aSopenharmony_ci x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2)); 463cabdff1aSopenharmony_ci x62 = vec_add(x52, x61); 464cabdff1aSopenharmony_ci x63 = vec_sub(x52, x61); 465cabdff1aSopenharmony_ci x64 = vec_add(x60, x53); 466cabdff1aSopenharmony_ci x65 = vec_sub(x60, x53); 467cabdff1aSopenharmony_ci x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2)); 468cabdff1aSopenharmony_ci x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2)); 469cabdff1aSopenharmony_ci 470cabdff1aSopenharmony_ci x68 = vec_add(x58, x66); // z1.r z1.i z3.r z3.i 471cabdff1aSopenharmony_ci x69 = vec_sub(x58, x66); // z9.r z9.i z11.r z11.i 472cabdff1aSopenharmony_ci x70 = vec_add(x59, x67); // z5.r z5.i z15.r z15.i 473cabdff1aSopenharmony_ci x71 = vec_sub(x59, x67); // z13.r z13.i z7.r z7.i 474cabdff1aSopenharmony_ci 475cabdff1aSopenharmony_ci x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3)); 476cabdff1aSopenharmony_ci x73 = vec_add(x25, x72); 477cabdff1aSopenharmony_ci x74 = vec_sub(x25, x72); 478cabdff1aSopenharmony_ci x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1)); 479cabdff1aSopenharmony_ci x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3)); 480cabdff1aSopenharmony_ci x77 = vec_add(x75, x76); // z2.r z2.i z6.r z6.i 481cabdff1aSopenharmony_ci x78 = vec_sub(x75, x76); // z10.r z10.i z14.r z14.i 482cabdff1aSopenharmony_ci 483cabdff1aSopenharmony_ci x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r z0.i z1.r z1.i 484cabdff1aSopenharmony_ci x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r z2.i z3.r z3.i 485cabdff1aSopenharmony_ci x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r z4.i z5.r z5.i 486cabdff1aSopenharmony_ci x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r z6.i z7.r z7.i 487cabdff1aSopenharmony_ci vec_st(x79, 0, &(out[0])); 488cabdff1aSopenharmony_ci vec_st(x80, byte_2complex, &(out[0])); 489cabdff1aSopenharmony_ci vec_st(x81, byte_4complex, &(out[0])); 490cabdff1aSopenharmony_ci vec_st(x82, byte_6complex, &(out[0])); 491cabdff1aSopenharmony_ci x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r z8.i z9.r z9.i 492cabdff1aSopenharmony_ci x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r z10.i z11.r z11.i 493cabdff1aSopenharmony_ci x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r z12.i z13.r z13.i 494cabdff1aSopenharmony_ci x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r z14.i z15.r z15.i 495cabdff1aSopenharmony_ci vec_st(x83, byte_8complex, &(out[0])); 496cabdff1aSopenharmony_ci vec_st(x84, byte_10complex, &(out[0])); 497cabdff1aSopenharmony_ci vec_st(x85, byte_12complex, &(out[0])); 498cabdff1aSopenharmony_ci vec_st(x86, byte_14complex, &(out[0])); 499cabdff1aSopenharmony_ci} 500cabdff1aSopenharmony_ci 501cabdff1aSopenharmony_ciinline static void fft4_vsx(FFTComplex *z) 502cabdff1aSopenharmony_ci{ 503cabdff1aSopenharmony_ci vec_f a, b, c, d; 504cabdff1aSopenharmony_ci float* out= (float*)z; 505cabdff1aSopenharmony_ci a = vec_ld(0, &(out[0])); 506cabdff1aSopenharmony_ci b = vec_ld(byte_2complex, &(out[0])); 507cabdff1aSopenharmony_ci 508cabdff1aSopenharmony_ci c = vec_perm(a, b, vcprm(0,1,s2,s1)); 509cabdff1aSopenharmony_ci d = vec_perm(a, b, vcprm(2,3,s0,s3)); 510cabdff1aSopenharmony_ci a = vec_add(c, d); 511cabdff1aSopenharmony_ci b = vec_sub(c, d); 512cabdff1aSopenharmony_ci 513cabdff1aSopenharmony_ci c = vec_perm(a,b, vcprm(0,s0,1,s1)); 514cabdff1aSopenharmony_ci d = vec_perm(a, b, vcprm(2,s3,3,s2)); 515cabdff1aSopenharmony_ci 516cabdff1aSopenharmony_ci a = vec_add(c, d); 517cabdff1aSopenharmony_ci b = vec_sub(c, d); 518cabdff1aSopenharmony_ci 519cabdff1aSopenharmony_ci c = vec_perm(a, b, vcprm(0,1,s0,s1)); 520cabdff1aSopenharmony_ci d = vec_perm(a, b, vcprm(2,3,s2,s3)); 521cabdff1aSopenharmony_ci 522cabdff1aSopenharmony_ci vec_st(c, 0, &(out[0])); 523cabdff1aSopenharmony_ci vec_st(d, byte_2complex, &(out[0])); 524cabdff1aSopenharmony_ci return; 525cabdff1aSopenharmony_ci} 526cabdff1aSopenharmony_ci 527cabdff1aSopenharmony_ciinline static void fft8_vsx(FFTComplex *z) 528cabdff1aSopenharmony_ci{ 529cabdff1aSopenharmony_ci vec_f vz0, vz1, vz2, vz3; 530cabdff1aSopenharmony_ci vec_f vz4, vz5, vz6, vz7, vz8; 531cabdff1aSopenharmony_ci 532cabdff1aSopenharmony_ci float* out= (float*)z; 533cabdff1aSopenharmony_ci vec_f vc0 = {0.0, 0.0, 0.0, 0.0}; 534cabdff1aSopenharmony_ci vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf}; 535cabdff1aSopenharmony_ci vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; 536cabdff1aSopenharmony_ci 537cabdff1aSopenharmony_ci vz0 = vec_ld(0, &(out[0])); 538cabdff1aSopenharmony_ci vz1 = vec_ld(byte_2complex, &(out[0])); 539cabdff1aSopenharmony_ci vz2 = vec_ld(byte_4complex, &(out[0])); 540cabdff1aSopenharmony_ci vz3 = vec_ld(byte_6complex, &(out[0])); 541cabdff1aSopenharmony_ci 542cabdff1aSopenharmony_ci vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); 543cabdff1aSopenharmony_ci vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3)); 544cabdff1aSopenharmony_ci vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 545cabdff1aSopenharmony_ci vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 546cabdff1aSopenharmony_ci 547cabdff1aSopenharmony_ci vz2 = vec_add(vz6, vz7); 548cabdff1aSopenharmony_ci vz3 = vec_sub(vz6, vz7); 549cabdff1aSopenharmony_ci vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1)); 550cabdff1aSopenharmony_ci 551cabdff1aSopenharmony_ci vz0 = vec_add(vz4, vz5); 552cabdff1aSopenharmony_ci vz1 = vec_sub(vz4, vz5); 553cabdff1aSopenharmony_ci 554cabdff1aSopenharmony_ci vz3 = vec_madd(vz3, vc1, vc0); 555cabdff1aSopenharmony_ci vz3 = vec_madd(vz8, vc2, vz3); 556cabdff1aSopenharmony_ci 557cabdff1aSopenharmony_ci vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); 558cabdff1aSopenharmony_ci vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); 559cabdff1aSopenharmony_ci vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0)); 560cabdff1aSopenharmony_ci vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1)); 561cabdff1aSopenharmony_ci 562cabdff1aSopenharmony_ci vz0 = vec_add(vz4, vz5); 563cabdff1aSopenharmony_ci vz1 = vec_sub(vz4, vz5); 564cabdff1aSopenharmony_ci vz2 = vec_add(vz6, vz7); 565cabdff1aSopenharmony_ci vz3 = vec_sub(vz6, vz7); 566cabdff1aSopenharmony_ci 567cabdff1aSopenharmony_ci vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); 568cabdff1aSopenharmony_ci vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); 569cabdff1aSopenharmony_ci vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3)); 570cabdff1aSopenharmony_ci vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2)); 571cabdff1aSopenharmony_ci 572cabdff1aSopenharmony_ci 573cabdff1aSopenharmony_ci vz2 = vec_sub(vz4, vz6); 574cabdff1aSopenharmony_ci vz3 = vec_sub(vz5, vz7); 575cabdff1aSopenharmony_ci 576cabdff1aSopenharmony_ci vz0 = vec_add(vz4, vz6); 577cabdff1aSopenharmony_ci vz1 = vec_add(vz5, vz7); 578cabdff1aSopenharmony_ci 579cabdff1aSopenharmony_ci vec_st(vz0, 0, &(out[0])); 580cabdff1aSopenharmony_ci vec_st(vz1, byte_2complex, &(out[0])); 581cabdff1aSopenharmony_ci vec_st(vz2, byte_4complex, &(out[0])); 582cabdff1aSopenharmony_ci vec_st(vz3, byte_6complex, &(out[0])); 583cabdff1aSopenharmony_ci return; 584cabdff1aSopenharmony_ci} 585cabdff1aSopenharmony_ci 586cabdff1aSopenharmony_ciinline static void fft16_vsx(FFTComplex *z) 587cabdff1aSopenharmony_ci{ 588cabdff1aSopenharmony_ci float* out= (float*)z; 589cabdff1aSopenharmony_ci vec_f vc0 = {0.0, 0.0, 0.0, 0.0}; 590cabdff1aSopenharmony_ci vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf}; 591cabdff1aSopenharmony_ci vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; 592cabdff1aSopenharmony_ci vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343}; 593cabdff1aSopenharmony_ci vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953}; 594cabdff1aSopenharmony_ci vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953}; 595cabdff1aSopenharmony_ci 596cabdff1aSopenharmony_ci vec_f vz0, vz1, vz2, vz3; 597cabdff1aSopenharmony_ci vec_f vz4, vz5, vz6, vz7; 598cabdff1aSopenharmony_ci vec_f vz8, vz9, vz10, vz11; 599cabdff1aSopenharmony_ci vec_f vz12, vz13; 600cabdff1aSopenharmony_ci 601cabdff1aSopenharmony_ci vz0 = vec_ld(byte_8complex, &(out[0])); 602cabdff1aSopenharmony_ci vz1 = vec_ld(byte_10complex, &(out[0])); 603cabdff1aSopenharmony_ci vz2 = vec_ld(byte_12complex, &(out[0])); 604cabdff1aSopenharmony_ci vz3 = vec_ld(byte_14complex, &(out[0])); 605cabdff1aSopenharmony_ci 606cabdff1aSopenharmony_ci vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 607cabdff1aSopenharmony_ci vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 608cabdff1aSopenharmony_ci vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1)); 609cabdff1aSopenharmony_ci vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3)); 610cabdff1aSopenharmony_ci 611cabdff1aSopenharmony_ci vz0 = vec_add(vz4, vz5); 612cabdff1aSopenharmony_ci vz1= vec_sub(vz4, vz5); 613cabdff1aSopenharmony_ci vz2 = vec_add(vz6, vz7); 614cabdff1aSopenharmony_ci vz3 = vec_sub(vz6, vz7); 615cabdff1aSopenharmony_ci 616cabdff1aSopenharmony_ci vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); 617cabdff1aSopenharmony_ci vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); 618cabdff1aSopenharmony_ci vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); 619cabdff1aSopenharmony_ci vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2)); 620cabdff1aSopenharmony_ci 621cabdff1aSopenharmony_ci vz0 = vec_add(vz4, vz5); 622cabdff1aSopenharmony_ci vz1 = vec_sub(vz4, vz5); 623cabdff1aSopenharmony_ci vz2 = vec_add(vz6, vz7); 624cabdff1aSopenharmony_ci vz3 = vec_sub(vz6, vz7); 625cabdff1aSopenharmony_ci 626cabdff1aSopenharmony_ci vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); 627cabdff1aSopenharmony_ci vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); 628cabdff1aSopenharmony_ci 629cabdff1aSopenharmony_ci vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1)); 630cabdff1aSopenharmony_ci vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3)); 631cabdff1aSopenharmony_ci 632cabdff1aSopenharmony_ci vz0 = vec_ld(0, &(out[0])); 633cabdff1aSopenharmony_ci vz1 = vec_ld(byte_2complex, &(out[0])); 634cabdff1aSopenharmony_ci vz2 = vec_ld(byte_4complex, &(out[0])); 635cabdff1aSopenharmony_ci vz3 = vec_ld(byte_6complex, &(out[0])); 636cabdff1aSopenharmony_ci vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); 637cabdff1aSopenharmony_ci vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3)); 638cabdff1aSopenharmony_ci vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 639cabdff1aSopenharmony_ci vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 640cabdff1aSopenharmony_ci 641cabdff1aSopenharmony_ci vz2 = vec_add(vz10, vz11); 642cabdff1aSopenharmony_ci vz3 = vec_sub(vz10, vz11); 643cabdff1aSopenharmony_ci vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1)); 644cabdff1aSopenharmony_ci vz0 = vec_add(vz8, vz9); 645cabdff1aSopenharmony_ci vz1 = vec_sub(vz8, vz9); 646cabdff1aSopenharmony_ci 647cabdff1aSopenharmony_ci vz3 = vec_madd(vz3, vc1, vc0); 648cabdff1aSopenharmony_ci vz3 = vec_madd(vz12, vc2, vz3); 649cabdff1aSopenharmony_ci vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); 650cabdff1aSopenharmony_ci vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); 651cabdff1aSopenharmony_ci vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0)); 652cabdff1aSopenharmony_ci vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1)); 653cabdff1aSopenharmony_ci 654cabdff1aSopenharmony_ci vz0 = vec_add(vz8, vz9); 655cabdff1aSopenharmony_ci vz1 = vec_sub(vz8, vz9); 656cabdff1aSopenharmony_ci vz2 = vec_add(vz10, vz11); 657cabdff1aSopenharmony_ci vz3 = vec_sub(vz10, vz11); 658cabdff1aSopenharmony_ci 659cabdff1aSopenharmony_ci vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); 660cabdff1aSopenharmony_ci vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); 661cabdff1aSopenharmony_ci vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3)); 662cabdff1aSopenharmony_ci vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2)); 663cabdff1aSopenharmony_ci 664cabdff1aSopenharmony_ci vz2 = vec_sub(vz8, vz10); 665cabdff1aSopenharmony_ci vz3 = vec_sub(vz9, vz11); 666cabdff1aSopenharmony_ci vz0 = vec_add(vz8, vz10); 667cabdff1aSopenharmony_ci vz1 = vec_add(vz9, vz11); 668cabdff1aSopenharmony_ci 669cabdff1aSopenharmony_ci vz8 = vec_madd(vz4, vc3, vc0); 670cabdff1aSopenharmony_ci vz9 = vec_madd(vz5, vc3, vc0); 671cabdff1aSopenharmony_ci vz10 = vec_madd(vz6, vc3, vc0); 672cabdff1aSopenharmony_ci vz11 = vec_madd(vz7, vc3, vc0); 673cabdff1aSopenharmony_ci 674cabdff1aSopenharmony_ci vz8 = vec_madd(vz5, vc4, vz8); 675cabdff1aSopenharmony_ci vz9 = vec_madd(vz4, vc5, vz9); 676cabdff1aSopenharmony_ci vz10 = vec_madd(vz7, vc5, vz10); 677cabdff1aSopenharmony_ci vz11 = vec_madd(vz6, vc4, vz11); 678cabdff1aSopenharmony_ci 679cabdff1aSopenharmony_ci vz12 = vec_sub(vz10, vz8); 680cabdff1aSopenharmony_ci vz10 = vec_add(vz10, vz8); 681cabdff1aSopenharmony_ci 682cabdff1aSopenharmony_ci vz13 = vec_sub(vz9, vz11); 683cabdff1aSopenharmony_ci vz11 = vec_add(vz9, vz11); 684cabdff1aSopenharmony_ci 685cabdff1aSopenharmony_ci vz4 = vec_sub(vz0, vz10); 686cabdff1aSopenharmony_ci vz0 = vec_add(vz0, vz10); 687cabdff1aSopenharmony_ci 688cabdff1aSopenharmony_ci vz7= vec_sub(vz3, vz12); 689cabdff1aSopenharmony_ci vz3= vec_add(vz3, vz12); 690cabdff1aSopenharmony_ci 691cabdff1aSopenharmony_ci vz5 = vec_sub(vz1, vz11); 692cabdff1aSopenharmony_ci vz1 = vec_add(vz1, vz11); 693cabdff1aSopenharmony_ci 694cabdff1aSopenharmony_ci vz6 = vec_sub(vz2, vz13); 695cabdff1aSopenharmony_ci vz2 = vec_add(vz2, vz13); 696cabdff1aSopenharmony_ci 697cabdff1aSopenharmony_ci vec_st(vz0, 0, &(out[0])); 698cabdff1aSopenharmony_ci vec_st(vz1, byte_2complex, &(out[0])); 699cabdff1aSopenharmony_ci vec_st(vz2, byte_4complex, &(out[0])); 700cabdff1aSopenharmony_ci vec_st(vz3, byte_6complex, &(out[0])); 701cabdff1aSopenharmony_ci vec_st(vz4, byte_8complex, &(out[0])); 702cabdff1aSopenharmony_ci vec_st(vz5, byte_10complex, &(out[0])); 703cabdff1aSopenharmony_ci vec_st(vz6, byte_12complex, &(out[0])); 704cabdff1aSopenharmony_ci vec_st(vz7, byte_14complex, &(out[0])); 705cabdff1aSopenharmony_ci return; 706cabdff1aSopenharmony_ci 707cabdff1aSopenharmony_ci} 708cabdff1aSopenharmony_ciinline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n) 709cabdff1aSopenharmony_ci{ 710cabdff1aSopenharmony_ci int o1 = n<<1; 711cabdff1aSopenharmony_ci int o2 = n<<2; 712cabdff1aSopenharmony_ci int o3 = o1+o2; 713cabdff1aSopenharmony_ci int i1, i2, i3; 714cabdff1aSopenharmony_ci FFTSample* out = (FFTSample*)z; 715cabdff1aSopenharmony_ci const FFTSample *wim = wre+o1; 716cabdff1aSopenharmony_ci vec_f v0, v1, v2, v3; 717cabdff1aSopenharmony_ci vec_f v4, v5, v6, v7; 718cabdff1aSopenharmony_ci vec_f v8, v9, v10, v11; 719cabdff1aSopenharmony_ci vec_f v12, v13; 720cabdff1aSopenharmony_ci 721cabdff1aSopenharmony_ci n = n-2; 722cabdff1aSopenharmony_ci i1 = o1*sizeof(FFTComplex); 723cabdff1aSopenharmony_ci i2 = o2*sizeof(FFTComplex); 724cabdff1aSopenharmony_ci i3 = o3*sizeof(FFTComplex); 725cabdff1aSopenharmony_ci 726cabdff1aSopenharmony_ci v8 = vec_ld(0, &(wre[0])); 727cabdff1aSopenharmony_ci v10 = vec_ld(0, &(wim[0])); 728cabdff1aSopenharmony_ci v9 = vec_ld(0, &(wim[-4])); 729cabdff1aSopenharmony_ci v9 = vec_perm(v9, v10, vcprm(s0,3,2,1)); 730cabdff1aSopenharmony_ci 731cabdff1aSopenharmony_ci v4 = vec_ld(i2, &(out[0])); 732cabdff1aSopenharmony_ci v5 = vec_ld(i2+16, &(out[0])); 733cabdff1aSopenharmony_ci v6 = vec_ld(i3, &(out[0])); 734cabdff1aSopenharmony_ci v7 = vec_ld(i3+16, &(out[0])); 735cabdff1aSopenharmony_ci v10 = vec_mul(v4, v8); // r2*wre 736cabdff1aSopenharmony_ci v11 = vec_mul(v5, v8); // i2*wre 737cabdff1aSopenharmony_ci v12 = vec_mul(v6, v8); // r3*wre 738cabdff1aSopenharmony_ci v13 = vec_mul(v7, v8); // i3*wre 739cabdff1aSopenharmony_ci 740cabdff1aSopenharmony_ci v0 = vec_ld(0, &(out[0])); // r0 741cabdff1aSopenharmony_ci v3 = vec_ld(i1+16, &(out[0])); // i1 742cabdff1aSopenharmony_ci v10 = vec_madd(v5, v9, v10); // r2*wim 743cabdff1aSopenharmony_ci v11 = vec_nmsub(v4, v9, v11); // i2*wim 744cabdff1aSopenharmony_ci v12 = vec_nmsub(v7, v9, v12); // r3*wim 745cabdff1aSopenharmony_ci v13 = vec_madd(v6, v9, v13); // i3*wim 746cabdff1aSopenharmony_ci 747cabdff1aSopenharmony_ci v1 = vec_ld(16, &(out[0])); // i0 748cabdff1aSopenharmony_ci v2 = vec_ld(i1, &(out[0])); // r1 749cabdff1aSopenharmony_ci v8 = vec_sub(v12, v10); 750cabdff1aSopenharmony_ci v12 = vec_add(v12, v10); 751cabdff1aSopenharmony_ci v9 = vec_sub(v11, v13); 752cabdff1aSopenharmony_ci v13 = vec_add(v11, v13); 753cabdff1aSopenharmony_ci v4 = vec_sub(v0, v12); 754cabdff1aSopenharmony_ci v0 = vec_add(v0, v12); 755cabdff1aSopenharmony_ci v7 = vec_sub(v3, v8); 756cabdff1aSopenharmony_ci v3 = vec_add(v3, v8); 757cabdff1aSopenharmony_ci 758cabdff1aSopenharmony_ci vec_st(v0, 0, &(out[0])); // r0 759cabdff1aSopenharmony_ci vec_st(v3, i1+16, &(out[0])); // i1 760cabdff1aSopenharmony_ci vec_st(v4, i2, &(out[0])); // r2 761cabdff1aSopenharmony_ci vec_st(v7, i3+16, &(out[0]));// i3 762cabdff1aSopenharmony_ci 763cabdff1aSopenharmony_ci v5 = vec_sub(v1, v13); 764cabdff1aSopenharmony_ci v1 = vec_add(v1, v13); 765cabdff1aSopenharmony_ci v6 = vec_sub(v2, v9); 766cabdff1aSopenharmony_ci v2 = vec_add(v2, v9); 767cabdff1aSopenharmony_ci 768cabdff1aSopenharmony_ci vec_st(v1, 16, &(out[0])); // i0 769cabdff1aSopenharmony_ci vec_st(v2, i1, &(out[0])); // r1 770cabdff1aSopenharmony_ci vec_st(v5, i2+16, &(out[0])); // i2 771cabdff1aSopenharmony_ci vec_st(v6, i3, &(out[0])); // r3 772cabdff1aSopenharmony_ci 773cabdff1aSopenharmony_ci do { 774cabdff1aSopenharmony_ci out += 8; 775cabdff1aSopenharmony_ci wre += 4; 776cabdff1aSopenharmony_ci wim -= 4; 777cabdff1aSopenharmony_ci 778cabdff1aSopenharmony_ci v8 = vec_ld(0, &(wre[0])); 779cabdff1aSopenharmony_ci v10 = vec_ld(0, &(wim[0])); 780cabdff1aSopenharmony_ci v9 = vec_ld(0, &(wim[-4])); 781cabdff1aSopenharmony_ci v9 = vec_perm(v9, v10, vcprm(s0,3,2,1)); 782cabdff1aSopenharmony_ci 783cabdff1aSopenharmony_ci v4 = vec_ld(i2, &(out[0])); // r2 784cabdff1aSopenharmony_ci v5 = vec_ld(i2+16, &(out[0])); // i2 785cabdff1aSopenharmony_ci v6 = vec_ld(i3, &(out[0])); // r3 786cabdff1aSopenharmony_ci v7 = vec_ld(i3+16, &(out[0]));// i3 787cabdff1aSopenharmony_ci v10 = vec_mul(v4, v8); // r2*wre 788cabdff1aSopenharmony_ci v11 = vec_mul(v5, v8); // i2*wre 789cabdff1aSopenharmony_ci v12 = vec_mul(v6, v8); // r3*wre 790cabdff1aSopenharmony_ci v13 = vec_mul(v7, v8); // i3*wre 791cabdff1aSopenharmony_ci 792cabdff1aSopenharmony_ci v0 = vec_ld(0, &(out[0])); // r0 793cabdff1aSopenharmony_ci v3 = vec_ld(i1+16, &(out[0])); // i1 794cabdff1aSopenharmony_ci v10 = vec_madd(v5, v9, v10); // r2*wim 795cabdff1aSopenharmony_ci v11 = vec_nmsub(v4, v9, v11); // i2*wim 796cabdff1aSopenharmony_ci v12 = vec_nmsub(v7, v9, v12); // r3*wim 797cabdff1aSopenharmony_ci v13 = vec_madd(v6, v9, v13); // i3*wim 798cabdff1aSopenharmony_ci 799cabdff1aSopenharmony_ci v1 = vec_ld(16, &(out[0])); // i0 800cabdff1aSopenharmony_ci v2 = vec_ld(i1, &(out[0])); // r1 801cabdff1aSopenharmony_ci v8 = vec_sub(v12, v10); 802cabdff1aSopenharmony_ci v12 = vec_add(v12, v10); 803cabdff1aSopenharmony_ci v9 = vec_sub(v11, v13); 804cabdff1aSopenharmony_ci v13 = vec_add(v11, v13); 805cabdff1aSopenharmony_ci v4 = vec_sub(v0, v12); 806cabdff1aSopenharmony_ci v0 = vec_add(v0, v12); 807cabdff1aSopenharmony_ci v7 = vec_sub(v3, v8); 808cabdff1aSopenharmony_ci v3 = vec_add(v3, v8); 809cabdff1aSopenharmony_ci 810cabdff1aSopenharmony_ci vec_st(v0, 0, &(out[0])); // r0 811cabdff1aSopenharmony_ci vec_st(v3, i1+16, &(out[0])); // i1 812cabdff1aSopenharmony_ci vec_st(v4, i2, &(out[0])); // r2 813cabdff1aSopenharmony_ci vec_st(v7, i3+16, &(out[0])); // i3 814cabdff1aSopenharmony_ci 815cabdff1aSopenharmony_ci v5 = vec_sub(v1, v13); 816cabdff1aSopenharmony_ci v1 = vec_add(v1, v13); 817cabdff1aSopenharmony_ci v6 = vec_sub(v2, v9); 818cabdff1aSopenharmony_ci v2 = vec_add(v2, v9); 819cabdff1aSopenharmony_ci 820cabdff1aSopenharmony_ci vec_st(v1, 16, &(out[0])); // i0 821cabdff1aSopenharmony_ci vec_st(v2, i1, &(out[0])); // r1 822cabdff1aSopenharmony_ci vec_st(v5, i2+16, &(out[0])); // i2 823cabdff1aSopenharmony_ci vec_st(v6, i3, &(out[0])); // r3 824cabdff1aSopenharmony_ci } while (n-=2); 825cabdff1aSopenharmony_ci} 826cabdff1aSopenharmony_ci 827cabdff1aSopenharmony_ci#endif 828cabdff1aSopenharmony_ci 829cabdff1aSopenharmony_ci#endif /* AVCODEC_PPC_FFT_VSX_H */ 830