153a5a1b3Sopenharmony_ci/* Copyright (C) 2007 Hong Zhiqian */ 253a5a1b3Sopenharmony_ci/** 353a5a1b3Sopenharmony_ci @file kiss_fft_tm.h 453a5a1b3Sopenharmony_ci @author Hong Zhiqian 553a5a1b3Sopenharmony_ci @brief Various compatibility routines for Speex (TriMedia version) 653a5a1b3Sopenharmony_ci*/ 753a5a1b3Sopenharmony_ci/* 853a5a1b3Sopenharmony_ci Redistribution and use in source and binary forms, with or without 953a5a1b3Sopenharmony_ci modification, are permitted provided that the following conditions 1053a5a1b3Sopenharmony_ci are met: 1153a5a1b3Sopenharmony_ci 1253a5a1b3Sopenharmony_ci - Redistributions of source code must retain the above copyright 1353a5a1b3Sopenharmony_ci notice, this list of conditions and the following disclaimer. 1453a5a1b3Sopenharmony_ci 1553a5a1b3Sopenharmony_ci - Redistributions in binary form must reproduce the above copyright 1653a5a1b3Sopenharmony_ci notice, this list of conditions and the following disclaimer in the 1753a5a1b3Sopenharmony_ci documentation and/or other materials provided with the distribution. 1853a5a1b3Sopenharmony_ci 1953a5a1b3Sopenharmony_ci - Neither the name of the Xiph.org Foundation nor the names of its 2053a5a1b3Sopenharmony_ci contributors may be used to endorse or promote products derived from 2153a5a1b3Sopenharmony_ci this software without specific prior written permission. 2253a5a1b3Sopenharmony_ci 2353a5a1b3Sopenharmony_ci THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 2453a5a1b3Sopenharmony_ci ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 2553a5a1b3Sopenharmony_ci LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 2653a5a1b3Sopenharmony_ci A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 2753a5a1b3Sopenharmony_ci CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 2853a5a1b3Sopenharmony_ci EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 2953a5a1b3Sopenharmony_ci PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 3053a5a1b3Sopenharmony_ci PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 3153a5a1b3Sopenharmony_ci LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 3253a5a1b3Sopenharmony_ci NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 3353a5a1b3Sopenharmony_ci SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 3453a5a1b3Sopenharmony_ci*/ 3553a5a1b3Sopenharmony_ci 3653a5a1b3Sopenharmony_ci#include "_kiss_fft_guts_tm.h" 3753a5a1b3Sopenharmony_ci 3853a5a1b3Sopenharmony_ci#ifdef TM_ASM 3953a5a1b3Sopenharmony_ci 4053a5a1b3Sopenharmony_ci#include "profile_tm.h" 4153a5a1b3Sopenharmony_ci 4253a5a1b3Sopenharmony_ci#ifdef FIXED_POINT 4353a5a1b3Sopenharmony_ci 4453a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY2 4553a5a1b3Sopenharmony_cistatic void kf_bfly2( 4653a5a1b3Sopenharmony_ci kiss_fft_cpx *Fout, 4753a5a1b3Sopenharmony_ci const int fstride, 4853a5a1b3Sopenharmony_ci const kiss_fft_cfg st, 4953a5a1b3Sopenharmony_ci int m 5053a5a1b3Sopenharmony_ci ) 5153a5a1b3Sopenharmony_ci{ 5253a5a1b3Sopenharmony_ci register int * restrict Fout2; 5353a5a1b3Sopenharmony_ci register int * restrict tw1 = (int*)st->twiddles; 5453a5a1b3Sopenharmony_ci register int i, j; 5553a5a1b3Sopenharmony_ci register int _inv = !st->inverse; 5653a5a1b3Sopenharmony_ci 5753a5a1b3Sopenharmony_ci Fout2 = (int*)Fout + m; 5853a5a1b3Sopenharmony_ci 5953a5a1b3Sopenharmony_ci for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride ) 6053a5a1b3Sopenharmony_ci { register int tw_10, ff_10, f2_10; 6153a5a1b3Sopenharmony_ci 6253a5a1b3Sopenharmony_ci ff_10 = ld32x(Fout, i); 6353a5a1b3Sopenharmony_ci f2_10 = ld32x(Fout2, i); 6453a5a1b3Sopenharmony_ci tw_10 = ld32(tw1); 6553a5a1b3Sopenharmony_ci 6653a5a1b3Sopenharmony_ci if ( _inv ) 6753a5a1b3Sopenharmony_ci { TM_SHR(f2_10, f2_10, 1); 6853a5a1b3Sopenharmony_ci TM_SHR(ff_10, ff_10, 1); 6953a5a1b3Sopenharmony_ci } 7053a5a1b3Sopenharmony_ci 7153a5a1b3Sopenharmony_ci TM_MUL(tw_10, tw_10, f2_10); 7253a5a1b3Sopenharmony_ci TM_SUB(f2_10, ff_10, tw_10); 7353a5a1b3Sopenharmony_ci TM_ADD(ff_10, ff_10, tw_10); 7453a5a1b3Sopenharmony_ci 7553a5a1b3Sopenharmony_ci st32d(j, Fout2, f2_10); 7653a5a1b3Sopenharmony_ci st32d(j, Fout, ff_10); 7753a5a1b3Sopenharmony_ci } 7853a5a1b3Sopenharmony_ci} 7953a5a1b3Sopenharmony_ci 8053a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY4 8153a5a1b3Sopenharmony_cistatic void kf_bfly4( 8253a5a1b3Sopenharmony_ci kiss_fft_cpx *Fout, 8353a5a1b3Sopenharmony_ci const int fstride, 8453a5a1b3Sopenharmony_ci const kiss_fft_cfg st, 8553a5a1b3Sopenharmony_ci const int m 8653a5a1b3Sopenharmony_ci ) 8753a5a1b3Sopenharmony_ci{ 8853a5a1b3Sopenharmony_ci register int * restrict tw1; 8953a5a1b3Sopenharmony_ci register int * restrict tw2; 9053a5a1b3Sopenharmony_ci register int * restrict tw3; 9153a5a1b3Sopenharmony_ci register int * restrict Fout1; 9253a5a1b3Sopenharmony_ci register int * restrict Fout2; 9353a5a1b3Sopenharmony_ci register int * restrict Fout3; 9453a5a1b3Sopenharmony_ci register int i, j; 9553a5a1b3Sopenharmony_ci register int fstride2, fstride3; 9653a5a1b3Sopenharmony_ci register int _inv = !st->inverse; 9753a5a1b3Sopenharmony_ci 9853a5a1b3Sopenharmony_ci tw3 = tw2 = tw1 = (int*)st->twiddles; 9953a5a1b3Sopenharmony_ci fstride2 = fstride << 1; 10053a5a1b3Sopenharmony_ci fstride3 = fstride * 3; 10153a5a1b3Sopenharmony_ci 10253a5a1b3Sopenharmony_ci Fout1 = (int*)Fout + m; 10353a5a1b3Sopenharmony_ci Fout2 = (int*)Fout + (m << 1); 10453a5a1b3Sopenharmony_ci Fout3 = (int*)Fout + (m * 3); 10553a5a1b3Sopenharmony_ci 10653a5a1b3Sopenharmony_ci 10753a5a1b3Sopenharmony_ci for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3 ) 10853a5a1b3Sopenharmony_ci { register int sc0, sc1, sc2, sc3, sc4, sc5; 10953a5a1b3Sopenharmony_ci register int ff0; 11053a5a1b3Sopenharmony_ci 11153a5a1b3Sopenharmony_ci sc0 = ld32x(Fout1,i); 11253a5a1b3Sopenharmony_ci sc3 = ld32(tw1); 11353a5a1b3Sopenharmony_ci sc1 = ld32x(Fout2, i); 11453a5a1b3Sopenharmony_ci sc4 = ld32(tw2); 11553a5a1b3Sopenharmony_ci sc2 = ld32x(Fout3, i); 11653a5a1b3Sopenharmony_ci sc5 = ld32(tw3); 11753a5a1b3Sopenharmony_ci ff0 = ld32x(Fout,i); 11853a5a1b3Sopenharmony_ci 11953a5a1b3Sopenharmony_ci if ( _inv ) 12053a5a1b3Sopenharmony_ci { 12153a5a1b3Sopenharmony_ci TM_ADD(sc0, sc0, 0x00020002); 12253a5a1b3Sopenharmony_ci TM_ADD(sc1, sc1, 0x00020002); 12353a5a1b3Sopenharmony_ci TM_ADD(sc2, sc2, 0x00020002); 12453a5a1b3Sopenharmony_ci TM_ADD(ff0, ff0, 0x00020002); 12553a5a1b3Sopenharmony_ci TM_SHR(sc0, sc0, 2); 12653a5a1b3Sopenharmony_ci TM_SHR(sc1, sc1, 2); 12753a5a1b3Sopenharmony_ci TM_SHR(sc2, sc2, 2); 12853a5a1b3Sopenharmony_ci TM_SHR(ff0, ff0, 2); 12953a5a1b3Sopenharmony_ci } 13053a5a1b3Sopenharmony_ci 13153a5a1b3Sopenharmony_ci TM_MUL(sc0, sc0, sc3); 13253a5a1b3Sopenharmony_ci TM_MUL(sc1, sc1, sc4); 13353a5a1b3Sopenharmony_ci TM_MUL(sc2, sc2, sc5); 13453a5a1b3Sopenharmony_ci TM_SUB(sc5, ff0, sc1); 13553a5a1b3Sopenharmony_ci TM_ADD(ff0, ff0, sc1); 13653a5a1b3Sopenharmony_ci TM_ADD(sc3, sc0, sc2); 13753a5a1b3Sopenharmony_ci TM_SUB(sc4, sc0, sc2); 13853a5a1b3Sopenharmony_ci TM_SUB(sc1, ff0, sc3); 13953a5a1b3Sopenharmony_ci TM_ADD(ff0, ff0, sc3); 14053a5a1b3Sopenharmony_ci 14153a5a1b3Sopenharmony_ci st32d(j, Fout2, sc1); 14253a5a1b3Sopenharmony_ci st32d(j, Fout, ff0); 14353a5a1b3Sopenharmony_ci 14453a5a1b3Sopenharmony_ci sc5 = funshift2(sc5, sc5); 14553a5a1b3Sopenharmony_ci 14653a5a1b3Sopenharmony_ci if ( _inv ) 14753a5a1b3Sopenharmony_ci { TM_ADD(ff0, sc5, sc4); 14853a5a1b3Sopenharmony_ci TM_SUB(sc1, sc5, sc4); 14953a5a1b3Sopenharmony_ci } else 15053a5a1b3Sopenharmony_ci { TM_ADD(sc1, sc5, sc4); 15153a5a1b3Sopenharmony_ci TM_SUB(ff0, sc5, sc4); 15253a5a1b3Sopenharmony_ci } 15353a5a1b3Sopenharmony_ci 15453a5a1b3Sopenharmony_ci sc0 = funshift2(sc1, ff0); 15553a5a1b3Sopenharmony_ci sc2 = funshift2(ff0, sc1); 15653a5a1b3Sopenharmony_ci 15753a5a1b3Sopenharmony_ci st32d(j, Fout1, sc0); 15853a5a1b3Sopenharmony_ci st32d(j, Fout3, sc2); 15953a5a1b3Sopenharmony_ci } 16053a5a1b3Sopenharmony_ci} 16153a5a1b3Sopenharmony_ci 16253a5a1b3Sopenharmony_ci 16353a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY3 16453a5a1b3Sopenharmony_cistatic void kf_bfly3( 16553a5a1b3Sopenharmony_ci kiss_fft_cpx *Fout, 16653a5a1b3Sopenharmony_ci const int fstride, 16753a5a1b3Sopenharmony_ci const kiss_fft_cfg st, 16853a5a1b3Sopenharmony_ci int m 16953a5a1b3Sopenharmony_ci ) 17053a5a1b3Sopenharmony_ci{ 17153a5a1b3Sopenharmony_ci register int * restrict tw1; 17253a5a1b3Sopenharmony_ci register int * restrict tw2; 17353a5a1b3Sopenharmony_ci register int * restrict Fout1; 17453a5a1b3Sopenharmony_ci register int * restrict Fout2; 17553a5a1b3Sopenharmony_ci register int epi; 17653a5a1b3Sopenharmony_ci register int i, j; 17753a5a1b3Sopenharmony_ci register int fstride2; 17853a5a1b3Sopenharmony_ci register int _inv = !st->inverse; 17953a5a1b3Sopenharmony_ci 18053a5a1b3Sopenharmony_ci tw1 = tw2 = (int*)st->twiddles; 18153a5a1b3Sopenharmony_ci Fout1 = (int*)Fout + m; 18253a5a1b3Sopenharmony_ci Fout2 = (int*)Fout + (m << 1); 18353a5a1b3Sopenharmony_ci epi = tw1[fstride*m]; 18453a5a1b3Sopenharmony_ci epi = pack16lsb(epi,epi); 18553a5a1b3Sopenharmony_ci fstride2 = fstride << 1; 18653a5a1b3Sopenharmony_ci 18753a5a1b3Sopenharmony_ci for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2 ) 18853a5a1b3Sopenharmony_ci { register int sc0, sc1, sc2, sc3, sc4, sc5; 18953a5a1b3Sopenharmony_ci register int ff0; 19053a5a1b3Sopenharmony_ci 19153a5a1b3Sopenharmony_ci sc1 = ld32x(Fout1,i); 19253a5a1b3Sopenharmony_ci sc2 = ld32x(Fout2,i); 19353a5a1b3Sopenharmony_ci sc3 = ld32(tw1); 19453a5a1b3Sopenharmony_ci sc4 = ld32(tw2); 19553a5a1b3Sopenharmony_ci ff0 = ld32x(Fout,i); 19653a5a1b3Sopenharmony_ci 19753a5a1b3Sopenharmony_ci if ( _inv ) 19853a5a1b3Sopenharmony_ci { 19953a5a1b3Sopenharmony_ci TM_DIV(sc1, sc1, 3); 20053a5a1b3Sopenharmony_ci TM_DIV(sc2, sc2, 3); 20153a5a1b3Sopenharmony_ci TM_DIV(ff0, ff0, 3); 20253a5a1b3Sopenharmony_ci } 20353a5a1b3Sopenharmony_ci 20453a5a1b3Sopenharmony_ci TM_MUL(sc1, sc1, sc3); 20553a5a1b3Sopenharmony_ci TM_MUL(sc2, sc2, sc4); 20653a5a1b3Sopenharmony_ci TM_ADD(sc3, sc1, sc2); 20753a5a1b3Sopenharmony_ci TM_SUB(sc0, sc1, sc2); 20853a5a1b3Sopenharmony_ci TM_SHR(sc4, sc3, 1); 20953a5a1b3Sopenharmony_ci TM_SUB(sc1, ff0, sc4); 21053a5a1b3Sopenharmony_ci 21153a5a1b3Sopenharmony_ci sc0 = dspidualmul(sc0, epi); 21253a5a1b3Sopenharmony_ci sc0 = funshift2(sc0, sc0); 21353a5a1b3Sopenharmony_ci 21453a5a1b3Sopenharmony_ci TM_ADD(ff0, ff0, sc3); 21553a5a1b3Sopenharmony_ci TM_ADD(sc4, sc1, sc0); 21653a5a1b3Sopenharmony_ci TM_SUB(sc5, sc1, sc0); 21753a5a1b3Sopenharmony_ci 21853a5a1b3Sopenharmony_ci sc1 = funshift2(sc4, sc5); 21953a5a1b3Sopenharmony_ci sc2 = funshift2(sc5, sc4); 22053a5a1b3Sopenharmony_ci sc2 = funshift2(sc2, sc2); 22153a5a1b3Sopenharmony_ci 22253a5a1b3Sopenharmony_ci st32d(j, Fout1, sc1); 22353a5a1b3Sopenharmony_ci st32d(j, Fout, ff0); 22453a5a1b3Sopenharmony_ci st32d(j, Fout2, sc2); 22553a5a1b3Sopenharmony_ci } 22653a5a1b3Sopenharmony_ci} 22753a5a1b3Sopenharmony_ci 22853a5a1b3Sopenharmony_ci 22953a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY5 23053a5a1b3Sopenharmony_cistatic void kf_bfly5( 23153a5a1b3Sopenharmony_ci kiss_fft_cpx *Fout, 23253a5a1b3Sopenharmony_ci const int fstride, 23353a5a1b3Sopenharmony_ci const kiss_fft_cfg st, 23453a5a1b3Sopenharmony_ci int m 23553a5a1b3Sopenharmony_ci ) 23653a5a1b3Sopenharmony_ci{ 23753a5a1b3Sopenharmony_ci register int * restrict tw1; 23853a5a1b3Sopenharmony_ci register int * restrict tw2; 23953a5a1b3Sopenharmony_ci register int * restrict tw3; 24053a5a1b3Sopenharmony_ci register int * restrict tw4; 24153a5a1b3Sopenharmony_ci register int * restrict Fout1; 24253a5a1b3Sopenharmony_ci register int * restrict Fout2; 24353a5a1b3Sopenharmony_ci register int * restrict Fout3; 24453a5a1b3Sopenharmony_ci register int * restrict Fout4; 24553a5a1b3Sopenharmony_ci register int fstride2, fstride3, fstride4; 24653a5a1b3Sopenharmony_ci register int i, j; 24753a5a1b3Sopenharmony_ci register int yab_msb, yab_lsb, yba_msb, yba_lsb; 24853a5a1b3Sopenharmony_ci register int _inv = !st->inverse; 24953a5a1b3Sopenharmony_ci 25053a5a1b3Sopenharmony_ci 25153a5a1b3Sopenharmony_ci Fout1=(int*)Fout+m; 25253a5a1b3Sopenharmony_ci Fout2=(int*)Fout+(m<<1); 25353a5a1b3Sopenharmony_ci Fout3=(int*)Fout+(3 *m); 25453a5a1b3Sopenharmony_ci Fout4=(int*)Fout+(m<<2); 25553a5a1b3Sopenharmony_ci 25653a5a1b3Sopenharmony_ci tw1 = tw2 = tw3 = tw4 = (int*)st->twiddles; 25753a5a1b3Sopenharmony_ci 25853a5a1b3Sopenharmony_ci i = tw1[fstride*m]; 25953a5a1b3Sopenharmony_ci yab_lsb = tw1[fstride*(m<<1)]; 26053a5a1b3Sopenharmony_ci yab_msb = pack16msb(i, yab_lsb); 26153a5a1b3Sopenharmony_ci yab_lsb = pack16lsb(i, yab_lsb); 26253a5a1b3Sopenharmony_ci yba_msb = funshift2(-sex16(yab_msb), yab_msb); 26353a5a1b3Sopenharmony_ci yba_lsb = funshift2(yab_lsb, yab_lsb); 26453a5a1b3Sopenharmony_ci 26553a5a1b3Sopenharmony_ci fstride2 = fstride << 1; 26653a5a1b3Sopenharmony_ci fstride3 = fstride * 3; 26753a5a1b3Sopenharmony_ci fstride4 = fstride << 2; 26853a5a1b3Sopenharmony_ci 26953a5a1b3Sopenharmony_ci for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3,tw4+=fstride4 ) 27053a5a1b3Sopenharmony_ci { register int sc0, sc1, sc2, sc3, sc4, sc5, sc6; 27153a5a1b3Sopenharmony_ci register int sc7, sc8, sc9, sc10, sc11, sc12; 27253a5a1b3Sopenharmony_ci register int ff0, sc78_msb, sc78_lsb, sc90_msb, sc90_lsb; 27353a5a1b3Sopenharmony_ci 27453a5a1b3Sopenharmony_ci sc0 = ld32x(Fout,i); 27553a5a1b3Sopenharmony_ci sc1 = ld32x(Fout1,i); 27653a5a1b3Sopenharmony_ci sc2 = ld32x(Fout2,i); 27753a5a1b3Sopenharmony_ci sc3 = ld32x(Fout3,i); 27853a5a1b3Sopenharmony_ci sc4 = ld32x(Fout4,i); 27953a5a1b3Sopenharmony_ci sc5 = ld32(tw1); 28053a5a1b3Sopenharmony_ci sc6 = ld32(tw2); 28153a5a1b3Sopenharmony_ci sc7 = ld32(tw3); 28253a5a1b3Sopenharmony_ci sc8 = ld32(tw4); 28353a5a1b3Sopenharmony_ci 28453a5a1b3Sopenharmony_ci if ( _inv ) 28553a5a1b3Sopenharmony_ci { 28653a5a1b3Sopenharmony_ci TM_DIV(sc0, sc0, 5); 28753a5a1b3Sopenharmony_ci TM_DIV(sc1, sc1, 5); 28853a5a1b3Sopenharmony_ci TM_DIV(sc2, sc2, 5); 28953a5a1b3Sopenharmony_ci TM_DIV(sc3, sc3, 5); 29053a5a1b3Sopenharmony_ci TM_DIV(sc4, sc4, 5); 29153a5a1b3Sopenharmony_ci } 29253a5a1b3Sopenharmony_ci 29353a5a1b3Sopenharmony_ci ff0 = sc0; 29453a5a1b3Sopenharmony_ci 29553a5a1b3Sopenharmony_ci TM_MUL(sc1, sc1, sc5); 29653a5a1b3Sopenharmony_ci TM_MUL(sc2, sc2, sc6); 29753a5a1b3Sopenharmony_ci TM_MUL(sc3, sc3, sc7); 29853a5a1b3Sopenharmony_ci TM_MUL(sc4, sc4, sc8); 29953a5a1b3Sopenharmony_ci TM_ADD(sc7, sc1, sc4); 30053a5a1b3Sopenharmony_ci TM_SUB(sc10,sc1, sc4); 30153a5a1b3Sopenharmony_ci TM_ADD(sc8, sc2, sc3); 30253a5a1b3Sopenharmony_ci TM_SUB(sc9, sc2, sc3); 30353a5a1b3Sopenharmony_ci 30453a5a1b3Sopenharmony_ci TM_ADD(ff0, ff0, sc7); 30553a5a1b3Sopenharmony_ci TM_ADD(ff0, ff0, sc8); 30653a5a1b3Sopenharmony_ci st32d(j, Fout, ff0); 30753a5a1b3Sopenharmony_ci 30853a5a1b3Sopenharmony_ci sc78_msb = pack16msb(sc7,sc8); 30953a5a1b3Sopenharmony_ci sc78_lsb = pack16lsb(sc7,sc8); 31053a5a1b3Sopenharmony_ci sc90_msb = pack16msb(sc10,sc9); 31153a5a1b3Sopenharmony_ci sc90_lsb = pack16lsb(sc10,sc9); 31253a5a1b3Sopenharmony_ci 31353a5a1b3Sopenharmony_ci sc5 = pack16lsb( sround(ifir16(sc78_msb,yab_lsb)), sround(ifir16(sc78_lsb,yab_lsb))); 31453a5a1b3Sopenharmony_ci sc6 = pack16lsb(-sround(ifir16(sc90_lsb,yab_msb)), sround(ifir16(sc90_msb,yab_msb))); 31553a5a1b3Sopenharmony_ci 31653a5a1b3Sopenharmony_ci TM_ADD(sc5, sc5, sc0); 31753a5a1b3Sopenharmony_ci TM_SUB(sc1, sc5, sc6); 31853a5a1b3Sopenharmony_ci TM_ADD(sc4, sc5, sc6); 31953a5a1b3Sopenharmony_ci st32d(j, Fout1, sc1); 32053a5a1b3Sopenharmony_ci st32d(j, Fout4, sc4); 32153a5a1b3Sopenharmony_ci 32253a5a1b3Sopenharmony_ci sc11 = pack16lsb( sround(ifir16(sc78_msb,yba_lsb)), sround(ifir16(sc78_lsb,yba_lsb))); 32353a5a1b3Sopenharmony_ci sc12 = pack16lsb(-sround(ifir16(sc90_lsb,yba_msb)), sround(ifir16(sc90_msb,yba_msb))); 32453a5a1b3Sopenharmony_ci 32553a5a1b3Sopenharmony_ci TM_ADD(sc11, sc11, sc0); 32653a5a1b3Sopenharmony_ci TM_ADD(sc2, sc11, sc12); 32753a5a1b3Sopenharmony_ci TM_SUB(sc3, sc11, sc12); 32853a5a1b3Sopenharmony_ci st32d(j, Fout2, sc2); 32953a5a1b3Sopenharmony_ci st32d(j, Fout3, sc3); 33053a5a1b3Sopenharmony_ci 33153a5a1b3Sopenharmony_ci } 33253a5a1b3Sopenharmony_ci} 33353a5a1b3Sopenharmony_ci 33453a5a1b3Sopenharmony_ci 33553a5a1b3Sopenharmony_ci#define OVERRIDE_KF_BFLY_GENERIC 33653a5a1b3Sopenharmony_cistatic void kf_bfly_generic( 33753a5a1b3Sopenharmony_ci kiss_fft_cpx * restrict Fout, 33853a5a1b3Sopenharmony_ci const size_t fstride, 33953a5a1b3Sopenharmony_ci const kiss_fft_cfg st, 34053a5a1b3Sopenharmony_ci int m, 34153a5a1b3Sopenharmony_ci int p 34253a5a1b3Sopenharmony_ci ) 34353a5a1b3Sopenharmony_ci{ 34453a5a1b3Sopenharmony_ci register int _inv = !st->inverse; 34553a5a1b3Sopenharmony_ci register int i, j, k, l; 34653a5a1b3Sopenharmony_ci register int * restrict twiddles = (int*)st->twiddles; 34753a5a1b3Sopenharmony_ci register int Norig = st->nfft; 34853a5a1b3Sopenharmony_ci 34953a5a1b3Sopenharmony_ci CHECKBUF(scratchbuf,nscratchbuf,p); 35053a5a1b3Sopenharmony_ci 35153a5a1b3Sopenharmony_ci for ( i=0; i<m; ++i ) 35253a5a1b3Sopenharmony_ci { register int sc10; 35353a5a1b3Sopenharmony_ci 35453a5a1b3Sopenharmony_ci for ( j=0,k=i ; j<p ; ++j,k+=m ) 35553a5a1b3Sopenharmony_ci { register int f10; 35653a5a1b3Sopenharmony_ci 35753a5a1b3Sopenharmony_ci f10 = ld32x(Fout,k); 35853a5a1b3Sopenharmony_ci 35953a5a1b3Sopenharmony_ci if ( _inv ) 36053a5a1b3Sopenharmony_ci { TM_DIV(f10, f10, p); 36153a5a1b3Sopenharmony_ci } 36253a5a1b3Sopenharmony_ci 36353a5a1b3Sopenharmony_ci st32d(j<<2, scratchbuf, f10); 36453a5a1b3Sopenharmony_ci } 36553a5a1b3Sopenharmony_ci 36653a5a1b3Sopenharmony_ci for ( j=0,k=i,sc10=ld32(scratchbuf) ; j<p ; ++j,k+=m ) 36753a5a1b3Sopenharmony_ci { 36853a5a1b3Sopenharmony_ci register int twidx = 0; 36953a5a1b3Sopenharmony_ci register int f10; 37053a5a1b3Sopenharmony_ci 37153a5a1b3Sopenharmony_ci for ( l=1,f10 = sc10 ; l<p ; ++l ) 37253a5a1b3Sopenharmony_ci { register int tw, sc; 37353a5a1b3Sopenharmony_ci 37453a5a1b3Sopenharmony_ci twidx += fstride * k; 37553a5a1b3Sopenharmony_ci if ( twidx>=Norig ) 37653a5a1b3Sopenharmony_ci { twidx -= Norig; 37753a5a1b3Sopenharmony_ci } 37853a5a1b3Sopenharmony_ci 37953a5a1b3Sopenharmony_ci sc = ld32x(scratchbuf,l); 38053a5a1b3Sopenharmony_ci tw = ld32x(twiddles,twidx); 38153a5a1b3Sopenharmony_ci 38253a5a1b3Sopenharmony_ci TM_MUL(sc, sc, tw); 38353a5a1b3Sopenharmony_ci TM_ADD(f10, f10, sc); 38453a5a1b3Sopenharmony_ci } 38553a5a1b3Sopenharmony_ci st32d(k<<2, Fout, f10); 38653a5a1b3Sopenharmony_ci } 38753a5a1b3Sopenharmony_ci } 38853a5a1b3Sopenharmony_ci} 38953a5a1b3Sopenharmony_ci 39053a5a1b3Sopenharmony_ci#else 39153a5a1b3Sopenharmony_ci 39253a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY2 39353a5a1b3Sopenharmony_cistatic void kf_bfly2( 39453a5a1b3Sopenharmony_ci kiss_fft_cpx * Fout, 39553a5a1b3Sopenharmony_ci const size_t fstride, 39653a5a1b3Sopenharmony_ci const kiss_fft_cfg st, 39753a5a1b3Sopenharmony_ci int m 39853a5a1b3Sopenharmony_ci ) 39953a5a1b3Sopenharmony_ci{ 40053a5a1b3Sopenharmony_ci register kiss_fft_cpx * restrict Fout2; 40153a5a1b3Sopenharmony_ci register kiss_fft_cpx * restrict tw1 = st->twiddles; 40253a5a1b3Sopenharmony_ci 40353a5a1b3Sopenharmony_ci Fout2 = Fout + m; 40453a5a1b3Sopenharmony_ci 40553a5a1b3Sopenharmony_ci do 40653a5a1b3Sopenharmony_ci { 40753a5a1b3Sopenharmony_ci register kiss_fft_cpx _fout2, _fout, t; 40853a5a1b3Sopenharmony_ci 40953a5a1b3Sopenharmony_ci _fout2 = *Fout2; 41053a5a1b3Sopenharmony_ci _fout = *Fout; 41153a5a1b3Sopenharmony_ci 41253a5a1b3Sopenharmony_ci C_MUL ( t, _fout2, *tw1); 41353a5a1b3Sopenharmony_ci C_SUB (_fout2, _fout, t); 41453a5a1b3Sopenharmony_ci C_ADD (_fout, _fout, t); 41553a5a1b3Sopenharmony_ci 41653a5a1b3Sopenharmony_ci *Fout2 = _fout2; 41753a5a1b3Sopenharmony_ci *Fout = _fout; 41853a5a1b3Sopenharmony_ci 41953a5a1b3Sopenharmony_ci tw1 += fstride; 42053a5a1b3Sopenharmony_ci ++Fout2; 42153a5a1b3Sopenharmony_ci ++Fout; 42253a5a1b3Sopenharmony_ci 42353a5a1b3Sopenharmony_ci } while ( --m ); 42453a5a1b3Sopenharmony_ci} 42553a5a1b3Sopenharmony_ci 42653a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY4 42753a5a1b3Sopenharmony_cistatic void kf_bfly4( 42853a5a1b3Sopenharmony_ci kiss_fft_cpx * Fout, 42953a5a1b3Sopenharmony_ci const int fstride, 43053a5a1b3Sopenharmony_ci const kiss_fft_cfg st, 43153a5a1b3Sopenharmony_ci int m 43253a5a1b3Sopenharmony_ci ) 43353a5a1b3Sopenharmony_ci{ 43453a5a1b3Sopenharmony_ci register kiss_fft_cpx * restrict tw1,* restrict tw2,* restrict tw3; 43553a5a1b3Sopenharmony_ci register kiss_fft_cpx * restrict Fout1, * restrict Fout2, * restrict Fout3; 43653a5a1b3Sopenharmony_ci register int _inv = !st->inverse; 43753a5a1b3Sopenharmony_ci 43853a5a1b3Sopenharmony_ci tw3 = tw2 = tw1 = st->twiddles; 43953a5a1b3Sopenharmony_ci 44053a5a1b3Sopenharmony_ci Fout1 = Fout + m; 44153a5a1b3Sopenharmony_ci Fout2 = Fout + (m << 1); 44253a5a1b3Sopenharmony_ci Fout3 = Fout + (m * 3); 44353a5a1b3Sopenharmony_ci 44453a5a1b3Sopenharmony_ci do { 44553a5a1b3Sopenharmony_ci 44653a5a1b3Sopenharmony_ci register kiss_fft_cpx _fout; 44753a5a1b3Sopenharmony_ci register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5; 44853a5a1b3Sopenharmony_ci 44953a5a1b3Sopenharmony_ci _fout = *Fout; 45053a5a1b3Sopenharmony_ci 45153a5a1b3Sopenharmony_ci C_MUL( sc0,*Fout1, *tw1); 45253a5a1b3Sopenharmony_ci C_MUL( sc1,*Fout2, *tw2); 45353a5a1b3Sopenharmony_ci C_MUL( sc2,*Fout3, *tw3); 45453a5a1b3Sopenharmony_ci C_SUB( sc5, _fout, sc1); 45553a5a1b3Sopenharmony_ci C_ADD( _fout, _fout, sc1); 45653a5a1b3Sopenharmony_ci C_ADD( sc3, sc0, sc2); 45753a5a1b3Sopenharmony_ci C_SUB( sc4, sc0, sc2); 45853a5a1b3Sopenharmony_ci C_SUB(*Fout2, _fout, sc3); 45953a5a1b3Sopenharmony_ci C_ADD( *Fout, _fout, sc3); 46053a5a1b3Sopenharmony_ci 46153a5a1b3Sopenharmony_ci tw1 += fstride; 46253a5a1b3Sopenharmony_ci tw2 += (fstride << 1); 46353a5a1b3Sopenharmony_ci tw3 += (fstride * 3); 46453a5a1b3Sopenharmony_ci 46553a5a1b3Sopenharmony_ci if ( _inv ) 46653a5a1b3Sopenharmony_ci { 46753a5a1b3Sopenharmony_ci Fout1->r = sc5.r + sc4.i; 46853a5a1b3Sopenharmony_ci Fout1->i = sc5.i - sc4.r; 46953a5a1b3Sopenharmony_ci Fout3->r = sc5.r - sc4.i; 47053a5a1b3Sopenharmony_ci Fout3->i = sc5.i + sc4.r; 47153a5a1b3Sopenharmony_ci } 47253a5a1b3Sopenharmony_ci else 47353a5a1b3Sopenharmony_ci { Fout1->r = sc5.r - sc4.i; 47453a5a1b3Sopenharmony_ci Fout1->i = sc5.i + sc4.r; 47553a5a1b3Sopenharmony_ci Fout3->r = sc5.r + sc4.i; 47653a5a1b3Sopenharmony_ci Fout3->i = sc5.i - sc4.r; 47753a5a1b3Sopenharmony_ci } 47853a5a1b3Sopenharmony_ci 47953a5a1b3Sopenharmony_ci 48053a5a1b3Sopenharmony_ci ++Fout; ++Fout1; ++Fout2; ++Fout3; 48153a5a1b3Sopenharmony_ci 48253a5a1b3Sopenharmony_ci } while(--m); 48353a5a1b3Sopenharmony_ci} 48453a5a1b3Sopenharmony_ci 48553a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY3 48653a5a1b3Sopenharmony_cistatic void kf_bfly3( 48753a5a1b3Sopenharmony_ci kiss_fft_cpx * Fout, 48853a5a1b3Sopenharmony_ci const int fstride, 48953a5a1b3Sopenharmony_ci const kiss_fft_cfg st, 49053a5a1b3Sopenharmony_ci int m 49153a5a1b3Sopenharmony_ci ) 49253a5a1b3Sopenharmony_ci{ 49353a5a1b3Sopenharmony_ci register kiss_fft_cpx * restrict Fout1, * restrict Fout2; 49453a5a1b3Sopenharmony_ci register kiss_fft_cpx * restrict tw1,* restrict tw2; 49553a5a1b3Sopenharmony_ci register float epi; 49653a5a1b3Sopenharmony_ci 49753a5a1b3Sopenharmony_ci tw1 = tw2 = st->twiddles; 49853a5a1b3Sopenharmony_ci epi = st->twiddles[fstride*m].i; 49953a5a1b3Sopenharmony_ci Fout1 = Fout + m; 50053a5a1b3Sopenharmony_ci Fout2 = Fout + (m << 1); 50153a5a1b3Sopenharmony_ci 50253a5a1b3Sopenharmony_ci do { 50353a5a1b3Sopenharmony_ci 50453a5a1b3Sopenharmony_ci register kiss_fft_cpx _fout; 50553a5a1b3Sopenharmony_ci register kiss_fft_cpx sc0, sc1, sc2, sc3; 50653a5a1b3Sopenharmony_ci 50753a5a1b3Sopenharmony_ci _fout = *Fout; 50853a5a1b3Sopenharmony_ci 50953a5a1b3Sopenharmony_ci C_MUL( sc1, *Fout1, *tw1); 51053a5a1b3Sopenharmony_ci C_MUL( sc2, *Fout2, *tw2); 51153a5a1b3Sopenharmony_ci C_ADD( sc3, sc1, sc2); 51253a5a1b3Sopenharmony_ci C_SUB( sc0, sc1, sc2); 51353a5a1b3Sopenharmony_ci tw1 += fstride; 51453a5a1b3Sopenharmony_ci tw2 += (fstride << 1); 51553a5a1b3Sopenharmony_ci 51653a5a1b3Sopenharmony_ci sc1.r = _fout.r - HALF_OF(sc3.r); 51753a5a1b3Sopenharmony_ci sc1.i = _fout.i - HALF_OF(sc3.i); 51853a5a1b3Sopenharmony_ci 51953a5a1b3Sopenharmony_ci C_MULBYSCALAR(sc0, epi); 52053a5a1b3Sopenharmony_ci C_ADD(*Fout, _fout, sc3); 52153a5a1b3Sopenharmony_ci 52253a5a1b3Sopenharmony_ci Fout2->r = sc1.r + sc0.i; 52353a5a1b3Sopenharmony_ci Fout2->i = sc1.i - sc0.r; 52453a5a1b3Sopenharmony_ci 52553a5a1b3Sopenharmony_ci Fout1->r = sc1.i - sc0.i; 52653a5a1b3Sopenharmony_ci Fout1->i = sc1.r + sc0.r; 52753a5a1b3Sopenharmony_ci 52853a5a1b3Sopenharmony_ci ++Fout; ++Fout1; ++Fout2; 52953a5a1b3Sopenharmony_ci 53053a5a1b3Sopenharmony_ci } while(--m); 53153a5a1b3Sopenharmony_ci} 53253a5a1b3Sopenharmony_ci 53353a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY5 53453a5a1b3Sopenharmony_cistatic void kf_bfly5( 53553a5a1b3Sopenharmony_ci kiss_fft_cpx * Fout, 53653a5a1b3Sopenharmony_ci const size_t fstride, 53753a5a1b3Sopenharmony_ci const kiss_fft_cfg st, 53853a5a1b3Sopenharmony_ci int m 53953a5a1b3Sopenharmony_ci ) 54053a5a1b3Sopenharmony_ci{ 54153a5a1b3Sopenharmony_ci register kiss_fft_cpx * restrict Fout1,* restrict Fout2,* restrict Fout3,* restrict Fout4; 54253a5a1b3Sopenharmony_ci register int u; 54353a5a1b3Sopenharmony_ci register kiss_fft_cpx *tw; 54453a5a1b3Sopenharmony_ci register float yar, yai, ybr, ybi; 54553a5a1b3Sopenharmony_ci 54653a5a1b3Sopenharmony_ci Fout1=Fout+m; 54753a5a1b3Sopenharmony_ci Fout2=Fout+(m<<1); 54853a5a1b3Sopenharmony_ci Fout3=Fout+(m*3); 54953a5a1b3Sopenharmony_ci Fout4=Fout+(m<<2); 55053a5a1b3Sopenharmony_ci 55153a5a1b3Sopenharmony_ci tw = st->twiddles; 55253a5a1b3Sopenharmony_ci yar = tw[fstride*m].r; 55353a5a1b3Sopenharmony_ci yai = tw[fstride*m].i; 55453a5a1b3Sopenharmony_ci ybr = tw[fstride*2*m].r; 55553a5a1b3Sopenharmony_ci ybi = tw[fstride*2*m].i; 55653a5a1b3Sopenharmony_ci 55753a5a1b3Sopenharmony_ci for ( u=0; u<m; ++u ) 55853a5a1b3Sopenharmony_ci { 55953a5a1b3Sopenharmony_ci register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5, sc6, sc7, sc8, sc9, sc10, sc11, sc12; 56053a5a1b3Sopenharmony_ci 56153a5a1b3Sopenharmony_ci sc0 = *Fout; 56253a5a1b3Sopenharmony_ci 56353a5a1b3Sopenharmony_ci C_MUL( sc1,*Fout1, tw[u*fstride]); 56453a5a1b3Sopenharmony_ci C_MUL( sc2,*Fout2, tw[2*u*fstride]); 56553a5a1b3Sopenharmony_ci C_MUL( sc3,*Fout3, tw[3*u*fstride]); 56653a5a1b3Sopenharmony_ci C_MUL( sc4,*Fout4, tw[4*u*fstride]); 56753a5a1b3Sopenharmony_ci 56853a5a1b3Sopenharmony_ci C_ADD( sc7, sc1, sc4); 56953a5a1b3Sopenharmony_ci C_SUB( sc10, sc1, sc4); 57053a5a1b3Sopenharmony_ci C_ADD( sc8, sc2, sc3); 57153a5a1b3Sopenharmony_ci C_SUB( sc9, sc2, sc3); 57253a5a1b3Sopenharmony_ci 57353a5a1b3Sopenharmony_ci Fout->r = sc0.r + sc7.r + sc8.r; 57453a5a1b3Sopenharmony_ci Fout->i = sc0.i + sc7.i + sc8.i; 57553a5a1b3Sopenharmony_ci 57653a5a1b3Sopenharmony_ci sc5.r = sc0.r + S_MUL(sc7.r,yar) + S_MUL(sc8.r,ybr); 57753a5a1b3Sopenharmony_ci sc5.i = sc0.i + S_MUL(sc7.i,yar) + S_MUL(sc8.i,ybr); 57853a5a1b3Sopenharmony_ci 57953a5a1b3Sopenharmony_ci sc6.r = S_MUL(sc10.i,yai) + S_MUL(sc9.i,ybi); 58053a5a1b3Sopenharmony_ci sc6.i = -S_MUL(sc10.r,yai) - S_MUL(sc9.r,ybi); 58153a5a1b3Sopenharmony_ci 58253a5a1b3Sopenharmony_ci C_SUB(*Fout1,sc5,sc6); 58353a5a1b3Sopenharmony_ci C_ADD(*Fout4,sc5,sc6); 58453a5a1b3Sopenharmony_ci 58553a5a1b3Sopenharmony_ci sc11.r = sc0.r + S_MUL(sc7.r,ybr) + S_MUL(sc8.r,yar); 58653a5a1b3Sopenharmony_ci sc11.i = sc0.i + S_MUL(sc7.i,ybr) + S_MUL(sc8.i,yar); 58753a5a1b3Sopenharmony_ci sc12.r = - S_MUL(sc10.i,ybi) + S_MUL(sc9.i,yai); 58853a5a1b3Sopenharmony_ci sc12.i = S_MUL(sc10.r,ybi) - S_MUL(sc9.r,yai); 58953a5a1b3Sopenharmony_ci C_ADD(*Fout2,sc11,sc12); 59053a5a1b3Sopenharmony_ci C_SUB(*Fout3,sc11,sc12); 59153a5a1b3Sopenharmony_ci 59253a5a1b3Sopenharmony_ci ++Fout1; ++Fout2; ++Fout3; ++Fout4; 59353a5a1b3Sopenharmony_ci } 59453a5a1b3Sopenharmony_ci} 59553a5a1b3Sopenharmony_ci 59653a5a1b3Sopenharmony_ci 59753a5a1b3Sopenharmony_ci#endif 59853a5a1b3Sopenharmony_ci 59953a5a1b3Sopenharmony_ci#endif 600