153a5a1b3Sopenharmony_ci/* Copyright (C) 2007 Hong Zhiqian */
253a5a1b3Sopenharmony_ci/**
353a5a1b3Sopenharmony_ci   @file kiss_fft_tm.h
453a5a1b3Sopenharmony_ci   @author Hong Zhiqian
553a5a1b3Sopenharmony_ci   @brief Various compatibility routines for Speex (TriMedia version)
653a5a1b3Sopenharmony_ci*/
753a5a1b3Sopenharmony_ci/*
853a5a1b3Sopenharmony_ci   Redistribution and use in source and binary forms, with or without
953a5a1b3Sopenharmony_ci   modification, are permitted provided that the following conditions
1053a5a1b3Sopenharmony_ci   are met:
1153a5a1b3Sopenharmony_ci
1253a5a1b3Sopenharmony_ci   - Redistributions of source code must retain the above copyright
1353a5a1b3Sopenharmony_ci   notice, this list of conditions and the following disclaimer.
1453a5a1b3Sopenharmony_ci
1553a5a1b3Sopenharmony_ci   - Redistributions in binary form must reproduce the above copyright
1653a5a1b3Sopenharmony_ci   notice, this list of conditions and the following disclaimer in the
1753a5a1b3Sopenharmony_ci   documentation and/or other materials provided with the distribution.
1853a5a1b3Sopenharmony_ci
1953a5a1b3Sopenharmony_ci   - Neither the name of the Xiph.org Foundation nor the names of its
2053a5a1b3Sopenharmony_ci   contributors may be used to endorse or promote products derived from
2153a5a1b3Sopenharmony_ci   this software without specific prior written permission.
2253a5a1b3Sopenharmony_ci
2353a5a1b3Sopenharmony_ci   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
2453a5a1b3Sopenharmony_ci   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
2553a5a1b3Sopenharmony_ci   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
2653a5a1b3Sopenharmony_ci   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
2753a5a1b3Sopenharmony_ci   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
2853a5a1b3Sopenharmony_ci   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
2953a5a1b3Sopenharmony_ci   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
3053a5a1b3Sopenharmony_ci   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
3153a5a1b3Sopenharmony_ci   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
3253a5a1b3Sopenharmony_ci   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
3353a5a1b3Sopenharmony_ci   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3453a5a1b3Sopenharmony_ci*/
3553a5a1b3Sopenharmony_ci
3653a5a1b3Sopenharmony_ci#include "_kiss_fft_guts_tm.h"
3753a5a1b3Sopenharmony_ci
3853a5a1b3Sopenharmony_ci#ifdef TM_ASM
3953a5a1b3Sopenharmony_ci
4053a5a1b3Sopenharmony_ci#include "profile_tm.h"
4153a5a1b3Sopenharmony_ci
4253a5a1b3Sopenharmony_ci#ifdef FIXED_POINT
4353a5a1b3Sopenharmony_ci
4453a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY2
4553a5a1b3Sopenharmony_cistatic void kf_bfly2(
4653a5a1b3Sopenharmony_ci		kiss_fft_cpx		*Fout,
4753a5a1b3Sopenharmony_ci        const int			fstride,
4853a5a1b3Sopenharmony_ci        const kiss_fft_cfg	st,
4953a5a1b3Sopenharmony_ci        int					m
5053a5a1b3Sopenharmony_ci        )
5153a5a1b3Sopenharmony_ci{
5253a5a1b3Sopenharmony_ci	register int * restrict Fout2;
5353a5a1b3Sopenharmony_ci    register int * restrict tw1 = (int*)st->twiddles;
5453a5a1b3Sopenharmony_ci    register int i, j;
5553a5a1b3Sopenharmony_ci	register int _inv = !st->inverse;
5653a5a1b3Sopenharmony_ci
5753a5a1b3Sopenharmony_ci	Fout2 = (int*)Fout + m;
5853a5a1b3Sopenharmony_ci
5953a5a1b3Sopenharmony_ci	for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride )
6053a5a1b3Sopenharmony_ci	{	register int tw_10, ff_10, f2_10;
6153a5a1b3Sopenharmony_ci
6253a5a1b3Sopenharmony_ci		ff_10	= ld32x(Fout, i);
6353a5a1b3Sopenharmony_ci		f2_10	= ld32x(Fout2, i);
6453a5a1b3Sopenharmony_ci		tw_10	= ld32(tw1);
6553a5a1b3Sopenharmony_ci
6653a5a1b3Sopenharmony_ci		if ( _inv )
6753a5a1b3Sopenharmony_ci		{	TM_SHR(f2_10, f2_10, 1);
6853a5a1b3Sopenharmony_ci			TM_SHR(ff_10, ff_10, 1);
6953a5a1b3Sopenharmony_ci		}
7053a5a1b3Sopenharmony_ci
7153a5a1b3Sopenharmony_ci		TM_MUL(tw_10, tw_10, f2_10);
7253a5a1b3Sopenharmony_ci		TM_SUB(f2_10, ff_10, tw_10);
7353a5a1b3Sopenharmony_ci		TM_ADD(ff_10, ff_10, tw_10);
7453a5a1b3Sopenharmony_ci
7553a5a1b3Sopenharmony_ci		st32d(j, Fout2, f2_10);
7653a5a1b3Sopenharmony_ci		st32d(j,  Fout, ff_10);
7753a5a1b3Sopenharmony_ci	}
7853a5a1b3Sopenharmony_ci}
7953a5a1b3Sopenharmony_ci
8053a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY4
8153a5a1b3Sopenharmony_cistatic void kf_bfly4(
8253a5a1b3Sopenharmony_ci        kiss_fft_cpx		*Fout,
8353a5a1b3Sopenharmony_ci        const int			fstride,
8453a5a1b3Sopenharmony_ci        const kiss_fft_cfg	st,
8553a5a1b3Sopenharmony_ci        const int			m
8653a5a1b3Sopenharmony_ci        )
8753a5a1b3Sopenharmony_ci{
8853a5a1b3Sopenharmony_ci    register int * restrict tw1;
8953a5a1b3Sopenharmony_ci	register int * restrict tw2;
9053a5a1b3Sopenharmony_ci	register int * restrict tw3;
9153a5a1b3Sopenharmony_ci	register int * restrict Fout1;
9253a5a1b3Sopenharmony_ci	register int * restrict Fout2;
9353a5a1b3Sopenharmony_ci	register int * restrict Fout3;
9453a5a1b3Sopenharmony_ci	register int i, j;
9553a5a1b3Sopenharmony_ci	register int fstride2, fstride3;
9653a5a1b3Sopenharmony_ci	register int _inv = !st->inverse;
9753a5a1b3Sopenharmony_ci
9853a5a1b3Sopenharmony_ci	tw3  = tw2 = tw1 = (int*)st->twiddles;
9953a5a1b3Sopenharmony_ci	fstride2 = fstride << 1;
10053a5a1b3Sopenharmony_ci	fstride3 = fstride * 3;
10153a5a1b3Sopenharmony_ci
10253a5a1b3Sopenharmony_ci	Fout1 = (int*)Fout + m;
10353a5a1b3Sopenharmony_ci	Fout2 = (int*)Fout + (m << 1);
10453a5a1b3Sopenharmony_ci	Fout3 = (int*)Fout + (m *  3);
10553a5a1b3Sopenharmony_ci
10653a5a1b3Sopenharmony_ci
10753a5a1b3Sopenharmony_ci	for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3 )
10853a5a1b3Sopenharmony_ci	{	register int sc0, sc1, sc2, sc3, sc4, sc5;
10953a5a1b3Sopenharmony_ci		register int ff0;
11053a5a1b3Sopenharmony_ci
11153a5a1b3Sopenharmony_ci		sc0   = ld32x(Fout1,i);
11253a5a1b3Sopenharmony_ci		sc3   = ld32(tw1);
11353a5a1b3Sopenharmony_ci		sc1   = ld32x(Fout2, i);
11453a5a1b3Sopenharmony_ci		sc4   = ld32(tw2);
11553a5a1b3Sopenharmony_ci		sc2   = ld32x(Fout3, i);
11653a5a1b3Sopenharmony_ci		sc5   = ld32(tw3);
11753a5a1b3Sopenharmony_ci		ff0   = ld32x(Fout,i);
11853a5a1b3Sopenharmony_ci
11953a5a1b3Sopenharmony_ci		if ( _inv )
12053a5a1b3Sopenharmony_ci		{
12153a5a1b3Sopenharmony_ci			TM_ADD(sc0, sc0, 0x00020002);
12253a5a1b3Sopenharmony_ci			TM_ADD(sc1, sc1, 0x00020002);
12353a5a1b3Sopenharmony_ci			TM_ADD(sc2, sc2, 0x00020002);
12453a5a1b3Sopenharmony_ci			TM_ADD(ff0, ff0, 0x00020002);
12553a5a1b3Sopenharmony_ci			TM_SHR(sc0, sc0, 2);
12653a5a1b3Sopenharmony_ci			TM_SHR(sc1, sc1, 2);
12753a5a1b3Sopenharmony_ci			TM_SHR(sc2, sc2, 2);
12853a5a1b3Sopenharmony_ci			TM_SHR(ff0, ff0, 2);
12953a5a1b3Sopenharmony_ci		}
13053a5a1b3Sopenharmony_ci
13153a5a1b3Sopenharmony_ci		TM_MUL(sc0, sc0, sc3);
13253a5a1b3Sopenharmony_ci		TM_MUL(sc1, sc1, sc4);
13353a5a1b3Sopenharmony_ci		TM_MUL(sc2, sc2, sc5);
13453a5a1b3Sopenharmony_ci		TM_SUB(sc5, ff0, sc1);
13553a5a1b3Sopenharmony_ci		TM_ADD(ff0, ff0, sc1);
13653a5a1b3Sopenharmony_ci		TM_ADD(sc3, sc0, sc2);
13753a5a1b3Sopenharmony_ci		TM_SUB(sc4, sc0, sc2);
13853a5a1b3Sopenharmony_ci		TM_SUB(sc1, ff0, sc3);
13953a5a1b3Sopenharmony_ci		TM_ADD(ff0, ff0, sc3);
14053a5a1b3Sopenharmony_ci
14153a5a1b3Sopenharmony_ci		st32d(j, Fout2, sc1);
14253a5a1b3Sopenharmony_ci		st32d(j, Fout,  ff0);
14353a5a1b3Sopenharmony_ci
14453a5a1b3Sopenharmony_ci		sc5 = funshift2(sc5, sc5);
14553a5a1b3Sopenharmony_ci
14653a5a1b3Sopenharmony_ci		if ( _inv )
14753a5a1b3Sopenharmony_ci		{	TM_ADD(ff0, sc5, sc4);
14853a5a1b3Sopenharmony_ci			TM_SUB(sc1, sc5, sc4);
14953a5a1b3Sopenharmony_ci		} else
15053a5a1b3Sopenharmony_ci		{	TM_ADD(sc1, sc5, sc4);
15153a5a1b3Sopenharmony_ci			TM_SUB(ff0, sc5, sc4);
15253a5a1b3Sopenharmony_ci		}
15353a5a1b3Sopenharmony_ci
15453a5a1b3Sopenharmony_ci		sc0 = funshift2(sc1, ff0);
15553a5a1b3Sopenharmony_ci		sc2 = funshift2(ff0, sc1);
15653a5a1b3Sopenharmony_ci
15753a5a1b3Sopenharmony_ci		st32d(j, Fout1, sc0);
15853a5a1b3Sopenharmony_ci		st32d(j, Fout3, sc2);
15953a5a1b3Sopenharmony_ci	}
16053a5a1b3Sopenharmony_ci}
16153a5a1b3Sopenharmony_ci
16253a5a1b3Sopenharmony_ci
16353a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY3
16453a5a1b3Sopenharmony_cistatic void kf_bfly3(
16553a5a1b3Sopenharmony_ci         kiss_fft_cpx	*Fout,
16653a5a1b3Sopenharmony_ci         const int		fstride,
16753a5a1b3Sopenharmony_ci         const			kiss_fft_cfg st,
16853a5a1b3Sopenharmony_ci         int			m
16953a5a1b3Sopenharmony_ci         )
17053a5a1b3Sopenharmony_ci{
17153a5a1b3Sopenharmony_ci    register int * restrict tw1;
17253a5a1b3Sopenharmony_ci	register int * restrict tw2;
17353a5a1b3Sopenharmony_ci	register int * restrict Fout1;
17453a5a1b3Sopenharmony_ci	register int * restrict Fout2;
17553a5a1b3Sopenharmony_ci    register int epi;
17653a5a1b3Sopenharmony_ci	register int i, j;
17753a5a1b3Sopenharmony_ci	register int fstride2;
17853a5a1b3Sopenharmony_ci	register int _inv = !st->inverse;
17953a5a1b3Sopenharmony_ci
18053a5a1b3Sopenharmony_ci    tw1  = tw2 = (int*)st->twiddles;
18153a5a1b3Sopenharmony_ci	Fout1 = (int*)Fout + m;
18253a5a1b3Sopenharmony_ci	Fout2 = (int*)Fout + (m << 1);
18353a5a1b3Sopenharmony_ci	epi = tw1[fstride*m];
18453a5a1b3Sopenharmony_ci    epi = pack16lsb(epi,epi);
18553a5a1b3Sopenharmony_ci	fstride2 = fstride << 1;
18653a5a1b3Sopenharmony_ci
18753a5a1b3Sopenharmony_ci	 for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2 )
18853a5a1b3Sopenharmony_ci	 {	register int sc0, sc1, sc2, sc3, sc4, sc5;
18953a5a1b3Sopenharmony_ci		register int ff0;
19053a5a1b3Sopenharmony_ci
19153a5a1b3Sopenharmony_ci		sc1 = ld32x(Fout1,i);
19253a5a1b3Sopenharmony_ci		sc2 = ld32x(Fout2,i);
19353a5a1b3Sopenharmony_ci		sc3 = ld32(tw1);
19453a5a1b3Sopenharmony_ci		sc4 = ld32(tw2);
19553a5a1b3Sopenharmony_ci		ff0 = ld32x(Fout,i);
19653a5a1b3Sopenharmony_ci
19753a5a1b3Sopenharmony_ci		if ( _inv )
19853a5a1b3Sopenharmony_ci		{
19953a5a1b3Sopenharmony_ci			TM_DIV(sc1, sc1, 3);
20053a5a1b3Sopenharmony_ci			TM_DIV(sc2, sc2, 3);
20153a5a1b3Sopenharmony_ci			TM_DIV(ff0, ff0, 3);
20253a5a1b3Sopenharmony_ci		}
20353a5a1b3Sopenharmony_ci
20453a5a1b3Sopenharmony_ci		TM_MUL(sc1, sc1,  sc3);
20553a5a1b3Sopenharmony_ci		TM_MUL(sc2, sc2,  sc4);
20653a5a1b3Sopenharmony_ci		TM_ADD(sc3, sc1,  sc2);
20753a5a1b3Sopenharmony_ci		TM_SUB(sc0, sc1,  sc2);
20853a5a1b3Sopenharmony_ci		TM_SHR(sc4, sc3,    1);
20953a5a1b3Sopenharmony_ci		TM_SUB(sc1, ff0,  sc4);
21053a5a1b3Sopenharmony_ci
21153a5a1b3Sopenharmony_ci		sc0 = dspidualmul(sc0, epi);
21253a5a1b3Sopenharmony_ci		sc0 = funshift2(sc0, sc0);
21353a5a1b3Sopenharmony_ci
21453a5a1b3Sopenharmony_ci		TM_ADD(ff0, ff0, sc3);
21553a5a1b3Sopenharmony_ci		TM_ADD(sc4, sc1, sc0);
21653a5a1b3Sopenharmony_ci		TM_SUB(sc5, sc1, sc0);
21753a5a1b3Sopenharmony_ci
21853a5a1b3Sopenharmony_ci		sc1 = funshift2(sc4, sc5);
21953a5a1b3Sopenharmony_ci		sc2 = funshift2(sc5, sc4);
22053a5a1b3Sopenharmony_ci		sc2 = funshift2(sc2, sc2);
22153a5a1b3Sopenharmony_ci
22253a5a1b3Sopenharmony_ci		st32d(j, Fout1, sc1);
22353a5a1b3Sopenharmony_ci		st32d(j, Fout,  ff0);
22453a5a1b3Sopenharmony_ci		st32d(j, Fout2, sc2);
22553a5a1b3Sopenharmony_ci	 }
22653a5a1b3Sopenharmony_ci}
22753a5a1b3Sopenharmony_ci
22853a5a1b3Sopenharmony_ci
22953a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY5
23053a5a1b3Sopenharmony_cistatic void kf_bfly5(
23153a5a1b3Sopenharmony_ci        kiss_fft_cpx		*Fout,
23253a5a1b3Sopenharmony_ci        const int			fstride,
23353a5a1b3Sopenharmony_ci        const kiss_fft_cfg	st,
23453a5a1b3Sopenharmony_ci        int m
23553a5a1b3Sopenharmony_ci        )
23653a5a1b3Sopenharmony_ci{
23753a5a1b3Sopenharmony_ci    register int * restrict tw1;
23853a5a1b3Sopenharmony_ci	register int * restrict tw2;
23953a5a1b3Sopenharmony_ci	register int * restrict tw3;
24053a5a1b3Sopenharmony_ci	register int * restrict tw4;
24153a5a1b3Sopenharmony_ci	register int * restrict Fout1;
24253a5a1b3Sopenharmony_ci	register int * restrict Fout2;
24353a5a1b3Sopenharmony_ci	register int * restrict Fout3;
24453a5a1b3Sopenharmony_ci	register int * restrict Fout4;
24553a5a1b3Sopenharmony_ci	register int fstride2, fstride3, fstride4;
24653a5a1b3Sopenharmony_ci	register int i, j;
24753a5a1b3Sopenharmony_ci	register int yab_msb, yab_lsb, yba_msb, yba_lsb;
24853a5a1b3Sopenharmony_ci	register int _inv = !st->inverse;
24953a5a1b3Sopenharmony_ci
25053a5a1b3Sopenharmony_ci
25153a5a1b3Sopenharmony_ci    Fout1=(int*)Fout+m;
25253a5a1b3Sopenharmony_ci    Fout2=(int*)Fout+(m<<1);
25353a5a1b3Sopenharmony_ci    Fout3=(int*)Fout+(3 *m);
25453a5a1b3Sopenharmony_ci    Fout4=(int*)Fout+(m<<2);
25553a5a1b3Sopenharmony_ci
25653a5a1b3Sopenharmony_ci    tw1 = tw2 = tw3 = tw4 = (int*)st->twiddles;
25753a5a1b3Sopenharmony_ci
25853a5a1b3Sopenharmony_ci	i = tw1[fstride*m];
25953a5a1b3Sopenharmony_ci    yab_lsb = tw1[fstride*(m<<1)];
26053a5a1b3Sopenharmony_ci	yab_msb = pack16msb(i, yab_lsb);
26153a5a1b3Sopenharmony_ci	yab_lsb = pack16lsb(i, yab_lsb);
26253a5a1b3Sopenharmony_ci	yba_msb = funshift2(-sex16(yab_msb), yab_msb);
26353a5a1b3Sopenharmony_ci	yba_lsb = funshift2(yab_lsb, yab_lsb);
26453a5a1b3Sopenharmony_ci
26553a5a1b3Sopenharmony_ci	fstride2 = fstride << 1;
26653a5a1b3Sopenharmony_ci	fstride3 = fstride *  3;
26753a5a1b3Sopenharmony_ci	fstride4 = fstride << 2;
26853a5a1b3Sopenharmony_ci
26953a5a1b3Sopenharmony_ci	for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3,tw4+=fstride4 )
27053a5a1b3Sopenharmony_ci	{	register int sc0, sc1, sc2, sc3, sc4, sc5, sc6;
27153a5a1b3Sopenharmony_ci		register int sc7, sc8, sc9, sc10, sc11, sc12;
27253a5a1b3Sopenharmony_ci		register int ff0, sc78_msb, sc78_lsb, sc90_msb, sc90_lsb;
27353a5a1b3Sopenharmony_ci
27453a5a1b3Sopenharmony_ci		sc0 = ld32x(Fout,i);
27553a5a1b3Sopenharmony_ci		sc1 = ld32x(Fout1,i);
27653a5a1b3Sopenharmony_ci		sc2 = ld32x(Fout2,i);
27753a5a1b3Sopenharmony_ci		sc3 = ld32x(Fout3,i);
27853a5a1b3Sopenharmony_ci		sc4 = ld32x(Fout4,i);
27953a5a1b3Sopenharmony_ci		sc5 = ld32(tw1);
28053a5a1b3Sopenharmony_ci		sc6 = ld32(tw2);
28153a5a1b3Sopenharmony_ci		sc7 = ld32(tw3);
28253a5a1b3Sopenharmony_ci		sc8 = ld32(tw4);
28353a5a1b3Sopenharmony_ci
28453a5a1b3Sopenharmony_ci		if ( _inv )
28553a5a1b3Sopenharmony_ci		{
28653a5a1b3Sopenharmony_ci			TM_DIV(sc0, sc0, 5);
28753a5a1b3Sopenharmony_ci			TM_DIV(sc1, sc1, 5);
28853a5a1b3Sopenharmony_ci			TM_DIV(sc2, sc2, 5);
28953a5a1b3Sopenharmony_ci			TM_DIV(sc3, sc3, 5);
29053a5a1b3Sopenharmony_ci			TM_DIV(sc4, sc4, 5);
29153a5a1b3Sopenharmony_ci		}
29253a5a1b3Sopenharmony_ci
29353a5a1b3Sopenharmony_ci		ff0 = sc0;
29453a5a1b3Sopenharmony_ci
29553a5a1b3Sopenharmony_ci		TM_MUL(sc1, sc1, sc5);
29653a5a1b3Sopenharmony_ci		TM_MUL(sc2, sc2, sc6);
29753a5a1b3Sopenharmony_ci		TM_MUL(sc3, sc3, sc7);
29853a5a1b3Sopenharmony_ci		TM_MUL(sc4, sc4, sc8);
29953a5a1b3Sopenharmony_ci		TM_ADD(sc7, sc1, sc4);
30053a5a1b3Sopenharmony_ci		TM_SUB(sc10,sc1, sc4);
30153a5a1b3Sopenharmony_ci		TM_ADD(sc8, sc2, sc3);
30253a5a1b3Sopenharmony_ci		TM_SUB(sc9, sc2, sc3);
30353a5a1b3Sopenharmony_ci
30453a5a1b3Sopenharmony_ci		TM_ADD(ff0, ff0, sc7);
30553a5a1b3Sopenharmony_ci		TM_ADD(ff0, ff0, sc8);
30653a5a1b3Sopenharmony_ci		st32d(j, Fout,  ff0);
30753a5a1b3Sopenharmony_ci
30853a5a1b3Sopenharmony_ci		sc78_msb = pack16msb(sc7,sc8);
30953a5a1b3Sopenharmony_ci		sc78_lsb = pack16lsb(sc7,sc8);
31053a5a1b3Sopenharmony_ci		sc90_msb = pack16msb(sc10,sc9);
31153a5a1b3Sopenharmony_ci		sc90_lsb = pack16lsb(sc10,sc9);
31253a5a1b3Sopenharmony_ci
31353a5a1b3Sopenharmony_ci		sc5 = pack16lsb( sround(ifir16(sc78_msb,yab_lsb)), sround(ifir16(sc78_lsb,yab_lsb)));
31453a5a1b3Sopenharmony_ci		sc6 = pack16lsb(-sround(ifir16(sc90_lsb,yab_msb)), sround(ifir16(sc90_msb,yab_msb)));
31553a5a1b3Sopenharmony_ci
31653a5a1b3Sopenharmony_ci		TM_ADD(sc5, sc5, sc0);
31753a5a1b3Sopenharmony_ci		TM_SUB(sc1, sc5, sc6);
31853a5a1b3Sopenharmony_ci		TM_ADD(sc4, sc5, sc6);
31953a5a1b3Sopenharmony_ci		st32d(j, Fout1, sc1);
32053a5a1b3Sopenharmony_ci		st32d(j, Fout4, sc4);
32153a5a1b3Sopenharmony_ci
32253a5a1b3Sopenharmony_ci		sc11 = pack16lsb( sround(ifir16(sc78_msb,yba_lsb)), sround(ifir16(sc78_lsb,yba_lsb)));
32353a5a1b3Sopenharmony_ci		sc12 = pack16lsb(-sround(ifir16(sc90_lsb,yba_msb)), sround(ifir16(sc90_msb,yba_msb)));
32453a5a1b3Sopenharmony_ci
32553a5a1b3Sopenharmony_ci		TM_ADD(sc11, sc11, sc0);
32653a5a1b3Sopenharmony_ci		TM_ADD(sc2, sc11, sc12);
32753a5a1b3Sopenharmony_ci		TM_SUB(sc3, sc11, sc12);
32853a5a1b3Sopenharmony_ci		st32d(j, Fout2, sc2);
32953a5a1b3Sopenharmony_ci		st32d(j, Fout3, sc3);
33053a5a1b3Sopenharmony_ci
33153a5a1b3Sopenharmony_ci	}
33253a5a1b3Sopenharmony_ci}
33353a5a1b3Sopenharmony_ci
33453a5a1b3Sopenharmony_ci
33553a5a1b3Sopenharmony_ci#define OVERRIDE_KF_BFLY_GENERIC
33653a5a1b3Sopenharmony_cistatic void kf_bfly_generic(
33753a5a1b3Sopenharmony_ci        kiss_fft_cpx * restrict Fout,
33853a5a1b3Sopenharmony_ci        const size_t fstride,
33953a5a1b3Sopenharmony_ci        const kiss_fft_cfg st,
34053a5a1b3Sopenharmony_ci        int m,
34153a5a1b3Sopenharmony_ci        int p
34253a5a1b3Sopenharmony_ci        )
34353a5a1b3Sopenharmony_ci{
34453a5a1b3Sopenharmony_ci	register int _inv = !st->inverse;
34553a5a1b3Sopenharmony_ci	register int i, j, k, l;
34653a5a1b3Sopenharmony_ci    register int * restrict twiddles = (int*)st->twiddles;
34753a5a1b3Sopenharmony_ci    register int Norig = st->nfft;
34853a5a1b3Sopenharmony_ci
34953a5a1b3Sopenharmony_ci    CHECKBUF(scratchbuf,nscratchbuf,p);
35053a5a1b3Sopenharmony_ci
35153a5a1b3Sopenharmony_ci    for ( i=0; i<m; ++i )
35253a5a1b3Sopenharmony_ci	{	register int sc10;
35353a5a1b3Sopenharmony_ci
35453a5a1b3Sopenharmony_ci        for ( j=0,k=i ; j<p ; ++j,k+=m )
35553a5a1b3Sopenharmony_ci		{	register int f10;
35653a5a1b3Sopenharmony_ci
35753a5a1b3Sopenharmony_ci			f10 = ld32x(Fout,k);
35853a5a1b3Sopenharmony_ci
35953a5a1b3Sopenharmony_ci			if ( _inv )
36053a5a1b3Sopenharmony_ci			{	TM_DIV(f10, f10, p);
36153a5a1b3Sopenharmony_ci			}
36253a5a1b3Sopenharmony_ci
36353a5a1b3Sopenharmony_ci			st32d(j<<2, scratchbuf, f10);
36453a5a1b3Sopenharmony_ci        }
36553a5a1b3Sopenharmony_ci
36653a5a1b3Sopenharmony_ci        for ( j=0,k=i,sc10=ld32(scratchbuf) ; j<p ; ++j,k+=m )
36753a5a1b3Sopenharmony_ci		{
36853a5a1b3Sopenharmony_ci            register int twidx = 0;
36953a5a1b3Sopenharmony_ci			register int f10;
37053a5a1b3Sopenharmony_ci
37153a5a1b3Sopenharmony_ci            for ( l=1,f10 = sc10 ; l<p ; ++l )
37253a5a1b3Sopenharmony_ci			{	register int tw, sc;
37353a5a1b3Sopenharmony_ci
37453a5a1b3Sopenharmony_ci                twidx += fstride * k;
37553a5a1b3Sopenharmony_ci				if ( twidx>=Norig )
37653a5a1b3Sopenharmony_ci				{	twidx -= Norig;
37753a5a1b3Sopenharmony_ci				}
37853a5a1b3Sopenharmony_ci
37953a5a1b3Sopenharmony_ci				sc = ld32x(scratchbuf,l);
38053a5a1b3Sopenharmony_ci				tw = ld32x(twiddles,twidx);
38153a5a1b3Sopenharmony_ci
38253a5a1b3Sopenharmony_ci				TM_MUL(sc, sc, tw);
38353a5a1b3Sopenharmony_ci				TM_ADD(f10, f10, sc);
38453a5a1b3Sopenharmony_ci			}
38553a5a1b3Sopenharmony_ci			st32d(k<<2, Fout, f10);
38653a5a1b3Sopenharmony_ci		}
38753a5a1b3Sopenharmony_ci	}
38853a5a1b3Sopenharmony_ci}
38953a5a1b3Sopenharmony_ci
39053a5a1b3Sopenharmony_ci#else
39153a5a1b3Sopenharmony_ci
39253a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY2
39353a5a1b3Sopenharmony_cistatic void kf_bfly2(
39453a5a1b3Sopenharmony_ci        kiss_fft_cpx * Fout,
39553a5a1b3Sopenharmony_ci        const size_t fstride,
39653a5a1b3Sopenharmony_ci        const kiss_fft_cfg st,
39753a5a1b3Sopenharmony_ci        int m
39853a5a1b3Sopenharmony_ci        )
39953a5a1b3Sopenharmony_ci{
40053a5a1b3Sopenharmony_ci    register kiss_fft_cpx * restrict Fout2;
40153a5a1b3Sopenharmony_ci    register kiss_fft_cpx * restrict tw1 = st->twiddles;
40253a5a1b3Sopenharmony_ci
40353a5a1b3Sopenharmony_ci    Fout2 = Fout + m;
40453a5a1b3Sopenharmony_ci
40553a5a1b3Sopenharmony_ci    do
40653a5a1b3Sopenharmony_ci	{
40753a5a1b3Sopenharmony_ci		register kiss_fft_cpx _fout2, _fout, t;
40853a5a1b3Sopenharmony_ci
40953a5a1b3Sopenharmony_ci		_fout2 = *Fout2;
41053a5a1b3Sopenharmony_ci		_fout  = *Fout;
41153a5a1b3Sopenharmony_ci
41253a5a1b3Sopenharmony_ci        C_MUL	(	  t,  _fout2,   *tw1);
41353a5a1b3Sopenharmony_ci        C_SUB	(_fout2,   _fout,	   t);
41453a5a1b3Sopenharmony_ci        C_ADD	(_fout,    _fout,	   t);
41553a5a1b3Sopenharmony_ci
41653a5a1b3Sopenharmony_ci		*Fout2 = _fout2;
41753a5a1b3Sopenharmony_ci		*Fout  = _fout;
41853a5a1b3Sopenharmony_ci
41953a5a1b3Sopenharmony_ci		tw1	+= fstride;
42053a5a1b3Sopenharmony_ci        ++Fout2;
42153a5a1b3Sopenharmony_ci        ++Fout;
42253a5a1b3Sopenharmony_ci
42353a5a1b3Sopenharmony_ci    } while ( --m );
42453a5a1b3Sopenharmony_ci}
42553a5a1b3Sopenharmony_ci
42653a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY4
42753a5a1b3Sopenharmony_cistatic void kf_bfly4(
42853a5a1b3Sopenharmony_ci        kiss_fft_cpx * Fout,
42953a5a1b3Sopenharmony_ci        const int fstride,
43053a5a1b3Sopenharmony_ci        const kiss_fft_cfg st,
43153a5a1b3Sopenharmony_ci        int m
43253a5a1b3Sopenharmony_ci        )
43353a5a1b3Sopenharmony_ci{
43453a5a1b3Sopenharmony_ci    register kiss_fft_cpx * restrict tw1,* restrict tw2,* restrict tw3;
43553a5a1b3Sopenharmony_ci	register kiss_fft_cpx * restrict Fout1, * restrict Fout2, * restrict Fout3;
43653a5a1b3Sopenharmony_ci	register int _inv = !st->inverse;
43753a5a1b3Sopenharmony_ci
43853a5a1b3Sopenharmony_ci    tw3 = tw2 = tw1 = st->twiddles;
43953a5a1b3Sopenharmony_ci
44053a5a1b3Sopenharmony_ci	Fout1 = Fout + m;
44153a5a1b3Sopenharmony_ci	Fout2 = Fout + (m << 1);
44253a5a1b3Sopenharmony_ci	Fout3 = Fout + (m * 3);
44353a5a1b3Sopenharmony_ci
44453a5a1b3Sopenharmony_ci	do {
44553a5a1b3Sopenharmony_ci
44653a5a1b3Sopenharmony_ci		register kiss_fft_cpx _fout;
44753a5a1b3Sopenharmony_ci		register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5;
44853a5a1b3Sopenharmony_ci
44953a5a1b3Sopenharmony_ci		_fout = *Fout;
45053a5a1b3Sopenharmony_ci
45153a5a1b3Sopenharmony_ci		C_MUL(   sc0,*Fout1, *tw1);
45253a5a1b3Sopenharmony_ci		C_MUL(   sc1,*Fout2, *tw2);
45353a5a1b3Sopenharmony_ci		C_MUL(   sc2,*Fout3, *tw3);
45453a5a1b3Sopenharmony_ci		C_SUB(   sc5, _fout,  sc1);
45553a5a1b3Sopenharmony_ci		C_ADD( _fout, _fout,  sc1);
45653a5a1b3Sopenharmony_ci		C_ADD(   sc3,   sc0,  sc2);
45753a5a1b3Sopenharmony_ci		C_SUB(   sc4,   sc0,  sc2);
45853a5a1b3Sopenharmony_ci		C_SUB(*Fout2, _fout,  sc3);
45953a5a1b3Sopenharmony_ci		C_ADD( *Fout, _fout,  sc3);
46053a5a1b3Sopenharmony_ci
46153a5a1b3Sopenharmony_ci		tw1 += fstride;
46253a5a1b3Sopenharmony_ci		tw2 += (fstride << 1);
46353a5a1b3Sopenharmony_ci		tw3 += (fstride *  3);
46453a5a1b3Sopenharmony_ci
46553a5a1b3Sopenharmony_ci		if ( _inv )
46653a5a1b3Sopenharmony_ci		{
46753a5a1b3Sopenharmony_ci			Fout1->r = sc5.r + sc4.i;
46853a5a1b3Sopenharmony_ci			Fout1->i = sc5.i - sc4.r;
46953a5a1b3Sopenharmony_ci			Fout3->r = sc5.r - sc4.i;
47053a5a1b3Sopenharmony_ci			Fout3->i = sc5.i + sc4.r;
47153a5a1b3Sopenharmony_ci		}
47253a5a1b3Sopenharmony_ci		else
47353a5a1b3Sopenharmony_ci		{	Fout1->r = sc5.r - sc4.i;
47453a5a1b3Sopenharmony_ci			Fout1->i = sc5.i + sc4.r;
47553a5a1b3Sopenharmony_ci			Fout3->r = sc5.r + sc4.i;
47653a5a1b3Sopenharmony_ci			Fout3->i = sc5.i - sc4.r;
47753a5a1b3Sopenharmony_ci		}
47853a5a1b3Sopenharmony_ci
47953a5a1b3Sopenharmony_ci
48053a5a1b3Sopenharmony_ci        ++Fout; ++Fout1; ++Fout2; ++Fout3;
48153a5a1b3Sopenharmony_ci
48253a5a1b3Sopenharmony_ci    } while(--m);
48353a5a1b3Sopenharmony_ci}
48453a5a1b3Sopenharmony_ci
48553a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY3
48653a5a1b3Sopenharmony_cistatic void kf_bfly3(
48753a5a1b3Sopenharmony_ci         kiss_fft_cpx * Fout,
48853a5a1b3Sopenharmony_ci         const int fstride,
48953a5a1b3Sopenharmony_ci         const kiss_fft_cfg st,
49053a5a1b3Sopenharmony_ci         int m
49153a5a1b3Sopenharmony_ci         )
49253a5a1b3Sopenharmony_ci{
49353a5a1b3Sopenharmony_ci	register kiss_fft_cpx * restrict Fout1, * restrict Fout2;
49453a5a1b3Sopenharmony_ci	register kiss_fft_cpx * restrict tw1,* restrict tw2;
49553a5a1b3Sopenharmony_ci	register float epi;
49653a5a1b3Sopenharmony_ci
49753a5a1b3Sopenharmony_ci    tw1 = tw2 = st->twiddles;
49853a5a1b3Sopenharmony_ci    epi = st->twiddles[fstride*m].i;
49953a5a1b3Sopenharmony_ci	Fout1 = Fout + m;
50053a5a1b3Sopenharmony_ci	Fout2 = Fout + (m << 1);
50153a5a1b3Sopenharmony_ci
50253a5a1b3Sopenharmony_ci    do {
50353a5a1b3Sopenharmony_ci
50453a5a1b3Sopenharmony_ci		register kiss_fft_cpx _fout;
50553a5a1b3Sopenharmony_ci		register kiss_fft_cpx sc0, sc1, sc2, sc3;
50653a5a1b3Sopenharmony_ci
50753a5a1b3Sopenharmony_ci		_fout = *Fout;
50853a5a1b3Sopenharmony_ci
50953a5a1b3Sopenharmony_ci        C_MUL(   sc1, *Fout1,  *tw1);
51053a5a1b3Sopenharmony_ci        C_MUL(   sc2, *Fout2,  *tw2);
51153a5a1b3Sopenharmony_ci        C_ADD(	 sc3,    sc1,   sc2);
51253a5a1b3Sopenharmony_ci        C_SUB(   sc0,    sc1,   sc2);
51353a5a1b3Sopenharmony_ci        tw1 += fstride;
51453a5a1b3Sopenharmony_ci        tw2 += (fstride << 1);
51553a5a1b3Sopenharmony_ci
51653a5a1b3Sopenharmony_ci        sc1.r = _fout.r - HALF_OF(sc3.r);
51753a5a1b3Sopenharmony_ci        sc1.i = _fout.i - HALF_OF(sc3.i);
51853a5a1b3Sopenharmony_ci
51953a5a1b3Sopenharmony_ci        C_MULBYSCALAR(sc0,  epi);
52053a5a1b3Sopenharmony_ci        C_ADD(*Fout, _fout, sc3);
52153a5a1b3Sopenharmony_ci
52253a5a1b3Sopenharmony_ci        Fout2->r = sc1.r + sc0.i;
52353a5a1b3Sopenharmony_ci        Fout2->i = sc1.i - sc0.r;
52453a5a1b3Sopenharmony_ci
52553a5a1b3Sopenharmony_ci        Fout1->r = sc1.i - sc0.i;
52653a5a1b3Sopenharmony_ci        Fout1->i = sc1.r + sc0.r;
52753a5a1b3Sopenharmony_ci
52853a5a1b3Sopenharmony_ci        ++Fout; ++Fout1; ++Fout2;
52953a5a1b3Sopenharmony_ci
53053a5a1b3Sopenharmony_ci	} while(--m);
53153a5a1b3Sopenharmony_ci}
53253a5a1b3Sopenharmony_ci
53353a5a1b3Sopenharmony_ci#define OVERRIDE_KFBFLY5
53453a5a1b3Sopenharmony_cistatic void kf_bfly5(
53553a5a1b3Sopenharmony_ci        kiss_fft_cpx * Fout,
53653a5a1b3Sopenharmony_ci        const size_t fstride,
53753a5a1b3Sopenharmony_ci        const kiss_fft_cfg st,
53853a5a1b3Sopenharmony_ci        int m
53953a5a1b3Sopenharmony_ci        )
54053a5a1b3Sopenharmony_ci{
54153a5a1b3Sopenharmony_ci    register kiss_fft_cpx * restrict Fout1,* restrict Fout2,* restrict Fout3,* restrict Fout4;
54253a5a1b3Sopenharmony_ci	register int u;
54353a5a1b3Sopenharmony_ci    register kiss_fft_cpx *tw;
54453a5a1b3Sopenharmony_ci    register float yar, yai, ybr, ybi;
54553a5a1b3Sopenharmony_ci
54653a5a1b3Sopenharmony_ci    Fout1=Fout+m;
54753a5a1b3Sopenharmony_ci    Fout2=Fout+(m<<1);
54853a5a1b3Sopenharmony_ci    Fout3=Fout+(m*3);
54953a5a1b3Sopenharmony_ci    Fout4=Fout+(m<<2);
55053a5a1b3Sopenharmony_ci
55153a5a1b3Sopenharmony_ci    tw = st->twiddles;
55253a5a1b3Sopenharmony_ci    yar = tw[fstride*m].r;
55353a5a1b3Sopenharmony_ci	yai = tw[fstride*m].i;
55453a5a1b3Sopenharmony_ci    ybr = tw[fstride*2*m].r;
55553a5a1b3Sopenharmony_ci	ybi = tw[fstride*2*m].i;
55653a5a1b3Sopenharmony_ci
55753a5a1b3Sopenharmony_ci	for ( u=0; u<m; ++u )
55853a5a1b3Sopenharmony_ci	{
55953a5a1b3Sopenharmony_ci		register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5, sc6, sc7, sc8, sc9, sc10, sc11, sc12;
56053a5a1b3Sopenharmony_ci
56153a5a1b3Sopenharmony_ci		sc0 = *Fout;
56253a5a1b3Sopenharmony_ci
56353a5a1b3Sopenharmony_ci        C_MUL(   sc1,*Fout1,   tw[u*fstride]);
56453a5a1b3Sopenharmony_ci        C_MUL(   sc2,*Fout2, tw[2*u*fstride]);
56553a5a1b3Sopenharmony_ci        C_MUL(   sc3,*Fout3, tw[3*u*fstride]);
56653a5a1b3Sopenharmony_ci        C_MUL(   sc4,*Fout4, tw[4*u*fstride]);
56753a5a1b3Sopenharmony_ci
56853a5a1b3Sopenharmony_ci        C_ADD(   sc7,   sc1,   sc4);
56953a5a1b3Sopenharmony_ci        C_SUB(  sc10,   sc1,   sc4);
57053a5a1b3Sopenharmony_ci        C_ADD(   sc8,   sc2,   sc3);
57153a5a1b3Sopenharmony_ci        C_SUB(   sc9,   sc2,   sc3);
57253a5a1b3Sopenharmony_ci
57353a5a1b3Sopenharmony_ci        Fout->r = sc0.r + sc7.r + sc8.r;
57453a5a1b3Sopenharmony_ci        Fout->i = sc0.i + sc7.i + sc8.i;
57553a5a1b3Sopenharmony_ci
57653a5a1b3Sopenharmony_ci        sc5.r = sc0.r + S_MUL(sc7.r,yar) + S_MUL(sc8.r,ybr);
57753a5a1b3Sopenharmony_ci        sc5.i = sc0.i + S_MUL(sc7.i,yar) + S_MUL(sc8.i,ybr);
57853a5a1b3Sopenharmony_ci
57953a5a1b3Sopenharmony_ci        sc6.r =  S_MUL(sc10.i,yai) + S_MUL(sc9.i,ybi);
58053a5a1b3Sopenharmony_ci        sc6.i = -S_MUL(sc10.r,yai) - S_MUL(sc9.r,ybi);
58153a5a1b3Sopenharmony_ci
58253a5a1b3Sopenharmony_ci        C_SUB(*Fout1,sc5,sc6);
58353a5a1b3Sopenharmony_ci        C_ADD(*Fout4,sc5,sc6);
58453a5a1b3Sopenharmony_ci
58553a5a1b3Sopenharmony_ci        sc11.r = sc0.r + S_MUL(sc7.r,ybr) + S_MUL(sc8.r,yar);
58653a5a1b3Sopenharmony_ci        sc11.i = sc0.i + S_MUL(sc7.i,ybr) + S_MUL(sc8.i,yar);
58753a5a1b3Sopenharmony_ci        sc12.r = - S_MUL(sc10.i,ybi) + S_MUL(sc9.i,yai);
58853a5a1b3Sopenharmony_ci        sc12.i = S_MUL(sc10.r,ybi) - S_MUL(sc9.r,yai);
58953a5a1b3Sopenharmony_ci        C_ADD(*Fout2,sc11,sc12);
59053a5a1b3Sopenharmony_ci        C_SUB(*Fout3,sc11,sc12);
59153a5a1b3Sopenharmony_ci
59253a5a1b3Sopenharmony_ci        ++Fout1; ++Fout2; ++Fout3; ++Fout4;
59353a5a1b3Sopenharmony_ci	}
59453a5a1b3Sopenharmony_ci}
59553a5a1b3Sopenharmony_ci
59653a5a1b3Sopenharmony_ci
59753a5a1b3Sopenharmony_ci#endif
59853a5a1b3Sopenharmony_ci
59953a5a1b3Sopenharmony_ci#endif
600