1/* Copyright (C) 2007 Hong Zhiqian */
2/**
3   @file kiss_fft_tm.h
4   @author Hong Zhiqian
5   @brief Various compatibility routines for Speex (TriMedia version)
6*/
7/*
8   Redistribution and use in source and binary forms, with or without
9   modification, are permitted provided that the following conditions
10   are met:
11
12   - Redistributions of source code must retain the above copyright
13   notice, this list of conditions and the following disclaimer.
14
15   - Redistributions in binary form must reproduce the above copyright
16   notice, this list of conditions and the following disclaimer in the
17   documentation and/or other materials provided with the distribution.
18
19   - Neither the name of the Xiph.org Foundation nor the names of its
20   contributors may be used to endorse or promote products derived from
21   this software without specific prior written permission.
22
23   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
27   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34*/
35
36#include "_kiss_fft_guts_tm.h"
37
38#ifdef TM_ASM
39
40#include "profile_tm.h"
41
42#ifdef FIXED_POINT
43
44#define OVERRIDE_KFBFLY2
45static void kf_bfly2(
46		kiss_fft_cpx		*Fout,
47        const int			fstride,
48        const kiss_fft_cfg	st,
49        int					m
50        )
51{
52	register int * restrict Fout2;
53    register int * restrict tw1 = (int*)st->twiddles;
54    register int i, j;
55	register int _inv = !st->inverse;
56
57	Fout2 = (int*)Fout + m;
58
59	for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride )
60	{	register int tw_10, ff_10, f2_10;
61
62		ff_10	= ld32x(Fout, i);
63		f2_10	= ld32x(Fout2, i);
64		tw_10	= ld32(tw1);
65
66		if ( _inv )
67		{	TM_SHR(f2_10, f2_10, 1);
68			TM_SHR(ff_10, ff_10, 1);
69		}
70
71		TM_MUL(tw_10, tw_10, f2_10);
72		TM_SUB(f2_10, ff_10, tw_10);
73		TM_ADD(ff_10, ff_10, tw_10);
74
75		st32d(j, Fout2, f2_10);
76		st32d(j,  Fout, ff_10);
77	}
78}
79
80#define OVERRIDE_KFBFLY4
81static void kf_bfly4(
82        kiss_fft_cpx		*Fout,
83        const int			fstride,
84        const kiss_fft_cfg	st,
85        const int			m
86        )
87{
88    register int * restrict tw1;
89	register int * restrict tw2;
90	register int * restrict tw3;
91	register int * restrict Fout1;
92	register int * restrict Fout2;
93	register int * restrict Fout3;
94	register int i, j;
95	register int fstride2, fstride3;
96	register int _inv = !st->inverse;
97
98	tw3  = tw2 = tw1 = (int*)st->twiddles;
99	fstride2 = fstride << 1;
100	fstride3 = fstride * 3;
101
102	Fout1 = (int*)Fout + m;
103	Fout2 = (int*)Fout + (m << 1);
104	Fout3 = (int*)Fout + (m *  3);
105
106
107	for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3 )
108	{	register int sc0, sc1, sc2, sc3, sc4, sc5;
109		register int ff0;
110
111		sc0   = ld32x(Fout1,i);
112		sc3   = ld32(tw1);
113		sc1   = ld32x(Fout2, i);
114		sc4   = ld32(tw2);
115		sc2   = ld32x(Fout3, i);
116		sc5   = ld32(tw3);
117		ff0   = ld32x(Fout,i);
118
119		if ( _inv )
120		{
121			TM_ADD(sc0, sc0, 0x00020002);
122			TM_ADD(sc1, sc1, 0x00020002);
123			TM_ADD(sc2, sc2, 0x00020002);
124			TM_ADD(ff0, ff0, 0x00020002);
125			TM_SHR(sc0, sc0, 2);
126			TM_SHR(sc1, sc1, 2);
127			TM_SHR(sc2, sc2, 2);
128			TM_SHR(ff0, ff0, 2);
129		}
130
131		TM_MUL(sc0, sc0, sc3);
132		TM_MUL(sc1, sc1, sc4);
133		TM_MUL(sc2, sc2, sc5);
134		TM_SUB(sc5, ff0, sc1);
135		TM_ADD(ff0, ff0, sc1);
136		TM_ADD(sc3, sc0, sc2);
137		TM_SUB(sc4, sc0, sc2);
138		TM_SUB(sc1, ff0, sc3);
139		TM_ADD(ff0, ff0, sc3);
140
141		st32d(j, Fout2, sc1);
142		st32d(j, Fout,  ff0);
143
144		sc5 = funshift2(sc5, sc5);
145
146		if ( _inv )
147		{	TM_ADD(ff0, sc5, sc4);
148			TM_SUB(sc1, sc5, sc4);
149		} else
150		{	TM_ADD(sc1, sc5, sc4);
151			TM_SUB(ff0, sc5, sc4);
152		}
153
154		sc0 = funshift2(sc1, ff0);
155		sc2 = funshift2(ff0, sc1);
156
157		st32d(j, Fout1, sc0);
158		st32d(j, Fout3, sc2);
159	}
160}
161
162
163#define OVERRIDE_KFBFLY3
164static void kf_bfly3(
165         kiss_fft_cpx	*Fout,
166         const int		fstride,
167         const			kiss_fft_cfg st,
168         int			m
169         )
170{
171    register int * restrict tw1;
172	register int * restrict tw2;
173	register int * restrict Fout1;
174	register int * restrict Fout2;
175    register int epi;
176	register int i, j;
177	register int fstride2;
178	register int _inv = !st->inverse;
179
180    tw1  = tw2 = (int*)st->twiddles;
181	Fout1 = (int*)Fout + m;
182	Fout2 = (int*)Fout + (m << 1);
183	epi = tw1[fstride*m];
184    epi = pack16lsb(epi,epi);
185	fstride2 = fstride << 1;
186
187	 for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2 )
188	 {	register int sc0, sc1, sc2, sc3, sc4, sc5;
189		register int ff0;
190
191		sc1 = ld32x(Fout1,i);
192		sc2 = ld32x(Fout2,i);
193		sc3 = ld32(tw1);
194		sc4 = ld32(tw2);
195		ff0 = ld32x(Fout,i);
196
197		if ( _inv )
198		{
199			TM_DIV(sc1, sc1, 3);
200			TM_DIV(sc2, sc2, 3);
201			TM_DIV(ff0, ff0, 3);
202		}
203
204		TM_MUL(sc1, sc1,  sc3);
205		TM_MUL(sc2, sc2,  sc4);
206		TM_ADD(sc3, sc1,  sc2);
207		TM_SUB(sc0, sc1,  sc2);
208		TM_SHR(sc4, sc3,    1);
209		TM_SUB(sc1, ff0,  sc4);
210
211		sc0 = dspidualmul(sc0, epi);
212		sc0 = funshift2(sc0, sc0);
213
214		TM_ADD(ff0, ff0, sc3);
215		TM_ADD(sc4, sc1, sc0);
216		TM_SUB(sc5, sc1, sc0);
217
218		sc1 = funshift2(sc4, sc5);
219		sc2 = funshift2(sc5, sc4);
220		sc2 = funshift2(sc2, sc2);
221
222		st32d(j, Fout1, sc1);
223		st32d(j, Fout,  ff0);
224		st32d(j, Fout2, sc2);
225	 }
226}
227
228
229#define OVERRIDE_KFBFLY5
230static void kf_bfly5(
231        kiss_fft_cpx		*Fout,
232        const int			fstride,
233        const kiss_fft_cfg	st,
234        int m
235        )
236{
237    register int * restrict tw1;
238	register int * restrict tw2;
239	register int * restrict tw3;
240	register int * restrict tw4;
241	register int * restrict Fout1;
242	register int * restrict Fout2;
243	register int * restrict Fout3;
244	register int * restrict Fout4;
245	register int fstride2, fstride3, fstride4;
246	register int i, j;
247	register int yab_msb, yab_lsb, yba_msb, yba_lsb;
248	register int _inv = !st->inverse;
249
250
251    Fout1=(int*)Fout+m;
252    Fout2=(int*)Fout+(m<<1);
253    Fout3=(int*)Fout+(3 *m);
254    Fout4=(int*)Fout+(m<<2);
255
256    tw1 = tw2 = tw3 = tw4 = (int*)st->twiddles;
257
258	i = tw1[fstride*m];
259    yab_lsb = tw1[fstride*(m<<1)];
260	yab_msb = pack16msb(i, yab_lsb);
261	yab_lsb = pack16lsb(i, yab_lsb);
262	yba_msb = funshift2(-sex16(yab_msb), yab_msb);
263	yba_lsb = funshift2(yab_lsb, yab_lsb);
264
265	fstride2 = fstride << 1;
266	fstride3 = fstride *  3;
267	fstride4 = fstride << 2;
268
269	for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3,tw4+=fstride4 )
270	{	register int sc0, sc1, sc2, sc3, sc4, sc5, sc6;
271		register int sc7, sc8, sc9, sc10, sc11, sc12;
272		register int ff0, sc78_msb, sc78_lsb, sc90_msb, sc90_lsb;
273
274		sc0 = ld32x(Fout,i);
275		sc1 = ld32x(Fout1,i);
276		sc2 = ld32x(Fout2,i);
277		sc3 = ld32x(Fout3,i);
278		sc4 = ld32x(Fout4,i);
279		sc5 = ld32(tw1);
280		sc6 = ld32(tw2);
281		sc7 = ld32(tw3);
282		sc8 = ld32(tw4);
283
284		if ( _inv )
285		{
286			TM_DIV(sc0, sc0, 5);
287			TM_DIV(sc1, sc1, 5);
288			TM_DIV(sc2, sc2, 5);
289			TM_DIV(sc3, sc3, 5);
290			TM_DIV(sc4, sc4, 5);
291		}
292
293		ff0 = sc0;
294
295		TM_MUL(sc1, sc1, sc5);
296		TM_MUL(sc2, sc2, sc6);
297		TM_MUL(sc3, sc3, sc7);
298		TM_MUL(sc4, sc4, sc8);
299		TM_ADD(sc7, sc1, sc4);
300		TM_SUB(sc10,sc1, sc4);
301		TM_ADD(sc8, sc2, sc3);
302		TM_SUB(sc9, sc2, sc3);
303
304		TM_ADD(ff0, ff0, sc7);
305		TM_ADD(ff0, ff0, sc8);
306		st32d(j, Fout,  ff0);
307
308		sc78_msb = pack16msb(sc7,sc8);
309		sc78_lsb = pack16lsb(sc7,sc8);
310		sc90_msb = pack16msb(sc10,sc9);
311		sc90_lsb = pack16lsb(sc10,sc9);
312
313		sc5 = pack16lsb( sround(ifir16(sc78_msb,yab_lsb)), sround(ifir16(sc78_lsb,yab_lsb)));
314		sc6 = pack16lsb(-sround(ifir16(sc90_lsb,yab_msb)), sround(ifir16(sc90_msb,yab_msb)));
315
316		TM_ADD(sc5, sc5, sc0);
317		TM_SUB(sc1, sc5, sc6);
318		TM_ADD(sc4, sc5, sc6);
319		st32d(j, Fout1, sc1);
320		st32d(j, Fout4, sc4);
321
322		sc11 = pack16lsb( sround(ifir16(sc78_msb,yba_lsb)), sround(ifir16(sc78_lsb,yba_lsb)));
323		sc12 = pack16lsb(-sround(ifir16(sc90_lsb,yba_msb)), sround(ifir16(sc90_msb,yba_msb)));
324
325		TM_ADD(sc11, sc11, sc0);
326		TM_ADD(sc2, sc11, sc12);
327		TM_SUB(sc3, sc11, sc12);
328		st32d(j, Fout2, sc2);
329		st32d(j, Fout3, sc3);
330
331	}
332}
333
334
335#define OVERRIDE_KF_BFLY_GENERIC
336static void kf_bfly_generic(
337        kiss_fft_cpx * restrict Fout,
338        const size_t fstride,
339        const kiss_fft_cfg st,
340        int m,
341        int p
342        )
343{
344	register int _inv = !st->inverse;
345	register int i, j, k, l;
346    register int * restrict twiddles = (int*)st->twiddles;
347    register int Norig = st->nfft;
348
349    CHECKBUF(scratchbuf,nscratchbuf,p);
350
351    for ( i=0; i<m; ++i )
352	{	register int sc10;
353
354        for ( j=0,k=i ; j<p ; ++j,k+=m )
355		{	register int f10;
356
357			f10 = ld32x(Fout,k);
358
359			if ( _inv )
360			{	TM_DIV(f10, f10, p);
361			}
362
363			st32d(j<<2, scratchbuf, f10);
364        }
365
366        for ( j=0,k=i,sc10=ld32(scratchbuf) ; j<p ; ++j,k+=m )
367		{
368            register int twidx = 0;
369			register int f10;
370
371            for ( l=1,f10 = sc10 ; l<p ; ++l )
372			{	register int tw, sc;
373
374                twidx += fstride * k;
375				if ( twidx>=Norig )
376				{	twidx -= Norig;
377				}
378
379				sc = ld32x(scratchbuf,l);
380				tw = ld32x(twiddles,twidx);
381
382				TM_MUL(sc, sc, tw);
383				TM_ADD(f10, f10, sc);
384			}
385			st32d(k<<2, Fout, f10);
386		}
387	}
388}
389
390#else
391
392#define OVERRIDE_KFBFLY2
393static void kf_bfly2(
394        kiss_fft_cpx * Fout,
395        const size_t fstride,
396        const kiss_fft_cfg st,
397        int m
398        )
399{
400    register kiss_fft_cpx * restrict Fout2;
401    register kiss_fft_cpx * restrict tw1 = st->twiddles;
402
403    Fout2 = Fout + m;
404
405    do
406	{
407		register kiss_fft_cpx _fout2, _fout, t;
408
409		_fout2 = *Fout2;
410		_fout  = *Fout;
411
412        C_MUL	(	  t,  _fout2,   *tw1);
413        C_SUB	(_fout2,   _fout,	   t);
414        C_ADD	(_fout,    _fout,	   t);
415
416		*Fout2 = _fout2;
417		*Fout  = _fout;
418
419		tw1	+= fstride;
420        ++Fout2;
421        ++Fout;
422
423    } while ( --m );
424}
425
426#define OVERRIDE_KFBFLY4
427static void kf_bfly4(
428        kiss_fft_cpx * Fout,
429        const int fstride,
430        const kiss_fft_cfg st,
431        int m
432        )
433{
434    register kiss_fft_cpx * restrict tw1,* restrict tw2,* restrict tw3;
435	register kiss_fft_cpx * restrict Fout1, * restrict Fout2, * restrict Fout3;
436	register int _inv = !st->inverse;
437
438    tw3 = tw2 = tw1 = st->twiddles;
439
440	Fout1 = Fout + m;
441	Fout2 = Fout + (m << 1);
442	Fout3 = Fout + (m * 3);
443
444	do {
445
446		register kiss_fft_cpx _fout;
447		register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5;
448
449		_fout = *Fout;
450
451		C_MUL(   sc0,*Fout1, *tw1);
452		C_MUL(   sc1,*Fout2, *tw2);
453		C_MUL(   sc2,*Fout3, *tw3);
454		C_SUB(   sc5, _fout,  sc1);
455		C_ADD( _fout, _fout,  sc1);
456		C_ADD(   sc3,   sc0,  sc2);
457		C_SUB(   sc4,   sc0,  sc2);
458		C_SUB(*Fout2, _fout,  sc3);
459		C_ADD( *Fout, _fout,  sc3);
460
461		tw1 += fstride;
462		tw2 += (fstride << 1);
463		tw3 += (fstride *  3);
464
465		if ( _inv )
466		{
467			Fout1->r = sc5.r + sc4.i;
468			Fout1->i = sc5.i - sc4.r;
469			Fout3->r = sc5.r - sc4.i;
470			Fout3->i = sc5.i + sc4.r;
471		}
472		else
473		{	Fout1->r = sc5.r - sc4.i;
474			Fout1->i = sc5.i + sc4.r;
475			Fout3->r = sc5.r + sc4.i;
476			Fout3->i = sc5.i - sc4.r;
477		}
478
479
480        ++Fout; ++Fout1; ++Fout2; ++Fout3;
481
482    } while(--m);
483}
484
485#define OVERRIDE_KFBFLY3
486static void kf_bfly3(
487         kiss_fft_cpx * Fout,
488         const int fstride,
489         const kiss_fft_cfg st,
490         int m
491         )
492{
493	register kiss_fft_cpx * restrict Fout1, * restrict Fout2;
494	register kiss_fft_cpx * restrict tw1,* restrict tw2;
495	register float epi;
496
497    tw1 = tw2 = st->twiddles;
498    epi = st->twiddles[fstride*m].i;
499	Fout1 = Fout + m;
500	Fout2 = Fout + (m << 1);
501
502    do {
503
504		register kiss_fft_cpx _fout;
505		register kiss_fft_cpx sc0, sc1, sc2, sc3;
506
507		_fout = *Fout;
508
509        C_MUL(   sc1, *Fout1,  *tw1);
510        C_MUL(   sc2, *Fout2,  *tw2);
511        C_ADD(	 sc3,    sc1,   sc2);
512        C_SUB(   sc0,    sc1,   sc2);
513        tw1 += fstride;
514        tw2 += (fstride << 1);
515
516        sc1.r = _fout.r - HALF_OF(sc3.r);
517        sc1.i = _fout.i - HALF_OF(sc3.i);
518
519        C_MULBYSCALAR(sc0,  epi);
520        C_ADD(*Fout, _fout, sc3);
521
522        Fout2->r = sc1.r + sc0.i;
523        Fout2->i = sc1.i - sc0.r;
524
525        Fout1->r = sc1.i - sc0.i;
526        Fout1->i = sc1.r + sc0.r;
527
528        ++Fout; ++Fout1; ++Fout2;
529
530	} while(--m);
531}
532
533#define OVERRIDE_KFBFLY5
534static void kf_bfly5(
535        kiss_fft_cpx * Fout,
536        const size_t fstride,
537        const kiss_fft_cfg st,
538        int m
539        )
540{
541    register kiss_fft_cpx * restrict Fout1,* restrict Fout2,* restrict Fout3,* restrict Fout4;
542	register int u;
543    register kiss_fft_cpx *tw;
544    register float yar, yai, ybr, ybi;
545
546    Fout1=Fout+m;
547    Fout2=Fout+(m<<1);
548    Fout3=Fout+(m*3);
549    Fout4=Fout+(m<<2);
550
551    tw = st->twiddles;
552    yar = tw[fstride*m].r;
553	yai = tw[fstride*m].i;
554    ybr = tw[fstride*2*m].r;
555	ybi = tw[fstride*2*m].i;
556
557	for ( u=0; u<m; ++u )
558	{
559		register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5, sc6, sc7, sc8, sc9, sc10, sc11, sc12;
560
561		sc0 = *Fout;
562
563        C_MUL(   sc1,*Fout1,   tw[u*fstride]);
564        C_MUL(   sc2,*Fout2, tw[2*u*fstride]);
565        C_MUL(   sc3,*Fout3, tw[3*u*fstride]);
566        C_MUL(   sc4,*Fout4, tw[4*u*fstride]);
567
568        C_ADD(   sc7,   sc1,   sc4);
569        C_SUB(  sc10,   sc1,   sc4);
570        C_ADD(   sc8,   sc2,   sc3);
571        C_SUB(   sc9,   sc2,   sc3);
572
573        Fout->r = sc0.r + sc7.r + sc8.r;
574        Fout->i = sc0.i + sc7.i + sc8.i;
575
576        sc5.r = sc0.r + S_MUL(sc7.r,yar) + S_MUL(sc8.r,ybr);
577        sc5.i = sc0.i + S_MUL(sc7.i,yar) + S_MUL(sc8.i,ybr);
578
579        sc6.r =  S_MUL(sc10.i,yai) + S_MUL(sc9.i,ybi);
580        sc6.i = -S_MUL(sc10.r,yai) - S_MUL(sc9.r,ybi);
581
582        C_SUB(*Fout1,sc5,sc6);
583        C_ADD(*Fout4,sc5,sc6);
584
585        sc11.r = sc0.r + S_MUL(sc7.r,ybr) + S_MUL(sc8.r,yar);
586        sc11.i = sc0.i + S_MUL(sc7.i,ybr) + S_MUL(sc8.i,yar);
587        sc12.r = - S_MUL(sc10.i,ybi) + S_MUL(sc9.i,yai);
588        sc12.i = S_MUL(sc10.r,ybi) - S_MUL(sc9.r,yai);
589        C_ADD(*Fout2,sc11,sc12);
590        C_SUB(*Fout3,sc11,sc12);
591
592        ++Fout1; ++Fout2; ++Fout3; ++Fout4;
593	}
594}
595
596
597#endif
598
599#endif
600