1/* Copyright (C) 2007 Hong Zhiqian */ 2/** 3 @file kiss_fft_tm.h 4 @author Hong Zhiqian 5 @brief Various compatibility routines for Speex (TriMedia version) 6*/ 7/* 8 Redistribution and use in source and binary forms, with or without 9 modification, are permitted provided that the following conditions 10 are met: 11 12 - Redistributions of source code must retain the above copyright 13 notice, this list of conditions and the following disclaimer. 14 15 - Redistributions in binary form must reproduce the above copyright 16 notice, this list of conditions and the following disclaimer in the 17 documentation and/or other materials provided with the distribution. 18 19 - Neither the name of the Xiph.org Foundation nor the names of its 20 contributors may be used to endorse or promote products derived from 21 this software without specific prior written permission. 22 23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 27 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 31 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 32 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34*/ 35 36#include "_kiss_fft_guts_tm.h" 37 38#ifdef TM_ASM 39 40#include "profile_tm.h" 41 42#ifdef FIXED_POINT 43 44#define OVERRIDE_KFBFLY2 45static void kf_bfly2( 46 kiss_fft_cpx *Fout, 47 const int fstride, 48 const kiss_fft_cfg st, 49 int m 50 ) 51{ 52 register int * restrict Fout2; 53 register int * restrict tw1 = (int*)st->twiddles; 54 register int i, j; 55 register int _inv = !st->inverse; 56 57 Fout2 = (int*)Fout + m; 58 59 for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride ) 60 { register int tw_10, ff_10, f2_10; 61 62 ff_10 = ld32x(Fout, i); 63 f2_10 = ld32x(Fout2, i); 64 tw_10 = ld32(tw1); 65 66 if ( _inv ) 67 { TM_SHR(f2_10, f2_10, 1); 68 TM_SHR(ff_10, ff_10, 1); 69 } 70 71 TM_MUL(tw_10, tw_10, f2_10); 72 TM_SUB(f2_10, ff_10, tw_10); 73 TM_ADD(ff_10, ff_10, tw_10); 74 75 st32d(j, Fout2, f2_10); 76 st32d(j, Fout, ff_10); 77 } 78} 79 80#define OVERRIDE_KFBFLY4 81static void kf_bfly4( 82 kiss_fft_cpx *Fout, 83 const int fstride, 84 const kiss_fft_cfg st, 85 const int m 86 ) 87{ 88 register int * restrict tw1; 89 register int * restrict tw2; 90 register int * restrict tw3; 91 register int * restrict Fout1; 92 register int * restrict Fout2; 93 register int * restrict Fout3; 94 register int i, j; 95 register int fstride2, fstride3; 96 register int _inv = !st->inverse; 97 98 tw3 = tw2 = tw1 = (int*)st->twiddles; 99 fstride2 = fstride << 1; 100 fstride3 = fstride * 3; 101 102 Fout1 = (int*)Fout + m; 103 Fout2 = (int*)Fout + (m << 1); 104 Fout3 = (int*)Fout + (m * 3); 105 106 107 for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3 ) 108 { register int sc0, sc1, sc2, sc3, sc4, sc5; 109 register int ff0; 110 111 sc0 = ld32x(Fout1,i); 112 sc3 = ld32(tw1); 113 sc1 = ld32x(Fout2, i); 114 sc4 = ld32(tw2); 115 sc2 = ld32x(Fout3, i); 116 sc5 = ld32(tw3); 117 ff0 = ld32x(Fout,i); 118 119 if ( _inv ) 120 { 121 TM_ADD(sc0, sc0, 0x00020002); 122 TM_ADD(sc1, sc1, 0x00020002); 123 TM_ADD(sc2, sc2, 0x00020002); 124 TM_ADD(ff0, ff0, 0x00020002); 125 TM_SHR(sc0, sc0, 2); 126 TM_SHR(sc1, sc1, 2); 127 TM_SHR(sc2, sc2, 2); 128 TM_SHR(ff0, ff0, 2); 129 } 130 131 TM_MUL(sc0, sc0, sc3); 132 TM_MUL(sc1, sc1, sc4); 133 TM_MUL(sc2, sc2, sc5); 134 TM_SUB(sc5, ff0, sc1); 135 TM_ADD(ff0, ff0, sc1); 136 TM_ADD(sc3, sc0, sc2); 137 TM_SUB(sc4, sc0, sc2); 138 TM_SUB(sc1, ff0, sc3); 139 TM_ADD(ff0, ff0, sc3); 140 141 st32d(j, Fout2, sc1); 142 st32d(j, Fout, ff0); 143 144 sc5 = funshift2(sc5, sc5); 145 146 if ( _inv ) 147 { TM_ADD(ff0, sc5, sc4); 148 TM_SUB(sc1, sc5, sc4); 149 } else 150 { TM_ADD(sc1, sc5, sc4); 151 TM_SUB(ff0, sc5, sc4); 152 } 153 154 sc0 = funshift2(sc1, ff0); 155 sc2 = funshift2(ff0, sc1); 156 157 st32d(j, Fout1, sc0); 158 st32d(j, Fout3, sc2); 159 } 160} 161 162 163#define OVERRIDE_KFBFLY3 164static void kf_bfly3( 165 kiss_fft_cpx *Fout, 166 const int fstride, 167 const kiss_fft_cfg st, 168 int m 169 ) 170{ 171 register int * restrict tw1; 172 register int * restrict tw2; 173 register int * restrict Fout1; 174 register int * restrict Fout2; 175 register int epi; 176 register int i, j; 177 register int fstride2; 178 register int _inv = !st->inverse; 179 180 tw1 = tw2 = (int*)st->twiddles; 181 Fout1 = (int*)Fout + m; 182 Fout2 = (int*)Fout + (m << 1); 183 epi = tw1[fstride*m]; 184 epi = pack16lsb(epi,epi); 185 fstride2 = fstride << 1; 186 187 for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2 ) 188 { register int sc0, sc1, sc2, sc3, sc4, sc5; 189 register int ff0; 190 191 sc1 = ld32x(Fout1,i); 192 sc2 = ld32x(Fout2,i); 193 sc3 = ld32(tw1); 194 sc4 = ld32(tw2); 195 ff0 = ld32x(Fout,i); 196 197 if ( _inv ) 198 { 199 TM_DIV(sc1, sc1, 3); 200 TM_DIV(sc2, sc2, 3); 201 TM_DIV(ff0, ff0, 3); 202 } 203 204 TM_MUL(sc1, sc1, sc3); 205 TM_MUL(sc2, sc2, sc4); 206 TM_ADD(sc3, sc1, sc2); 207 TM_SUB(sc0, sc1, sc2); 208 TM_SHR(sc4, sc3, 1); 209 TM_SUB(sc1, ff0, sc4); 210 211 sc0 = dspidualmul(sc0, epi); 212 sc0 = funshift2(sc0, sc0); 213 214 TM_ADD(ff0, ff0, sc3); 215 TM_ADD(sc4, sc1, sc0); 216 TM_SUB(sc5, sc1, sc0); 217 218 sc1 = funshift2(sc4, sc5); 219 sc2 = funshift2(sc5, sc4); 220 sc2 = funshift2(sc2, sc2); 221 222 st32d(j, Fout1, sc1); 223 st32d(j, Fout, ff0); 224 st32d(j, Fout2, sc2); 225 } 226} 227 228 229#define OVERRIDE_KFBFLY5 230static void kf_bfly5( 231 kiss_fft_cpx *Fout, 232 const int fstride, 233 const kiss_fft_cfg st, 234 int m 235 ) 236{ 237 register int * restrict tw1; 238 register int * restrict tw2; 239 register int * restrict tw3; 240 register int * restrict tw4; 241 register int * restrict Fout1; 242 register int * restrict Fout2; 243 register int * restrict Fout3; 244 register int * restrict Fout4; 245 register int fstride2, fstride3, fstride4; 246 register int i, j; 247 register int yab_msb, yab_lsb, yba_msb, yba_lsb; 248 register int _inv = !st->inverse; 249 250 251 Fout1=(int*)Fout+m; 252 Fout2=(int*)Fout+(m<<1); 253 Fout3=(int*)Fout+(3 *m); 254 Fout4=(int*)Fout+(m<<2); 255 256 tw1 = tw2 = tw3 = tw4 = (int*)st->twiddles; 257 258 i = tw1[fstride*m]; 259 yab_lsb = tw1[fstride*(m<<1)]; 260 yab_msb = pack16msb(i, yab_lsb); 261 yab_lsb = pack16lsb(i, yab_lsb); 262 yba_msb = funshift2(-sex16(yab_msb), yab_msb); 263 yba_lsb = funshift2(yab_lsb, yab_lsb); 264 265 fstride2 = fstride << 1; 266 fstride3 = fstride * 3; 267 fstride4 = fstride << 2; 268 269 for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3,tw4+=fstride4 ) 270 { register int sc0, sc1, sc2, sc3, sc4, sc5, sc6; 271 register int sc7, sc8, sc9, sc10, sc11, sc12; 272 register int ff0, sc78_msb, sc78_lsb, sc90_msb, sc90_lsb; 273 274 sc0 = ld32x(Fout,i); 275 sc1 = ld32x(Fout1,i); 276 sc2 = ld32x(Fout2,i); 277 sc3 = ld32x(Fout3,i); 278 sc4 = ld32x(Fout4,i); 279 sc5 = ld32(tw1); 280 sc6 = ld32(tw2); 281 sc7 = ld32(tw3); 282 sc8 = ld32(tw4); 283 284 if ( _inv ) 285 { 286 TM_DIV(sc0, sc0, 5); 287 TM_DIV(sc1, sc1, 5); 288 TM_DIV(sc2, sc2, 5); 289 TM_DIV(sc3, sc3, 5); 290 TM_DIV(sc4, sc4, 5); 291 } 292 293 ff0 = sc0; 294 295 TM_MUL(sc1, sc1, sc5); 296 TM_MUL(sc2, sc2, sc6); 297 TM_MUL(sc3, sc3, sc7); 298 TM_MUL(sc4, sc4, sc8); 299 TM_ADD(sc7, sc1, sc4); 300 TM_SUB(sc10,sc1, sc4); 301 TM_ADD(sc8, sc2, sc3); 302 TM_SUB(sc9, sc2, sc3); 303 304 TM_ADD(ff0, ff0, sc7); 305 TM_ADD(ff0, ff0, sc8); 306 st32d(j, Fout, ff0); 307 308 sc78_msb = pack16msb(sc7,sc8); 309 sc78_lsb = pack16lsb(sc7,sc8); 310 sc90_msb = pack16msb(sc10,sc9); 311 sc90_lsb = pack16lsb(sc10,sc9); 312 313 sc5 = pack16lsb( sround(ifir16(sc78_msb,yab_lsb)), sround(ifir16(sc78_lsb,yab_lsb))); 314 sc6 = pack16lsb(-sround(ifir16(sc90_lsb,yab_msb)), sround(ifir16(sc90_msb,yab_msb))); 315 316 TM_ADD(sc5, sc5, sc0); 317 TM_SUB(sc1, sc5, sc6); 318 TM_ADD(sc4, sc5, sc6); 319 st32d(j, Fout1, sc1); 320 st32d(j, Fout4, sc4); 321 322 sc11 = pack16lsb( sround(ifir16(sc78_msb,yba_lsb)), sround(ifir16(sc78_lsb,yba_lsb))); 323 sc12 = pack16lsb(-sround(ifir16(sc90_lsb,yba_msb)), sround(ifir16(sc90_msb,yba_msb))); 324 325 TM_ADD(sc11, sc11, sc0); 326 TM_ADD(sc2, sc11, sc12); 327 TM_SUB(sc3, sc11, sc12); 328 st32d(j, Fout2, sc2); 329 st32d(j, Fout3, sc3); 330 331 } 332} 333 334 335#define OVERRIDE_KF_BFLY_GENERIC 336static void kf_bfly_generic( 337 kiss_fft_cpx * restrict Fout, 338 const size_t fstride, 339 const kiss_fft_cfg st, 340 int m, 341 int p 342 ) 343{ 344 register int _inv = !st->inverse; 345 register int i, j, k, l; 346 register int * restrict twiddles = (int*)st->twiddles; 347 register int Norig = st->nfft; 348 349 CHECKBUF(scratchbuf,nscratchbuf,p); 350 351 for ( i=0; i<m; ++i ) 352 { register int sc10; 353 354 for ( j=0,k=i ; j<p ; ++j,k+=m ) 355 { register int f10; 356 357 f10 = ld32x(Fout,k); 358 359 if ( _inv ) 360 { TM_DIV(f10, f10, p); 361 } 362 363 st32d(j<<2, scratchbuf, f10); 364 } 365 366 for ( j=0,k=i,sc10=ld32(scratchbuf) ; j<p ; ++j,k+=m ) 367 { 368 register int twidx = 0; 369 register int f10; 370 371 for ( l=1,f10 = sc10 ; l<p ; ++l ) 372 { register int tw, sc; 373 374 twidx += fstride * k; 375 if ( twidx>=Norig ) 376 { twidx -= Norig; 377 } 378 379 sc = ld32x(scratchbuf,l); 380 tw = ld32x(twiddles,twidx); 381 382 TM_MUL(sc, sc, tw); 383 TM_ADD(f10, f10, sc); 384 } 385 st32d(k<<2, Fout, f10); 386 } 387 } 388} 389 390#else 391 392#define OVERRIDE_KFBFLY2 393static void kf_bfly2( 394 kiss_fft_cpx * Fout, 395 const size_t fstride, 396 const kiss_fft_cfg st, 397 int m 398 ) 399{ 400 register kiss_fft_cpx * restrict Fout2; 401 register kiss_fft_cpx * restrict tw1 = st->twiddles; 402 403 Fout2 = Fout + m; 404 405 do 406 { 407 register kiss_fft_cpx _fout2, _fout, t; 408 409 _fout2 = *Fout2; 410 _fout = *Fout; 411 412 C_MUL ( t, _fout2, *tw1); 413 C_SUB (_fout2, _fout, t); 414 C_ADD (_fout, _fout, t); 415 416 *Fout2 = _fout2; 417 *Fout = _fout; 418 419 tw1 += fstride; 420 ++Fout2; 421 ++Fout; 422 423 } while ( --m ); 424} 425 426#define OVERRIDE_KFBFLY4 427static void kf_bfly4( 428 kiss_fft_cpx * Fout, 429 const int fstride, 430 const kiss_fft_cfg st, 431 int m 432 ) 433{ 434 register kiss_fft_cpx * restrict tw1,* restrict tw2,* restrict tw3; 435 register kiss_fft_cpx * restrict Fout1, * restrict Fout2, * restrict Fout3; 436 register int _inv = !st->inverse; 437 438 tw3 = tw2 = tw1 = st->twiddles; 439 440 Fout1 = Fout + m; 441 Fout2 = Fout + (m << 1); 442 Fout3 = Fout + (m * 3); 443 444 do { 445 446 register kiss_fft_cpx _fout; 447 register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5; 448 449 _fout = *Fout; 450 451 C_MUL( sc0,*Fout1, *tw1); 452 C_MUL( sc1,*Fout2, *tw2); 453 C_MUL( sc2,*Fout3, *tw3); 454 C_SUB( sc5, _fout, sc1); 455 C_ADD( _fout, _fout, sc1); 456 C_ADD( sc3, sc0, sc2); 457 C_SUB( sc4, sc0, sc2); 458 C_SUB(*Fout2, _fout, sc3); 459 C_ADD( *Fout, _fout, sc3); 460 461 tw1 += fstride; 462 tw2 += (fstride << 1); 463 tw3 += (fstride * 3); 464 465 if ( _inv ) 466 { 467 Fout1->r = sc5.r + sc4.i; 468 Fout1->i = sc5.i - sc4.r; 469 Fout3->r = sc5.r - sc4.i; 470 Fout3->i = sc5.i + sc4.r; 471 } 472 else 473 { Fout1->r = sc5.r - sc4.i; 474 Fout1->i = sc5.i + sc4.r; 475 Fout3->r = sc5.r + sc4.i; 476 Fout3->i = sc5.i - sc4.r; 477 } 478 479 480 ++Fout; ++Fout1; ++Fout2; ++Fout3; 481 482 } while(--m); 483} 484 485#define OVERRIDE_KFBFLY3 486static void kf_bfly3( 487 kiss_fft_cpx * Fout, 488 const int fstride, 489 const kiss_fft_cfg st, 490 int m 491 ) 492{ 493 register kiss_fft_cpx * restrict Fout1, * restrict Fout2; 494 register kiss_fft_cpx * restrict tw1,* restrict tw2; 495 register float epi; 496 497 tw1 = tw2 = st->twiddles; 498 epi = st->twiddles[fstride*m].i; 499 Fout1 = Fout + m; 500 Fout2 = Fout + (m << 1); 501 502 do { 503 504 register kiss_fft_cpx _fout; 505 register kiss_fft_cpx sc0, sc1, sc2, sc3; 506 507 _fout = *Fout; 508 509 C_MUL( sc1, *Fout1, *tw1); 510 C_MUL( sc2, *Fout2, *tw2); 511 C_ADD( sc3, sc1, sc2); 512 C_SUB( sc0, sc1, sc2); 513 tw1 += fstride; 514 tw2 += (fstride << 1); 515 516 sc1.r = _fout.r - HALF_OF(sc3.r); 517 sc1.i = _fout.i - HALF_OF(sc3.i); 518 519 C_MULBYSCALAR(sc0, epi); 520 C_ADD(*Fout, _fout, sc3); 521 522 Fout2->r = sc1.r + sc0.i; 523 Fout2->i = sc1.i - sc0.r; 524 525 Fout1->r = sc1.i - sc0.i; 526 Fout1->i = sc1.r + sc0.r; 527 528 ++Fout; ++Fout1; ++Fout2; 529 530 } while(--m); 531} 532 533#define OVERRIDE_KFBFLY5 534static void kf_bfly5( 535 kiss_fft_cpx * Fout, 536 const size_t fstride, 537 const kiss_fft_cfg st, 538 int m 539 ) 540{ 541 register kiss_fft_cpx * restrict Fout1,* restrict Fout2,* restrict Fout3,* restrict Fout4; 542 register int u; 543 register kiss_fft_cpx *tw; 544 register float yar, yai, ybr, ybi; 545 546 Fout1=Fout+m; 547 Fout2=Fout+(m<<1); 548 Fout3=Fout+(m*3); 549 Fout4=Fout+(m<<2); 550 551 tw = st->twiddles; 552 yar = tw[fstride*m].r; 553 yai = tw[fstride*m].i; 554 ybr = tw[fstride*2*m].r; 555 ybi = tw[fstride*2*m].i; 556 557 for ( u=0; u<m; ++u ) 558 { 559 register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5, sc6, sc7, sc8, sc9, sc10, sc11, sc12; 560 561 sc0 = *Fout; 562 563 C_MUL( sc1,*Fout1, tw[u*fstride]); 564 C_MUL( sc2,*Fout2, tw[2*u*fstride]); 565 C_MUL( sc3,*Fout3, tw[3*u*fstride]); 566 C_MUL( sc4,*Fout4, tw[4*u*fstride]); 567 568 C_ADD( sc7, sc1, sc4); 569 C_SUB( sc10, sc1, sc4); 570 C_ADD( sc8, sc2, sc3); 571 C_SUB( sc9, sc2, sc3); 572 573 Fout->r = sc0.r + sc7.r + sc8.r; 574 Fout->i = sc0.i + sc7.i + sc8.i; 575 576 sc5.r = sc0.r + S_MUL(sc7.r,yar) + S_MUL(sc8.r,ybr); 577 sc5.i = sc0.i + S_MUL(sc7.i,yar) + S_MUL(sc8.i,ybr); 578 579 sc6.r = S_MUL(sc10.i,yai) + S_MUL(sc9.i,ybi); 580 sc6.i = -S_MUL(sc10.r,yai) - S_MUL(sc9.r,ybi); 581 582 C_SUB(*Fout1,sc5,sc6); 583 C_ADD(*Fout4,sc5,sc6); 584 585 sc11.r = sc0.r + S_MUL(sc7.r,ybr) + S_MUL(sc8.r,yar); 586 sc11.i = sc0.i + S_MUL(sc7.i,ybr) + S_MUL(sc8.i,yar); 587 sc12.r = - S_MUL(sc10.i,ybi) + S_MUL(sc9.i,yai); 588 sc12.i = S_MUL(sc10.r,ybi) - S_MUL(sc9.r,yai); 589 C_ADD(*Fout2,sc11,sc12); 590 C_SUB(*Fout3,sc11,sc12); 591 592 ++Fout1; ++Fout2; ++Fout3; ++Fout4; 593 } 594} 595 596 597#endif 598 599#endif 600