1;****************************************************************************** 2;* 36 point SSE-optimized IMDCT transform 3;* Copyright (c) 2011 Vitor Sessak 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25 26ps_mask: dd 0, ~0, ~0, ~0 27ps_mask2: dd 0, ~0, 0, ~0 28ps_mask3: dd 0, 0, 0, ~0 29ps_mask4: dd 0, ~0, 0, 0 30 31ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038 32ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038 33ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433 34ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038 35ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530 36ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097 37ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097 38 39ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 40ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 41 42ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461 43 dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349 44 dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896 45 dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991 46 dd 1.0, 0.70710678118654752439, 0.0, 0.0 47 48ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 49 dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349 50 dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896 51 dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 52 dd 1.0, -0.70710678118654752439, 0.0, 0.0 53 54costabs: times 4 dd 0.98480773 55 times 4 dd 0.93969262 56 times 4 dd 0.86602539 57 times 4 dd -0.76604444 58 times 4 dd -0.64278764 59 times 4 dd 0.50000000 60 times 4 dd -0.50000000 61 times 4 dd -0.34202015 62 times 4 dd -0.17364818 63 times 4 dd 0.50190992 64 times 4 dd 0.51763808 65 times 4 dd 0.55168896 66 times 4 dd 0.61038726 67 times 4 dd 0.70710677 68 times 4 dd 0.87172341 69 times 4 dd 1.18310082 70 times 4 dd 1.93185163 71 times 4 dd 5.73685646 72 73%define SBLIMIT 32 74SECTION .text 75 76%macro PSHUFD 3 77%if cpuflag(sse2) && notcpuflag(avx) 78 pshufd %1, %2, %3 79%else 80 shufps %1, %2, %2, %3 81%endif 82%endmacro 83 84; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} 85; output %1={x3,x4,y1,y2} 86%macro BUILDINVHIGHLOW 3 87%if cpuflag(avx) 88 shufps %1, %2, %3, 0x4e 89%else 90 movlhps %1, %3 91 movhlps %1, %2 92%endif 93%endmacro 94 95; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} 96; output %1={x4,y1,y2,y3} 97%macro ROTLEFT 3 98%if cpuflag(ssse3) 99 palignr %1, %3, %2, 12 100%else 101 BUILDINVHIGHLOW %1, %2, %3 102 shufps %1, %1, %3, 0x99 103%endif 104%endmacro 105 106%macro INVERTHL 2 107%if cpuflag(sse2) 108 PSHUFD %1, %2, 0x4e 109%else 110 movhlps %1, %2 111 movlhps %1, %2 112%endif 113%endmacro 114 115%macro BUTTERF 3 116 INVERTHL %2, %1 117 xorps %1, [ps_p1p1m1m1] 118 addps %1, %2 119%if cpuflag(sse3) 120 mulps %1, %1, [ps_cosh_sse3 + %3] 121 PSHUFD %2, %1, 0xb1 122 addsubps %1, %1, %2 123%else 124 mulps %1, [ps_cosh + %3] 125 PSHUFD %2, %1, 0xb1 126 xorps %1, [ps_p1m1p1m1] 127 addps %1, %2 128%endif 129%endmacro 130 131%macro BUTTERF2 3 132%if cpuflag(sse3) 133 mulps %1, %1, [ps_cosh_sse3 + %3] 134 PSHUFD %2, %1, 0xe1 135 addsubps %1, %1, %2 136%else 137 mulps %1, [ps_cosh + %3] 138 PSHUFD %2, %1, 0xe1 139 xorps %1, [ps_p1m1p1m1] 140 addps %1, %2 141%endif 142%endmacro 143 144%macro STORE 4 145%if cpuflag(sse4) 146 movss [%3 ], %1 147 extractps dword [%3 + %4], %1, 1 148 extractps dword [%3 + 2*%4], %1, 2 149 extractps dword [%3 + 3*%4], %1, 3 150%else 151 movhlps %2, %1 152 movss [%3 ], %1 153 movss [%3 + 2*%4], %2 154 shufps %1, %1, 0xb1 155 movss [%3 + %4], %1 156 movhlps %2, %1 157 movss [%3 + 3*%4], %2 158%endif 159%endmacro 160 161%macro LOAD 4 162 movlps %1, [%3 ] 163 movhps %1, [%3 + %4] 164 movlps %2, [%3 + 2*%4] 165 movhps %2, [%3 + 3*%4] 166 shufps %1, %2, 0x88 167%endmacro 168 169%macro LOADA64 2 170%if cpuflag(avx) 171 movu %1, [%2] 172%else 173 movlps %1, [%2] 174 movhps %1, [%2 + 8] 175%endif 176%endmacro 177 178%macro DEFINE_IMDCT 0 179cglobal imdct36_float, 4,4,9, out, buf, in, win 180 181 ; for(i=17;i>=1;i--) in[i] += in[i-1]; 182 LOADA64 m0, inq 183 LOADA64 m1, inq + 16 184 185 ROTLEFT m5, m0, m1 186 187 PSHUFD m6, m0, 0x93 188 andps m6, m6, [ps_mask] 189 addps m0, m0, m6 190 191 LOADA64 m2, inq + 32 192 193 ROTLEFT m7, m1, m2 194 195 addps m1, m1, m5 196 LOADA64 m3, inq + 48 197 198 ROTLEFT m5, m2, m3 199 200 xorps m4, m4, m4 201 movlps m4, [inq+64] 202 BUILDINVHIGHLOW m6, m3, m4 203 shufps m6, m6, m4, 0xa9 204 205 addps m4, m4, m6 206 addps m2, m2, m7 207 addps m3, m3, m5 208 209 ; for(i=17;i>=3;i-=2) in[i] += in[i-2]; 210 movlhps m5, m5, m0 211 andps m5, m5, [ps_mask3] 212 213 BUILDINVHIGHLOW m7, m0, m1 214 andps m7, m7, [ps_mask2] 215 216 addps m0, m0, m5 217 218 BUILDINVHIGHLOW m6, m1, m2 219 andps m6, m6, [ps_mask2] 220 221 addps m1, m1, m7 222 223 BUILDINVHIGHLOW m7, m2, m3 224 andps m7, m7, [ps_mask2] 225 226 addps m2, m2, m6 227 228 movhlps m6, m6, m3 229 andps m6, m6, [ps_mask4] 230 231 addps m3, m3, m7 232 addps m4, m4, m6 233 234 ; Populate tmp[] 235 movlhps m6, m1, m5 ; zero out high values 236 subps m6, m6, m4 237 238 subps m5, m0, m3 239 240%if ARCH_X86_64 241 SWAP m5, m8 242%endif 243 244 mulps m7, m2, [ps_val1] 245 246%if ARCH_X86_64 247 mulps m5, m8, [ps_val2] 248%else 249 mulps m5, m5, [ps_val2] 250%endif 251 addps m7, m7, m5 252 253 mulps m5, m6, [ps_val1] 254 subps m7, m7, m5 255 256%if ARCH_X86_64 257 SWAP m5, m8 258%else 259 subps m5, m0, m3 260%endif 261 262 subps m5, m5, m6 263 addps m5, m5, m2 264 265 shufps m6, m4, m3, 0xe4 266 subps m6, m6, m2 267 mulps m6, m6, [ps_val3] 268 269 addps m4, m4, m1 270 mulps m4, m4, [ps_val4] 271 272 shufps m1, m1, m0, 0xe4 273 addps m1, m1, m2 274 mulps m1, m1, [ps_val5] 275 276 mulps m3, m3, [ps_val6] 277 mulps m0, m0, [ps_val7] 278 addps m0, m0, m3 279 280 xorps m2, m1, [ps_p1p1m1m1] 281 subps m2, m2, m4 282 addps m2, m2, m0 283 284 addps m3, m4, m0 285 subps m3, m3, m6 286 xorps m3, m3, [ps_p1p1m1m1] 287 288 shufps m0, m0, m4, 0xe4 289 subps m0, m0, m1 290 addps m0, m0, m6 291 292 BUILDINVHIGHLOW m4, m2, m3 293 shufps m3, m3, m2, 0x4e 294 295 ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5} 296 297 BUTTERF m0, m1, 0 298 BUTTERF m7, m2, 16 299 BUTTERF m3, m6, 32 300 BUTTERF m4, m1, 48 301 BUTTERF2 m5, m1, 64 302 303 ; permutates: 304 ; m0 0 1 2 3 => 2 6 10 14 m1 305 ; m7 4 5 6 7 => 3 7 11 15 m2 306 ; m3 8 9 10 11 => 17 13 9 5 m3 307 ; m4 12 13 14 15 => 16 12 8 4 m5 308 ; m5 16 17 xx xx => 0 1 xx xx m0 309 310 unpckhps m1, m0, m7 311 unpckhps m6, m3, m4 312 movhlps m2, m6, m1 313 movlhps m1, m1, m6 314 315 unpcklps m5, m5, m4 316 unpcklps m3, m3, m7 317 movhlps m4, m3, m5 318 movlhps m5, m5, m3 319 SWAP m4, m3 320 ; permutation done 321 322 PSHUFD m6, m2, 0xb1 323 movss m4, [bufq + 4*68] 324 movss m7, [bufq + 4*64] 325 unpcklps m7, m7, m4 326 mulps m6, m6, [winq + 16*4] 327 addps m6, m6, m7 328 movss [outq + 64*SBLIMIT], m6 329 shufps m6, m6, m6, 0xb1 330 movss [outq + 68*SBLIMIT], m6 331 332 mulps m6, m3, [winq + 4*4] 333 LOAD m4, m7, bufq + 4*16, 16 334 addps m6, m6, m4 335 STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT 336 337 shufps m4, m0, m3, 0xb5 338 mulps m4, m4, [winq + 8*4] 339 LOAD m7, m6, bufq + 4*32, 16 340 addps m4, m4, m7 341 STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT 342 343 shufps m3, m3, m2, 0xb1 344 mulps m3, m3, [winq + 12*4] 345 LOAD m7, m6, bufq + 4*48, 16 346 addps m3, m3, m7 347 STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT 348 349 mulps m2, m2, [winq] 350 LOAD m6, m7, bufq, 16 351 addps m2, m2, m6 352 STORE m2, m7, outq, 4*SBLIMIT 353 354 mulps m4, m1, [winq + 20*4] 355 STORE m4, m7, bufq, 16 356 357 mulps m3, m5, [winq + 24*4] 358 STORE m3, m7, bufq + 4*16, 16 359 360 shufps m0, m0, m5, 0xb0 361 mulps m0, m0, [winq + 28*4] 362 STORE m0, m7, bufq + 4*32, 16 363 364 shufps m5, m5, m1, 0xb1 365 mulps m5, m5, [winq + 32*4] 366 STORE m5, m7, bufq + 4*48, 16 367 368 shufps m1, m1, m1, 0xb1 369 mulps m1, m1, [winq + 36*4] 370 movss [bufq + 4*64], m1 371 shufps m1, m1, 0xb1 372 movss [bufq + 4*68], m1 373 RET 374%endmacro 375 376INIT_XMM sse2 377DEFINE_IMDCT 378 379INIT_XMM sse3 380DEFINE_IMDCT 381 382INIT_XMM ssse3 383DEFINE_IMDCT 384 385%if HAVE_AVX_EXTERNAL 386INIT_XMM avx 387DEFINE_IMDCT 388%endif 389 390INIT_XMM sse 391 392%if ARCH_X86_64 393%define SPILL SWAP 394%define UNSPILL SWAP 395%define SPILLED(x) m %+ x 396%else 397%define SPILLED(x) [tmpq+(x-8)*16 + 32*4] 398%macro SPILL 2 ; xmm#, mempos 399 movaps SPILLED(%2), m%1 400%endmacro 401%macro UNSPILL 2 402 movaps m%1, SPILLED(%2) 403%endmacro 404%endif 405 406%macro DEFINE_FOUR_IMDCT 0 407cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp 408 movlps m0, [inq+64] 409 movhps m0, [inq+64 + 72] 410 movlps m3, [inq+64 + 2*72] 411 movhps m3, [inq+64 + 3*72] 412 413 shufps m5, m0, m3, 0xdd 414 shufps m0, m0, m3, 0x88 415 416 mova m1, [inq+48] 417 movu m6, [inq+48 + 72] 418 mova m7, [inq+48 + 2*72] 419 movu m3, [inq+48 + 3*72] 420 421 TRANSPOSE4x4PS 1, 6, 7, 3, 4 422 423 addps m4, m6, m7 424 mova [tmpq+4*28], m4 425 426 addps m7, m3 427 addps m6, m1 428 addps m3, m0 429 addps m0, m5 430 addps m0, m7 431 addps m7, m6 432 mova [tmpq+4*12], m7 433 SPILL 3, 12 434 435 mova m4, [inq+32] 436 movu m5, [inq+32 + 72] 437 mova m2, [inq+32 + 2*72] 438 movu m7, [inq+32 + 3*72] 439 440 TRANSPOSE4x4PS 4, 5, 2, 7, 3 441 442 addps m1, m7 443 SPILL 1, 11 444 445 addps m3, m5, m2 446 SPILL 3, 13 447 448 addps m7, m2 449 addps m5, m4 450 addps m6, m7 451 mova [tmpq], m6 452 addps m7, m5 453 mova [tmpq+4*16], m7 454 455 mova m2, [inq+16] 456 movu m7, [inq+16 + 72] 457 mova m1, [inq+16 + 2*72] 458 movu m6, [inq+16 + 3*72] 459 460 TRANSPOSE4x4PS 2, 7, 1, 6, 3 461 462 addps m4, m6 463 addps m6, m1 464 addps m1, m7 465 addps m7, m2 466 addps m5, m6 467 SPILL 5, 15 468 addps m6, m7 469 mulps m6, [costabs + 16*2] 470 mova [tmpq+4*8], m6 471 SPILL 1, 10 472 SPILL 0, 14 473 474 mova m1, [inq] 475 movu m6, [inq + 72] 476 mova m3, [inq + 2*72] 477 movu m5, [inq + 3*72] 478 479 TRANSPOSE4x4PS 1, 6, 3, 5, 0 480 481 addps m2, m5 482 addps m5, m3 483 addps m7, m5 484 addps m3, m6 485 addps m6, m1 486 SPILL 7, 8 487 addps m5, m6 488 SPILL 6, 9 489 addps m6, m4, SPILLED(12) 490 subps m6, m2 491 UNSPILL 7, 11 492 SPILL 5, 11 493 subps m5, m1, m7 494 mulps m7, [costabs + 16*5] 495 addps m7, m1 496 mulps m0, m6, [costabs + 16*6] 497 addps m0, m5 498 mova [tmpq+4*24], m0 499 addps m6, m5 500 mova [tmpq+4*4], m6 501 addps m6, m4, m2 502 mulps m6, [costabs + 16*1] 503 subps m4, SPILLED(12) 504 mulps m4, [costabs + 16*8] 505 addps m2, SPILLED(12) 506 mulps m2, [costabs + 16*3] 507 subps m5, m7, m6 508 subps m5, m2 509 addps m6, m7 510 addps m6, m4 511 addps m7, m2 512 subps m7, m4 513 mova [tmpq+4*20], m7 514 mova m2, [tmpq+4*28] 515 mova [tmpq+4*28], m5 516 UNSPILL 7, 13 517 subps m5, m7, m2 518 mulps m5, [costabs + 16*7] 519 UNSPILL 1, 10 520 mulps m1, [costabs + 16*2] 521 addps m4, m3, m2 522 mulps m4, [costabs + 16*4] 523 addps m2, m7 524 addps m7, m3 525 mulps m7, [costabs] 526 subps m3, m2 527 mulps m3, [costabs + 16*2] 528 addps m2, m7, m5 529 addps m2, m1 530 SPILL 2, 10 531 addps m7, m4 532 subps m7, m1 533 SPILL 7, 12 534 subps m5, m4 535 subps m5, m1 536 UNSPILL 0, 14 537 SPILL 5, 13 538 addps m1, m0, SPILLED(15) 539 subps m1, SPILLED(8) 540 mova m4, [costabs + 16*5] 541 mulps m4, [tmpq] 542 UNSPILL 2, 9 543 addps m4, m2 544 subps m2, [tmpq] 545 mulps m5, m1, [costabs + 16*6] 546 addps m5, m2 547 SPILL 5, 9 548 addps m2, m1 549 SPILL 2, 14 550 UNSPILL 5, 15 551 subps m7, m5, m0 552 addps m5, SPILLED(8) 553 mulps m5, [costabs + 16*1] 554 mulps m7, [costabs + 16*8] 555 addps m0, SPILLED(8) 556 mulps m0, [costabs + 16*3] 557 subps m2, m4, m5 558 subps m2, m0 559 SPILL 2, 15 560 addps m5, m4 561 addps m5, m7 562 addps m4, m0 563 subps m4, m7 564 SPILL 4, 8 565 mova m7, [tmpq+4*16] 566 mova m2, [tmpq+4*12] 567 addps m0, m7, m2 568 subps m0, SPILLED(11) 569 mulps m0, [costabs + 16*2] 570 addps m4, m7, SPILLED(11) 571 mulps m4, [costabs] 572 subps m7, m2 573 mulps m7, [costabs + 16*7] 574 addps m2, SPILLED(11) 575 mulps m2, [costabs + 16*4] 576 addps m1, m7, [tmpq+4*8] 577 addps m1, m4 578 addps m4, m2 579 subps m4, [tmpq+4*8] 580 SPILL 4, 11 581 subps m7, m2 582 subps m7, [tmpq+4*8] 583 addps m4, m6, SPILLED(10) 584 subps m6, SPILLED(10) 585 addps m2, m5, m1 586 mulps m2, [costabs + 16*9] 587 subps m5, m1 588 mulps m5, [costabs + 16*17] 589 subps m1, m4, m2 590 addps m4, m2 591 mulps m2, m1, [winq+4*36] 592 addps m2, [bufq+4*36] 593 mova [outq+1152], m2 594 mulps m1, [winq+4*32] 595 addps m1, [bufq+4*32] 596 mova [outq+1024], m1 597 mulps m1, m4, [winq+4*116] 598 mova [bufq+4*36], m1 599 mulps m4, [winq+4*112] 600 mova [bufq+4*32], m4 601 addps m2, m6, m5 602 subps m6, m5 603 mulps m1, m6, [winq+4*68] 604 addps m1, [bufq+4*68] 605 mova [outq+2176], m1 606 mulps m6, [winq] 607 addps m6, [bufq] 608 mova [outq], m6 609 mulps m1, m2, [winq+4*148] 610 mova [bufq+4*68], m1 611 mulps m2, [winq+4*80] 612 mova [bufq], m2 613 addps m5, m3, [tmpq+4*24] 614 mova m2, [tmpq+4*24] 615 subps m2, m3 616 mova m1, SPILLED(9) 617 subps m1, m0 618 mulps m1, [costabs + 16*10] 619 addps m0, SPILLED(9) 620 mulps m0, [costabs + 16*16] 621 addps m6, m5, m1 622 subps m5, m1 623 mulps m3, m5, [winq+4*40] 624 addps m3, [bufq+4*40] 625 mova [outq+1280], m3 626 mulps m5, [winq+4*28] 627 addps m5, [bufq+4*28] 628 mova [outq+896], m5 629 mulps m1, m6, [winq+4*120] 630 mova [bufq+4*40], m1 631 mulps m6, [winq+4*108] 632 mova [bufq+4*28], m6 633 addps m1, m2, m0 634 subps m2, m0 635 mulps m5, m2, [winq+4*64] 636 addps m5, [bufq+4*64] 637 mova [outq+2048], m5 638 mulps m2, [winq+4*4] 639 addps m2, [bufq+4*4] 640 mova [outq+128], m2 641 mulps m0, m1, [winq+4*144] 642 mova [bufq+4*64], m0 643 mulps m1, [winq+4*84] 644 mova [bufq+4*4], m1 645 mova m1, [tmpq+4*28] 646 mova m5, m1 647 addps m1, SPILLED(13) 648 subps m5, SPILLED(13) 649 UNSPILL 3, 15 650 addps m2, m7, m3 651 mulps m2, [costabs + 16*11] 652 subps m3, m7 653 mulps m3, [costabs + 16*15] 654 addps m0, m2, m1 655 subps m1, m2 656 SWAP m0, m2 657 mulps m6, m1, [winq+4*44] 658 addps m6, [bufq+4*44] 659 mova [outq+1408], m6 660 mulps m1, [winq+4*24] 661 addps m1, [bufq+4*24] 662 mova [outq+768], m1 663 mulps m0, m2, [winq+4*124] 664 mova [bufq+4*44], m0 665 mulps m2, [winq+4*104] 666 mova [bufq+4*24], m2 667 addps m0, m5, m3 668 subps m5, m3 669 mulps m1, m5, [winq+4*60] 670 addps m1, [bufq+4*60] 671 mova [outq+1920], m1 672 mulps m5, [winq+4*8] 673 addps m5, [bufq+4*8] 674 mova [outq+256], m5 675 mulps m1, m0, [winq+4*140] 676 mova [bufq+4*60], m1 677 mulps m0, [winq+4*88] 678 mova [bufq+4*8], m0 679 mova m1, [tmpq+4*20] 680 addps m1, SPILLED(12) 681 mova m2, [tmpq+4*20] 682 subps m2, SPILLED(12) 683 UNSPILL 7, 8 684 subps m0, m7, SPILLED(11) 685 addps m7, SPILLED(11) 686 mulps m4, m7, [costabs + 16*12] 687 mulps m0, [costabs + 16*14] 688 addps m5, m1, m4 689 subps m1, m4 690 mulps m7, m1, [winq+4*48] 691 addps m7, [bufq+4*48] 692 mova [outq+1536], m7 693 mulps m1, [winq+4*20] 694 addps m1, [bufq+4*20] 695 mova [outq+640], m1 696 mulps m1, m5, [winq+4*128] 697 mova [bufq+4*48], m1 698 mulps m5, [winq+4*100] 699 mova [bufq+4*20], m5 700 addps m6, m2, m0 701 subps m2, m0 702 mulps m1, m2, [winq+4*56] 703 addps m1, [bufq+4*56] 704 mova [outq+1792], m1 705 mulps m2, [winq+4*12] 706 addps m2, [bufq+4*12] 707 mova [outq+384], m2 708 mulps m0, m6, [winq+4*136] 709 mova [bufq+4*56], m0 710 mulps m6, [winq+4*92] 711 mova [bufq+4*12], m6 712 UNSPILL 0, 14 713 mulps m0, [costabs + 16*13] 714 mova m3, [tmpq+4*4] 715 addps m2, m0, m3 716 subps m3, m0 717 mulps m0, m3, [winq+4*52] 718 addps m0, [bufq+4*52] 719 mova [outq+1664], m0 720 mulps m3, [winq+4*16] 721 addps m3, [bufq+4*16] 722 mova [outq+512], m3 723 mulps m0, m2, [winq+4*132] 724 mova [bufq+4*52], m0 725 mulps m2, [winq+4*96] 726 mova [bufq+4*16], m2 727 RET 728%endmacro 729 730INIT_XMM sse 731DEFINE_FOUR_IMDCT 732 733%if HAVE_AVX_EXTERNAL 734INIT_XMM avx 735DEFINE_FOUR_IMDCT 736%endif 737