1;***************************************************************************** 2;* MMX/SSE2-optimized H.264 iDCT 3;***************************************************************************** 4;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt 5;* Copyright (C) 2003-2008 x264 project 6;* 7;* Authors: Laurent Aimar <fenrir@via.ecp.fr> 8;* Loren Merritt <lorenm@u.washington.edu> 9;* Holger Lubitz <hal@duncan.ol.sub.de> 10;* Min Chen <chenm001.163.com> 11;* 12;* This file is part of FFmpeg. 13;* 14;* FFmpeg is free software; you can redistribute it and/or 15;* modify it under the terms of the GNU Lesser General Public 16;* License as published by the Free Software Foundation; either 17;* version 2.1 of the License, or (at your option) any later version. 18;* 19;* FFmpeg is distributed in the hope that it will be useful, 20;* but WITHOUT ANY WARRANTY; without even the implied warranty of 21;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22;* Lesser General Public License for more details. 23;* 24;* You should have received a copy of the GNU Lesser General Public 25;* License along with FFmpeg; if not, write to the Free Software 26;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 27;***************************************************************************** 28 29%include "libavutil/x86/x86util.asm" 30 31SECTION_RODATA 32 33scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 34 db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 35 db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 36 db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 37 db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 38 db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 39 db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 40 db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 41 db 4+11*8, 5+11*8, 4+12*8, 5+12*8 42 db 6+11*8, 7+11*8, 6+12*8, 7+12*8 43 db 4+13*8, 5+13*8, 4+14*8, 5+14*8 44 db 6+13*8, 7+13*8, 6+14*8, 7+14*8 45%ifdef PIC 46%define npicregs 1 47%define scan8 picregq 48%else 49%define npicregs 0 50%define scan8 scan8_mem 51%endif 52 53cextern pw_32 54cextern pw_1 55 56SECTION .text 57 58; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 59%macro IDCT4_ADD 3 60 ; Load dct coeffs 61 movq m0, [%2] 62 movq m1, [%2+8] 63 movq m2, [%2+16] 64 movq m3, [%2+24] 65 66 IDCT4_1D w, 0, 1, 2, 3, 4, 5 67 mova m6, [pw_32] 68 %if mmsize == 8 69 TRANSPOSE4x4W 0, 1, 2, 3, 4 70 %else 71 punpcklwd m0, m1 72 punpcklwd m2, m3 73 SBUTTERFLY dq, 0, 2, 4 74 MOVHL m1, m0 75 MOVHL m3, m2 76 %endif 77 paddw m0, m6 78 IDCT4_1D w, 0, 1, 2, 3, 4, 5 79 pxor m7, m7 80 movq [%2+ 0], m7 81 movq [%2+ 8], m7 82 movq [%2+16], m7 83 movq [%2+24], m7 84 85 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 86 lea %1, [%1+%3*2] 87 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 88%endmacro 89 90%macro IDCT8_1D 2 91 psraw m0, m1, 1 92 SWAP 0, 1 93 psraw m4, m5, 1 94 paddw m4, m5 95 paddw m1, m0 96 paddw m4, m7 97 paddw m1, m5 98 psubw m4, m0 99 paddw m1, m3 100 101 psubw m0, m3 102 psubw m5, m3 103 psraw m3, 1 104 paddw m0, m7 105 psubw m5, m7 106 psraw m7, 1 107 psubw m0, m3 108 psubw m5, m7 109 110 psraw m7, m1, 2 111 SWAP 7,1 112 psraw m3, m4, 2 113 paddw m3, m0 114 psraw m0, 2 115 paddw m1, m5 116 psraw m5, 2 117 psubw m0, m4 118 psubw m7, m5 119 120 psraw m5, m6, 1 121 SWAP 5,6 122 psraw m4, m2, 1 123 paddw m6, m2 124 psubw m4, m5 125 126 mova m2, %1 127 mova m5, %2 128 SUMSUB_BA w, 5, 2 129 SUMSUB_BA w, 6, 5 130 SUMSUB_BA w, 4, 2 131 SUMSUB_BA w, 7, 6 132 SUMSUB_BA w, 0, 4 133 SUMSUB_BA w, 3, 2 134 SUMSUB_BA w, 1, 5 135 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 136%endmacro 137 138%macro IDCT8_1D_FULL 1 139 mova m7, [%1+112] 140 mova m6, [%1+ 96] 141 mova m5, [%1+ 80] 142 mova m3, [%1+ 48] 143 mova m2, [%1+ 32] 144 mova m1, [%1+ 16] 145 IDCT8_1D [%1], [%1+ 64] 146%endmacro 147 148; %1=int16_t *block, %2=int16_t *dstblock 149%macro IDCT8_ADD_MMX_START 2 150 IDCT8_1D_FULL %1 151 mova [%1], m7 152 TRANSPOSE4x4W 0, 1, 2, 3, 7 153 mova m7, [%1] 154 mova [%2 ], m0 155 mova [%2+16], m1 156 mova [%2+32], m2 157 mova [%2+48], m3 158 TRANSPOSE4x4W 4, 5, 6, 7, 3 159 mova [%2+ 8], m4 160 mova [%2+24], m5 161 mova [%2+40], m6 162 mova [%2+56], m7 163%endmacro 164 165; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 166%macro IDCT8_ADD_MMX_END 3-4 167 IDCT8_1D_FULL %2 168 mova [%2 ], m5 169 mova [%2+16], m6 170 mova [%2+32], m7 171 172 pxor m7, m7 173%if %0 == 4 174 movq [%4+ 0], m7 175 movq [%4+ 8], m7 176 movq [%4+ 16], m7 177 movq [%4+ 24], m7 178 movq [%4+ 32], m7 179 movq [%4+ 40], m7 180 movq [%4+ 48], m7 181 movq [%4+ 56], m7 182 movq [%4+ 64], m7 183 movq [%4+ 72], m7 184 movq [%4+ 80], m7 185 movq [%4+ 88], m7 186 movq [%4+ 96], m7 187 movq [%4+104], m7 188 movq [%4+112], m7 189 movq [%4+120], m7 190%endif 191 STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 192 lea %1, [%1+%3*2] 193 STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 194 mova m0, [%2 ] 195 mova m1, [%2+16] 196 mova m2, [%2+32] 197 lea %1, [%1+%3*2] 198 STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3 199 lea %1, [%1+%3*2] 200 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 201%endmacro 202 203; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 204%macro IDCT8_ADD_SSE 4 205 IDCT8_1D_FULL %2 206%if ARCH_X86_64 207 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 208%else 209 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16] 210%endif 211 paddw m0, [pw_32] 212 213%if ARCH_X86_64 == 0 214 mova [%2 ], m0 215 mova [%2+16], m4 216 IDCT8_1D [%2], [%2+ 16] 217 mova [%2 ], m6 218 mova [%2+16], m7 219%else 220 SWAP 0, 8 221 SWAP 4, 9 222 IDCT8_1D m8, m9 223 SWAP 6, 8 224 SWAP 7, 9 225%endif 226 227 pxor m7, m7 228 lea %4, [%3*3] 229 STORE_DIFF m0, m6, m7, [%1 ] 230 STORE_DIFF m1, m6, m7, [%1+%3 ] 231 STORE_DIFF m2, m6, m7, [%1+%3*2] 232 STORE_DIFF m3, m6, m7, [%1+%4 ] 233%if ARCH_X86_64 == 0 234 mova m0, [%2 ] 235 mova m1, [%2+16] 236%else 237 SWAP 0, 8 238 SWAP 1, 9 239%endif 240 mova [%2+ 0], m7 241 mova [%2+ 16], m7 242 mova [%2+ 32], m7 243 mova [%2+ 48], m7 244 mova [%2+ 64], m7 245 mova [%2+ 80], m7 246 mova [%2+ 96], m7 247 mova [%2+112], m7 248 lea %1, [%1+%3*4] 249 STORE_DIFF m4, m6, m7, [%1 ] 250 STORE_DIFF m5, m6, m7, [%1+%3 ] 251 STORE_DIFF m0, m6, m7, [%1+%3*2] 252 STORE_DIFF m1, m6, m7, [%1+%4 ] 253%endmacro 254 255INIT_XMM sse2 256; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride) 257cglobal h264_idct8_add_8, 3, 4, 10 258 movsxdifnidn r2, r2d 259 IDCT8_ADD_SSE r0, r1, r2, r3 260 RET 261 262%macro DC_ADD_MMXEXT_INIT 2 263 add %1, 32 264 sar %1, 6 265 movd m0, %1d 266 lea %1, [%2*3] 267 pshufw m0, m0, 0 268 pxor m1, m1 269 psubw m1, m0 270 packuswb m0, m0 271 packuswb m1, m1 272%endmacro 273 274%macro DC_ADD_MMXEXT_OP 4 275 %1 m2, [%2 ] 276 %1 m3, [%2+%3 ] 277 %1 m4, [%2+%3*2] 278 %1 m5, [%2+%4 ] 279 paddusb m2, m0 280 paddusb m3, m0 281 paddusb m4, m0 282 paddusb m5, m0 283 psubusb m2, m1 284 psubusb m3, m1 285 psubusb m4, m1 286 psubusb m5, m1 287 %1 [%2 ], m2 288 %1 [%2+%3 ], m3 289 %1 [%2+%3*2], m4 290 %1 [%2+%4 ], m5 291%endmacro 292 293INIT_MMX mmxext 294%if ARCH_X86_64 295; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) 296cglobal h264_idct8_dc_add_8, 3, 4, 0 297 movsxd r2, r2d 298 movsx r3, word [r1] 299 mov dword [r1], 0 300 DC_ADD_MMXEXT_INIT r3, r2 301 DC_ADD_MMXEXT_OP mova, r0, r2, r3 302 lea r0, [r0+r2*4] 303 DC_ADD_MMXEXT_OP mova, r0, r2, r3 304 RET 305%else 306; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) 307cglobal h264_idct8_dc_add_8, 2, 3, 0 308 movsx r2, word [r1] 309 mov dword [r1], 0 310 mov r1, r2m 311 DC_ADD_MMXEXT_INIT r2, r1 312 DC_ADD_MMXEXT_OP mova, r0, r1, r2 313 lea r0, [r0+r1*4] 314 DC_ADD_MMXEXT_OP mova, r0, r1, r2 315 RET 316%endif 317 318INIT_XMM sse2 319; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset, 320; int16_t *block, int stride, 321; const uint8_t nnzc[6 * 8]) 322cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 323 movsxdifnidn r3, r3d 324 xor r5, r5 325%ifdef PIC 326 lea picregq, [scan8_mem] 327%endif 328.nextblock: 329 movzx r6, byte [scan8+r5] 330 movzx r6, byte [r4+r6] 331 test r6, r6 332 jz .skipblock 333 cmp r6, 1 334 jnz .no_dc 335 movsx r6, word [r2] 336 test r6, r6 337 jz .no_dc 338INIT_MMX cpuname 339 mov word [r2], 0 340 DC_ADD_MMXEXT_INIT r6, r3 341%if ARCH_X86_64 == 0 342%define dst2q r1 343%define dst2d r1d 344%endif 345 mov dst2d, dword [r1+r5*4] 346 add dst2q, r0 347 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 348 lea dst2q, [dst2q+r3*4] 349 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 350%if ARCH_X86_64 == 0 351 mov r1, r1m 352%endif 353 add r5, 4 354 add r2, 128 355 cmp r5, 16 356 jl .nextblock 357 REP_RET 358.no_dc: 359INIT_XMM cpuname 360 mov dst2d, dword [r1+r5*4] 361 add dst2q, r0 362 IDCT8_ADD_SSE dst2q, r2, r3, r6 363%if ARCH_X86_64 == 0 364 mov r1, r1m 365%endif 366.skipblock: 367 add r5, 4 368 add r2, 128 369 cmp r5, 16 370 jl .nextblock 371 REP_RET 372 373INIT_MMX mmx 374h264_idct_add8_mmx_plane: 375 movsxdifnidn r3, r3d 376.nextblock: 377 movzx r6, byte [scan8+r5] 378 movzx r6, byte [r4+r6] 379 or r6w, word [r2] 380 test r6, r6 381 jz .skipblock 382%if ARCH_X86_64 383 mov r0d, dword [r1+r5*4] 384 add r0, [dst2q] 385%else 386 mov r0, r1m ; XXX r1m here is actually r0m of the calling func 387 mov r0, [r0] 388 add r0, dword [r1+r5*4] 389%endif 390 IDCT4_ADD r0, r2, r3 391.skipblock: 392 inc r5 393 add r2, 32 394 test r5, 3 395 jnz .nextblock 396 rep ret 397 398cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 399; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 400 movsxdifnidn r3, r3d 401%ifdef PIC 402 lea picregq, [scan8_mem] 403%endif 404%if ARCH_X86_64 405 mov dst2q, r0 406%endif 407 408 mov r5, 16 ; i 409 add r2, 512 ; i * 16 * sizeof(dctcoef) ; #define dctcoef int16_t 410 411 call h264_idct_add8_mmx_plane 412 add r5, 4 413 call h264_idct_add8_mmx_plane 414 415%if ARCH_X86_64 416 add dst2q, gprsize ; dest[1] 417%else 418 add r0mp, gprsize 419%endif 420 421 add r5, 4 ; set to 32 422 add r2, 256 ; set to i * 16 * sizeof(dctcoef) 423 424 call h264_idct_add8_mmx_plane 425 add r5, 4 426 call h264_idct_add8_mmx_plane 427 428 RET ; TODO: check rep ret after a function call 429 430; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered 431h264_idct_dc_add8_mmxext: 432 movsxdifnidn r3, r3d 433 movd m0, [r2 ] ; 0 0 X D 434 mov word [r2+ 0], 0 435 punpcklwd m0, [r2+32] ; x X d D 436 mov word [r2+32], 0 437 paddsw m0, [pw_32] 438 psraw m0, 6 439 punpcklwd m0, m0 ; d d D D 440 pxor m1, m1 ; 0 0 0 0 441 psubw m1, m0 ; -d-d-D-D 442 packuswb m0, m1 ; -d-d-D-D d d D D 443 pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D 444 punpcklwd m0, m0 ; d d d d D D D D 445 lea r6, [r3*3] 446 DC_ADD_MMXEXT_OP movq, r0, r3, r6 447 ret 448 449ALIGN 16 450INIT_XMM sse2 451; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride 452h264_add8x4_idct_sse2: 453 movsxdifnidn r3, r3d 454 movq m0, [r2+ 0] 455 movq m1, [r2+ 8] 456 movq m2, [r2+16] 457 movq m3, [r2+24] 458 movhps m0, [r2+32] 459 movhps m1, [r2+40] 460 movhps m2, [r2+48] 461 movhps m3, [r2+56] 462 IDCT4_1D w,0,1,2,3,4,5 463 TRANSPOSE2x4x4W 0,1,2,3,4 464 paddw m0, [pw_32] 465 IDCT4_1D w,0,1,2,3,4,5 466 pxor m7, m7 467 mova [r2+ 0], m7 468 mova [r2+16], m7 469 mova [r2+32], m7 470 mova [r2+48], m7 471 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 472 lea r0, [r0+r3*2] 473 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 474 ret 475 476%macro add16_sse2_cycle 2 477 movzx r0, word [r4+%2] 478 test r0, r0 479 jz .cycle%1end 480 mov r0d, dword [r1+%1*8] 481%if ARCH_X86_64 482 add r0, r5 483%else 484 add r0, r0m 485%endif 486 call h264_add8x4_idct_sse2 487.cycle%1end: 488%if %1 < 7 489 add r2, 64 490%endif 491%endmacro 492 493; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset, 494; int16_t *block, int stride, 495; const uint8_t nnzc[6 * 8]) 496cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8 497 movsxdifnidn r3, r3d 498%if ARCH_X86_64 499 mov r5, r0 500%endif 501 ; unrolling of the loop leads to an average performance gain of 502 ; 20-25% 503 add16_sse2_cycle 0, 0xc 504 add16_sse2_cycle 1, 0x14 505 add16_sse2_cycle 2, 0xe 506 add16_sse2_cycle 3, 0x16 507 add16_sse2_cycle 4, 0x1c 508 add16_sse2_cycle 5, 0x24 509 add16_sse2_cycle 6, 0x1e 510 add16_sse2_cycle 7, 0x26 511REP_RET 512 513%macro add16intra_sse2_cycle 2 514 movzx r0, word [r4+%2] 515 test r0, r0 516 jz .try%1dc 517 mov r0d, dword [r1+%1*8] 518%if ARCH_X86_64 519 add r0, r7 520%else 521 add r0, r0m 522%endif 523 call h264_add8x4_idct_sse2 524 jmp .cycle%1end 525.try%1dc: 526 movsx r0, word [r2 ] 527 or r0w, word [r2+32] 528 jz .cycle%1end 529 mov r0d, dword [r1+%1*8] 530%if ARCH_X86_64 531 add r0, r7 532%else 533 add r0, r0m 534%endif 535 call h264_idct_dc_add8_mmxext 536.cycle%1end: 537%if %1 < 7 538 add r2, 64 539%endif 540%endmacro 541 542; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset, 543; int16_t *block, int stride, 544; const uint8_t nnzc[6 * 8]) 545cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8 546 movsxdifnidn r3, r3d 547%if ARCH_X86_64 548 mov r7, r0 549%endif 550 add16intra_sse2_cycle 0, 0xc 551 add16intra_sse2_cycle 1, 0x14 552 add16intra_sse2_cycle 2, 0xe 553 add16intra_sse2_cycle 3, 0x16 554 add16intra_sse2_cycle 4, 0x1c 555 add16intra_sse2_cycle 5, 0x24 556 add16intra_sse2_cycle 6, 0x1e 557 add16intra_sse2_cycle 7, 0x26 558REP_RET 559 560%macro add8_sse2_cycle 2 561 movzx r0, word [r4+%2] 562 test r0, r0 563 jz .try%1dc 564%if ARCH_X86_64 565 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 566 add r0, [r7] 567%else 568 mov r0, r0m 569 mov r0, [r0] 570 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 571%endif 572 call h264_add8x4_idct_sse2 573 jmp .cycle%1end 574.try%1dc: 575 movsx r0, word [r2 ] 576 or r0w, word [r2+32] 577 jz .cycle%1end 578%if ARCH_X86_64 579 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 580 add r0, [r7] 581%else 582 mov r0, r0m 583 mov r0, [r0] 584 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 585%endif 586 call h264_idct_dc_add8_mmxext 587.cycle%1end: 588%if %1 == 1 589 add r2, 384+64 590%elif %1 < 3 591 add r2, 64 592%endif 593%endmacro 594 595; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset, 596; int16_t *block, int stride, 597; const uint8_t nnzc[6 * 8]) 598cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8 599 movsxdifnidn r3, r3d 600 add r2, 512 601%if ARCH_X86_64 602 mov r7, r0 603%endif 604 add8_sse2_cycle 0, 0x34 605 add8_sse2_cycle 1, 0x3c 606%if ARCH_X86_64 607 add r7, gprsize 608%else 609 add r0mp, gprsize 610%endif 611 add8_sse2_cycle 2, 0x5c 612 add8_sse2_cycle 3, 0x64 613REP_RET 614 615;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul) 616 617%macro WALSH4_1D 5 618 SUMSUB_BADC w, %4, %3, %2, %1, %5 619 SUMSUB_BADC w, %4, %2, %3, %1, %5 620 SWAP %1, %4, %3 621%endmacro 622 623%macro DEQUANT 1-3 624%if cpuflag(sse2) 625 movd xmm4, t3d 626 movq xmm5, [pw_1] 627 pshufd xmm4, xmm4, 0 628 movq2dq xmm0, m0 629 movq2dq xmm1, m1 630 movq2dq xmm2, m2 631 movq2dq xmm3, m3 632 punpcklwd xmm0, xmm5 633 punpcklwd xmm1, xmm5 634 punpcklwd xmm2, xmm5 635 punpcklwd xmm3, xmm5 636 pmaddwd xmm0, xmm4 637 pmaddwd xmm1, xmm4 638 pmaddwd xmm2, xmm4 639 pmaddwd xmm3, xmm4 640 psrad xmm0, %1 641 psrad xmm1, %1 642 psrad xmm2, %1 643 psrad xmm3, %1 644 packssdw xmm0, xmm1 645 packssdw xmm2, xmm3 646%else 647 mova m7, [pw_1] 648 mova m4, %1 649 punpcklwd %1, m7 650 punpckhwd m4, m7 651 mova m5, %2 652 punpcklwd %2, m7 653 punpckhwd m5, m7 654 movd m7, t3d 655 punpckldq m7, m7 656 pmaddwd %1, m7 657 pmaddwd %2, m7 658 pmaddwd m4, m7 659 pmaddwd m5, m7 660 psrad %1, %3 661 psrad %2, %3 662 psrad m4, %3 663 psrad m5, %3 664 packssdw %1, m4 665 packssdw %2, m5 666%endif 667%endmacro 668 669%macro STORE_WORDS 5-9 670%if cpuflag(sse) 671 movd t0d, %1 672 psrldq %1, 4 673 movd t1d, %1 674 psrldq %1, 4 675 mov [t2+%2*32], t0w 676 mov [t2+%4*32], t1w 677 shr t0d, 16 678 shr t1d, 16 679 mov [t2+%3*32], t0w 680 mov [t2+%5*32], t1w 681 movd t0d, %1 682 psrldq %1, 4 683 movd t1d, %1 684 mov [t2+%6*32], t0w 685 mov [t2+%8*32], t1w 686 shr t0d, 16 687 shr t1d, 16 688 mov [t2+%7*32], t0w 689 mov [t2+%9*32], t1w 690%else 691 movd t0d, %1 692 psrlq %1, 32 693 movd t1d, %1 694 mov [t2+%2*32], t0w 695 mov [t2+%4*32], t1w 696 shr t0d, 16 697 shr t1d, 16 698 mov [t2+%3*32], t0w 699 mov [t2+%5*32], t1w 700%endif 701%endmacro 702 703%macro DEQUANT_STORE 1 704%if cpuflag(sse2) 705 DEQUANT %1 706 STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7 707 STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15 708%else 709 DEQUANT m0, m1, %1 710 STORE_WORDS m0, 0, 1, 4, 5 711 STORE_WORDS m1, 2, 3, 6, 7 712 713 DEQUANT m2, m3, %1 714 STORE_WORDS m2, 8, 9, 12, 13 715 STORE_WORDS m3, 10, 11, 14, 15 716%endif 717%endmacro 718 719%macro IDCT_DC_DEQUANT 1 720cglobal h264_luma_dc_dequant_idct, 3, 4, %1 721 ; manually spill XMM registers for Win64 because 722 ; the code here is initialized with INIT_MMX 723 WIN64_SPILL_XMM %1 724 movq m3, [r1+24] 725 movq m2, [r1+16] 726 movq m1, [r1+ 8] 727 movq m0, [r1+ 0] 728 WALSH4_1D 0,1,2,3,4 729 TRANSPOSE4x4W 0,1,2,3,4 730 WALSH4_1D 0,1,2,3,4 731 732; shift, tmp, output, qmul 733%if WIN64 734 DECLARE_REG_TMP 0,3,1,2 735 ; we can't avoid this, because r0 is the shift register (ecx) on win64 736 xchg r0, t2 737%elif ARCH_X86_64 738 DECLARE_REG_TMP 3,1,0,2 739%else 740 DECLARE_REG_TMP 1,3,0,2 741%endif 742 743 cmp t3d, 32767 744 jg .big_qmul 745 add t3d, 128 << 16 746 DEQUANT_STORE 8 747 RET 748.big_qmul: 749 bsr t0d, t3d 750 add t3d, 128 << 16 751 mov t1d, 7 752 cmp t0d, t1d 753 cmovg t0d, t1d 754 inc t1d 755 shr t3d, t0b 756 sub t1d, t0d 757 movd xmm6, t1d 758 DEQUANT_STORE xmm6 759 RET 760%endmacro 761 762INIT_MMX sse2 763IDCT_DC_DEQUANT 7 764 765%ifdef __NASM_VER__ 766%if __NASM_MAJOR__ >= 2 && __NASM_MINOR__ >= 4 767%unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet 768%endif 769%endif 770%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride 771 movd %3, [%7] 772 movd %4, [%7+%8] 773 psraw %1, %6 774 psraw %2, %6 775 punpcklbw %3, %5 776 punpcklbw %4, %5 777 paddw %3, %1 778 paddw %4, %2 779 packuswb %3, %5 780 packuswb %4, %5 781 movd [%7], %3 782 movd [%7+%8], %4 783%endmacro 784 785%macro DC_ADD_INIT 1 786 add %1d, 32 787 sar %1d, 6 788 movd m0, %1d 789 pshuflw m0, m0, 0 790 lea %1, [3*stride_q] 791 pxor m1, m1 792 psubw m1, m0 793 packuswb m0, m0 794 packuswb m1, m1 795%endmacro 796 797%macro IDCT_XMM 1 798 799INIT_XMM %1 800 801cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ 802 movsxdifnidn stride_q, stride_d 803 IDCT4_ADD dst_q, block_q, stride_q 804RET 805 806cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_ 807 movsxdifnidn stride_q, stride_d 808 movsx r3d, word [block_q] 809 mov dword [block_q], 0 810 DC_ADD_INIT r3 811 DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3 812RET 813 814%endmacro 815 816IDCT_XMM sse2 817IDCT_XMM avx 818