1;****************************************************************************** 2;* MMX/SSSE3-optimized functions for H.264 chroma MC 3;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, 4;* 2005-2008 Loren Merritt 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27rnd_rv40_2d_tbl: times 4 dw 0 28 times 4 dw 16 29 times 4 dw 32 30 times 4 dw 16 31 times 4 dw 32 32 times 4 dw 28 33 times 4 dw 32 34 times 4 dw 28 35 times 4 dw 0 36 times 4 dw 32 37 times 4 dw 16 38 times 4 dw 32 39 times 4 dw 32 40 times 4 dw 28 41 times 4 dw 32 42 times 4 dw 28 43rnd_rv40_1d_tbl: times 4 dw 0 44 times 4 dw 2 45 times 4 dw 4 46 times 4 dw 2 47 times 4 dw 4 48 times 4 dw 3 49 times 4 dw 4 50 times 4 dw 3 51 times 4 dw 0 52 times 4 dw 4 53 times 4 dw 2 54 times 4 dw 4 55 times 4 dw 4 56 times 4 dw 3 57 times 4 dw 4 58 times 4 dw 3 59 60cextern pw_3 61cextern pw_4 62cextern pw_8 63pw_28: times 8 dw 28 64cextern pw_32 65cextern pw_64 66 67SECTION .text 68 69%macro mv0_pixels_mc8 0 70 lea r4, [r2*2 ] 71.next4rows: 72 movq mm0, [r1 ] 73 movq mm1, [r1+r2] 74 add r1, r4 75 CHROMAMC_AVG mm0, [r0 ] 76 CHROMAMC_AVG mm1, [r0+r2] 77 movq [r0 ], mm0 78 movq [r0+r2], mm1 79 add r0, r4 80 movq mm0, [r1 ] 81 movq mm1, [r1+r2] 82 add r1, r4 83 CHROMAMC_AVG mm0, [r0 ] 84 CHROMAMC_AVG mm1, [r0+r2] 85 movq [r0 ], mm0 86 movq [r0+r2], mm1 87 add r0, r4 88 sub r3d, 4 89 jne .next4rows 90%endmacro 91 92%macro chroma_mc8_mmx_func 2-3 93%ifidn %2, rv40 94%ifdef PIC 95%define rnd_1d_rv40 r8 96%define rnd_2d_rv40 r8 97%define extra_regs 2 98%else ; no-PIC 99%define rnd_1d_rv40 rnd_rv40_1d_tbl 100%define rnd_2d_rv40 rnd_rv40_2d_tbl 101%define extra_regs 1 102%endif ; PIC 103%else 104%define extra_regs 0 105%endif ; rv40 106; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */, 107; uint8_t *src /* align 1 */, 108; ptrdiff_t stride, int h, int mx, int my) 109cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0 110 mov r6d, r5d 111 or r6d, r4d 112 jne .at_least_one_non_zero 113 ; mx == 0 AND my == 0 - no filter needed 114 mv0_pixels_mc8 115 REP_RET 116 117.at_least_one_non_zero: 118%ifidn %2, rv40 119%if ARCH_X86_64 120 mov r7, r5 121 and r7, 6 ; &~1 for mx/my=[0,7] 122 lea r7, [r7*4+r4] 123 sar r7d, 1 124%define rnd_bias r7 125%define dest_reg r0 126%else ; x86-32 127 mov r0, r5 128 and r0, 6 ; &~1 for mx/my=[0,7] 129 lea r0, [r0*4+r4] 130 sar r0d, 1 131%define rnd_bias r0 132%define dest_reg r5 133%endif 134%else ; vc1, h264 135%define rnd_bias 0 136%define dest_reg r0 137%endif 138 139 test r5d, r5d 140 mov r6, 1 141 je .my_is_zero 142 test r4d, r4d 143 mov r6, r2 ; dxy = x ? 1 : stride 144 jne .both_non_zero 145.my_is_zero: 146 ; mx == 0 XOR my == 0 - 1 dimensional filter only 147 or r4d, r5d ; x + y 148 149%ifidn %2, rv40 150%ifdef PIC 151 lea r8, [rnd_rv40_1d_tbl] 152%endif 153%if ARCH_X86_64 == 0 154 mov r5, r0m 155%endif 156%endif 157 158 movd m5, r4d 159 movq m4, [pw_8] 160 movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3 161 punpcklwd m5, m5 162 punpckldq m5, m5 ; mm5 = B = x 163 pxor m7, m7 164 psubw m4, m5 ; mm4 = A = 8-x 165 166.next1drow: 167 movq m0, [r1 ] ; mm0 = src[0..7] 168 movq m2, [r1+r6] ; mm1 = src[1..8] 169 170 movq m1, m0 171 movq m3, m2 172 punpcklbw m0, m7 173 punpckhbw m1, m7 174 punpcklbw m2, m7 175 punpckhbw m3, m7 176 pmullw m0, m4 ; [mm0,mm1] = A * src[0..7] 177 pmullw m1, m4 178 pmullw m2, m5 ; [mm2,mm3] = B * src[1..8] 179 pmullw m3, m5 180 181 paddw m0, m6 182 paddw m1, m6 183 paddw m0, m2 184 paddw m1, m3 185 psrlw m0, 3 186 psrlw m1, 3 187 packuswb m0, m1 188 CHROMAMC_AVG m0, [dest_reg] 189 movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 190 191 add dest_reg, r2 192 add r1, r2 193 dec r3d 194 jne .next1drow 195 REP_RET 196 197.both_non_zero: ; general case, bilinear 198 movd m4, r4d ; x 199 movd m6, r5d ; y 200%ifidn %2, rv40 201%ifdef PIC 202 lea r8, [rnd_rv40_2d_tbl] 203%endif 204%if ARCH_X86_64 == 0 205 mov r5, r0m 206%endif 207%endif 208 mov r6, rsp ; backup stack pointer 209 and rsp, ~(mmsize-1) ; align stack 210 sub rsp, 16 ; AA and DD 211 212 punpcklwd m4, m4 213 punpcklwd m6, m6 214 punpckldq m4, m4 ; mm4 = x words 215 punpckldq m6, m6 ; mm6 = y words 216 movq m5, m4 217 pmullw m4, m6 ; mm4 = x * y 218 psllw m5, 3 219 psllw m6, 3 220 movq m7, m5 221 paddw m7, m6 222 movq [rsp+8], m4 ; DD = x * y 223 psubw m5, m4 ; mm5 = B = 8x - xy 224 psubw m6, m4 ; mm6 = C = 8y - xy 225 paddw m4, [pw_64] 226 psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64 227 pxor m7, m7 228 movq [rsp ], m4 229 230 movq m0, [r1 ] ; mm0 = src[0..7] 231 movq m1, [r1+1] ; mm1 = src[1..8] 232.next2drow: 233 add r1, r2 234 235 movq m2, m0 236 movq m3, m1 237 punpckhbw m0, m7 238 punpcklbw m1, m7 239 punpcklbw m2, m7 240 punpckhbw m3, m7 241 pmullw m0, [rsp] 242 pmullw m2, [rsp] 243 pmullw m1, m5 244 pmullw m3, m5 245 paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4] 246 paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8] 247 248 movq m0, [r1] 249 movq m1, m0 250 punpcklbw m0, m7 251 punpckhbw m1, m7 252 pmullw m0, m6 253 pmullw m1, m6 254 paddw m2, m0 255 paddw m3, m1 ; [mm2,mm3] += C * src[0..7] 256 257 movq m1, [r1+1] 258 movq m0, m1 259 movq m4, m1 260 punpcklbw m0, m7 261 punpckhbw m4, m7 262 pmullw m0, [rsp+8] 263 pmullw m4, [rsp+8] 264 paddw m2, m0 265 paddw m3, m4 ; [mm2,mm3] += D * src[1..8] 266 movq m0, [r1] 267 268 paddw m2, [rnd_2d_%2+rnd_bias*8] 269 paddw m3, [rnd_2d_%2+rnd_bias*8] 270 psrlw m2, 6 271 psrlw m3, 6 272 packuswb m2, m3 273 CHROMAMC_AVG m2, [dest_reg] 274 movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6 275 276 add dest_reg, r2 277 dec r3d 278 jne .next2drow 279 mov rsp, r6 ; restore stack pointer 280 RET 281%endmacro 282 283%macro chroma_mc4_mmx_func 2 284%define extra_regs 0 285%ifidn %2, rv40 286%ifdef PIC 287%define extra_regs 1 288%endif ; PIC 289%endif ; rv40 290cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0 291 pxor m7, m7 292 movd m2, r4d ; x 293 movd m3, r5d ; y 294 movq m4, [pw_8] 295 movq m5, [pw_8] 296 punpcklwd m2, m2 297 punpcklwd m3, m3 298 punpcklwd m2, m2 299 punpcklwd m3, m3 300 psubw m4, m2 301 psubw m5, m3 302 303%ifidn %2, rv40 304%ifdef PIC 305 lea r6, [rnd_rv40_2d_tbl] 306%define rnd_2d_rv40 r6 307%else 308%define rnd_2d_rv40 rnd_rv40_2d_tbl 309%endif 310 and r5, 6 ; &~1 for mx/my=[0,7] 311 lea r5, [r5*4+r4] 312 sar r5d, 1 313%define rnd_bias r5 314%else ; vc1, h264 315%define rnd_bias 0 316%endif 317 318 movd m0, [r1 ] 319 movd m6, [r1+1] 320 add r1, r2 321 punpcklbw m0, m7 322 punpcklbw m6, m7 323 pmullw m0, m4 324 pmullw m6, m2 325 paddw m6, m0 326 327.next2rows: 328 movd m0, [r1 ] 329 movd m1, [r1+1] 330 add r1, r2 331 punpcklbw m0, m7 332 punpcklbw m1, m7 333 pmullw m0, m4 334 pmullw m1, m2 335 paddw m1, m0 336 movq m0, m1 337 338 pmullw m6, m5 339 pmullw m1, m3 340 paddw m6, [rnd_2d_%2+rnd_bias*8] 341 paddw m1, m6 342 psrlw m1, 6 343 packuswb m1, m1 344 CHROMAMC_AVG4 m1, m6, [r0] 345 movd [r0], m1 346 add r0, r2 347 348 movd m6, [r1 ] 349 movd m1, [r1+1] 350 add r1, r2 351 punpcklbw m6, m7 352 punpcklbw m1, m7 353 pmullw m6, m4 354 pmullw m1, m2 355 paddw m1, m6 356 movq m6, m1 357 pmullw m0, m5 358 pmullw m1, m3 359 paddw m0, [rnd_2d_%2+rnd_bias*8] 360 paddw m1, m0 361 psrlw m1, 6 362 packuswb m1, m1 363 CHROMAMC_AVG4 m1, m0, [r0] 364 movd [r0], m1 365 add r0, r2 366 sub r3d, 2 367 jnz .next2rows 368 REP_RET 369%endmacro 370 371%macro chroma_mc2_mmx_func 2 372cglobal %1_%2_chroma_mc2, 6, 7, 0 373 mov r6d, r4d 374 shl r4d, 16 375 sub r4d, r6d 376 add r4d, 8 377 imul r5d, r4d ; x*y<<16 | y*(8-x) 378 shl r4d, 3 379 sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) 380 381 movd m5, r4d 382 movd m6, r5d 383 punpckldq m5, m5 ; mm5 = {A,B,A,B} 384 punpckldq m6, m6 ; mm6 = {C,D,C,D} 385 pxor m7, m7 386 movd m2, [r1] 387 punpcklbw m2, m7 388 pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] 389 390.nextrow: 391 add r1, r2 392 movq m1, m2 393 pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] 394 movd m0, [r1] 395 punpcklbw m0, m7 396 pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2] 397 movq m2, m0 398 pmaddwd m0, m6 399 paddw m1, [rnd_2d_%2] 400 paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] 401 psrlw m1, 6 402 packssdw m1, m7 403 packuswb m1, m7 404 CHROMAMC_AVG4 m1, m3, [r0] 405 movd r5d, m1 406 mov [r0], r5w 407 add r0, r2 408 sub r3d, 1 409 jnz .nextrow 410 REP_RET 411%endmacro 412 413%define rnd_1d_h264 pw_4 414%define rnd_2d_h264 pw_32 415%define rnd_1d_vc1 pw_3 416%define rnd_2d_vc1 pw_28 417 418%macro NOTHING 2-3 419%endmacro 420%macro DIRECT_AVG 2 421 PAVGB %1, %2 422%endmacro 423%macro COPY_AVG 3 424 movd %2, %3 425 PAVGB %1, %2 426%endmacro 427 428INIT_MMX mmx 429%define CHROMAMC_AVG NOTHING 430%define CHROMAMC_AVG4 NOTHING 431chroma_mc8_mmx_func put, h264, _rnd 432chroma_mc8_mmx_func put, vc1, _nornd 433chroma_mc8_mmx_func put, rv40 434chroma_mc4_mmx_func put, h264 435chroma_mc4_mmx_func put, rv40 436 437INIT_MMX mmxext 438chroma_mc2_mmx_func put, h264 439 440%define CHROMAMC_AVG DIRECT_AVG 441%define CHROMAMC_AVG4 COPY_AVG 442chroma_mc8_mmx_func avg, h264, _rnd 443chroma_mc8_mmx_func avg, vc1, _nornd 444chroma_mc8_mmx_func avg, rv40 445chroma_mc4_mmx_func avg, h264 446chroma_mc4_mmx_func avg, rv40 447chroma_mc2_mmx_func avg, h264 448 449%macro chroma_mc8_ssse3_func 2-3 450cglobal %1_%2_chroma_mc8%3, 6, 7, 8 451 mov r6d, r5d 452 or r6d, r4d 453 jne .at_least_one_non_zero 454 ; mx == 0 AND my == 0 - no filter needed 455 mv0_pixels_mc8 456 REP_RET 457 458.at_least_one_non_zero: 459 test r5d, r5d 460 je .my_is_zero 461 test r4d, r4d 462 je .mx_is_zero 463 464 ; general case, bilinear 465 mov r6d, r4d 466 shl r4d, 8 467 sub r4, r6 468 mov r6, 8 469 add r4, 8 ; x*288+8 = x<<8 | (8-x) 470 sub r6d, r5d 471 imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) 472 imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) 473 474 movd m7, r6d 475 movd m6, r4d 476 movdqa m5, [rnd_2d_%2] 477 movq m0, [r1 ] 478 movq m1, [r1+1] 479 pshuflw m7, m7, 0 480 pshuflw m6, m6, 0 481 punpcklbw m0, m1 482 movlhps m7, m7 483 movlhps m6, m6 484 485.next2rows: 486 movq m1, [r1+r2*1 ] 487 movq m2, [r1+r2*1+1] 488 movq m3, [r1+r2*2 ] 489 movq m4, [r1+r2*2+1] 490 lea r1, [r1+r2*2] 491 punpcklbw m1, m2 492 movdqa m2, m1 493 punpcklbw m3, m4 494 movdqa m4, m3 495 pmaddubsw m0, m7 496 pmaddubsw m1, m6 497 pmaddubsw m2, m7 498 pmaddubsw m3, m6 499 paddw m0, m5 500 paddw m2, m5 501 paddw m1, m0 502 paddw m3, m2 503 psrlw m1, 6 504 movdqa m0, m4 505 psrlw m3, 6 506%ifidn %1, avg 507 movq m2, [r0 ] 508 movhps m2, [r0+r2] 509%endif 510 packuswb m1, m3 511 CHROMAMC_AVG m1, m2 512 movq [r0 ], m1 513 movhps [r0+r2], m1 514 sub r3d, 2 515 lea r0, [r0+r2*2] 516 jg .next2rows 517 REP_RET 518 519.my_is_zero: 520 mov r5d, r4d 521 shl r4d, 8 522 add r4, 8 523 sub r4, r5 ; 255*x+8 = x<<8 | (8-x) 524 movd m7, r4d 525 movdqa m6, [rnd_1d_%2] 526 pshuflw m7, m7, 0 527 movlhps m7, m7 528 529.next2xrows: 530 movq m0, [r1 ] 531 movq m1, [r1 +1] 532 movq m2, [r1+r2 ] 533 movq m3, [r1+r2+1] 534 punpcklbw m0, m1 535 punpcklbw m2, m3 536 pmaddubsw m0, m7 537 pmaddubsw m2, m7 538%ifidn %1, avg 539 movq m4, [r0 ] 540 movhps m4, [r0+r2] 541%endif 542 paddw m0, m6 543 paddw m2, m6 544 psrlw m0, 3 545 psrlw m2, 3 546 packuswb m0, m2 547 CHROMAMC_AVG m0, m4 548 movq [r0 ], m0 549 movhps [r0+r2], m0 550 sub r3d, 2 551 lea r0, [r0+r2*2] 552 lea r1, [r1+r2*2] 553 jg .next2xrows 554 REP_RET 555 556.mx_is_zero: 557 mov r4d, r5d 558 shl r5d, 8 559 add r5, 8 560 sub r5, r4 ; 255*y+8 = y<<8 | (8-y) 561 movd m7, r5d 562 movdqa m6, [rnd_1d_%2] 563 pshuflw m7, m7, 0 564 movlhps m7, m7 565 566.next2yrows: 567 movq m0, [r1 ] 568 movq m1, [r1+r2 ] 569 movdqa m2, m1 570 movq m3, [r1+r2*2] 571 lea r1, [r1+r2*2] 572 punpcklbw m0, m1 573 punpcklbw m2, m3 574 pmaddubsw m0, m7 575 pmaddubsw m2, m7 576%ifidn %1, avg 577 movq m4, [r0 ] 578 movhps m4, [r0+r2] 579%endif 580 paddw m0, m6 581 paddw m2, m6 582 psrlw m0, 3 583 psrlw m2, 3 584 packuswb m0, m2 585 CHROMAMC_AVG m0, m4 586 movq [r0 ], m0 587 movhps [r0+r2], m0 588 sub r3d, 2 589 lea r0, [r0+r2*2] 590 jg .next2yrows 591 REP_RET 592%endmacro 593 594%macro chroma_mc4_ssse3_func 2 595cglobal %1_%2_chroma_mc4, 6, 7, 0 596 mov r6, r4 597 shl r4d, 8 598 sub r4d, r6d 599 mov r6, 8 600 add r4d, 8 ; x*288+8 601 sub r6d, r5d 602 imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) 603 imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) 604 605 movd m7, r6d 606 movd m6, r4d 607 movq m5, [pw_32] 608 movd m0, [r1 ] 609 pshufw m7, m7, 0 610 punpcklbw m0, [r1+1] 611 pshufw m6, m6, 0 612 613.next2rows: 614 movd m1, [r1+r2*1 ] 615 movd m3, [r1+r2*2 ] 616 punpcklbw m1, [r1+r2*1+1] 617 punpcklbw m3, [r1+r2*2+1] 618 lea r1, [r1+r2*2] 619 movq m2, m1 620 movq m4, m3 621 pmaddubsw m0, m7 622 pmaddubsw m1, m6 623 pmaddubsw m2, m7 624 pmaddubsw m3, m6 625 paddw m0, m5 626 paddw m2, m5 627 paddw m1, m0 628 paddw m3, m2 629 psrlw m1, 6 630 movq m0, m4 631 psrlw m3, 6 632 packuswb m1, m1 633 packuswb m3, m3 634 CHROMAMC_AVG m1, [r0 ] 635 CHROMAMC_AVG m3, [r0+r2] 636 movd [r0 ], m1 637 movd [r0+r2], m3 638 sub r3d, 2 639 lea r0, [r0+r2*2] 640 jg .next2rows 641 REP_RET 642%endmacro 643 644%define CHROMAMC_AVG NOTHING 645INIT_XMM ssse3 646chroma_mc8_ssse3_func put, h264, _rnd 647chroma_mc8_ssse3_func put, vc1, _nornd 648INIT_MMX ssse3 649chroma_mc4_ssse3_func put, h264 650 651%define CHROMAMC_AVG DIRECT_AVG 652INIT_XMM ssse3 653chroma_mc8_ssse3_func avg, h264, _rnd 654chroma_mc8_ssse3_func avg, vc1, _nornd 655INIT_MMX ssse3 656chroma_mc4_ssse3_func avg, h264 657