1;****************************************************************************** 2;* H.264 intra prediction asm optimizations 3;* Copyright (c) 2010 Fiona Glaser 4;* Copyright (c) 2010 Holger Lubitz 5;* Copyright (c) 2010 Loren Merritt 6;* Copyright (c) 2010 Ronald S. Bultje 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29tm_shuf: times 8 db 0x03, 0x80 30pw_ff00: times 8 dw 0xff00 31plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1 32 db 1, 2, 3, 4, 5, 6, 7, 8 33plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0 34 db 1, 2, 3, 4, 0, 0, 0, 0 35pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7 36pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8 37pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1 38pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4 39 40SECTION .text 41 42cextern pb_1 43cextern pb_3 44cextern pw_4 45cextern pw_8 46 47;----------------------------------------------------------------------------- 48; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride) 49;----------------------------------------------------------------------------- 50 51INIT_XMM sse 52cglobal pred16x16_vertical_8, 2,3 53 sub r0, r1 54 mov r2, 4 55 movaps xmm0, [r0] 56.loop: 57 movaps [r0+r1*1], xmm0 58 movaps [r0+r1*2], xmm0 59 lea r0, [r0+r1*2] 60 movaps [r0+r1*1], xmm0 61 movaps [r0+r1*2], xmm0 62 lea r0, [r0+r1*2] 63 dec r2 64 jg .loop 65 REP_RET 66 67;----------------------------------------------------------------------------- 68; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride) 69;----------------------------------------------------------------------------- 70 71%macro PRED16x16_H 0 72cglobal pred16x16_horizontal_8, 2,3 73 mov r2, 8 74%if cpuflag(ssse3) 75 mova m2, [pb_3] 76%endif 77.loop: 78 movd m0, [r0+r1*0-4] 79 movd m1, [r0+r1*1-4] 80 81%if cpuflag(ssse3) 82 pshufb m0, m2 83 pshufb m1, m2 84%else 85 punpcklbw m0, m0 86 punpcklbw m1, m1 87 SPLATW m0, m0, 3 88 SPLATW m1, m1, 3 89 mova [r0+r1*0+8], m0 90 mova [r0+r1*1+8], m1 91%endif 92 93 mova [r0+r1*0], m0 94 mova [r0+r1*1], m1 95 lea r0, [r0+r1*2] 96 dec r2 97 jg .loop 98 REP_RET 99%endmacro 100 101INIT_MMX mmxext 102PRED16x16_H 103INIT_XMM ssse3 104PRED16x16_H 105 106;----------------------------------------------------------------------------- 107; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride) 108;----------------------------------------------------------------------------- 109 110%macro PRED16x16_DC 0 111cglobal pred16x16_dc_8, 2,7 112 mov r4, r0 113 sub r0, r1 114 pxor mm0, mm0 115 pxor mm1, mm1 116 psadbw mm0, [r0+0] 117 psadbw mm1, [r0+8] 118 dec r0 119 movzx r5d, byte [r0+r1*1] 120 paddw mm0, mm1 121 movd r6d, mm0 122 lea r0, [r0+r1*2] 123%rep 7 124 movzx r2d, byte [r0+r1*0] 125 movzx r3d, byte [r0+r1*1] 126 add r5d, r2d 127 add r6d, r3d 128 lea r0, [r0+r1*2] 129%endrep 130 movzx r2d, byte [r0+r1*0] 131 add r5d, r6d 132 lea r2d, [r2+r5+16] 133 shr r2d, 5 134%if cpuflag(ssse3) 135 pxor m1, m1 136%endif 137 SPLATB_REG m0, r2, m1 138 139 mov r3d, 4 140.loop: 141 mova [r4+r1*0], m0 142 mova [r4+r1*1], m0 143 lea r4, [r4+r1*2] 144 mova [r4+r1*0], m0 145 mova [r4+r1*1], m0 146 lea r4, [r4+r1*2] 147 dec r3d 148 jg .loop 149 REP_RET 150%endmacro 151 152INIT_XMM sse2 153PRED16x16_DC 154INIT_XMM ssse3 155PRED16x16_DC 156 157;----------------------------------------------------------------------------- 158; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride) 159;----------------------------------------------------------------------------- 160 161INIT_XMM sse2 162cglobal pred16x16_tm_vp8_8, 2,6,6 163 sub r0, r1 164 pxor xmm2, xmm2 165 movdqa xmm0, [r0] 166 movdqa xmm1, xmm0 167 punpcklbw xmm0, xmm2 168 punpckhbw xmm1, xmm2 169 movzx r4d, byte [r0-1] 170 mov r5d, 8 171.loop: 172 movzx r2d, byte [r0+r1*1-1] 173 movzx r3d, byte [r0+r1*2-1] 174 sub r2d, r4d 175 sub r3d, r4d 176 movd xmm2, r2d 177 movd xmm4, r3d 178 pshuflw xmm2, xmm2, 0 179 pshuflw xmm4, xmm4, 0 180 punpcklqdq xmm2, xmm2 181 punpcklqdq xmm4, xmm4 182 movdqa xmm3, xmm2 183 movdqa xmm5, xmm4 184 paddw xmm2, xmm0 185 paddw xmm3, xmm1 186 paddw xmm4, xmm0 187 paddw xmm5, xmm1 188 packuswb xmm2, xmm3 189 packuswb xmm4, xmm5 190 movdqa [r0+r1*1], xmm2 191 movdqa [r0+r1*2], xmm4 192 lea r0, [r0+r1*2] 193 dec r5d 194 jg .loop 195 REP_RET 196 197%if HAVE_AVX2_EXTERNAL 198INIT_YMM avx2 199cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration 200 sub dstq, strideq 201 pmovzxbw m0, [dstq] 202 vpbroadcastb xm1, [r0-1] 203 pmovzxbw m1, xm1 204 psubw m0, m1 205 mov iterationd, 4 206 lea stride3q, [strideq*3] 207.loop: 208 vpbroadcastb xm1, [dstq+strideq*1-1] 209 vpbroadcastb xm2, [dstq+strideq*2-1] 210 vpbroadcastb xm3, [dstq+stride3q-1] 211 vpbroadcastb xm4, [dstq+strideq*4-1] 212 pmovzxbw m1, xm1 213 pmovzxbw m2, xm2 214 pmovzxbw m3, xm3 215 pmovzxbw m4, xm4 216 paddw m1, m0 217 paddw m2, m0 218 paddw m3, m0 219 paddw m4, m0 220 vpackuswb m1, m1, m2 221 vpackuswb m3, m3, m4 222 vpermq m1, m1, q3120 223 vpermq m3, m3, q3120 224 movdqa [dstq+strideq*1], xm1 225 vextracti128 [dstq+strideq*2], m1, 1 226 movdqa [dstq+stride3q*1], xm3 227 vextracti128 [dstq+strideq*4], m3, 1 228 lea dstq, [dstq+strideq*4] 229 dec iterationd 230 jg .loop 231 REP_RET 232%endif 233 234;----------------------------------------------------------------------------- 235; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride) 236;----------------------------------------------------------------------------- 237 238%macro H264_PRED16x16_PLANE 1 239cglobal pred16x16_plane_%1_8, 2,9,7 240 mov r2, r1 ; +stride 241 neg r1 ; -stride 242 243 movh m0, [r0+r1 -1] 244%if cpuflag(ssse3) 245 movhps m0, [r0+r1 +8] 246 pmaddubsw m0, [plane_shuf] ; H coefficients 247%else ; sse2 248 pxor m2, m2 249 movh m1, [r0+r1 +8] 250 punpcklbw m0, m2 251 punpcklbw m1, m2 252 pmullw m0, [pw_m8tom1] 253 pmullw m1, [pw_1to8] 254 paddw m0, m1 255%endif 256 movhlps m1, m0 257 paddw m0, m1 258 PSHUFLW m1, m0, 0xE 259 paddw m0, m1 260 PSHUFLW m1, m0, 0x1 261 paddw m0, m1 ; sum of H coefficients 262 263 lea r4, [r0+r2*8-1] 264 lea r3, [r0+r2*4-1] 265 add r4, r2 266 267%if ARCH_X86_64 268%define e_reg r8 269%else 270%define e_reg r0 271%endif 272 273 movzx e_reg, byte [r3+r2*2 ] 274 movzx r5, byte [r4+r1 ] 275 sub r5, e_reg 276 277 movzx e_reg, byte [r3+r2 ] 278 movzx r6, byte [r4 ] 279 sub r6, e_reg 280 lea r5, [r5+r6*2] 281 282 movzx e_reg, byte [r3+r1 ] 283 movzx r6, byte [r4+r2*2 ] 284 sub r6, e_reg 285 lea r5, [r5+r6*4] 286 287 movzx e_reg, byte [r3 ] 288%if ARCH_X86_64 289 movzx r7, byte [r4+r2 ] 290 sub r7, e_reg 291%else 292 movzx r6, byte [r4+r2 ] 293 sub r6, e_reg 294 lea r5, [r5+r6*4] 295 sub r5, r6 296%endif 297 298 lea e_reg, [r3+r1*4] 299 lea r3, [r4+r2*4] 300 301 movzx r4, byte [e_reg+r2 ] 302 movzx r6, byte [r3 ] 303 sub r6, r4 304%if ARCH_X86_64 305 lea r6, [r7+r6*2] 306 lea r5, [r5+r6*2] 307 add r5, r6 308%else 309 lea r5, [r5+r6*4] 310 lea r5, [r5+r6*2] 311%endif 312 313 movzx r4, byte [e_reg ] 314%if ARCH_X86_64 315 movzx r7, byte [r3 +r2 ] 316 sub r7, r4 317 sub r5, r7 318%else 319 movzx r6, byte [r3 +r2 ] 320 sub r6, r4 321 lea r5, [r5+r6*8] 322 sub r5, r6 323%endif 324 325 movzx r4, byte [e_reg+r1 ] 326 movzx r6, byte [r3 +r2*2] 327 sub r6, r4 328%if ARCH_X86_64 329 add r6, r7 330%endif 331 lea r5, [r5+r6*8] 332 333 movzx r4, byte [e_reg+r2*2] 334 movzx r6, byte [r3 +r1 ] 335 sub r6, r4 336 lea r5, [r5+r6*4] 337 add r5, r6 ; sum of V coefficients 338 339%if ARCH_X86_64 == 0 340 mov r0, r0m 341%endif 342 343%ifidn %1, h264 344 lea r5, [r5*5+32] 345 sar r5, 6 346%elifidn %1, rv40 347 lea r5, [r5*5] 348 sar r5, 6 349%elifidn %1, svq3 350 test r5, r5 351 lea r6, [r5+3] 352 cmovs r5, r6 353 sar r5, 2 ; V/4 354 lea r5, [r5*5] ; 5*(V/4) 355 test r5, r5 356 lea r6, [r5+15] 357 cmovs r5, r6 358 sar r5, 4 ; (5*(V/4))/16 359%endif 360 361 movzx r4, byte [r0+r1 +15] 362 movzx r3, byte [r3+r2*2 ] 363 lea r3, [r3+r4+1] 364 shl r3, 4 365 366 movd r1d, m0 367 movsx r1d, r1w 368%ifnidn %1, svq3 369%ifidn %1, h264 370 lea r1d, [r1d*5+32] 371%else ; rv40 372 lea r1d, [r1d*5] 373%endif 374 sar r1d, 6 375%else ; svq3 376 test r1d, r1d 377 lea r4d, [r1d+3] 378 cmovs r1d, r4d 379 sar r1d, 2 ; H/4 380 lea r1d, [r1d*5] ; 5*(H/4) 381 test r1d, r1d 382 lea r4d, [r1d+15] 383 cmovs r1d, r4d 384 sar r1d, 4 ; (5*(H/4))/16 385%endif 386 movd m0, r1d 387 388 add r1d, r5d 389 add r3d, r1d 390 shl r1d, 3 391 sub r3d, r1d ; a 392 393 movd m1, r5d 394 movd m3, r3d 395 SPLATW m0, m0, 0 ; H 396 SPLATW m1, m1, 0 ; V 397 SPLATW m3, m3, 0 ; a 398%ifidn %1, svq3 399 SWAP 0, 1 400%endif 401 mova m2, m0 402 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) 403 psllw m2, 3 404 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H 405 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H 406 407 mov r4, 8 408.loop: 409 mova m3, m0 ; b[0..7] 410 mova m4, m2 ; b[8..15] 411 psraw m3, 5 412 psraw m4, 5 413 packuswb m3, m4 414 mova [r0], m3 415 paddw m0, m1 416 paddw m2, m1 417 418 mova m3, m0 ; b[0..7] 419 mova m4, m2 ; b[8..15] 420 psraw m3, 5 421 psraw m4, 5 422 packuswb m3, m4 423 mova [r0+r2], m3 424 paddw m0, m1 425 paddw m2, m1 426 427 lea r0, [r0+r2*2] 428 dec r4 429 jg .loop 430 REP_RET 431%endmacro 432 433INIT_XMM sse2 434H264_PRED16x16_PLANE h264 435H264_PRED16x16_PLANE rv40 436H264_PRED16x16_PLANE svq3 437INIT_XMM ssse3 438H264_PRED16x16_PLANE h264 439H264_PRED16x16_PLANE rv40 440H264_PRED16x16_PLANE svq3 441 442;----------------------------------------------------------------------------- 443; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride) 444;----------------------------------------------------------------------------- 445 446%macro H264_PRED8x8_PLANE 0 447cglobal pred8x8_plane_8, 2,9,7 448 mov r2, r1 ; +stride 449 neg r1 ; -stride 450 451 movd m0, [r0+r1 -1] 452%if cpuflag(ssse3) 453 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary 454 pmaddubsw m0, [plane8_shuf] ; H coefficients 455%else ; sse2 456 pxor m2, m2 457 movd m1, [r0+r1 +4] 458 punpckldq m0, m1 459 punpcklbw m0, m2 460 pmullw m0, [pw_m4to4] 461%endif 462 movhlps m1, m0 463 paddw m0, m1 464 465%if notcpuflag(ssse3) 466 PSHUFLW m1, m0, 0xE 467 paddw m0, m1 468%endif ; !ssse3 469 470 PSHUFLW m1, m0, 0x1 471 paddw m0, m1 ; sum of H coefficients 472 473 lea r4, [r0+r2*4-1] 474 lea r3, [r0 -1] 475 add r4, r2 476 477%if ARCH_X86_64 478%define e_reg r8 479%else 480%define e_reg r0 481%endif 482 483 movzx e_reg, byte [r3+r2*2 ] 484 movzx r5, byte [r4+r1 ] 485 sub r5, e_reg 486 487 movzx e_reg, byte [r3 ] 488%if ARCH_X86_64 489 movzx r7, byte [r4+r2 ] 490 sub r7, e_reg 491 sub r5, r7 492%else 493 movzx r6, byte [r4+r2 ] 494 sub r6, e_reg 495 lea r5, [r5+r6*4] 496 sub r5, r6 497%endif 498 499 movzx e_reg, byte [r3+r1 ] 500 movzx r6, byte [r4+r2*2 ] 501 sub r6, e_reg 502%if ARCH_X86_64 503 add r6, r7 504%endif 505 lea r5, [r5+r6*4] 506 507 movzx e_reg, byte [r3+r2 ] 508 movzx r6, byte [r4 ] 509 sub r6, e_reg 510 lea r6, [r5+r6*2] 511 512 lea r5, [r6*9+16] 513 lea r5, [r5+r6*8] 514 sar r5, 5 515 516%if ARCH_X86_64 == 0 517 mov r0, r0m 518%endif 519 520 movzx r3, byte [r4+r2*2 ] 521 movzx r4, byte [r0+r1 +7] 522 lea r3, [r3+r4+1] 523 shl r3, 4 524 movd r1d, m0 525 movsx r1d, r1w 526 imul r1d, 17 527 add r1d, 16 528 sar r1d, 5 529 movd m0, r1d 530 add r1d, r5d 531 sub r3d, r1d 532 add r1d, r1d 533 sub r3d, r1d ; a 534 535 movd m1, r5d 536 movd m3, r3d 537 SPLATW m0, m0, 0 ; H 538 SPLATW m1, m1, 0 ; V 539 SPLATW m3, m3, 0 ; a 540 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) 541 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H 542 543 mov r4, 4 544ALIGN 16 545.loop: 546 mova m3, m0 ; b[0..7] 547 paddw m0, m1 548 psraw m3, 5 549 mova m4, m0 ; V+b[0..7] 550 paddw m0, m1 551 psraw m4, 5 552 packuswb m3, m4 553 movh [r0], m3 554 movhps [r0+r2], m3 555 556 lea r0, [r0+r2*2] 557 dec r4 558 jg .loop 559 REP_RET 560%endmacro 561 562INIT_XMM sse2 563H264_PRED8x8_PLANE 564INIT_XMM ssse3 565H264_PRED8x8_PLANE 566 567;----------------------------------------------------------------------------- 568; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride) 569;----------------------------------------------------------------------------- 570 571INIT_MMX mmx 572cglobal pred8x8_vertical_8, 2,2 573 sub r0, r1 574 movq mm0, [r0] 575%rep 3 576 movq [r0+r1*1], mm0 577 movq [r0+r1*2], mm0 578 lea r0, [r0+r1*2] 579%endrep 580 movq [r0+r1*1], mm0 581 movq [r0+r1*2], mm0 582 RET 583 584;----------------------------------------------------------------------------- 585; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride) 586;----------------------------------------------------------------------------- 587 588%macro PRED8x8_H 0 589cglobal pred8x8_horizontal_8, 2,3 590 mov r2, 4 591%if cpuflag(ssse3) 592 mova m2, [pb_3] 593%endif 594.loop: 595 SPLATB_LOAD m0, r0+r1*0-1, m2 596 SPLATB_LOAD m1, r0+r1*1-1, m2 597 mova [r0+r1*0], m0 598 mova [r0+r1*1], m1 599 lea r0, [r0+r1*2] 600 dec r2 601 jg .loop 602 REP_RET 603%endmacro 604 605INIT_MMX mmxext 606PRED8x8_H 607INIT_MMX ssse3 608PRED8x8_H 609 610;----------------------------------------------------------------------------- 611; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) 612;----------------------------------------------------------------------------- 613INIT_MMX mmxext 614cglobal pred8x8_top_dc_8, 2,5 615 sub r0, r1 616 movq mm0, [r0] 617 pxor mm1, mm1 618 pxor mm2, mm2 619 lea r2, [r0+r1*2] 620 punpckhbw mm1, mm0 621 punpcklbw mm0, mm2 622 psadbw mm1, mm2 ; s1 623 lea r3, [r2+r1*2] 624 psadbw mm0, mm2 ; s0 625 psrlw mm1, 1 626 psrlw mm0, 1 627 pavgw mm1, mm2 628 lea r4, [r3+r1*2] 629 pavgw mm0, mm2 630 pshufw mm1, mm1, 0 631 pshufw mm0, mm0, 0 ; dc0 (w) 632 packuswb mm0, mm1 ; dc0,dc1 (b) 633 movq [r0+r1*1], mm0 634 movq [r0+r1*2], mm0 635 lea r0, [r3+r1*2] 636 movq [r2+r1*1], mm0 637 movq [r2+r1*2], mm0 638 movq [r3+r1*1], mm0 639 movq [r3+r1*2], mm0 640 movq [r0+r1*1], mm0 641 movq [r0+r1*2], mm0 642 RET 643 644;----------------------------------------------------------------------------- 645; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) 646;----------------------------------------------------------------------------- 647 648INIT_MMX mmxext 649cglobal pred8x8_dc_8, 2,5 650 sub r0, r1 651 pxor m7, m7 652 movd m0, [r0+0] 653 movd m1, [r0+4] 654 psadbw m0, m7 ; s0 655 mov r4, r0 656 psadbw m1, m7 ; s1 657 658 movzx r2d, byte [r0+r1*1-1] 659 movzx r3d, byte [r0+r1*2-1] 660 lea r0, [r0+r1*2] 661 add r2d, r3d 662 movzx r3d, byte [r0+r1*1-1] 663 add r2d, r3d 664 movzx r3d, byte [r0+r1*2-1] 665 add r2d, r3d 666 lea r0, [r0+r1*2] 667 movd m2, r2d ; s2 668 movzx r2d, byte [r0+r1*1-1] 669 movzx r3d, byte [r0+r1*2-1] 670 lea r0, [r0+r1*2] 671 add r2d, r3d 672 movzx r3d, byte [r0+r1*1-1] 673 add r2d, r3d 674 movzx r3d, byte [r0+r1*2-1] 675 add r2d, r3d 676 movd m3, r2d ; s3 677 678 punpcklwd m0, m1 679 mov r0, r4 680 punpcklwd m2, m3 681 punpckldq m0, m2 ; s0, s1, s2, s3 682 pshufw m3, m0, 11110110b ; s2, s1, s3, s3 683 lea r2, [r0+r1*2] 684 pshufw m0, m0, 01110100b ; s0, s1, s3, s1 685 paddw m0, m3 686 lea r3, [r2+r1*2] 687 psrlw m0, 2 688 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 689 lea r4, [r3+r1*2] 690 packuswb m0, m0 691 punpcklbw m0, m0 692 movq m1, m0 693 punpcklbw m0, m0 694 punpckhbw m1, m1 695 movq [r0+r1*1], m0 696 movq [r0+r1*2], m0 697 movq [r2+r1*1], m0 698 movq [r2+r1*2], m0 699 movq [r3+r1*1], m1 700 movq [r3+r1*2], m1 701 movq [r4+r1*1], m1 702 movq [r4+r1*2], m1 703 RET 704 705;----------------------------------------------------------------------------- 706; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride) 707;----------------------------------------------------------------------------- 708 709INIT_MMX mmxext 710cglobal pred8x8_dc_rv40_8, 2,7 711 mov r4, r0 712 sub r0, r1 713 pxor mm0, mm0 714 psadbw mm0, [r0] 715 dec r0 716 movzx r5d, byte [r0+r1*1] 717 movd r6d, mm0 718 lea r0, [r0+r1*2] 719%rep 3 720 movzx r2d, byte [r0+r1*0] 721 movzx r3d, byte [r0+r1*1] 722 add r5d, r2d 723 add r6d, r3d 724 lea r0, [r0+r1*2] 725%endrep 726 movzx r2d, byte [r0+r1*0] 727 add r5d, r6d 728 lea r2d, [r2+r5+8] 729 shr r2d, 4 730 movd mm0, r2d 731 punpcklbw mm0, mm0 732 pshufw mm0, mm0, 0 733 mov r3d, 4 734.loop: 735 movq [r4+r1*0], mm0 736 movq [r4+r1*1], mm0 737 lea r4, [r4+r1*2] 738 dec r3d 739 jg .loop 740 REP_RET 741 742;----------------------------------------------------------------------------- 743; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride) 744;----------------------------------------------------------------------------- 745 746INIT_XMM sse2 747cglobal pred8x8_tm_vp8_8, 2,6,4 748 sub r0, r1 749 pxor xmm1, xmm1 750 movq xmm0, [r0] 751 punpcklbw xmm0, xmm1 752 movzx r4d, byte [r0-1] 753 mov r5d, 4 754.loop: 755 movzx r2d, byte [r0+r1*1-1] 756 movzx r3d, byte [r0+r1*2-1] 757 sub r2d, r4d 758 sub r3d, r4d 759 movd xmm2, r2d 760 movd xmm3, r3d 761 pshuflw xmm2, xmm2, 0 762 pshuflw xmm3, xmm3, 0 763 punpcklqdq xmm2, xmm2 764 punpcklqdq xmm3, xmm3 765 paddw xmm2, xmm0 766 paddw xmm3, xmm0 767 packuswb xmm2, xmm3 768 movq [r0+r1*1], xmm2 769 movhps [r0+r1*2], xmm2 770 lea r0, [r0+r1*2] 771 dec r5d 772 jg .loop 773 REP_RET 774 775INIT_XMM ssse3 776cglobal pred8x8_tm_vp8_8, 2,3,6 777 sub r0, r1 778 movdqa xmm4, [tm_shuf] 779 pxor xmm1, xmm1 780 movq xmm0, [r0] 781 punpcklbw xmm0, xmm1 782 movd xmm5, [r0-4] 783 pshufb xmm5, xmm4 784 mov r2d, 4 785.loop: 786 movd xmm2, [r0+r1*1-4] 787 movd xmm3, [r0+r1*2-4] 788 pshufb xmm2, xmm4 789 pshufb xmm3, xmm4 790 psubw xmm2, xmm5 791 psubw xmm3, xmm5 792 paddw xmm2, xmm0 793 paddw xmm3, xmm0 794 packuswb xmm2, xmm3 795 movq [r0+r1*1], xmm2 796 movhps [r0+r1*2], xmm2 797 lea r0, [r0+r1*2] 798 dec r2d 799 jg .loop 800 REP_RET 801 802; dest, left, right, src, tmp 803; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 804%macro PRED4x4_LOWPASS 5 805 mova %5, %2 806 pavgb %2, %3 807 pxor %3, %5 808 mova %1, %4 809 pand %3, [pb_1] 810 psubusb %2, %3 811 pavgb %1, %2 812%endmacro 813 814;----------------------------------------------------------------------------- 815; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, 816; ptrdiff_t stride) 817;----------------------------------------------------------------------------- 818%macro PRED8x8L_TOP_DC 0 819cglobal pred8x8l_top_dc_8, 4,4 820 sub r0, r3 821 pxor mm7, mm7 822 movq mm0, [r0-8] 823 movq mm3, [r0] 824 movq mm1, [r0+8] 825 movq mm2, mm3 826 movq mm4, mm3 827 PALIGNR mm2, mm0, 7, mm0 828 PALIGNR mm1, mm4, 1, mm4 829 test r1d, r1d ; top_left 830 jz .fix_lt_2 831 test r2d, r2d ; top_right 832 jz .fix_tr_1 833 jmp .body 834.fix_lt_2: 835 movq mm5, mm3 836 pxor mm5, mm2 837 psllq mm5, 56 838 psrlq mm5, 56 839 pxor mm2, mm5 840 test r2d, r2d ; top_right 841 jnz .body 842.fix_tr_1: 843 movq mm5, mm3 844 pxor mm5, mm1 845 psrlq mm5, 56 846 psllq mm5, 56 847 pxor mm1, mm5 848.body: 849 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 850 psadbw mm7, mm0 851 paddw mm7, [pw_4] 852 psrlw mm7, 3 853 pshufw mm7, mm7, 0 854 packuswb mm7, mm7 855%rep 3 856 movq [r0+r3*1], mm7 857 movq [r0+r3*2], mm7 858 lea r0, [r0+r3*2] 859%endrep 860 movq [r0+r3*1], mm7 861 movq [r0+r3*2], mm7 862 RET 863%endmacro 864 865INIT_MMX mmxext 866PRED8x8L_TOP_DC 867INIT_MMX ssse3 868PRED8x8L_TOP_DC 869 870;----------------------------------------------------------------------------- 871; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, 872; ptrdiff_t stride) 873;----------------------------------------------------------------------------- 874 875%macro PRED8x8L_DC 0 876cglobal pred8x8l_dc_8, 4,5 877 sub r0, r3 878 lea r4, [r0+r3*2] 879 movq mm0, [r0+r3*1-8] 880 punpckhbw mm0, [r0+r3*0-8] 881 movq mm1, [r4+r3*1-8] 882 punpckhbw mm1, [r0+r3*2-8] 883 mov r4, r0 884 punpckhwd mm1, mm0 885 lea r0, [r0+r3*4] 886 movq mm2, [r0+r3*1-8] 887 punpckhbw mm2, [r0+r3*0-8] 888 lea r0, [r0+r3*2] 889 movq mm3, [r0+r3*1-8] 890 punpckhbw mm3, [r0+r3*0-8] 891 punpckhwd mm3, mm2 892 punpckhdq mm3, mm1 893 lea r0, [r0+r3*2] 894 movq mm0, [r0+r3*0-8] 895 movq mm1, [r4] 896 mov r0, r4 897 movq mm4, mm3 898 movq mm2, mm3 899 PALIGNR mm4, mm0, 7, mm0 900 PALIGNR mm1, mm2, 1, mm2 901 test r1d, r1d 902 jnz .do_left 903.fix_lt_1: 904 movq mm5, mm3 905 pxor mm5, mm4 906 psrlq mm5, 56 907 psllq mm5, 48 908 pxor mm1, mm5 909 jmp .do_left 910.fix_lt_2: 911 movq mm5, mm3 912 pxor mm5, mm2 913 psllq mm5, 56 914 psrlq mm5, 56 915 pxor mm2, mm5 916 test r2d, r2d 917 jnz .body 918.fix_tr_1: 919 movq mm5, mm3 920 pxor mm5, mm1 921 psrlq mm5, 56 922 psllq mm5, 56 923 pxor mm1, mm5 924 jmp .body 925.do_left: 926 movq mm0, mm4 927 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 928 movq mm4, mm0 929 movq mm7, mm2 930 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 931 psllq mm1, 56 932 PALIGNR mm7, mm1, 7, mm3 933 movq mm0, [r0-8] 934 movq mm3, [r0] 935 movq mm1, [r0+8] 936 movq mm2, mm3 937 movq mm4, mm3 938 PALIGNR mm2, mm0, 7, mm0 939 PALIGNR mm1, mm4, 1, mm4 940 test r1d, r1d 941 jz .fix_lt_2 942 test r2d, r2d 943 jz .fix_tr_1 944.body: 945 lea r1, [r0+r3*2] 946 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 947 pxor mm0, mm0 948 pxor mm1, mm1 949 lea r2, [r1+r3*2] 950 psadbw mm0, mm7 951 psadbw mm1, mm6 952 paddw mm0, [pw_8] 953 paddw mm0, mm1 954 lea r4, [r2+r3*2] 955 psrlw mm0, 4 956 pshufw mm0, mm0, 0 957 packuswb mm0, mm0 958 movq [r0+r3*1], mm0 959 movq [r0+r3*2], mm0 960 movq [r1+r3*1], mm0 961 movq [r1+r3*2], mm0 962 movq [r2+r3*1], mm0 963 movq [r2+r3*2], mm0 964 movq [r4+r3*1], mm0 965 movq [r4+r3*2], mm0 966 RET 967%endmacro 968 969INIT_MMX mmxext 970PRED8x8L_DC 971INIT_MMX ssse3 972PRED8x8L_DC 973 974;----------------------------------------------------------------------------- 975; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft, 976; int has_topright, ptrdiff_t stride) 977;----------------------------------------------------------------------------- 978 979%macro PRED8x8L_HORIZONTAL 0 980cglobal pred8x8l_horizontal_8, 4,4 981 sub r0, r3 982 lea r2, [r0+r3*2] 983 movq mm0, [r0+r3*1-8] 984 test r1d, r1d 985 lea r1, [r0+r3] 986 cmovnz r1, r0 987 punpckhbw mm0, [r1+r3*0-8] 988 movq mm1, [r2+r3*1-8] 989 punpckhbw mm1, [r0+r3*2-8] 990 mov r2, r0 991 punpckhwd mm1, mm0 992 lea r0, [r0+r3*4] 993 movq mm2, [r0+r3*1-8] 994 punpckhbw mm2, [r0+r3*0-8] 995 lea r0, [r0+r3*2] 996 movq mm3, [r0+r3*1-8] 997 punpckhbw mm3, [r0+r3*0-8] 998 punpckhwd mm3, mm2 999 punpckhdq mm3, mm1 1000 lea r0, [r0+r3*2] 1001 movq mm0, [r0+r3*0-8] 1002 movq mm1, [r1+r3*0-8] 1003 mov r0, r2 1004 movq mm4, mm3 1005 movq mm2, mm3 1006 PALIGNR mm4, mm0, 7, mm0 1007 PALIGNR mm1, mm2, 1, mm2 1008 movq mm0, mm4 1009 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1010 movq mm4, mm0 1011 movq mm7, mm2 1012 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1013 psllq mm1, 56 1014 PALIGNR mm7, mm1, 7, mm3 1015 movq mm3, mm7 1016 lea r1, [r0+r3*2] 1017 movq mm7, mm3 1018 punpckhbw mm3, mm3 1019 punpcklbw mm7, mm7 1020 pshufw mm0, mm3, 0xff 1021 pshufw mm1, mm3, 0xaa 1022 lea r2, [r1+r3*2] 1023 pshufw mm2, mm3, 0x55 1024 pshufw mm3, mm3, 0x00 1025 pshufw mm4, mm7, 0xff 1026 pshufw mm5, mm7, 0xaa 1027 pshufw mm6, mm7, 0x55 1028 pshufw mm7, mm7, 0x00 1029 movq [r0+r3*1], mm0 1030 movq [r0+r3*2], mm1 1031 movq [r1+r3*1], mm2 1032 movq [r1+r3*2], mm3 1033 movq [r2+r3*1], mm4 1034 movq [r2+r3*2], mm5 1035 lea r0, [r2+r3*2] 1036 movq [r0+r3*1], mm6 1037 movq [r0+r3*2], mm7 1038 RET 1039%endmacro 1040 1041INIT_MMX mmxext 1042PRED8x8L_HORIZONTAL 1043INIT_MMX ssse3 1044PRED8x8L_HORIZONTAL 1045 1046;----------------------------------------------------------------------------- 1047; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, 1048; ptrdiff_t stride) 1049;----------------------------------------------------------------------------- 1050 1051%macro PRED8x8L_VERTICAL 0 1052cglobal pred8x8l_vertical_8, 4,4 1053 sub r0, r3 1054 movq mm0, [r0-8] 1055 movq mm3, [r0] 1056 movq mm1, [r0+8] 1057 movq mm2, mm3 1058 movq mm4, mm3 1059 PALIGNR mm2, mm0, 7, mm0 1060 PALIGNR mm1, mm4, 1, mm4 1061 test r1d, r1d ; top_left 1062 jz .fix_lt_2 1063 test r2d, r2d ; top_right 1064 jz .fix_tr_1 1065 jmp .body 1066.fix_lt_2: 1067 movq mm5, mm3 1068 pxor mm5, mm2 1069 psllq mm5, 56 1070 psrlq mm5, 56 1071 pxor mm2, mm5 1072 test r2d, r2d ; top_right 1073 jnz .body 1074.fix_tr_1: 1075 movq mm5, mm3 1076 pxor mm5, mm1 1077 psrlq mm5, 56 1078 psllq mm5, 56 1079 pxor mm1, mm5 1080.body: 1081 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 1082%rep 3 1083 movq [r0+r3*1], mm0 1084 movq [r0+r3*2], mm0 1085 lea r0, [r0+r3*2] 1086%endrep 1087 movq [r0+r3*1], mm0 1088 movq [r0+r3*2], mm0 1089 RET 1090%endmacro 1091 1092INIT_MMX mmxext 1093PRED8x8L_VERTICAL 1094INIT_MMX ssse3 1095PRED8x8L_VERTICAL 1096 1097;----------------------------------------------------------------------------- 1098; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft, 1099; int has_topright, ptrdiff_t stride) 1100;----------------------------------------------------------------------------- 1101 1102%macro PRED8x8L_DOWN_LEFT 0 1103cglobal pred8x8l_down_left_8, 4,4 1104 sub r0, r3 1105 movq mm0, [r0-8] 1106 movq mm3, [r0] 1107 movq mm1, [r0+8] 1108 movq mm2, mm3 1109 movq mm4, mm3 1110 PALIGNR mm2, mm0, 7, mm0 1111 PALIGNR mm1, mm4, 1, mm4 1112 test r1d, r1d ; top_left 1113 jz .fix_lt_2 1114 test r2d, r2d ; top_right 1115 jz .fix_tr_1 1116 jmp .do_top 1117.fix_lt_2: 1118 movq mm5, mm3 1119 pxor mm5, mm2 1120 psllq mm5, 56 1121 psrlq mm5, 56 1122 pxor mm2, mm5 1123 test r2d, r2d ; top_right 1124 jnz .do_top 1125.fix_tr_1: 1126 movq mm5, mm3 1127 pxor mm5, mm1 1128 psrlq mm5, 56 1129 psllq mm5, 56 1130 pxor mm1, mm5 1131 jmp .do_top 1132.fix_tr_2: 1133 punpckhbw mm3, mm3 1134 pshufw mm1, mm3, 0xFF 1135 jmp .do_topright 1136.do_top: 1137 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1138 movq2dq xmm3, mm4 1139 test r2d, r2d ; top_right 1140 jz .fix_tr_2 1141 movq mm0, [r0+8] 1142 movq mm5, mm0 1143 movq mm2, mm0 1144 movq mm4, mm0 1145 psrlq mm5, 56 1146 PALIGNR mm2, mm3, 7, mm3 1147 PALIGNR mm5, mm4, 1, mm4 1148 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1149.do_topright: 1150 movq2dq xmm4, mm1 1151 psrlq mm1, 56 1152 movq2dq xmm5, mm1 1153 lea r1, [r0+r3*2] 1154 pslldq xmm4, 8 1155 por xmm3, xmm4 1156 movdqa xmm2, xmm3 1157 psrldq xmm2, 1 1158 pslldq xmm5, 15 1159 por xmm2, xmm5 1160 lea r2, [r1+r3*2] 1161 movdqa xmm1, xmm3 1162 pslldq xmm1, 1 1163INIT_XMM cpuname 1164 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 1165 psrldq xmm0, 1 1166 movq [r0+r3*1], xmm0 1167 psrldq xmm0, 1 1168 movq [r0+r3*2], xmm0 1169 psrldq xmm0, 1 1170 lea r0, [r2+r3*2] 1171 movq [r1+r3*1], xmm0 1172 psrldq xmm0, 1 1173 movq [r1+r3*2], xmm0 1174 psrldq xmm0, 1 1175 movq [r2+r3*1], xmm0 1176 psrldq xmm0, 1 1177 movq [r2+r3*2], xmm0 1178 psrldq xmm0, 1 1179 movq [r0+r3*1], xmm0 1180 psrldq xmm0, 1 1181 movq [r0+r3*2], xmm0 1182 RET 1183%endmacro 1184 1185INIT_MMX sse2 1186PRED8x8L_DOWN_LEFT 1187INIT_MMX ssse3 1188PRED8x8L_DOWN_LEFT 1189 1190;----------------------------------------------------------------------------- 1191; void ff_pred8x8l_down_right_8(uint8_t *src, int has_topleft, 1192; int has_topright, ptrdiff_t stride) 1193;----------------------------------------------------------------------------- 1194 1195%macro PRED8x8L_DOWN_RIGHT 0 1196cglobal pred8x8l_down_right_8, 4,5 1197 sub r0, r3 1198 lea r4, [r0+r3*2] 1199 movq mm0, [r0+r3*1-8] 1200 punpckhbw mm0, [r0+r3*0-8] 1201 movq mm1, [r4+r3*1-8] 1202 punpckhbw mm1, [r0+r3*2-8] 1203 mov r4, r0 1204 punpckhwd mm1, mm0 1205 lea r0, [r0+r3*4] 1206 movq mm2, [r0+r3*1-8] 1207 punpckhbw mm2, [r0+r3*0-8] 1208 lea r0, [r0+r3*2] 1209 movq mm3, [r0+r3*1-8] 1210 punpckhbw mm3, [r0+r3*0-8] 1211 punpckhwd mm3, mm2 1212 punpckhdq mm3, mm1 1213 lea r0, [r0+r3*2] 1214 movq mm0, [r0+r3*0-8] 1215 movq mm1, [r4] 1216 mov r0, r4 1217 movq mm4, mm3 1218 movq mm2, mm3 1219 PALIGNR mm4, mm0, 7, mm0 1220 PALIGNR mm1, mm2, 1, mm2 1221 test r1d, r1d 1222 jz .fix_lt_1 1223 jmp .do_left 1224.fix_lt_1: 1225 movq mm5, mm3 1226 pxor mm5, mm4 1227 psrlq mm5, 56 1228 psllq mm5, 48 1229 pxor mm1, mm5 1230 jmp .do_left 1231.fix_lt_2: 1232 movq mm5, mm3 1233 pxor mm5, mm2 1234 psllq mm5, 56 1235 psrlq mm5, 56 1236 pxor mm2, mm5 1237 test r2d, r2d 1238 jnz .do_top 1239.fix_tr_1: 1240 movq mm5, mm3 1241 pxor mm5, mm1 1242 psrlq mm5, 56 1243 psllq mm5, 56 1244 pxor mm1, mm5 1245 jmp .do_top 1246.do_left: 1247 movq mm0, mm4 1248 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1249 movq mm4, mm0 1250 movq mm7, mm2 1251 movq2dq xmm3, mm2 1252 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1253 psllq mm1, 56 1254 PALIGNR mm7, mm1, 7, mm3 1255 movq2dq xmm1, mm7 1256 movq mm0, [r0-8] 1257 movq mm3, [r0] 1258 movq mm1, [r0+8] 1259 movq mm2, mm3 1260 movq mm4, mm3 1261 PALIGNR mm2, mm0, 7, mm0 1262 PALIGNR mm1, mm4, 1, mm4 1263 test r1d, r1d 1264 jz .fix_lt_2 1265 test r2d, r2d 1266 jz .fix_tr_1 1267.do_top: 1268 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1269 movq2dq xmm4, mm4 1270 lea r1, [r0+r3*2] 1271 movdqa xmm0, xmm3 1272 pslldq xmm4, 8 1273 por xmm3, xmm4 1274 lea r2, [r1+r3*2] 1275 pslldq xmm4, 1 1276 por xmm1, xmm4 1277 psrldq xmm0, 7 1278 pslldq xmm0, 15 1279 psrldq xmm0, 7 1280 por xmm1, xmm0 1281 lea r0, [r2+r3*2] 1282 movdqa xmm2, xmm3 1283 psrldq xmm2, 1 1284INIT_XMM cpuname 1285 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 1286 movdqa xmm1, xmm0 1287 psrldq xmm1, 1 1288 movq [r0+r3*2], xmm0 1289 movq [r0+r3*1], xmm1 1290 psrldq xmm0, 2 1291 psrldq xmm1, 2 1292 movq [r2+r3*2], xmm0 1293 movq [r2+r3*1], xmm1 1294 psrldq xmm0, 2 1295 psrldq xmm1, 2 1296 movq [r1+r3*2], xmm0 1297 movq [r1+r3*1], xmm1 1298 psrldq xmm0, 2 1299 psrldq xmm1, 2 1300 movq [r4+r3*2], xmm0 1301 movq [r4+r3*1], xmm1 1302 RET 1303%endmacro 1304 1305INIT_MMX sse2 1306PRED8x8L_DOWN_RIGHT 1307INIT_MMX ssse3 1308PRED8x8L_DOWN_RIGHT 1309 1310;----------------------------------------------------------------------------- 1311; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, 1312; int has_topright, ptrdiff_t stride) 1313;----------------------------------------------------------------------------- 1314 1315%macro PRED8x8L_VERTICAL_RIGHT 0 1316cglobal pred8x8l_vertical_right_8, 4,5,7 1317 ; manually spill XMM registers for Win64 because 1318 ; the code here is initialized with INIT_MMX 1319 WIN64_SPILL_XMM 7 1320 sub r0, r3 1321 lea r4, [r0+r3*2] 1322 movq mm0, [r0+r3*1-8] 1323 punpckhbw mm0, [r0+r3*0-8] 1324 movq mm1, [r4+r3*1-8] 1325 punpckhbw mm1, [r0+r3*2-8] 1326 mov r4, r0 1327 punpckhwd mm1, mm0 1328 lea r0, [r0+r3*4] 1329 movq mm2, [r0+r3*1-8] 1330 punpckhbw mm2, [r0+r3*0-8] 1331 lea r0, [r0+r3*2] 1332 movq mm3, [r0+r3*1-8] 1333 punpckhbw mm3, [r0+r3*0-8] 1334 punpckhwd mm3, mm2 1335 punpckhdq mm3, mm1 1336 lea r0, [r0+r3*2] 1337 movq mm0, [r0+r3*0-8] 1338 movq mm1, [r4] 1339 mov r0, r4 1340 movq mm4, mm3 1341 movq mm2, mm3 1342 PALIGNR mm4, mm0, 7, mm0 1343 PALIGNR mm1, mm2, 1, mm2 1344 test r1d, r1d 1345 jnz .do_left 1346.fix_lt_1: 1347 movq mm5, mm3 1348 pxor mm5, mm4 1349 psrlq mm5, 56 1350 psllq mm5, 48 1351 pxor mm1, mm5 1352 jmp .do_left 1353.fix_lt_2: 1354 movq mm5, mm3 1355 pxor mm5, mm2 1356 psllq mm5, 56 1357 psrlq mm5, 56 1358 pxor mm2, mm5 1359 test r2d, r2d 1360 jnz .do_top 1361.fix_tr_1: 1362 movq mm5, mm3 1363 pxor mm5, mm1 1364 psrlq mm5, 56 1365 psllq mm5, 56 1366 pxor mm1, mm5 1367 jmp .do_top 1368.do_left: 1369 movq mm0, mm4 1370 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1371 movq2dq xmm0, mm2 1372 movq mm0, [r0-8] 1373 movq mm3, [r0] 1374 movq mm1, [r0+8] 1375 movq mm2, mm3 1376 movq mm4, mm3 1377 PALIGNR mm2, mm0, 7, mm0 1378 PALIGNR mm1, mm4, 1, mm4 1379 test r1d, r1d 1380 jz .fix_lt_2 1381 test r2d, r2d 1382 jz .fix_tr_1 1383.do_top: 1384 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 1385 lea r1, [r0+r3*2] 1386 movq2dq xmm4, mm6 1387 pslldq xmm4, 8 1388 por xmm0, xmm4 1389 movdqa xmm6, [pw_ff00] 1390 movdqa xmm1, xmm0 1391 lea r2, [r1+r3*2] 1392 movdqa xmm2, xmm0 1393 movdqa xmm3, xmm0 1394 pslldq xmm0, 1 1395 pslldq xmm1, 2 1396 pavgb xmm2, xmm0 1397INIT_XMM cpuname 1398 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5 1399 pandn xmm6, xmm4 1400 movdqa xmm5, xmm4 1401 psrlw xmm4, 8 1402 packuswb xmm6, xmm4 1403 movhlps xmm4, xmm6 1404 movhps [r0+r3*2], xmm5 1405 movhps [r0+r3*1], xmm2 1406 psrldq xmm5, 4 1407 movss xmm5, xmm6 1408 psrldq xmm2, 4 1409 movss xmm2, xmm4 1410 lea r0, [r2+r3*2] 1411 psrldq xmm5, 1 1412 psrldq xmm2, 1 1413 movq [r0+r3*2], xmm5 1414 movq [r0+r3*1], xmm2 1415 psrldq xmm5, 1 1416 psrldq xmm2, 1 1417 movq [r2+r3*2], xmm5 1418 movq [r2+r3*1], xmm2 1419 psrldq xmm5, 1 1420 psrldq xmm2, 1 1421 movq [r1+r3*2], xmm5 1422 movq [r1+r3*1], xmm2 1423 RET 1424%endmacro 1425 1426INIT_MMX sse2 1427PRED8x8L_VERTICAL_RIGHT 1428INIT_MMX ssse3 1429PRED8x8L_VERTICAL_RIGHT 1430 1431;----------------------------------------------------------------------------- 1432; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, 1433; int has_topright, ptrdiff_t stride) 1434;----------------------------------------------------------------------------- 1435 1436%macro PRED8x8L_VERTICAL_LEFT 0 1437cglobal pred8x8l_vertical_left_8, 4,4 1438 sub r0, r3 1439 movq mm0, [r0-8] 1440 movq mm3, [r0] 1441 movq mm1, [r0+8] 1442 movq mm2, mm3 1443 movq mm4, mm3 1444 PALIGNR mm2, mm0, 7, mm0 1445 PALIGNR mm1, mm4, 1, mm4 1446 test r1d, r1d 1447 jz .fix_lt_2 1448 test r2d, r2d 1449 jz .fix_tr_1 1450 jmp .do_top 1451.fix_lt_2: 1452 movq mm5, mm3 1453 pxor mm5, mm2 1454 psllq mm5, 56 1455 psrlq mm5, 56 1456 pxor mm2, mm5 1457 test r2d, r2d 1458 jnz .do_top 1459.fix_tr_1: 1460 movq mm5, mm3 1461 pxor mm5, mm1 1462 psrlq mm5, 56 1463 psllq mm5, 56 1464 pxor mm1, mm5 1465 jmp .do_top 1466.fix_tr_2: 1467 punpckhbw mm3, mm3 1468 pshufw mm1, mm3, 0xFF 1469 jmp .do_topright 1470.do_top: 1471 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1472 movq2dq xmm4, mm4 1473 test r2d, r2d 1474 jz .fix_tr_2 1475 movq mm0, [r0+8] 1476 movq mm5, mm0 1477 movq mm2, mm0 1478 movq mm4, mm0 1479 psrlq mm5, 56 1480 PALIGNR mm2, mm3, 7, mm3 1481 PALIGNR mm5, mm4, 1, mm4 1482 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1483.do_topright: 1484 movq2dq xmm3, mm1 1485 lea r1, [r0+r3*2] 1486 pslldq xmm3, 8 1487 por xmm4, xmm3 1488 movdqa xmm2, xmm4 1489 movdqa xmm1, xmm4 1490 movdqa xmm3, xmm4 1491 psrldq xmm2, 1 1492 pslldq xmm1, 1 1493 pavgb xmm3, xmm2 1494 lea r2, [r1+r3*2] 1495INIT_XMM cpuname 1496 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5 1497 psrldq xmm0, 1 1498 movq [r0+r3*1], xmm3 1499 movq [r0+r3*2], xmm0 1500 lea r0, [r2+r3*2] 1501 psrldq xmm3, 1 1502 psrldq xmm0, 1 1503 movq [r1+r3*1], xmm3 1504 movq [r1+r3*2], xmm0 1505 psrldq xmm3, 1 1506 psrldq xmm0, 1 1507 movq [r2+r3*1], xmm3 1508 movq [r2+r3*2], xmm0 1509 psrldq xmm3, 1 1510 psrldq xmm0, 1 1511 movq [r0+r3*1], xmm3 1512 movq [r0+r3*2], xmm0 1513 RET 1514%endmacro 1515 1516INIT_MMX sse2 1517PRED8x8L_VERTICAL_LEFT 1518INIT_MMX ssse3 1519PRED8x8L_VERTICAL_LEFT 1520 1521;----------------------------------------------------------------------------- 1522; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, 1523; int has_topright, ptrdiff_t stride) 1524;----------------------------------------------------------------------------- 1525 1526%macro PRED8x8L_HORIZONTAL_UP 0 1527cglobal pred8x8l_horizontal_up_8, 4,4 1528 sub r0, r3 1529 lea r2, [r0+r3*2] 1530 movq mm0, [r0+r3*1-8] 1531 test r1d, r1d 1532 lea r1, [r0+r3] 1533 cmovnz r1, r0 1534 punpckhbw mm0, [r1+r3*0-8] 1535 movq mm1, [r2+r3*1-8] 1536 punpckhbw mm1, [r0+r3*2-8] 1537 mov r2, r0 1538 punpckhwd mm1, mm0 1539 lea r0, [r0+r3*4] 1540 movq mm2, [r0+r3*1-8] 1541 punpckhbw mm2, [r0+r3*0-8] 1542 lea r0, [r0+r3*2] 1543 movq mm3, [r0+r3*1-8] 1544 punpckhbw mm3, [r0+r3*0-8] 1545 punpckhwd mm3, mm2 1546 punpckhdq mm3, mm1 1547 lea r0, [r0+r3*2] 1548 movq mm0, [r0+r3*0-8] 1549 movq mm1, [r1+r3*0-8] 1550 mov r0, r2 1551 movq mm4, mm3 1552 movq mm2, mm3 1553 PALIGNR mm4, mm0, 7, mm0 1554 PALIGNR mm1, mm2, 1, mm2 1555 movq mm0, mm4 1556 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1557 movq mm4, mm0 1558 movq mm7, mm2 1559 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1560 psllq mm1, 56 1561 PALIGNR mm7, mm1, 7, mm3 1562 lea r1, [r0+r3*2] 1563 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 1564 psllq mm7, 56 ; l7 .. .. .. .. .. .. .. 1565 movq mm2, mm0 1566 psllw mm0, 8 1567 psrlw mm2, 8 1568 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 1569 movq mm3, mm2 1570 movq mm4, mm2 1571 movq mm5, mm2 1572 psrlq mm2, 8 1573 psrlq mm3, 16 1574 lea r2, [r1+r3*2] 1575 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1 1576 punpckhbw mm7, mm7 1577 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2 1578 pavgb mm4, mm2 1579 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6 1580 movq mm5, mm4 1581 punpcklbw mm4, mm1 ; p4 p3 p2 p1 1582 punpckhbw mm5, mm1 ; p8 p7 p6 p5 1583 movq mm6, mm5 1584 movq mm7, mm5 1585 movq mm0, mm5 1586 PALIGNR mm5, mm4, 2, mm1 1587 pshufw mm1, mm6, 11111001b 1588 PALIGNR mm6, mm4, 4, mm2 1589 pshufw mm2, mm7, 11111110b 1590 PALIGNR mm7, mm4, 6, mm3 1591 pshufw mm3, mm0, 11111111b 1592 movq [r0+r3*1], mm4 1593 movq [r0+r3*2], mm5 1594 lea r0, [r2+r3*2] 1595 movq [r1+r3*1], mm6 1596 movq [r1+r3*2], mm7 1597 movq [r2+r3*1], mm0 1598 movq [r2+r3*2], mm1 1599 movq [r0+r3*1], mm2 1600 movq [r0+r3*2], mm3 1601 RET 1602%endmacro 1603 1604INIT_MMX mmxext 1605PRED8x8L_HORIZONTAL_UP 1606INIT_MMX ssse3 1607PRED8x8L_HORIZONTAL_UP 1608 1609;----------------------------------------------------------------------------- 1610; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, 1611; int has_topright, ptrdiff_t stride) 1612;----------------------------------------------------------------------------- 1613 1614%macro PRED8x8L_HORIZONTAL_DOWN 0 1615cglobal pred8x8l_horizontal_down_8, 4,5 1616 sub r0, r3 1617 lea r4, [r0+r3*2] 1618 movq mm0, [r0+r3*1-8] 1619 punpckhbw mm0, [r0+r3*0-8] 1620 movq mm1, [r4+r3*1-8] 1621 punpckhbw mm1, [r0+r3*2-8] 1622 mov r4, r0 1623 punpckhwd mm1, mm0 1624 lea r0, [r0+r3*4] 1625 movq mm2, [r0+r3*1-8] 1626 punpckhbw mm2, [r0+r3*0-8] 1627 lea r0, [r0+r3*2] 1628 movq mm3, [r0+r3*1-8] 1629 punpckhbw mm3, [r0+r3*0-8] 1630 punpckhwd mm3, mm2 1631 punpckhdq mm3, mm1 1632 lea r0, [r0+r3*2] 1633 movq mm0, [r0+r3*0-8] 1634 movq mm1, [r4] 1635 mov r0, r4 1636 movq mm4, mm3 1637 movq mm2, mm3 1638 PALIGNR mm4, mm0, 7, mm0 1639 PALIGNR mm1, mm2, 1, mm2 1640 test r1d, r1d 1641 jnz .do_left 1642.fix_lt_1: 1643 movq mm5, mm3 1644 pxor mm5, mm4 1645 psrlq mm5, 56 1646 psllq mm5, 48 1647 pxor mm1, mm5 1648 jmp .do_left 1649.fix_lt_2: 1650 movq mm5, mm3 1651 pxor mm5, mm2 1652 psllq mm5, 56 1653 psrlq mm5, 56 1654 pxor mm2, mm5 1655 test r2d, r2d 1656 jnz .do_top 1657.fix_tr_1: 1658 movq mm5, mm3 1659 pxor mm5, mm1 1660 psrlq mm5, 56 1661 psllq mm5, 56 1662 pxor mm1, mm5 1663 jmp .do_top 1664.fix_tr_2: 1665 punpckhbw mm3, mm3 1666 pshufw mm1, mm3, 0xFF 1667 jmp .do_topright 1668.do_left: 1669 movq mm0, mm4 1670 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1671 movq2dq xmm0, mm2 1672 pslldq xmm0, 8 1673 movq mm4, mm0 1674 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1675 movq2dq xmm2, mm1 1676 pslldq xmm2, 15 1677 psrldq xmm2, 8 1678 por xmm0, xmm2 1679 movq mm0, [r0-8] 1680 movq mm3, [r0] 1681 movq mm1, [r0+8] 1682 movq mm2, mm3 1683 movq mm4, mm3 1684 PALIGNR mm2, mm0, 7, mm0 1685 PALIGNR mm1, mm4, 1, mm4 1686 test r1d, r1d 1687 jz .fix_lt_2 1688 test r2d, r2d 1689 jz .fix_tr_1 1690.do_top: 1691 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1692 movq2dq xmm1, mm4 1693 test r2d, r2d 1694 jz .fix_tr_2 1695 movq mm0, [r0+8] 1696 movq mm5, mm0 1697 movq mm2, mm0 1698 movq mm4, mm0 1699 psrlq mm5, 56 1700 PALIGNR mm2, mm3, 7, mm3 1701 PALIGNR mm5, mm4, 1, mm4 1702 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1703.do_topright: 1704 movq2dq xmm5, mm1 1705 pslldq xmm5, 8 1706 por xmm1, xmm5 1707INIT_XMM cpuname 1708 lea r2, [r4+r3*2] 1709 movdqa xmm2, xmm1 1710 movdqa xmm3, xmm1 1711 PALIGNR xmm1, xmm0, 7, xmm4 1712 PALIGNR xmm2, xmm0, 9, xmm5 1713 lea r1, [r2+r3*2] 1714 PALIGNR xmm3, xmm0, 8, xmm0 1715 movdqa xmm4, xmm1 1716 pavgb xmm4, xmm3 1717 lea r0, [r1+r3*2] 1718 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5 1719 punpcklbw xmm4, xmm0 1720 movhlps xmm0, xmm4 1721 movq [r0+r3*2], xmm4 1722 movq [r2+r3*2], xmm0 1723 psrldq xmm4, 2 1724 psrldq xmm0, 2 1725 movq [r0+r3*1], xmm4 1726 movq [r2+r3*1], xmm0 1727 psrldq xmm4, 2 1728 psrldq xmm0, 2 1729 movq [r1+r3*2], xmm4 1730 movq [r4+r3*2], xmm0 1731 psrldq xmm4, 2 1732 psrldq xmm0, 2 1733 movq [r1+r3*1], xmm4 1734 movq [r4+r3*1], xmm0 1735 RET 1736%endmacro 1737 1738INIT_MMX sse2 1739PRED8x8L_HORIZONTAL_DOWN 1740INIT_MMX ssse3 1741PRED8x8L_HORIZONTAL_DOWN 1742 1743;------------------------------------------------------------------------------- 1744; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, 1745; ptrdiff_t stride) 1746;------------------------------------------------------------------------------- 1747 1748INIT_MMX mmxext 1749cglobal pred4x4_dc_8, 3,5 1750 pxor mm7, mm7 1751 mov r4, r0 1752 sub r0, r2 1753 movd mm0, [r0] 1754 psadbw mm0, mm7 1755 movzx r1d, byte [r0+r2*1-1] 1756 movd r3d, mm0 1757 add r3d, r1d 1758 movzx r1d, byte [r0+r2*2-1] 1759 lea r0, [r0+r2*2] 1760 add r3d, r1d 1761 movzx r1d, byte [r0+r2*1-1] 1762 add r3d, r1d 1763 movzx r1d, byte [r0+r2*2-1] 1764 add r3d, r1d 1765 add r3d, 4 1766 shr r3d, 3 1767 imul r3d, 0x01010101 1768 mov [r4+r2*0], r3d 1769 mov [r0+r2*0], r3d 1770 mov [r0+r2*1], r3d 1771 mov [r0+r2*2], r3d 1772 RET 1773 1774;----------------------------------------------------------------------------- 1775; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, 1776; ptrdiff_t stride) 1777;----------------------------------------------------------------------------- 1778 1779INIT_MMX mmxext 1780cglobal pred4x4_tm_vp8_8, 3,6 1781 sub r0, r2 1782 pxor mm7, mm7 1783 movd mm0, [r0] 1784 punpcklbw mm0, mm7 1785 movzx r4d, byte [r0-1] 1786 mov r5d, 2 1787.loop: 1788 movzx r1d, byte [r0+r2*1-1] 1789 movzx r3d, byte [r0+r2*2-1] 1790 sub r1d, r4d 1791 sub r3d, r4d 1792 movd mm2, r1d 1793 movd mm4, r3d 1794 pshufw mm2, mm2, 0 1795 pshufw mm4, mm4, 0 1796 paddw mm2, mm0 1797 paddw mm4, mm0 1798 packuswb mm2, mm2 1799 packuswb mm4, mm4 1800 movd [r0+r2*1], mm2 1801 movd [r0+r2*2], mm4 1802 lea r0, [r0+r2*2] 1803 dec r5d 1804 jg .loop 1805 REP_RET 1806 1807INIT_XMM ssse3 1808cglobal pred4x4_tm_vp8_8, 3,3 1809 sub r0, r2 1810 movq mm6, [tm_shuf] 1811 pxor mm1, mm1 1812 movd mm0, [r0] 1813 punpcklbw mm0, mm1 1814 movd mm7, [r0-4] 1815 pshufb mm7, mm6 1816 lea r1, [r0+r2*2] 1817 movd mm2, [r0+r2*1-4] 1818 movd mm3, [r0+r2*2-4] 1819 movd mm4, [r1+r2*1-4] 1820 movd mm5, [r1+r2*2-4] 1821 pshufb mm2, mm6 1822 pshufb mm3, mm6 1823 pshufb mm4, mm6 1824 pshufb mm5, mm6 1825 psubw mm0, mm7 1826 paddw mm2, mm0 1827 paddw mm3, mm0 1828 paddw mm4, mm0 1829 paddw mm5, mm0 1830 packuswb mm2, mm2 1831 packuswb mm3, mm3 1832 packuswb mm4, mm4 1833 packuswb mm5, mm5 1834 movd [r0+r2*1], mm2 1835 movd [r0+r2*2], mm3 1836 movd [r1+r2*1], mm4 1837 movd [r1+r2*2], mm5 1838 RET 1839 1840;----------------------------------------------------------------------------- 1841; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, 1842; ptrdiff_t stride) 1843;----------------------------------------------------------------------------- 1844 1845INIT_MMX mmxext 1846cglobal pred4x4_vertical_vp8_8, 3,3 1847 sub r0, r2 1848 movd m1, [r0-1] 1849 movd m0, [r0] 1850 mova m2, m0 ;t0 t1 t2 t3 1851 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 1852 lea r1, [r0+r2*2] 1853 psrlq m0, 8 ;t1 t2 t3 t4 1854 PRED4x4_LOWPASS m3, m1, m0, m2, m4 1855 movd [r0+r2*1], m3 1856 movd [r0+r2*2], m3 1857 movd [r1+r2*1], m3 1858 movd [r1+r2*2], m3 1859 RET 1860 1861;----------------------------------------------------------------------------- 1862; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, 1863; ptrdiff_t stride) 1864;----------------------------------------------------------------------------- 1865INIT_MMX mmxext 1866cglobal pred4x4_down_left_8, 3,3 1867 sub r0, r2 1868 movq m1, [r0] 1869 punpckldq m1, [r1] 1870 movq m2, m1 1871 movq m3, m1 1872 psllq m1, 8 1873 pxor m2, m1 1874 psrlq m2, 8 1875 pxor m2, m3 1876 PRED4x4_LOWPASS m0, m1, m2, m3, m4 1877 lea r1, [r0+r2*2] 1878 psrlq m0, 8 1879 movd [r0+r2*1], m0 1880 psrlq m0, 8 1881 movd [r0+r2*2], m0 1882 psrlq m0, 8 1883 movd [r1+r2*1], m0 1884 psrlq m0, 8 1885 movd [r1+r2*2], m0 1886 RET 1887 1888;------------------------------------------------------------------------------ 1889; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, 1890; ptrdiff_t stride) 1891;------------------------------------------------------------------------------ 1892 1893INIT_MMX mmxext 1894cglobal pred4x4_vertical_left_8, 3,3 1895 sub r0, r2 1896 movq m1, [r0] 1897 punpckldq m1, [r1] 1898 movq m3, m1 1899 movq m2, m1 1900 psrlq m3, 8 1901 psrlq m2, 16 1902 movq m4, m3 1903 pavgb m4, m1 1904 PRED4x4_LOWPASS m0, m1, m2, m3, m5 1905 lea r1, [r0+r2*2] 1906 movh [r0+r2*1], m4 1907 movh [r0+r2*2], m0 1908 psrlq m4, 8 1909 psrlq m0, 8 1910 movh [r1+r2*1], m4 1911 movh [r1+r2*2], m0 1912 RET 1913 1914;------------------------------------------------------------------------------ 1915; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, 1916; ptrdiff_t stride) 1917;------------------------------------------------------------------------------ 1918 1919INIT_MMX mmxext 1920cglobal pred4x4_horizontal_up_8, 3,3 1921 sub r0, r2 1922 lea r1, [r0+r2*2] 1923 movd m0, [r0+r2*1-4] 1924 punpcklbw m0, [r0+r2*2-4] 1925 movd m1, [r1+r2*1-4] 1926 punpcklbw m1, [r1+r2*2-4] 1927 punpckhwd m0, m1 1928 movq m1, m0 1929 punpckhbw m1, m1 1930 pshufw m1, m1, 0xFF 1931 punpckhdq m0, m1 1932 movq m2, m0 1933 movq m3, m0 1934 movq m7, m0 1935 psrlq m2, 16 1936 psrlq m3, 8 1937 pavgb m7, m3 1938 PRED4x4_LOWPASS m4, m0, m2, m3, m5 1939 punpcklbw m7, m4 1940 movd [r0+r2*1], m7 1941 psrlq m7, 16 1942 movd [r0+r2*2], m7 1943 psrlq m7, 16 1944 movd [r1+r2*1], m7 1945 movd [r1+r2*2], m1 1946 RET 1947 1948;------------------------------------------------------------------------------ 1949; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src, 1950; const uint8_t *topright, 1951; ptrdiff_t stride) 1952;------------------------------------------------------------------------------ 1953 1954INIT_MMX mmxext 1955cglobal pred4x4_horizontal_down_8, 3,3 1956 sub r0, r2 1957 lea r1, [r0+r2*2] 1958 movh m0, [r0-4] ; lt .. 1959 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. .. 1960 psllq m0, 8 ; t2 t1 t0 lt .. .. .. .. 1961 movd m1, [r1+r2*2-4] ; l3 1962 punpcklbw m1, [r1+r2*1-4] ; l2 l3 1963 movd m2, [r0+r2*2-4] ; l1 1964 punpcklbw m2, [r0+r2*1-4] ; l0 l1 1965 punpckhwd m1, m2 ; l0 l1 l2 l3 1966 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 1967 movq m0, m1 1968 movq m2, m1 1969 movq m5, m1 1970 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1 1971 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2 1972 pavgb m5, m2 1973 PRED4x4_LOWPASS m3, m1, m0, m2, m4 1974 punpcklbw m5, m3 1975 psrlq m3, 32 1976 PALIGNR m3, m5, 6, m4 1977 movh [r1+r2*2], m5 1978 psrlq m5, 16 1979 movh [r1+r2*1], m5 1980 psrlq m5, 16 1981 movh [r0+r2*2], m5 1982 movh [r0+r2*1], m3 1983 RET 1984 1985;----------------------------------------------------------------------------- 1986; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src, 1987; const uint8_t *topright, 1988; ptrdiff_t stride) 1989;----------------------------------------------------------------------------- 1990 1991INIT_MMX mmxext 1992cglobal pred4x4_vertical_right_8, 3,3 1993 sub r0, r2 1994 lea r1, [r0+r2*2] 1995 movh m0, [r0] ; ........t3t2t1t0 1996 movq m5, m0 1997 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt 1998 pavgb m5, m0 1999 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0 2000 movq m1, m0 2001 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1 2002 movq m2, m0 2003 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2 2004 PRED4x4_LOWPASS m3, m1, m0, m2, m4 2005 movq m1, m3 2006 psrlq m3, 16 2007 psllq m1, 48 2008 movh [r0+r2*1], m5 2009 movh [r0+r2*2], m3 2010 PALIGNR m5, m1, 7, m2 2011 psllq m1, 8 2012 movh [r1+r2*1], m5 2013 PALIGNR m3, m1, 7, m1 2014 movh [r1+r2*2], m3 2015 RET 2016 2017;----------------------------------------------------------------------------- 2018; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, 2019; ptrdiff_t stride) 2020;----------------------------------------------------------------------------- 2021 2022INIT_MMX mmxext 2023cglobal pred4x4_down_right_8, 3,3 2024 sub r0, r2 2025 lea r1, [r0+r2*2] 2026 movq m1, [r1-8] 2027 movq m2, [r0+r2*1-8] 2028 punpckhbw m2, [r0-8] 2029 movh m3, [r0] 2030 punpckhwd m1, m2 2031 PALIGNR m3, m1, 5, m1 2032 movq m1, m3 2033 PALIGNR m3, [r1+r2*1-8], 7, m4 2034 movq m2, m3 2035 PALIGNR m3, [r1+r2*2-8], 7, m4 2036 PRED4x4_LOWPASS m0, m3, m1, m2, m4 2037 movh [r1+r2*2], m0 2038 psrlq m0, 8 2039 movh [r1+r2*1], m0 2040 psrlq m0, 8 2041 movh [r0+r2*2], m0 2042 psrlq m0, 8 2043 movh [r0+r2*1], m0 2044 RET 2045