1;***************************************************************************** 2;* MMX/SSE2/SSSE3-optimized H.264 QPEL code 3;***************************************************************************** 4;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 5;* Copyright (C) 2012 Daniel Kang 6;* 7;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 8;* 9;* This file is part of FFmpeg. 10;* 11;* FFmpeg is free software; you can redistribute it and/or 12;* modify it under the terms of the GNU Lesser General Public 13;* License as published by the Free Software Foundation; either 14;* version 2.1 of the License, or (at your option) any later version. 15;* 16;* FFmpeg is distributed in the hope that it will be useful, 17;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19;* Lesser General Public License for more details. 20;* 21;* You should have received a copy of the GNU Lesser General Public 22;* License along with FFmpeg; if not, write to the Free Software 23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24;****************************************************************************** 25 26%include "libavutil/x86/x86util.asm" 27 28SECTION_RODATA 32 29 30cextern pw_16 31cextern pw_5 32cextern pb_0 33 34SECTION .text 35 36 37%macro op_avgh 3 38 movh %3, %2 39 pavgb %1, %3 40 movh %2, %1 41%endmacro 42 43%macro op_avg 2-3 44 pavgb %1, %2 45 mova %2, %1 46%endmacro 47 48%macro op_puth 2-3 49 movh %2, %1 50%endmacro 51 52%macro op_put 2-3 53 mova %2, %1 54%endmacro 55 56%macro QPEL4_H_LOWPASS_OP 1 57cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride 58 movsxdifnidn r2, r2d 59 movsxdifnidn r3, r3d 60 pxor m7, m7 61 mova m4, [pw_5] 62 mova m5, [pw_16] 63 mov r4d, 4 64.loop: 65 movh m1, [r1-1] 66 movh m2, [r1+0] 67 movh m3, [r1+1] 68 movh m0, [r1+2] 69 punpcklbw m1, m7 70 punpcklbw m2, m7 71 punpcklbw m3, m7 72 punpcklbw m0, m7 73 paddw m1, m0 74 paddw m2, m3 75 movh m0, [r1-2] 76 movh m3, [r1+3] 77 punpcklbw m0, m7 78 punpcklbw m3, m7 79 paddw m0, m3 80 psllw m2, 2 81 psubw m2, m1 82 pmullw m2, m4 83 paddw m0, m5 84 paddw m0, m2 85 psraw m0, 5 86 packuswb m0, m0 87 op_%1h m0, [r0], m6 88 add r0, r2 89 add r1, r3 90 dec r4d 91 jg .loop 92 REP_RET 93%endmacro 94 95INIT_MMX mmxext 96QPEL4_H_LOWPASS_OP put 97QPEL4_H_LOWPASS_OP avg 98 99%macro QPEL8_H_LOWPASS_OP 1 100cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride 101 movsxdifnidn r2, r2d 102 movsxdifnidn r3, r3d 103 mov r4d, 8 104 pxor m7, m7 105 mova m6, [pw_5] 106.loop: 107 mova m0, [r1] 108 mova m2, [r1+1] 109 mova m1, m0 110 mova m3, m2 111 punpcklbw m0, m7 112 punpckhbw m1, m7 113 punpcklbw m2, m7 114 punpckhbw m3, m7 115 paddw m0, m2 116 paddw m1, m3 117 psllw m0, 2 118 psllw m1, 2 119 mova m2, [r1-1] 120 mova m4, [r1+2] 121 mova m3, m2 122 mova m5, m4 123 punpcklbw m2, m7 124 punpckhbw m3, m7 125 punpcklbw m4, m7 126 punpckhbw m5, m7 127 paddw m2, m4 128 paddw m5, m3 129 psubw m0, m2 130 psubw m1, m5 131 pmullw m0, m6 132 pmullw m1, m6 133 movd m2, [r1-2] 134 movd m5, [r1+7] 135 punpcklbw m2, m7 136 punpcklbw m5, m7 137 paddw m2, m3 138 paddw m4, m5 139 mova m5, [pw_16] 140 paddw m2, m5 141 paddw m4, m5 142 paddw m0, m2 143 paddw m1, m4 144 psraw m0, 5 145 psraw m1, 5 146 packuswb m0, m1 147 op_%1 m0, [r0], m4 148 add r0, r2 149 add r1, r3 150 dec r4d 151 jg .loop 152 REP_RET 153%endmacro 154 155INIT_MMX mmxext 156QPEL8_H_LOWPASS_OP put 157QPEL8_H_LOWPASS_OP avg 158 159%macro QPEL8_H_LOWPASS_OP_XMM 1 160cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride 161 movsxdifnidn r2, r2d 162 movsxdifnidn r3, r3d 163 mov r4d, 8 164 pxor m7, m7 165 mova m6, [pw_5] 166.loop: 167 movu m1, [r1-2] 168 mova m0, m1 169 punpckhbw m1, m7 170 punpcklbw m0, m7 171 mova m2, m1 172 mova m3, m1 173 mova m4, m1 174 mova m5, m1 175 palignr m4, m0, 2 176 palignr m3, m0, 4 177 palignr m2, m0, 6 178 palignr m1, m0, 8 179 palignr m5, m0, 10 180 paddw m0, m5 181 paddw m2, m3 182 paddw m1, m4 183 psllw m2, 2 184 psubw m2, m1 185 paddw m0, [pw_16] 186 pmullw m2, m6 187 paddw m2, m0 188 psraw m2, 5 189 packuswb m2, m2 190 op_%1h m2, [r0], m4 191 add r1, r3 192 add r0, r2 193 dec r4d 194 jne .loop 195 REP_RET 196%endmacro 197 198INIT_XMM ssse3 199QPEL8_H_LOWPASS_OP_XMM put 200QPEL8_H_LOWPASS_OP_XMM avg 201 202 203%macro QPEL4_H_LOWPASS_L2_OP 1 204cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride 205 movsxdifnidn r3, r3d 206 movsxdifnidn r4, r4d 207 pxor m7, m7 208 mova m4, [pw_5] 209 mova m5, [pw_16] 210 mov r5d, 4 211.loop: 212 movh m1, [r1-1] 213 movh m2, [r1+0] 214 movh m3, [r1+1] 215 movh m0, [r1+2] 216 punpcklbw m1, m7 217 punpcklbw m2, m7 218 punpcklbw m3, m7 219 punpcklbw m0, m7 220 paddw m1, m0 221 paddw m2, m3 222 movh m0, [r1-2] 223 movh m3, [r1+3] 224 punpcklbw m0, m7 225 punpcklbw m3, m7 226 paddw m0, m3 227 psllw m2, 2 228 psubw m2, m1 229 pmullw m2, m4 230 paddw m0, m5 231 paddw m0, m2 232 movh m3, [r2] 233 psraw m0, 5 234 packuswb m0, m0 235 pavgb m0, m3 236 op_%1h m0, [r0], m6 237 add r0, r3 238 add r1, r3 239 add r2, r4 240 dec r5d 241 jg .loop 242 REP_RET 243%endmacro 244 245INIT_MMX mmxext 246QPEL4_H_LOWPASS_L2_OP put 247QPEL4_H_LOWPASS_L2_OP avg 248 249 250%macro QPEL8_H_LOWPASS_L2_OP 1 251cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride 252 movsxdifnidn r3, r3d 253 movsxdifnidn r4, r4d 254 mov r5d, 8 255 pxor m7, m7 256 mova m6, [pw_5] 257.loop: 258 mova m0, [r1] 259 mova m2, [r1+1] 260 mova m1, m0 261 mova m3, m2 262 punpcklbw m0, m7 263 punpckhbw m1, m7 264 punpcklbw m2, m7 265 punpckhbw m3, m7 266 paddw m0, m2 267 paddw m1, m3 268 psllw m0, 2 269 psllw m1, 2 270 mova m2, [r1-1] 271 mova m4, [r1+2] 272 mova m3, m2 273 mova m5, m4 274 punpcklbw m2, m7 275 punpckhbw m3, m7 276 punpcklbw m4, m7 277 punpckhbw m5, m7 278 paddw m2, m4 279 paddw m5, m3 280 psubw m0, m2 281 psubw m1, m5 282 pmullw m0, m6 283 pmullw m1, m6 284 movd m2, [r1-2] 285 movd m5, [r1+7] 286 punpcklbw m2, m7 287 punpcklbw m5, m7 288 paddw m2, m3 289 paddw m4, m5 290 mova m5, [pw_16] 291 paddw m2, m5 292 paddw m4, m5 293 paddw m0, m2 294 paddw m1, m4 295 psraw m0, 5 296 psraw m1, 5 297 mova m4, [r2] 298 packuswb m0, m1 299 pavgb m0, m4 300 op_%1 m0, [r0], m4 301 add r0, r3 302 add r1, r3 303 add r2, r4 304 dec r5d 305 jg .loop 306 REP_RET 307%endmacro 308 309INIT_MMX mmxext 310QPEL8_H_LOWPASS_L2_OP put 311QPEL8_H_LOWPASS_L2_OP avg 312 313 314%macro QPEL8_H_LOWPASS_L2_OP_XMM 1 315cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride 316 movsxdifnidn r3, r3d 317 movsxdifnidn r4, r4d 318 mov r5d, 8 319 pxor m7, m7 320 mova m6, [pw_5] 321.loop: 322 lddqu m1, [r1-2] 323 mova m0, m1 324 punpckhbw m1, m7 325 punpcklbw m0, m7 326 mova m2, m1 327 mova m3, m1 328 mova m4, m1 329 mova m5, m1 330 palignr m4, m0, 2 331 palignr m3, m0, 4 332 palignr m2, m0, 6 333 palignr m1, m0, 8 334 palignr m5, m0, 10 335 paddw m0, m5 336 paddw m2, m3 337 paddw m1, m4 338 psllw m2, 2 339 movh m3, [r2] 340 psubw m2, m1 341 paddw m0, [pw_16] 342 pmullw m2, m6 343 paddw m2, m0 344 psraw m2, 5 345 packuswb m2, m2 346 pavgb m2, m3 347 op_%1h m2, [r0], m4 348 add r1, r3 349 add r0, r3 350 add r2, r4 351 dec r5d 352 jg .loop 353 REP_RET 354%endmacro 355 356INIT_XMM ssse3 357QPEL8_H_LOWPASS_L2_OP_XMM put 358QPEL8_H_LOWPASS_L2_OP_XMM avg 359 360 361; All functions that call this are required to have function arguments of 362; dst, src, dstStride, srcStride 363%macro FILT_V 1 364 mova m6, m2 365 movh m5, [r1] 366 paddw m6, m3 367 psllw m6, 2 368 psubw m6, m1 369 psubw m6, m4 370 punpcklbw m5, m7 371 pmullw m6, [pw_5] 372 paddw m0, [pw_16] 373 add r1, r3 374 paddw m0, m5 375 paddw m6, m0 376 psraw m6, 5 377 packuswb m6, m6 378 op_%1h m6, [r0], m0 ; 1 379 add r0, r2 380 SWAP 0, 1, 2, 3, 4, 5 381%endmacro 382 383%macro QPEL4_V_LOWPASS_OP 1 384cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride 385 movsxdifnidn r2, r2d 386 movsxdifnidn r3, r3d 387 sub r1, r3 388 sub r1, r3 389 pxor m7, m7 390 movh m0, [r1] 391 movh m1, [r1+r3] 392 lea r1, [r1+2*r3] 393 movh m2, [r1] 394 movh m3, [r1+r3] 395 lea r1, [r1+2*r3] 396 movh m4, [r1] 397 add r1, r3 398 punpcklbw m0, m7 399 punpcklbw m1, m7 400 punpcklbw m2, m7 401 punpcklbw m3, m7 402 punpcklbw m4, m7 403 FILT_V %1 404 FILT_V %1 405 FILT_V %1 406 FILT_V %1 407 RET 408%endmacro 409 410INIT_MMX mmxext 411QPEL4_V_LOWPASS_OP put 412QPEL4_V_LOWPASS_OP avg 413 414 415 416%macro QPEL8OR16_V_LOWPASS_OP 1 417%if cpuflag(sse2) 418cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h 419 movsxdifnidn r2, r2d 420 movsxdifnidn r3, r3d 421 sub r1, r3 422 sub r1, r3 423%else 424cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h 425 movsxdifnidn r2, r2d 426 movsxdifnidn r3, r3d 427%endif 428 pxor m7, m7 429 movh m0, [r1] 430 movh m1, [r1+r3] 431 lea r1, [r1+2*r3] 432 movh m2, [r1] 433 movh m3, [r1+r3] 434 lea r1, [r1+2*r3] 435 movh m4, [r1] 436 add r1, r3 437 punpcklbw m0, m7 438 punpcklbw m1, m7 439 punpcklbw m2, m7 440 punpcklbw m3, m7 441 punpcklbw m4, m7 442 FILT_V %1 443 FILT_V %1 444 FILT_V %1 445 FILT_V %1 446 FILT_V %1 447 FILT_V %1 448 FILT_V %1 449 FILT_V %1 450 cmp r4d, 16 451 jne .end 452 FILT_V %1 453 FILT_V %1 454 FILT_V %1 455 FILT_V %1 456 FILT_V %1 457 FILT_V %1 458 FILT_V %1 459 FILT_V %1 460.end: 461 REP_RET 462%endmacro 463 464INIT_XMM sse2 465QPEL8OR16_V_LOWPASS_OP put 466QPEL8OR16_V_LOWPASS_OP avg 467 468 469; All functions that use this are required to have args: 470; src, tmp, srcSize 471%macro FILT_HV 1 ; offset 472 mova m6, m2 473 movh m5, [r0] 474 paddw m6, m3 475 psllw m6, 2 476 paddw m0, [pw_16] 477 psubw m6, m1 478 psubw m6, m4 479 punpcklbw m5, m7 480 pmullw m6, [pw_5] 481 paddw m0, m5 482 add r0, r2 483 paddw m6, m0 484 mova [r1+%1], m6 485 SWAP 0, 1, 2, 3, 4, 5 486%endmacro 487 488%macro QPEL4_HV1_LOWPASS_OP 1 489cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride 490 movsxdifnidn r2, r2d 491 pxor m7, m7 492 movh m0, [r0] 493 movh m1, [r0+r2] 494 lea r0, [r0+2*r2] 495 movh m2, [r0] 496 movh m3, [r0+r2] 497 lea r0, [r0+2*r2] 498 movh m4, [r0] 499 add r0, r2 500 punpcklbw m0, m7 501 punpcklbw m1, m7 502 punpcklbw m2, m7 503 punpcklbw m3, m7 504 punpcklbw m4, m7 505 FILT_HV 0*24 506 FILT_HV 1*24 507 FILT_HV 2*24 508 FILT_HV 3*24 509 RET 510 511cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride 512 movsxdifnidn r2, r2d 513 mov r3d, 4 514.loop: 515 mova m0, [r0] 516 paddw m0, [r0+10] 517 mova m1, [r0+2] 518 paddw m1, [r0+8] 519 mova m2, [r0+4] 520 paddw m2, [r0+6] 521 psubw m0, m1 522 psraw m0, 2 523 psubw m0, m1 524 paddsw m0, m2 525 psraw m0, 2 526 paddw m0, m2 527 psraw m0, 6 528 packuswb m0, m0 529 op_%1h m0, [r1], m7 530 add r0, 24 531 add r1, r2 532 dec r3d 533 jnz .loop 534 REP_RET 535%endmacro 536 537INIT_MMX mmxext 538QPEL4_HV1_LOWPASS_OP put 539QPEL4_HV1_LOWPASS_OP avg 540 541%macro QPEL8OR16_HV1_LOWPASS_OP 1 542cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size 543 movsxdifnidn r2, r2d 544 pxor m7, m7 545 movh m0, [r0] 546 movh m1, [r0+r2] 547 lea r0, [r0+2*r2] 548 movh m2, [r0] 549 movh m3, [r0+r2] 550 lea r0, [r0+2*r2] 551 movh m4, [r0] 552 add r0, r2 553 punpcklbw m0, m7 554 punpcklbw m1, m7 555 punpcklbw m2, m7 556 punpcklbw m3, m7 557 punpcklbw m4, m7 558 FILT_HV 0*48 559 FILT_HV 1*48 560 FILT_HV 2*48 561 FILT_HV 3*48 562 FILT_HV 4*48 563 FILT_HV 5*48 564 FILT_HV 6*48 565 FILT_HV 7*48 566 cmp r3d, 16 567 jne .end 568 FILT_HV 8*48 569 FILT_HV 9*48 570 FILT_HV 10*48 571 FILT_HV 11*48 572 FILT_HV 12*48 573 FILT_HV 13*48 574 FILT_HV 14*48 575 FILT_HV 15*48 576.end: 577 REP_RET 578%endmacro 579 580INIT_XMM sse2 581QPEL8OR16_HV1_LOWPASS_OP put 582 583 584 585%macro QPEL8OR16_HV2_LOWPASS_OP 1 586; unused is to match ssse3 and mmxext args 587cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h 588 movsxdifnidn r2, r2d 589.loop: 590 mova m0, [r1] 591 mova m3, [r1+8] 592 mova m1, [r1+2] 593 mova m4, [r1+10] 594 paddw m0, m4 595 paddw m1, m3 596 paddw m3, [r1+18] 597 paddw m4, [r1+16] 598 mova m2, [r1+4] 599 mova m5, [r1+12] 600 paddw m2, [r1+6] 601 paddw m5, [r1+14] 602 psubw m0, m1 603 psubw m3, m4 604 psraw m0, 2 605 psraw m3, 2 606 psubw m0, m1 607 psubw m3, m4 608 paddsw m0, m2 609 paddsw m3, m5 610 psraw m0, 2 611 psraw m3, 2 612 paddw m0, m2 613 paddw m3, m5 614 psraw m0, 6 615 psraw m3, 6 616 packuswb m0, m3 617 op_%1 m0, [r0], m7 618 add r1, 48 619 add r0, r2 620 dec r4d 621 jne .loop 622 REP_RET 623%endmacro 624 625INIT_MMX mmxext 626QPEL8OR16_HV2_LOWPASS_OP put 627QPEL8OR16_HV2_LOWPASS_OP avg 628 629%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1 630cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size 631 movsxdifnidn r2, r2d 632 movsxdifnidn r3, r3d 633 cmp r4d, 16 634 je .op16 635.loop8: 636 mova m1, [r1+16] 637 mova m0, [r1] 638 mova m2, m1 639 mova m3, m1 640 mova m4, m1 641 mova m5, m1 642 palignr m5, m0, 10 643 palignr m4, m0, 8 644 palignr m3, m0, 6 645 palignr m2, m0, 4 646 palignr m1, m0, 2 647 paddw m0, m5 648 paddw m1, m4 649 paddw m2, m3 650 psubw m0, m1 651 psraw m0, 2 652 psubw m0, m1 653 paddw m0, m2 654 psraw m0, 2 655 paddw m0, m2 656 psraw m0, 6 657 packuswb m0, m0 658 op_%1h m0, [r0], m7 659 add r1, 48 660 add r0, r2 661 dec r4d 662 jne .loop8 663 jmp .done 664.op16: 665 mova m4, [r1+32] 666 mova m5, [r1+16] 667 mova m7, [r1] 668 mova m3, m4 669 mova m2, m4 670 mova m1, m4 671 mova m0, m4 672 palignr m0, m5, 10 673 palignr m1, m5, 8 674 palignr m2, m5, 6 675 palignr m3, m5, 4 676 palignr m4, m5, 2 677 paddw m0, m5 678 paddw m1, m4 679 paddw m2, m3 680 mova m6, m5 681 mova m4, m5 682 mova m3, m5 683 palignr m4, m7, 8 684 palignr m6, m7, 2 685 palignr m3, m7, 10 686 paddw m4, m6 687 mova m6, m5 688 palignr m5, m7, 6 689 palignr m6, m7, 4 690 paddw m3, m7 691 paddw m5, m6 692 psubw m0, m1 693 psubw m3, m4 694 psraw m0, 2 695 psraw m3, 2 696 psubw m0, m1 697 psubw m3, m4 698 paddw m0, m2 699 paddw m3, m5 700 psraw m0, 2 701 psraw m3, 2 702 paddw m0, m2 703 paddw m3, m5 704 psraw m0, 6 705 psraw m3, 6 706 packuswb m3, m0 707 op_%1 m3, [r0], m7 708 add r1, 48 709 add r0, r2 710 dec r4d 711 jne .op16 712.done: 713 REP_RET 714%endmacro 715 716INIT_XMM ssse3 717QPEL8OR16_HV2_LOWPASS_OP_XMM put 718QPEL8OR16_HV2_LOWPASS_OP_XMM avg 719 720 721%macro PIXELS4_L2_SHIFT5 1 722cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h 723 movsxdifnidn r3, r3d 724 movsxdifnidn r4, r4d 725 mova m0, [r1] 726 mova m1, [r1+24] 727 psraw m0, 5 728 psraw m1, 5 729 packuswb m0, m0 730 packuswb m1, m1 731 pavgb m0, [r2] 732 pavgb m1, [r2+r4] 733 op_%1h m0, [r0], m4 734 op_%1h m1, [r0+r3], m5 735 lea r2, [r2+r4*2] 736 lea r0, [r0+r3*2] 737 mova m0, [r1+48] 738 mova m1, [r1+72] 739 psraw m0, 5 740 psraw m1, 5 741 packuswb m0, m0 742 packuswb m1, m1 743 pavgb m0, [r2] 744 pavgb m1, [r2+r4] 745 op_%1h m0, [r0], m4 746 op_%1h m1, [r0+r3], m5 747 RET 748%endmacro 749 750INIT_MMX mmxext 751PIXELS4_L2_SHIFT5 put 752PIXELS4_L2_SHIFT5 avg 753 754 755%macro PIXELS8_L2_SHIFT5 1 756cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h 757 movsxdifnidn r3, r3d 758 movsxdifnidn r4, r4d 759.loop: 760 mova m0, [r1] 761 mova m1, [r1+8] 762 mova m2, [r1+48] 763 mova m3, [r1+48+8] 764 psraw m0, 5 765 psraw m1, 5 766 psraw m2, 5 767 psraw m3, 5 768 packuswb m0, m1 769 packuswb m2, m3 770 pavgb m0, [r2] 771 pavgb m2, [r2+r4] 772 op_%1 m0, [r0], m4 773 op_%1 m2, [r0+r3], m5 774 lea r2, [r2+2*r4] 775 add r1, 48*2 776 lea r0, [r0+2*r3] 777 sub r5d, 2 778 jne .loop 779 REP_RET 780%endmacro 781 782INIT_MMX mmxext 783PIXELS8_L2_SHIFT5 put 784PIXELS8_L2_SHIFT5 avg 785 786 787%if ARCH_X86_64 788%macro QPEL16_H_LOWPASS_L2_OP 1 789cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride 790 movsxdifnidn r3, r3d 791 movsxdifnidn r4, r4d 792 mov r5d, 16 793 pxor m15, m15 794 mova m14, [pw_5] 795 mova m13, [pw_16] 796.loop: 797 lddqu m1, [r1+6] 798 lddqu m7, [r1-2] 799 mova m0, m1 800 punpckhbw m1, m15 801 punpcklbw m0, m15 802 punpcklbw m7, m15 803 mova m2, m1 804 mova m6, m0 805 mova m3, m1 806 mova m8, m0 807 mova m4, m1 808 mova m9, m0 809 mova m12, m0 810 mova m11, m1 811 palignr m11, m0, 10 812 palignr m12, m7, 10 813 palignr m4, m0, 2 814 palignr m9, m7, 2 815 palignr m3, m0, 4 816 palignr m8, m7, 4 817 palignr m2, m0, 6 818 palignr m6, m7, 6 819 paddw m11, m0 820 palignr m1, m0, 8 821 palignr m0, m7, 8 822 paddw m7, m12 823 paddw m2, m3 824 paddw m6, m8 825 paddw m1, m4 826 paddw m0, m9 827 psllw m2, 2 828 psllw m6, 2 829 psubw m2, m1 830 psubw m6, m0 831 paddw m11, m13 832 paddw m7, m13 833 pmullw m2, m14 834 pmullw m6, m14 835 lddqu m3, [r2] 836 paddw m2, m11 837 paddw m6, m7 838 psraw m2, 5 839 psraw m6, 5 840 packuswb m6, m2 841 pavgb m6, m3 842 op_%1 m6, [r0], m11 843 add r1, r3 844 add r0, r3 845 add r2, r4 846 dec r5d 847 jg .loop 848 REP_RET 849%endmacro 850 851INIT_XMM ssse3 852QPEL16_H_LOWPASS_L2_OP put 853QPEL16_H_LOWPASS_L2_OP avg 854%endif 855