1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29cextern pw_1023 30%define pw_pixel_max pw_1023 31cextern pw_512 32cextern pw_16 33cextern pw_8 34cextern pw_4 35cextern pw_2 36cextern pw_1 37cextern pd_16 38 39pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 40pw_m3: times 8 dw -3 41pd_17: times 4 dd 17 42 43SECTION .text 44 45; dest, left, right, src 46; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 47%macro PRED4x4_LOWPASS 4 48 paddw %2, %3 49 psrlw %2, 1 50 pavgw %1, %4, %2 51%endmacro 52 53;----------------------------------------------------------------------------- 54; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright, 55; ptrdiff_t stride) 56;----------------------------------------------------------------------------- 57%macro PRED4x4_DR 0 58cglobal pred4x4_down_right_10, 3, 3 59 sub r0, r2 60 lea r1, [r0+r2*2] 61 movhps m1, [r1-8] 62 movhps m2, [r0+r2*1-8] 63 movhps m4, [r0-8] 64 punpckhwd m2, m4 65 movq m3, [r0] 66 punpckhdq m1, m2 67 PALIGNR m3, m1, 10, m1 68 movhps m4, [r1+r2*1-8] 69 PALIGNR m0, m3, m4, 14, m4 70 movhps m4, [r1+r2*2-8] 71 PALIGNR m2, m0, m4, 14, m4 72 PRED4x4_LOWPASS m0, m2, m3, m0 73 movq [r1+r2*2], m0 74 psrldq m0, 2 75 movq [r1+r2*1], m0 76 psrldq m0, 2 77 movq [r0+r2*2], m0 78 psrldq m0, 2 79 movq [r0+r2*1], m0 80 RET 81%endmacro 82 83INIT_XMM sse2 84PRED4x4_DR 85INIT_XMM ssse3 86PRED4x4_DR 87%if HAVE_AVX_EXTERNAL 88INIT_XMM avx 89PRED4x4_DR 90%endif 91 92;------------------------------------------------------------------------------ 93; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright, 94; ptrdiff_t stride) 95;------------------------------------------------------------------------------ 96%macro PRED4x4_VR 0 97cglobal pred4x4_vertical_right_10, 3, 3, 6 98 sub r0, r2 99 lea r1, [r0+r2*2] 100 movq m5, [r0] ; ........t3t2t1t0 101 movhps m1, [r0-8] 102 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt 103 pavgw m5, m0 104 movhps m1, [r0+r2*1-8] 105 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0 106 movhps m2, [r0+r2*2-8] 107 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 108 movhps m3, [r1+r2*1-8] 109 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2 110 PRED4x4_LOWPASS m1, m0, m2, m1 111 pslldq m0, m1, 12 112 psrldq m1, 4 113 movq [r0+r2*1], m5 114 movq [r0+r2*2], m1 115 PALIGNR m5, m0, 14, m2 116 pslldq m0, 2 117 movq [r1+r2*1], m5 118 PALIGNR m1, m0, 14, m0 119 movq [r1+r2*2], m1 120 RET 121%endmacro 122 123INIT_XMM sse2 124PRED4x4_VR 125INIT_XMM ssse3 126PRED4x4_VR 127%if HAVE_AVX_EXTERNAL 128INIT_XMM avx 129PRED4x4_VR 130%endif 131 132;------------------------------------------------------------------------------- 133; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright, 134; ptrdiff_t stride) 135;------------------------------------------------------------------------------- 136%macro PRED4x4_HD 0 137cglobal pred4x4_horizontal_down_10, 3, 3 138 sub r0, r2 139 lea r1, [r0+r2*2] 140 movq m0, [r0-8] ; lt .. 141 movhps m0, [r0] 142 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. .. 143 movq m1, [r1+r2*2-8] ; l3 144 movq m3, [r1+r2*1-8] 145 punpcklwd m1, m3 ; l2 l3 146 movq m2, [r0+r2*2-8] ; l1 147 movq m3, [r0+r2*1-8] 148 punpcklwd m2, m3 ; l0 l1 149 punpckhdq m1, m2 ; l0 l1 l2 l3 150 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 151 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1 152 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 153 pavgw m5, m1, m3 154 PRED4x4_LOWPASS m3, m1, m0, m3 155 punpcklwd m5, m3 156 psrldq m3, 8 157 PALIGNR m3, m5, 12, m4 158 movq [r1+r2*2], m5 159 movhps [r0+r2*2], m5 160 psrldq m5, 4 161 movq [r1+r2*1], m5 162 movq [r0+r2*1], m3 163 RET 164%endmacro 165 166INIT_XMM sse2 167PRED4x4_HD 168INIT_XMM ssse3 169PRED4x4_HD 170%if HAVE_AVX_EXTERNAL 171INIT_XMM avx 172PRED4x4_HD 173%endif 174 175;----------------------------------------------------------------------------- 176; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride) 177;----------------------------------------------------------------------------- 178 179INIT_MMX mmxext 180cglobal pred4x4_dc_10, 3, 3 181 sub r0, r2 182 lea r1, [r0+r2*2] 183 movq m2, [r0+r2*1-8] 184 paddw m2, [r0+r2*2-8] 185 paddw m2, [r1+r2*1-8] 186 paddw m2, [r1+r2*2-8] 187 psrlq m2, 48 188 movq m0, [r0] 189 HADDW m0, m1 190 paddw m0, [pw_4] 191 paddw m0, m2 192 psrlw m0, 3 193 SPLATW m0, m0, 0 194 movq [r0+r2*1], m0 195 movq [r0+r2*2], m0 196 movq [r1+r2*1], m0 197 movq [r1+r2*2], m0 198 RET 199 200;----------------------------------------------------------------------------- 201; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright, 202; ptrdiff_t stride) 203;----------------------------------------------------------------------------- 204%macro PRED4x4_DL 0 205cglobal pred4x4_down_left_10, 3, 3 206 sub r0, r2 207 movq m0, [r0] 208 movhps m0, [r1] 209 psrldq m2, m0, 2 210 pslldq m3, m0, 2 211 pshufhw m2, m2, 10100100b 212 PRED4x4_LOWPASS m0, m3, m2, m0 213 lea r1, [r0+r2*2] 214 movhps [r1+r2*2], m0 215 psrldq m0, 2 216 movq [r0+r2*1], m0 217 psrldq m0, 2 218 movq [r0+r2*2], m0 219 psrldq m0, 2 220 movq [r1+r2*1], m0 221 RET 222%endmacro 223 224INIT_XMM sse2 225PRED4x4_DL 226%if HAVE_AVX_EXTERNAL 227INIT_XMM avx 228PRED4x4_DL 229%endif 230 231;----------------------------------------------------------------------------- 232; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright, 233; ptrdiff_t stride) 234;----------------------------------------------------------------------------- 235%macro PRED4x4_VL 0 236cglobal pred4x4_vertical_left_10, 3, 3 237 sub r0, r2 238 movu m1, [r0] 239 movhps m1, [r1] 240 psrldq m0, m1, 2 241 psrldq m2, m1, 4 242 pavgw m4, m0, m1 243 PRED4x4_LOWPASS m0, m1, m2, m0 244 lea r1, [r0+r2*2] 245 movq [r0+r2*1], m4 246 movq [r0+r2*2], m0 247 psrldq m4, 2 248 psrldq m0, 2 249 movq [r1+r2*1], m4 250 movq [r1+r2*2], m0 251 RET 252%endmacro 253 254INIT_XMM sse2 255PRED4x4_VL 256%if HAVE_AVX_EXTERNAL 257INIT_XMM avx 258PRED4x4_VL 259%endif 260 261;----------------------------------------------------------------------------- 262; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright, 263; ptrdiff_t stride) 264;----------------------------------------------------------------------------- 265INIT_MMX mmxext 266cglobal pred4x4_horizontal_up_10, 3, 3 267 sub r0, r2 268 lea r1, [r0+r2*2] 269 movq m0, [r0+r2*1-8] 270 punpckhwd m0, [r0+r2*2-8] 271 movq m1, [r1+r2*1-8] 272 punpckhwd m1, [r1+r2*2-8] 273 punpckhdq m0, m1 274 pshufw m1, m1, 0xFF 275 movq [r1+r2*2], m1 276 movd [r1+r2*1+4], m1 277 pshufw m2, m0, 11111001b 278 movq m1, m2 279 pavgw m2, m0 280 281 pshufw m5, m0, 11111110b 282 PRED4x4_LOWPASS m1, m0, m5, m1 283 movq m6, m2 284 punpcklwd m6, m1 285 movq [r0+r2*1], m6 286 psrlq m2, 16 287 psrlq m1, 16 288 punpcklwd m2, m1 289 movq [r0+r2*2], m2 290 psrlq m2, 32 291 movd [r1+r2*1], m2 292 RET 293 294 295 296;----------------------------------------------------------------------------- 297; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride) 298;----------------------------------------------------------------------------- 299INIT_XMM sse2 300cglobal pred8x8_vertical_10, 2, 2 301 sub r0, r1 302 mova m0, [r0] 303%rep 3 304 mova [r0+r1*1], m0 305 mova [r0+r1*2], m0 306 lea r0, [r0+r1*2] 307%endrep 308 mova [r0+r1*1], m0 309 mova [r0+r1*2], m0 310 RET 311 312;----------------------------------------------------------------------------- 313; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride) 314;----------------------------------------------------------------------------- 315INIT_XMM sse2 316cglobal pred8x8_horizontal_10, 2, 3 317 mov r2d, 4 318.loop: 319 movq m0, [r0+r1*0-8] 320 movq m1, [r0+r1*1-8] 321 pshuflw m0, m0, 0xff 322 pshuflw m1, m1, 0xff 323 punpcklqdq m0, m0 324 punpcklqdq m1, m1 325 mova [r0+r1*0], m0 326 mova [r0+r1*1], m1 327 lea r0, [r0+r1*2] 328 dec r2d 329 jg .loop 330 REP_RET 331 332;----------------------------------------------------------------------------- 333; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride) 334;----------------------------------------------------------------------------- 335%macro MOV8 2-3 336; sort of a hack, but it works 337 movdqa [%1], %2 338%endmacro 339 340%macro PRED8x8_DC 1 341cglobal pred8x8_dc_10, 2, 6 342 sub r0, r1 343 pxor m4, m4 344 movq m0, [r0+0] 345 movq m1, [r0+8] 346 punpcklwd m0, m1 347 movhlps m1, m0 348 paddw m0, m1 349 %1 m2, m0, 00001110b 350 paddw m0, m2 351 352 lea r5, [r1*3] 353 lea r4, [r0+r1*4] 354 movzx r2d, word [r0+r1*1-2] 355 movzx r3d, word [r0+r1*2-2] 356 add r2d, r3d 357 movzx r3d, word [r0+r5*1-2] 358 add r2d, r3d 359 movzx r3d, word [r4-2] 360 add r2d, r3d 361 movd m2, r2d ; s2 362 363 movzx r2d, word [r4+r1*1-2] 364 movzx r3d, word [r4+r1*2-2] 365 add r2d, r3d 366 movzx r3d, word [r4+r5*1-2] 367 add r2d, r3d 368 movzx r3d, word [r4+r1*4-2] 369 add r2d, r3d 370 movd m3, r2d ; s3 371 372 punpcklwd m2, m3 373 punpckldq m0, m2 ; s0, s1, s2, s3 374 %1 m3, m0, 11110110b ; s2, s1, s3, s3 375 %1 m0, m0, 01110100b ; s0, s1, s3, s1 376 paddw m0, m3 377 psrlw m0, 2 378 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 379 punpcklwd m0, m0 380 pshufd m3, m0, 11111010b 381 punpckldq m0, m0 382 SWAP 0,1 383 MOV8 r0+r1*1, m1, m2 384 MOV8 r0+r1*2, m1, m2 385 MOV8 r0+r5*1, m1, m2 386 MOV8 r0+r1*4, m1, m2 387 MOV8 r4+r1*1, m3, m4 388 MOV8 r4+r1*2, m3, m4 389 MOV8 r4+r5*1, m3, m4 390 MOV8 r4+r1*4, m3, m4 391 RET 392%endmacro 393 394INIT_XMM sse2 395PRED8x8_DC pshuflw 396 397;----------------------------------------------------------------------------- 398; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride) 399;----------------------------------------------------------------------------- 400INIT_XMM sse2 401cglobal pred8x8_top_dc_10, 2, 4 402 sub r0, r1 403 mova m0, [r0] 404 pshuflw m1, m0, 0x4e 405 pshufhw m1, m1, 0x4e 406 paddw m0, m1 407 pshuflw m1, m0, 0xb1 408 pshufhw m1, m1, 0xb1 409 paddw m0, m1 410 lea r2, [r1*3] 411 lea r3, [r0+r1*4] 412 paddw m0, [pw_2] 413 psrlw m0, 2 414 mova [r0+r1*1], m0 415 mova [r0+r1*2], m0 416 mova [r0+r2*1], m0 417 mova [r0+r1*4], m0 418 mova [r3+r1*1], m0 419 mova [r3+r1*2], m0 420 mova [r3+r2*1], m0 421 mova [r3+r1*4], m0 422 RET 423 424;----------------------------------------------------------------------------- 425; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride) 426;----------------------------------------------------------------------------- 427INIT_XMM sse2 428cglobal pred8x8_plane_10, 2, 7, 7 429 sub r0, r1 430 lea r2, [r1*3] 431 lea r3, [r0+r1*4] 432 mova m2, [r0] 433 pmaddwd m2, [pw_m32101234] 434 HADDD m2, m1 435 movd m0, [r0-4] 436 psrld m0, 14 437 psubw m2, m0 ; H 438 movd m0, [r3+r1*4-4] 439 movd m1, [r0+12] 440 paddw m0, m1 441 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7]) 442 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1] 443 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1] 444 sub r4d, r5d 445 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1] 446 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1] 447 sub r6d, r5d 448 lea r4d, [r4+r6*2] 449 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] 450 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] 451 sub r5d, r6d 452 lea r5d, [r5*3] 453 add r4d, r5d 454 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] 455 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] 456 sub r6d, r5d 457 lea r4d, [r4+r6*4] 458 movd m3, r4d ; V 459 punpckldq m2, m3 460 pmaddwd m2, [pd_17] 461 paddd m2, [pd_16] 462 psrad m2, 5 ; b, c 463 464 mova m3, [pw_pixel_max] 465 pxor m1, m1 466 SPLATW m0, m0, 1 467 SPLATW m4, m2, 2 468 SPLATW m2, m2, 0 469 pmullw m2, [pw_m32101234] ; b 470 pmullw m5, m4, [pw_m3] ; c 471 paddw m5, [pw_16] 472 mov r2d, 8 473 add r0, r1 474.loop: 475 paddsw m6, m2, m5 476 paddsw m6, m0 477 psraw m6, 5 478 CLIPW m6, m1, m3 479 mova [r0], m6 480 paddw m5, m4 481 add r0, r1 482 dec r2d 483 jg .loop 484 REP_RET 485 486 487;----------------------------------------------------------------------------- 488; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright, 489; ptrdiff_t stride) 490;----------------------------------------------------------------------------- 491INIT_XMM sse2 492cglobal pred8x8l_128_dc_10, 4, 4 493 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1)) 494 lea r1, [r3*3] 495 lea r2, [r0+r3*4] 496 MOV8 r0+r3*0, m0, m0 497 MOV8 r0+r3*1, m0, m0 498 MOV8 r0+r3*2, m0, m0 499 MOV8 r0+r1*1, m0, m0 500 MOV8 r2+r3*0, m0, m0 501 MOV8 r2+r3*1, m0, m0 502 MOV8 r2+r3*2, m0, m0 503 MOV8 r2+r1*1, m0, m0 504 RET 505 506;----------------------------------------------------------------------------- 507; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright, 508; ptrdiff_t stride) 509;----------------------------------------------------------------------------- 510%macro PRED8x8L_TOP_DC 0 511cglobal pred8x8l_top_dc_10, 4, 4, 6 512 sub r0, r3 513 mova m0, [r0] 514 shr r1d, 14 515 shr r2d, 13 516 neg r1 517 pslldq m1, m0, 2 518 psrldq m2, m0, 2 519 pinsrw m1, [r0+r1], 0 520 pinsrw m2, [r0+r2+14], 7 521 lea r1, [r3*3] 522 lea r2, [r0+r3*4] 523 PRED4x4_LOWPASS m0, m2, m1, m0 524 HADDW m0, m1 525 paddw m0, [pw_4] 526 psrlw m0, 3 527 SPLATW m0, m0, 0 528 mova [r0+r3*1], m0 529 mova [r0+r3*2], m0 530 mova [r0+r1*1], m0 531 mova [r0+r3*4], m0 532 mova [r2+r3*1], m0 533 mova [r2+r3*2], m0 534 mova [r2+r1*1], m0 535 mova [r2+r3*4], m0 536 RET 537%endmacro 538 539INIT_XMM sse2 540PRED8x8L_TOP_DC 541%if HAVE_AVX_EXTERNAL 542INIT_XMM avx 543PRED8x8L_TOP_DC 544%endif 545 546;------------------------------------------------------------------------------- 547; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright, 548; ptrdiff_t stride) 549;------------------------------------------------------------------------------- 550;TODO: see if scalar is faster 551%macro PRED8x8L_DC 0 552cglobal pred8x8l_dc_10, 4, 6, 6 553 sub r0, r3 554 lea r4, [r0+r3*4] 555 lea r5, [r3*3] 556 mova m0, [r0+r3*2-16] 557 punpckhwd m0, [r0+r3*1-16] 558 mova m1, [r4+r3*0-16] 559 punpckhwd m1, [r0+r5*1-16] 560 punpckhdq m1, m0 561 mova m2, [r4+r3*2-16] 562 punpckhwd m2, [r4+r3*1-16] 563 mova m3, [r4+r3*4-16] 564 punpckhwd m3, [r4+r5*1-16] 565 punpckhdq m3, m2 566 punpckhqdq m3, m1 567 mova m0, [r0] 568 shr r1d, 14 569 shr r2d, 13 570 neg r1 571 pslldq m1, m0, 2 572 psrldq m2, m0, 2 573 pinsrw m1, [r0+r1], 0 574 pinsrw m2, [r0+r2+14], 7 575 not r1 576 and r1, r3 577 pslldq m4, m3, 2 578 psrldq m5, m3, 2 579 pshuflw m4, m4, 11100101b 580 pinsrw m5, [r0+r1-2], 7 581 PRED4x4_LOWPASS m3, m4, m5, m3 582 PRED4x4_LOWPASS m0, m2, m1, m0 583 paddw m0, m3 584 HADDW m0, m1 585 paddw m0, [pw_8] 586 psrlw m0, 4 587 SPLATW m0, m0 588 mova [r0+r3*1], m0 589 mova [r0+r3*2], m0 590 mova [r0+r5*1], m0 591 mova [r0+r3*4], m0 592 mova [r4+r3*1], m0 593 mova [r4+r3*2], m0 594 mova [r4+r5*1], m0 595 mova [r4+r3*4], m0 596 RET 597%endmacro 598 599INIT_XMM sse2 600PRED8x8L_DC 601%if HAVE_AVX_EXTERNAL 602INIT_XMM avx 603PRED8x8L_DC 604%endif 605 606;----------------------------------------------------------------------------- 607; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright, 608; ptrdiff_t stride) 609;----------------------------------------------------------------------------- 610%macro PRED8x8L_VERTICAL 0 611cglobal pred8x8l_vertical_10, 4, 4, 6 612 sub r0, r3 613 mova m0, [r0] 614 shr r1d, 14 615 shr r2d, 13 616 neg r1 617 pslldq m1, m0, 2 618 psrldq m2, m0, 2 619 pinsrw m1, [r0+r1], 0 620 pinsrw m2, [r0+r2+14], 7 621 lea r1, [r3*3] 622 lea r2, [r0+r3*4] 623 PRED4x4_LOWPASS m0, m2, m1, m0 624 mova [r0+r3*1], m0 625 mova [r0+r3*2], m0 626 mova [r0+r1*1], m0 627 mova [r0+r3*4], m0 628 mova [r2+r3*1], m0 629 mova [r2+r3*2], m0 630 mova [r2+r1*1], m0 631 mova [r2+r3*4], m0 632 RET 633%endmacro 634 635INIT_XMM sse2 636PRED8x8L_VERTICAL 637%if HAVE_AVX_EXTERNAL 638INIT_XMM avx 639PRED8x8L_VERTICAL 640%endif 641 642;----------------------------------------------------------------------------- 643; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft, 644; int has_topright, ptrdiff_t stride) 645;----------------------------------------------------------------------------- 646%macro PRED8x8L_HORIZONTAL 0 647cglobal pred8x8l_horizontal_10, 4, 4, 5 648 mova m0, [r0-16] 649 shr r1d, 14 650 dec r1 651 and r1, r3 652 sub r1, r3 653 punpckhwd m0, [r0+r1-16] 654 mova m1, [r0+r3*2-16] 655 punpckhwd m1, [r0+r3*1-16] 656 lea r2, [r0+r3*4] 657 lea r1, [r3*3] 658 punpckhdq m1, m0 659 mova m2, [r2+r3*0-16] 660 punpckhwd m2, [r0+r1-16] 661 mova m3, [r2+r3*2-16] 662 punpckhwd m3, [r2+r3*1-16] 663 punpckhdq m3, m2 664 punpckhqdq m3, m1 665 PALIGNR m4, m3, [r2+r1-16], 14, m0 666 pslldq m0, m4, 2 667 pshuflw m0, m0, 11100101b 668 PRED4x4_LOWPASS m4, m3, m0, m4 669 punpckhwd m3, m4, m4 670 punpcklwd m4, m4 671 pshufd m0, m3, 0xff 672 pshufd m1, m3, 0xaa 673 pshufd m2, m3, 0x55 674 pshufd m3, m3, 0x00 675 mova [r0+r3*0], m0 676 mova [r0+r3*1], m1 677 mova [r0+r3*2], m2 678 mova [r0+r1*1], m3 679 pshufd m0, m4, 0xff 680 pshufd m1, m4, 0xaa 681 pshufd m2, m4, 0x55 682 pshufd m3, m4, 0x00 683 mova [r2+r3*0], m0 684 mova [r2+r3*1], m1 685 mova [r2+r3*2], m2 686 mova [r2+r1*1], m3 687 RET 688%endmacro 689 690INIT_XMM sse2 691PRED8x8L_HORIZONTAL 692INIT_XMM ssse3 693PRED8x8L_HORIZONTAL 694%if HAVE_AVX_EXTERNAL 695INIT_XMM avx 696PRED8x8L_HORIZONTAL 697%endif 698 699;----------------------------------------------------------------------------- 700; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright, 701; ptrdiff_t stride) 702;----------------------------------------------------------------------------- 703%macro PRED8x8L_DOWN_LEFT 0 704cglobal pred8x8l_down_left_10, 4, 4, 7 705 sub r0, r3 706 mova m3, [r0] 707 shr r1d, 14 708 neg r1 709 shr r2d, 13 710 pslldq m1, m3, 2 711 psrldq m2, m3, 2 712 pinsrw m1, [r0+r1], 0 713 pinsrw m2, [r0+r2+14], 7 714 PRED4x4_LOWPASS m6, m2, m1, m3 715 jz .fix_tr ; flags from shr r2d 716 mova m1, [r0+16] 717 psrldq m5, m1, 2 718 PALIGNR m2, m1, m3, 14, m3 719 pshufhw m5, m5, 10100100b 720 PRED4x4_LOWPASS m1, m2, m5, m1 721.do_topright: 722 lea r1, [r3*3] 723 psrldq m5, m1, 14 724 lea r2, [r0+r3*4] 725 PALIGNR m2, m1, m6, 2, m0 726 PALIGNR m3, m1, m6, 14, m0 727 PALIGNR m5, m1, 2, m0 728 pslldq m4, m6, 2 729 PRED4x4_LOWPASS m6, m4, m2, m6 730 PRED4x4_LOWPASS m1, m3, m5, m1 731 mova [r2+r3*4], m1 732 PALIGNR m1, m6, 14, m2 733 pslldq m6, 2 734 mova [r2+r1*1], m1 735 PALIGNR m1, m6, 14, m2 736 pslldq m6, 2 737 mova [r2+r3*2], m1 738 PALIGNR m1, m6, 14, m2 739 pslldq m6, 2 740 mova [r2+r3*1], m1 741 PALIGNR m1, m6, 14, m2 742 pslldq m6, 2 743 mova [r0+r3*4], m1 744 PALIGNR m1, m6, 14, m2 745 pslldq m6, 2 746 mova [r0+r1*1], m1 747 PALIGNR m1, m6, 14, m2 748 pslldq m6, 2 749 mova [r0+r3*2], m1 750 PALIGNR m1, m6, 14, m6 751 mova [r0+r3*1], m1 752 RET 753.fix_tr: 754 punpckhwd m3, m3 755 pshufd m1, m3, 0xFF 756 jmp .do_topright 757%endmacro 758 759INIT_XMM sse2 760PRED8x8L_DOWN_LEFT 761INIT_XMM ssse3 762PRED8x8L_DOWN_LEFT 763%if HAVE_AVX_EXTERNAL 764INIT_XMM avx 765PRED8x8L_DOWN_LEFT 766%endif 767 768;----------------------------------------------------------------------------- 769; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft, 770; int has_topright, ptrdiff_t stride) 771;----------------------------------------------------------------------------- 772%macro PRED8x8L_DOWN_RIGHT 0 773; standard forbids this when has_topleft is false 774; no need to check 775cglobal pred8x8l_down_right_10, 4, 5, 8 776 sub r0, r3 777 lea r4, [r0+r3*4] 778 lea r1, [r3*3] 779 mova m0, [r0+r3*1-16] 780 punpckhwd m0, [r0+r3*0-16] 781 mova m1, [r0+r1*1-16] 782 punpckhwd m1, [r0+r3*2-16] 783 punpckhdq m1, m0 784 mova m2, [r4+r3*1-16] 785 punpckhwd m2, [r4+r3*0-16] 786 mova m3, [r4+r1*1-16] 787 punpckhwd m3, [r4+r3*2-16] 788 punpckhdq m3, m2 789 punpckhqdq m3, m1 790 mova m0, [r4+r3*4-16] 791 mova m1, [r0] 792 PALIGNR m4, m3, m0, 14, m0 793 PALIGNR m1, m3, 2, m2 794 pslldq m0, m4, 2 795 pshuflw m0, m0, 11100101b 796 PRED4x4_LOWPASS m6, m1, m4, m3 797 PRED4x4_LOWPASS m4, m3, m0, m4 798 mova m3, [r0] 799 shr r2d, 13 800 pslldq m1, m3, 2 801 psrldq m2, m3, 2 802 pinsrw m1, [r0-2], 0 803 pinsrw m2, [r0+r2+14], 7 804 PRED4x4_LOWPASS m3, m2, m1, m3 805 PALIGNR m2, m3, m6, 2, m0 806 PALIGNR m5, m3, m6, 14, m0 807 psrldq m7, m3, 2 808 PRED4x4_LOWPASS m6, m4, m2, m6 809 PRED4x4_LOWPASS m3, m5, m7, m3 810 mova [r4+r3*4], m6 811 PALIGNR m3, m6, 14, m2 812 pslldq m6, 2 813 mova [r0+r3*1], m3 814 PALIGNR m3, m6, 14, m2 815 pslldq m6, 2 816 mova [r0+r3*2], m3 817 PALIGNR m3, m6, 14, m2 818 pslldq m6, 2 819 mova [r0+r1*1], m3 820 PALIGNR m3, m6, 14, m2 821 pslldq m6, 2 822 mova [r0+r3*4], m3 823 PALIGNR m3, m6, 14, m2 824 pslldq m6, 2 825 mova [r4+r3*1], m3 826 PALIGNR m3, m6, 14, m2 827 pslldq m6, 2 828 mova [r4+r3*2], m3 829 PALIGNR m3, m6, 14, m6 830 mova [r4+r1*1], m3 831 RET 832%endmacro 833 834INIT_XMM sse2 835PRED8x8L_DOWN_RIGHT 836INIT_XMM ssse3 837PRED8x8L_DOWN_RIGHT 838%if HAVE_AVX_EXTERNAL 839INIT_XMM avx 840PRED8x8L_DOWN_RIGHT 841%endif 842 843;----------------------------------------------------------------------------- 844; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft, 845; int has_topright, ptrdiff_t stride) 846;----------------------------------------------------------------------------- 847%macro PRED8x8L_VERTICAL_RIGHT 0 848; likewise with 8x8l_down_right 849cglobal pred8x8l_vertical_right_10, 4, 5, 7 850 sub r0, r3 851 lea r4, [r0+r3*4] 852 lea r1, [r3*3] 853 mova m0, [r0+r3*1-16] 854 punpckhwd m0, [r0+r3*0-16] 855 mova m1, [r0+r1*1-16] 856 punpckhwd m1, [r0+r3*2-16] 857 punpckhdq m1, m0 858 mova m2, [r4+r3*1-16] 859 punpckhwd m2, [r4+r3*0-16] 860 mova m3, [r4+r1*1-16] 861 punpckhwd m3, [r4+r3*2-16] 862 punpckhdq m3, m2 863 punpckhqdq m3, m1 864 mova m0, [r4+r3*4-16] 865 mova m1, [r0] 866 PALIGNR m4, m3, m0, 14, m0 867 PALIGNR m1, m3, 2, m2 868 PRED4x4_LOWPASS m3, m1, m4, m3 869 mova m2, [r0] 870 shr r2d, 13 871 pslldq m1, m2, 2 872 psrldq m5, m2, 2 873 pinsrw m1, [r0-2], 0 874 pinsrw m5, [r0+r2+14], 7 875 PRED4x4_LOWPASS m2, m5, m1, m2 876 PALIGNR m6, m2, m3, 12, m1 877 PALIGNR m5, m2, m3, 14, m0 878 PRED4x4_LOWPASS m0, m6, m2, m5 879 pavgw m2, m5 880 mova [r0+r3*2], m0 881 mova [r0+r3*1], m2 882 pslldq m6, m3, 4 883 pslldq m1, m3, 2 884 PRED4x4_LOWPASS m1, m3, m6, m1 885 PALIGNR m2, m1, 14, m4 886 mova [r0+r1*1], m2 887 pslldq m1, 2 888 PALIGNR m0, m1, 14, m3 889 mova [r0+r3*4], m0 890 pslldq m1, 2 891 PALIGNR m2, m1, 14, m4 892 mova [r4+r3*1], m2 893 pslldq m1, 2 894 PALIGNR m0, m1, 14, m3 895 mova [r4+r3*2], m0 896 pslldq m1, 2 897 PALIGNR m2, m1, 14, m4 898 mova [r4+r1*1], m2 899 pslldq m1, 2 900 PALIGNR m0, m1, 14, m1 901 mova [r4+r3*4], m0 902 RET 903%endmacro 904 905INIT_XMM sse2 906PRED8x8L_VERTICAL_RIGHT 907INIT_XMM ssse3 908PRED8x8L_VERTICAL_RIGHT 909%if HAVE_AVX_EXTERNAL 910INIT_XMM avx 911PRED8x8L_VERTICAL_RIGHT 912%endif 913 914;----------------------------------------------------------------------------- 915; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft, 916; int has_topright, ptrdiff_t stride) 917;----------------------------------------------------------------------------- 918%macro PRED8x8L_HORIZONTAL_UP 0 919cglobal pred8x8l_horizontal_up_10, 4, 4, 6 920 mova m0, [r0+r3*0-16] 921 punpckhwd m0, [r0+r3*1-16] 922 shr r1d, 14 923 dec r1 924 and r1, r3 925 sub r1, r3 926 mova m4, [r0+r1*1-16] 927 lea r1, [r3*3] 928 lea r2, [r0+r3*4] 929 mova m1, [r0+r3*2-16] 930 punpckhwd m1, [r0+r1*1-16] 931 punpckhdq m0, m1 932 mova m2, [r2+r3*0-16] 933 punpckhwd m2, [r2+r3*1-16] 934 mova m3, [r2+r3*2-16] 935 punpckhwd m3, [r2+r1*1-16] 936 punpckhdq m2, m3 937 punpckhqdq m0, m2 938 PALIGNR m1, m0, m4, 14, m4 939 psrldq m2, m0, 2 940 pshufhw m2, m2, 10100100b 941 PRED4x4_LOWPASS m0, m1, m2, m0 942 psrldq m1, m0, 2 943 psrldq m2, m0, 4 944 pshufhw m1, m1, 10100100b 945 pshufhw m2, m2, 01010100b 946 pavgw m4, m0, m1 947 PRED4x4_LOWPASS m1, m2, m0, m1 948 punpckhwd m5, m4, m1 949 punpcklwd m4, m1 950 mova [r2+r3*0], m5 951 mova [r0+r3*0], m4 952 pshufd m0, m5, 11111001b 953 pshufd m1, m5, 11111110b 954 pshufd m2, m5, 11111111b 955 mova [r2+r3*1], m0 956 mova [r2+r3*2], m1 957 mova [r2+r1*1], m2 958 PALIGNR m2, m5, m4, 4, m0 959 PALIGNR m3, m5, m4, 8, m1 960 PALIGNR m5, m5, m4, 12, m4 961 mova [r0+r3*1], m2 962 mova [r0+r3*2], m3 963 mova [r0+r1*1], m5 964 RET 965%endmacro 966 967INIT_XMM sse2 968PRED8x8L_HORIZONTAL_UP 969INIT_XMM ssse3 970PRED8x8L_HORIZONTAL_UP 971%if HAVE_AVX_EXTERNAL 972INIT_XMM avx 973PRED8x8L_HORIZONTAL_UP 974%endif 975 976 977;----------------------------------------------------------------------------- 978; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride) 979;----------------------------------------------------------------------------- 980%macro MOV16 3-5 981 mova [%1+ 0], %2 982 mova [%1+mmsize], %3 983%endmacro 984 985INIT_XMM sse2 986cglobal pred16x16_vertical_10, 2, 3 987 sub r0, r1 988 mov r2d, 8 989 mova m0, [r0+ 0] 990 mova m1, [r0+mmsize] 991.loop: 992 MOV16 r0+r1*1, m0, m1, m2, m3 993 MOV16 r0+r1*2, m0, m1, m2, m3 994 lea r0, [r0+r1*2] 995 dec r2d 996 jg .loop 997 REP_RET 998 999;----------------------------------------------------------------------------- 1000; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride) 1001;----------------------------------------------------------------------------- 1002INIT_XMM sse2 1003cglobal pred16x16_horizontal_10, 2, 3 1004 mov r2d, 8 1005.vloop: 1006 movd m0, [r0+r1*0-4] 1007 movd m1, [r0+r1*1-4] 1008 SPLATW m0, m0, 1 1009 SPLATW m1, m1, 1 1010 MOV16 r0+r1*0, m0, m0, m0, m0 1011 MOV16 r0+r1*1, m1, m1, m1, m1 1012 lea r0, [r0+r1*2] 1013 dec r2d 1014 jg .vloop 1015 REP_RET 1016 1017;----------------------------------------------------------------------------- 1018; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride) 1019;----------------------------------------------------------------------------- 1020INIT_XMM sse2 1021cglobal pred16x16_dc_10, 2, 6 1022 mov r5, r0 1023 sub r0, r1 1024 mova m0, [r0+0] 1025 paddw m0, [r0+mmsize] 1026 HADDW m0, m2 1027 1028 lea r0, [r0+r1-2] 1029 movzx r3d, word [r0] 1030 movzx r4d, word [r0+r1] 1031%rep 7 1032 lea r0, [r0+r1*2] 1033 movzx r2d, word [r0] 1034 add r3d, r2d 1035 movzx r2d, word [r0+r1] 1036 add r4d, r2d 1037%endrep 1038 lea r3d, [r3+r4+16] 1039 1040 movd m1, r3d 1041 paddw m0, m1 1042 psrlw m0, 5 1043 SPLATW m0, m0 1044 mov r3d, 8 1045.loop: 1046 MOV16 r5+r1*0, m0, m0, m0, m0 1047 MOV16 r5+r1*1, m0, m0, m0, m0 1048 lea r5, [r5+r1*2] 1049 dec r3d 1050 jg .loop 1051 REP_RET 1052 1053;----------------------------------------------------------------------------- 1054; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride) 1055;----------------------------------------------------------------------------- 1056INIT_XMM sse2 1057cglobal pred16x16_top_dc_10, 2, 3 1058 sub r0, r1 1059 mova m0, [r0+0] 1060 paddw m0, [r0+mmsize] 1061 HADDW m0, m2 1062 1063 SPLATW m0, m0 1064 paddw m0, [pw_8] 1065 psrlw m0, 4 1066 mov r2d, 8 1067.loop: 1068 MOV16 r0+r1*1, m0, m0, m0, m0 1069 MOV16 r0+r1*2, m0, m0, m0, m0 1070 lea r0, [r0+r1*2] 1071 dec r2d 1072 jg .loop 1073 REP_RET 1074 1075;----------------------------------------------------------------------------- 1076; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride) 1077;----------------------------------------------------------------------------- 1078INIT_XMM sse2 1079cglobal pred16x16_left_dc_10, 2, 6 1080 mov r5, r0 1081 1082 sub r0, 2 1083 movzx r3d, word [r0] 1084 movzx r4d, word [r0+r1] 1085%rep 7 1086 lea r0, [r0+r1*2] 1087 movzx r2d, word [r0] 1088 add r3d, r2d 1089 movzx r2d, word [r0+r1] 1090 add r4d, r2d 1091%endrep 1092 lea r3d, [r3+r4+8] 1093 shr r3d, 4 1094 1095 movd m0, r3d 1096 SPLATW m0, m0 1097 mov r3d, 8 1098.loop: 1099 MOV16 r5+r1*0, m0, m0, m0, m0 1100 MOV16 r5+r1*1, m0, m0, m0, m0 1101 lea r5, [r5+r1*2] 1102 dec r3d 1103 jg .loop 1104 REP_RET 1105 1106;----------------------------------------------------------------------------- 1107; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride) 1108;----------------------------------------------------------------------------- 1109INIT_XMM sse2 1110cglobal pred16x16_128_dc_10, 2,3 1111 mova m0, [pw_512] 1112 mov r2d, 8 1113.loop: 1114 MOV16 r0+r1*0, m0, m0, m0, m0 1115 MOV16 r0+r1*1, m0, m0, m0, m0 1116 lea r0, [r0+r1*2] 1117 dec r2d 1118 jg .loop 1119 REP_RET 1120