1; /* 2; * Provide SSE luma and chroma mc functions for HEVC decoding 3; * Copyright (c) 2013 Pierre-Edouard LEPERE 4; * 5; * This file is part of FFmpeg. 6; * 7; * FFmpeg is free software; you can redistribute it and/or 8; * modify it under the terms of the GNU Lesser General Public 9; * License as published by the Free Software Foundation; either 10; * version 2.1 of the License, or (at your option) any later version. 11; * 12; * FFmpeg is distributed in the hope that it will be useful, 13; * but WITHOUT ANY WARRANTY; without even the implied warranty of 14; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15; * Lesser General Public License for more details. 16; * 17; * You should have received a copy of the GNU Lesser General Public 18; * License along with FFmpeg; if not, write to the Free Software 19; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20; */ 21%include "libavutil/x86/x86util.asm" 22 23SECTION_RODATA 32 24cextern pw_255 25cextern pw_512 26cextern pw_2048 27cextern pw_8192 28cextern pw_1023 29cextern pw_1024 30cextern pw_4096 31%define pw_8 pw_512 32%define pw_10 pw_2048 33%define pw_12 pw_8192 34%define pw_bi_10 pw_1024 35%define pw_bi_12 pw_4096 36%define max_pixels_8 pw_255 37%define max_pixels_10 pw_1023 38pw_bi_8: times 16 dw (1 << 8) 39max_pixels_12: times 16 dw ((1 << 12)-1) 40cextern pd_1 41cextern pb_0 42 43%macro EPEL_TABLE 4 44hevc_epel_filters_%4_%1 times %2 d%3 -2, 58 45 times %2 d%3 10, -2 46 times %2 d%3 -4, 54 47 times %2 d%3 16, -2 48 times %2 d%3 -6, 46 49 times %2 d%3 28, -4 50 times %2 d%3 -4, 36 51 times %2 d%3 36, -4 52 times %2 d%3 -4, 28 53 times %2 d%3 46, -6 54 times %2 d%3 -2, 16 55 times %2 d%3 54, -4 56 times %2 d%3 -2, 10 57 times %2 d%3 58, -2 58%endmacro 59 60 61EPEL_TABLE 8,16, b, avx2 62EPEL_TABLE 10, 8, w, avx2 63 64EPEL_TABLE 8, 8, b, sse4 65EPEL_TABLE 10, 4, w, sse4 66EPEL_TABLE 12, 4, w, sse4 67 68%macro QPEL_TABLE 4 69hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4 70 times %2 d%3 -10, 58 71 times %2 d%3 17, -5 72 times %2 d%3 1, 0 73 times %2 d%3 -1, 4 74 times %2 d%3 -11, 40 75 times %2 d%3 40,-11 76 times %2 d%3 4, -1 77 times %2 d%3 0, 1 78 times %2 d%3 -5, 17 79 times %2 d%3 58,-10 80 times %2 d%3 4, -1 81%endmacro 82 83QPEL_TABLE 8, 8, b, sse4 84QPEL_TABLE 10, 4, w, sse4 85QPEL_TABLE 12, 4, w, sse4 86 87QPEL_TABLE 8,16, b, avx2 88QPEL_TABLE 10, 8, w, avx2 89 90QPEL_TABLE 4, 1, b, avx512icl_h 91QPEL_TABLE 8, 1, b, avx512icl_h 92QPEL_TABLE 8, 1, d, avx512icl_v 93QPEL_TABLE 16, 1, b, avx512icl_h 94QPEL_TABLE 32, 1, b, avx512icl_h 95QPEL_TABLE 64, 1, b, avx512icl_h 96 97pb_qpel_shuffle_index: db 0, 1, 2, 3 98 db 1, 2, 3, 4 99 db 2, 3, 4, 5 100 db 3, 4, 5, 6 101 db 4, 5, 6, 7 102 db 5, 6, 7, 8 103 db 6, 7, 8, 9 104 db 7, 8, 9, 10 105 db 8, 9, 10, 11 106 db 9, 10, 11, 12 107 db 10, 11, 12, 13 108 db 11, 12, 13, 14 109 db 12, 13, 14, 15 110 db 13, 14, 15, 16 111 db 14, 15, 16, 17 112 db 15, 16, 17, 18 113 db 4, 5, 6, 7 114 db 5, 6, 7, 8 115 db 6, 7, 8, 9 116 db 7, 8, 9, 10 117 db 8, 9, 10, 11 118 db 9, 10, 11, 12 119 db 10, 11, 12, 13 120 db 11, 12, 13, 14 121 db 12, 13, 14, 15 122 db 13, 14, 15, 16 123 db 14, 15, 16, 17 124 db 15, 16, 17, 18 125 db 16, 17, 18, 19 126 db 17, 18, 19, 20 127 db 18, 19, 20, 21 128 db 19, 20, 21, 22 129 130SECTION .text 131 132%define MAX_PB_SIZE 64 133 134%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10 135 136%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10 137 138%if ARCH_X86_64 139 140%macro SIMPLE_BILOAD 4 ;width, tab, r1, r2 141%if %1 <= 4 142 movq %3, [%2] ; load data from source2 143%elif %1 <= 8 144 movdqa %3, [%2] ; load data from source2 145%elif %1 <= 12 146%if cpuflag(avx2) 147 mova %3, [%2] 148%else 149 movdqa %3, [%2] ; load data from source2 150 movq %4, [%2+16] ; load data from source2 151%endif ;avx 152%elif %1 <= 16 153%if cpuflag(avx2) 154 mova %3, [%2] 155%else 156 movdqa %3, [%2] ; load data from source2 157 movdqa %4, [%2+16] ; load data from source2 158%endif ; avx 159%else ; %1 = 32 160 mova %3, [%2] 161 mova %4, [%2+32] 162%endif 163%endmacro 164 165%macro SIMPLE_LOAD 4 ;width, bitd, tab, r1 166%if %1 == 2 || (%2 == 8 && %1 <= 4) 167 movd %4, [%3] ; load data from source 168%elif %1 == 4 || (%2 == 8 && %1 <= 8) 169 movq %4, [%3] ; load data from source 170%elif notcpuflag(avx) 171 movu %4, [%3] ; load data from source 172%elif %1 <= 8 || (%2 == 8 && %1 <= 16) 173 movdqu %4, [%3] 174%else 175 movu %4, [%3] 176%endif 177%endmacro 178 179 180%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp 181%if cpuflag(avx2) 182%assign %%offset 32 183%ifdef PIC 184 lea %5q, [hevc_epel_filters_avx2_%1] 185 %define FILTER %5q 186%else 187 %define FILTER hevc_epel_filters_avx2_%1 188%endif 189%else 190%assign %%offset 16 191%ifdef PIC 192 lea %5q, [hevc_epel_filters_sse4_%1] 193 %define FILTER %5q 194%else 195 %define FILTER hevc_epel_filters_sse4_%1 196%endif 197%endif ;cpuflag(avx2) 198 sub %2q, 1 199%if cpuflag(avx2) 200 shl %2q, 6 ; multiply by 64 201 %else 202 shl %2q, 5 ; multiply by 32 203%endif 204 mova %3, [FILTER + %2q] ; get 2 first values of filters 205 mova %4, [FILTER + %2q+%%offset] ; get 2 last values of filters 206%endmacro 207 208%macro EPEL_HV_FILTER 1 209%if cpuflag(avx2) 210%assign %%offset 32 211%assign %%shift 6 212%define %%table hevc_epel_filters_avx2_%1 213%else 214%assign %%offset 16 215%assign %%shift 5 216%define %%table hevc_epel_filters_sse4_%1 217%endif 218 219%ifdef PIC 220 lea r3srcq, [%%table] 221 %define FILTER r3srcq 222%else 223 %define FILTER %%table 224%endif 225 sub mxq, 1 226 sub myq, 1 227 shl mxq, %%shift ; multiply by 32 228 shl myq, %%shift ; multiply by 32 229 mova m14, [FILTER + mxq] ; get 2 first values of filters 230 mova m15, [FILTER + mxq+%%offset] ; get 2 last values of filters 231 232%if cpuflag(avx2) 233%define %%table hevc_epel_filters_avx2_10 234%else 235%define %%table hevc_epel_filters_sse4_10 236%endif 237%ifdef PIC 238 lea r3srcq, [%%table] 239 %define FILTER r3srcq 240%else 241 %define FILTER %%table 242%endif 243 mova m12, [FILTER + myq] ; get 2 first values of filters 244 mova m13, [FILTER + myq+%%offset] ; get 2 last values of filters 245 lea r3srcq, [srcstrideq*3] 246%endmacro 247 248%macro QPEL_FILTER 2 249 250%if cpuflag(avx2) 251%assign %%offset 32 252%assign %%shift 7 253%define %%table hevc_qpel_filters_avx2_%1 254%else 255%assign %%offset 16 256%assign %%shift 6 257%define %%table hevc_qpel_filters_sse4_%1 258%endif 259 260%ifdef PIC 261 lea rfilterq, [%%table] 262%else 263 %define rfilterq %%table 264%endif 265 sub %2q, 1 266 shl %2q, %%shift ; multiply by 32 267 mova m12, [rfilterq + %2q] ; get 4 first values of filters 268 mova m13, [rfilterq + %2q + %%offset] ; get 4 first values of filters 269 mova m14, [rfilterq + %2q + 2*%%offset] ; get 4 first values of filters 270 mova m15, [rfilterq + %2q + 3*%%offset] ; get 4 first values of filters 271%endmacro 272 273%macro EPEL_LOAD 4 274%if (%1 == 8 && %4 <= 4) 275%define %%load movd 276%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4) 277%define %%load movq 278%else 279%define %%load movdqu 280%endif 281 282 %%load m0, [%2q ] 283%ifnum %3 284 %%load m1, [%2q+ %3] 285 %%load m2, [%2q+2*%3] 286 %%load m3, [%2q+3*%3] 287%else 288 %%load m1, [%2q+ %3q] 289 %%load m2, [%2q+2*%3q] 290 %%load m3, [%2q+r3srcq] 291%endif 292%if %1 == 8 293%if %4 > 8 294 SBUTTERFLY bw, 0, 1, 7 295 SBUTTERFLY bw, 2, 3, 7 296%else 297 punpcklbw m0, m1 298 punpcklbw m2, m3 299%endif 300%else 301%if %4 > 4 302 SBUTTERFLY wd, 0, 1, 7 303 SBUTTERFLY wd, 2, 3, 7 304%else 305 punpcklwd m0, m1 306 punpcklwd m2, m3 307%endif 308%endif 309%endmacro 310 311 312%macro QPEL_H_LOAD 4 313%assign %%stride (%1+7)/8 314%if %1 == 8 315%if %3 <= 4 316%define %%load movd 317%elif %3 == 8 318%define %%load movq 319%else 320%define %%load movu 321%endif 322%else 323%if %3 == 2 324%define %%load movd 325%elif %3 == 4 326%define %%load movq 327%else 328%define %%load movu 329%endif 330%endif 331 %%load m0, [%2-3*%%stride] ;load data from source 332 %%load m1, [%2-2*%%stride] 333 %%load m2, [%2-%%stride ] 334 %%load m3, [%2 ] 335 %%load m4, [%2+%%stride ] 336 %%load m5, [%2+2*%%stride] 337 %%load m6, [%2+3*%%stride] 338 %%load m7, [%2+4*%%stride] 339 340%if %1 == 8 341%if %3 > 8 342 SBUTTERFLY wd, 0, 1, %4 343 SBUTTERFLY wd, 2, 3, %4 344 SBUTTERFLY wd, 4, 5, %4 345 SBUTTERFLY wd, 6, 7, %4 346%else 347 punpcklbw m0, m1 348 punpcklbw m2, m3 349 punpcklbw m4, m5 350 punpcklbw m6, m7 351%endif 352%else 353%if %3 > 4 354 SBUTTERFLY dq, 0, 1, %4 355 SBUTTERFLY dq, 2, 3, %4 356 SBUTTERFLY dq, 4, 5, %4 357 SBUTTERFLY dq, 6, 7, %4 358%else 359 punpcklwd m0, m1 360 punpcklwd m2, m3 361 punpcklwd m4, m5 362 punpcklwd m6, m7 363%endif 364%endif 365%endmacro 366 367%macro QPEL_V_LOAD 5 368 lea %5q, [%2] 369 sub %5q, r3srcq 370 movu m0, [%5q ] ;load x- 3*srcstride 371 movu m1, [%5q+ %3q ] ;load x- 2*srcstride 372 movu m2, [%5q+ 2*%3q ] ;load x-srcstride 373 movu m3, [%2 ] ;load x 374 movu m4, [%2+ %3q] ;load x+stride 375 movu m5, [%2+ 2*%3q] ;load x+2*stride 376 movu m6, [%2+r3srcq] ;load x+3*stride 377 movu m7, [%2+ 4*%3q] ;load x+4*stride 378%if %1 == 8 379%if %4 > 8 380 SBUTTERFLY bw, 0, 1, 8 381 SBUTTERFLY bw, 2, 3, 8 382 SBUTTERFLY bw, 4, 5, 8 383 SBUTTERFLY bw, 6, 7, 8 384%else 385 punpcklbw m0, m1 386 punpcklbw m2, m3 387 punpcklbw m4, m5 388 punpcklbw m6, m7 389%endif 390%else 391%if %4 > 4 392 SBUTTERFLY wd, 0, 1, 8 393 SBUTTERFLY wd, 2, 3, 8 394 SBUTTERFLY wd, 4, 5, 8 395 SBUTTERFLY wd, 6, 7, 8 396%else 397 punpcklwd m0, m1 398 punpcklwd m2, m3 399 punpcklwd m4, m5 400 punpcklwd m6, m7 401%endif 402%endif 403%endmacro 404 405%macro PEL_12STORE2 3 406 movd [%1], %2 407%endmacro 408%macro PEL_12STORE4 3 409 movq [%1], %2 410%endmacro 411%macro PEL_12STORE6 3 412 movq [%1], %2 413 psrldq %2, 8 414 movd [%1+8], %2 415%endmacro 416%macro PEL_12STORE8 3 417 movdqa [%1], %2 418%endmacro 419%macro PEL_12STORE12 3 420 movdqa [%1], %2 421 movq [%1+16], %3 422%endmacro 423%macro PEL_12STORE16 3 424 PEL_12STORE8 %1, %2, %3 425 movdqa [%1+16], %3 426%endmacro 427 428%macro PEL_10STORE2 3 429 movd [%1], %2 430%endmacro 431%macro PEL_10STORE4 3 432 movq [%1], %2 433%endmacro 434%macro PEL_10STORE6 3 435 movq [%1], %2 436 psrldq %2, 8 437 movd [%1+8], %2 438%endmacro 439%macro PEL_10STORE8 3 440 movdqa [%1], %2 441%endmacro 442%macro PEL_10STORE12 3 443 movdqa [%1], %2 444 movq [%1+16], %3 445%endmacro 446%macro PEL_10STORE16 3 447%if cpuflag(avx2) 448 movu [%1], %2 449%else 450 PEL_10STORE8 %1, %2, %3 451 movdqa [%1+16], %3 452%endif 453%endmacro 454 455%macro PEL_10STORE32 3 456 PEL_10STORE16 %1, %2, %3 457 movu [%1+32], %3 458%endmacro 459 460%macro PEL_8STORE2 3 461 pextrw [%1], %2, 0 462%endmacro 463%macro PEL_8STORE4 3 464 movd [%1], %2 465%endmacro 466%macro PEL_8STORE6 3 467 movd [%1], %2 468 pextrw [%1+4], %2, 2 469%endmacro 470%macro PEL_8STORE8 3 471 movq [%1], %2 472%endmacro 473%macro PEL_8STORE12 3 474 movq [%1], %2 475 psrldq %2, 8 476 movd [%1+8], %2 477%endmacro 478%macro PEL_8STORE16 3 479%if cpuflag(avx2) 480 movdqu [%1], %2 481%else 482 mova [%1], %2 483%endif ; avx 484%endmacro 485%macro PEL_8STORE32 3 486 movu [%1], %2 487%endmacro 488 489%macro LOOP_END 3 490 add %1q, 2*MAX_PB_SIZE ; dst += dststride 491 add %2q, %3q ; src += srcstride 492 dec heightd ; cmp height 493 jnz .loop ; height loop 494%endmacro 495 496 497%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth 498%if %2 == 8 499%if cpuflag(avx2) && %0 ==3 500%if %1 > 16 501 vextracti128 xm1, m0, 1 502 pmovzxbw m1, xm1 503 psllw m1, 14-%2 504%endif 505 pmovzxbw m0, xm0 506%else ; not avx 507%if %1 > 8 508 punpckhbw m1, m0, m2 509 psllw m1, 14-%2 510%endif 511 punpcklbw m0, m2 512%endif 513%endif ;avx 514 psllw m0, 14-%2 515%endmacro 516 517%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3 518%if %0 == 8 519%define %%reg0 %5 520%define %%reg2 %6 521%define %%reg1 %7 522%define %%reg3 %8 523%else 524%define %%reg0 m0 525%define %%reg2 m2 526%define %%reg1 m1 527%define %%reg3 m3 528%endif 529%if %1 == 8 530%if cpuflag(avx2) && (%0 == 5) 531%if %2 > 16 532 vperm2i128 m10, m0, m1, q0301 533%endif 534 vinserti128 m0, m0, xm1, 1 535 mova m1, m10 536%if %2 > 16 537 vperm2i128 m10, m2, m3, q0301 538%endif 539 vinserti128 m2, m2, xm3, 1 540 mova m3, m10 541%endif 542 pmaddubsw %%reg0, %3 ;x1*c1+x2*c2 543 pmaddubsw %%reg2, %4 ;x3*c3+x4*c4 544 paddw %%reg0, %%reg2 545%if %2 > 8 546 pmaddubsw %%reg1, %3 547 pmaddubsw %%reg3, %4 548 paddw %%reg1, %%reg3 549%endif 550%else 551 pmaddwd %%reg0, %3 552 pmaddwd %%reg2, %4 553 paddd %%reg0, %%reg2 554%if %2 > 4 555 pmaddwd %%reg1, %3 556 pmaddwd %%reg3, %4 557 paddd %%reg1, %%reg3 558%if %1 != 8 559 psrad %%reg1, %1-8 560%endif 561%endif 562%if %1 != 8 563 psrad %%reg0, %1-8 564%endif 565 packssdw %%reg0, %%reg1 566%endif 567%endmacro 568 569%macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx 570 571%if cpuflag(avx2) 572%assign %%offset 32 573%define %%table hevc_qpel_filters_avx2_%2 574%else 575%assign %%offset 16 576%define %%table hevc_qpel_filters_sse4_%2 577%endif 578 579%ifdef PIC 580 lea rfilterq, [%%table] 581%else 582 %define rfilterq %%table 583%endif 584 585%if %2 == 8 586 pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2 587 pmaddubsw m2, [rfilterq + %3q*8+%%offset] ;x3*c3+x4*c4 588 pmaddubsw m4, [rfilterq + %3q*8+2*%%offset] ;x5*c5+x6*c6 589 pmaddubsw m6, [rfilterq + %3q*8+3*%%offset] ;x7*c7+x8*c8 590 paddw m0, m2 591 paddw m4, m6 592 paddw m0, m4 593%else 594 pmaddwd m0, [rfilterq + %3q*8 ] 595 pmaddwd m2, [rfilterq + %3q*8+%%offset] 596 pmaddwd m4, [rfilterq + %3q*8+2*%%offset] 597 pmaddwd m6, [rfilterq + %3q*8+3*%%offset] 598 paddd m0, m2 599 paddd m4, m6 600 paddd m0, m4 601%if %2 != 8 602 psrad m0, %2-8 603%endif 604%if %1 > 4 605 pmaddwd m1, [rfilterq + %3q*8 ] 606 pmaddwd m3, [rfilterq + %3q*8+%%offset] 607 pmaddwd m5, [rfilterq + %3q*8+2*%%offset] 608 pmaddwd m7, [rfilterq + %3q*8+3*%%offset] 609 paddd m1, m3 610 paddd m5, m7 611 paddd m1, m5 612%if %2 != 8 613 psrad m1, %2-8 614%endif 615%endif 616 p%4 m0, m1 617%endif 618%endmacro 619 620%macro QPEL_COMPUTE 2-3 ; width, bitdepth 621%if %2 == 8 622%if cpuflag(avx2) && (%0 == 3) 623 624 vperm2i128 m10, m0, m1, q0301 625 vinserti128 m0, m0, xm1, 1 626 SWAP 1, 10 627 628 vperm2i128 m10, m2, m3, q0301 629 vinserti128 m2, m2, xm3, 1 630 SWAP 3, 10 631 632 633 vperm2i128 m10, m4, m5, q0301 634 vinserti128 m4, m4, xm5, 1 635 SWAP 5, 10 636 637 vperm2i128 m10, m6, m7, q0301 638 vinserti128 m6, m6, xm7, 1 639 SWAP 7, 10 640%endif 641 642 pmaddubsw m0, m12 ;x1*c1+x2*c2 643 pmaddubsw m2, m13 ;x3*c3+x4*c4 644 pmaddubsw m4, m14 ;x5*c5+x6*c6 645 pmaddubsw m6, m15 ;x7*c7+x8*c8 646 paddw m0, m2 647 paddw m4, m6 648 paddw m0, m4 649%if %1 > 8 650 pmaddubsw m1, m12 651 pmaddubsw m3, m13 652 pmaddubsw m5, m14 653 pmaddubsw m7, m15 654 paddw m1, m3 655 paddw m5, m7 656 paddw m1, m5 657%endif 658%else 659 pmaddwd m0, m12 660 pmaddwd m2, m13 661 pmaddwd m4, m14 662 pmaddwd m6, m15 663 paddd m0, m2 664 paddd m4, m6 665 paddd m0, m4 666%if %2 != 8 667 psrad m0, %2-8 668%endif 669%if %1 > 4 670 pmaddwd m1, m12 671 pmaddwd m3, m13 672 pmaddwd m5, m14 673 pmaddwd m7, m15 674 paddd m1, m3 675 paddd m5, m7 676 paddd m1, m5 677%if %2 != 8 678 psrad m1, %2-8 679%endif 680%endif 681%endif 682%endmacro 683 684%macro BI_COMPUTE 7-8 ; width, bitd, src1l, src1h, scr2l, scr2h, pw 685 paddsw %3, %5 686%if %1 > 8 687 paddsw %4, %6 688%endif 689 UNI_COMPUTE %1, %2, %3, %4, %7 690%if %0 == 8 && cpuflag(avx2) && (%2 == 8) 691 vpermq %3, %3, 216 692 vpermq %4, %4, 216 693%endif 694%endmacro 695 696%macro UNI_COMPUTE 5 697 pmulhrsw %3, %5 698%if %1 > 8 || (%2 > 8 && %1 > 4) 699 pmulhrsw %4, %5 700%endif 701%if %2 == 8 702 packuswb %3, %4 703%else 704 CLIPW %3, [pb_0], [max_pixels_%2] 705%if (%1 > 8 && notcpuflag(avx)) || %1 > 16 706 CLIPW %4, [pb_0], [max_pixels_%2] 707%endif 708%endif 709%endmacro 710 711 712; ****************************** 713; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride, 714; uint8_t *_src, ptrdiff_t _srcstride, 715; int height, int mx, int my) 716; ****************************** 717 718%macro HEVC_PUT_HEVC_PEL_PIXELS 2 719HEVC_PEL_PIXELS %1, %2 720HEVC_UNI_PEL_PIXELS %1, %2 721HEVC_BI_PEL_PIXELS %1, %2 722%endmacro 723 724%macro HEVC_PEL_PIXELS 2 725cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height 726 pxor m2, m2 727.loop: 728 SIMPLE_LOAD %1, %2, srcq, m0 729 MC_PIXEL_COMPUTE %1, %2, 1 730 PEL_10STORE%1 dstq, m0, m1 731 LOOP_END dst, src, srcstride 732 RET 733 %endmacro 734 735%macro HEVC_UNI_PEL_PIXELS 2 736cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height 737.loop: 738 SIMPLE_LOAD %1, %2, srcq, m0 739 PEL_%2STORE%1 dstq, m0, m1 740 add dstq, dststrideq ; dst += dststride 741 add srcq, srcstrideq ; src += srcstride 742 dec heightd ; cmp height 743 jnz .loop ; height loop 744 RET 745%endmacro 746 747%macro HEVC_BI_PEL_PIXELS 2 748cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height 749 pxor m2, m2 750 movdqa m5, [pw_bi_%2] 751.loop: 752 SIMPLE_LOAD %1, %2, srcq, m0 753 SIMPLE_BILOAD %1, src2q, m3, m4 754 MC_PIXEL_COMPUTE %1, %2, 1 755 BI_COMPUTE %1, %2, m0, m1, m3, m4, m5, 1 756 PEL_%2STORE%1 dstq, m0, m1 757 add dstq, dststrideq ; dst += dststride 758 add srcq, srcstrideq ; src += srcstride 759 add src2q, 2*MAX_PB_SIZE ; src += srcstride 760 dec heightd ; cmp height 761 jnz .loop ; height loop 762 RET 763%endmacro 764 765 766; ****************************** 767; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride, 768; uint8_t *_src, ptrdiff_t _srcstride, 769; int height, int mx, int my, int width); 770; ****************************** 771 772 773%macro HEVC_PUT_HEVC_EPEL 2 774%if cpuflag(avx2) 775%define XMM_REGS 11 776%else 777%define XMM_REGS 8 778%endif 779 780cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter 781%assign %%stride ((%2 + 7)/8) 782 EPEL_FILTER %2, mx, m4, m5, rfilter 783.loop: 784 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 785 EPEL_COMPUTE %2, %1, m4, m5, 1 786 PEL_10STORE%1 dstq, m0, m1 787 LOOP_END dst, src, srcstride 788 RET 789 790cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter 791%assign %%stride ((%2 + 7)/8) 792 movdqa m6, [pw_%2] 793 EPEL_FILTER %2, mx, m4, m5, rfilter 794.loop: 795 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 796 EPEL_COMPUTE %2, %1, m4, m5 797 UNI_COMPUTE %1, %2, m0, m1, m6 798 PEL_%2STORE%1 dstq, m0, m1 799 add dstq, dststrideq ; dst += dststride 800 add srcq, srcstrideq ; src += srcstride 801 dec heightd ; cmp height 802 jnz .loop ; height loop 803 RET 804 805cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter 806 movdqa m6, [pw_bi_%2] 807 EPEL_FILTER %2, mx, m4, m5, rfilter 808.loop: 809 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 810 EPEL_COMPUTE %2, %1, m4, m5, 1 811 SIMPLE_BILOAD %1, src2q, m2, m3 812 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1 813 PEL_%2STORE%1 dstq, m0, m1 814 add dstq, dststrideq ; dst += dststride 815 add srcq, srcstrideq ; src += srcstride 816 add src2q, 2*MAX_PB_SIZE ; src += srcstride 817 dec heightd ; cmp height 818 jnz .loop ; height loop 819 RET 820 821; ****************************** 822; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride, 823; uint8_t *_src, ptrdiff_t _srcstride, 824; int height, int mx, int my, int width) 825; ****************************** 826 827cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my 828 movifnidn myd, mym 829 sub srcq, srcstrideq 830 EPEL_FILTER %2, my, m4, m5, r3src 831 lea r3srcq, [srcstrideq*3] 832.loop: 833 EPEL_LOAD %2, srcq, srcstride, %1 834 EPEL_COMPUTE %2, %1, m4, m5, 1 835 PEL_10STORE%1 dstq, m0, m1 836 LOOP_END dst, src, srcstride 837 RET 838 839cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my 840 movifnidn myd, mym 841 movdqa m6, [pw_%2] 842 sub srcq, srcstrideq 843 EPEL_FILTER %2, my, m4, m5, r3src 844 lea r3srcq, [srcstrideq*3] 845.loop: 846 EPEL_LOAD %2, srcq, srcstride, %1 847 EPEL_COMPUTE %2, %1, m4, m5 848 UNI_COMPUTE %1, %2, m0, m1, m6 849 PEL_%2STORE%1 dstq, m0, m1 850 add dstq, dststrideq ; dst += dststride 851 add srcq, srcstrideq ; src += srcstride 852 dec heightd ; cmp height 853 jnz .loop ; height loop 854 RET 855 856 857cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my 858 movifnidn myd, mym 859 movdqa m6, [pw_bi_%2] 860 sub srcq, srcstrideq 861 EPEL_FILTER %2, my, m4, m5, r3src 862 lea r3srcq, [srcstrideq*3] 863.loop: 864 EPEL_LOAD %2, srcq, srcstride, %1 865 EPEL_COMPUTE %2, %1, m4, m5, 1 866 SIMPLE_BILOAD %1, src2q, m2, m3 867 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1 868 PEL_%2STORE%1 dstq, m0, m1 869 add dstq, dststrideq ; dst += dststride 870 add srcq, srcstrideq ; src += srcstride 871 add src2q, 2*MAX_PB_SIZE ; src += srcstride 872 dec heightd ; cmp height 873 jnz .loop ; height loop 874 RET 875%endmacro 876 877 878; ****************************** 879; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride, 880; uint8_t *_src, ptrdiff_t _srcstride, 881; int height, int mx, int my, int width) 882; ****************************** 883 884%macro HEVC_PUT_HEVC_EPEL_HV 2 885cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src 886%assign %%stride ((%2 + 7)/8) 887 sub srcq, srcstrideq 888 EPEL_HV_FILTER %2 889 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 890 EPEL_COMPUTE %2, %1, m14, m15 891%if (%1 > 8 && (%2 == 8)) 892 SWAP m8, m1 893%endif 894 SWAP m4, m0 895 add srcq, srcstrideq 896 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 897 EPEL_COMPUTE %2, %1, m14, m15 898%if (%1 > 8 && (%2 == 8)) 899 SWAP m9, m1 900%endif 901 SWAP m5, m0 902 add srcq, srcstrideq 903 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 904 EPEL_COMPUTE %2, %1, m14, m15 905%if (%1 > 8 && (%2 == 8)) 906 SWAP m10, m1 907%endif 908 SWAP m6, m0 909 add srcq, srcstrideq 910.loop: 911 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 912 EPEL_COMPUTE %2, %1, m14, m15 913%if (%1 > 8 && (%2 == 8)) 914 SWAP m11, m1 915%endif 916 SWAP m7, m0 917 punpcklwd m0, m4, m5 918 punpcklwd m2, m6, m7 919%if %1 > 4 920 punpckhwd m1, m4, m5 921 punpckhwd m3, m6, m7 922%endif 923 EPEL_COMPUTE 14, %1, m12, m13 924%if (%1 > 8 && (%2 == 8)) 925 punpcklwd m4, m8, m9 926 punpcklwd m2, m10, m11 927 punpckhwd m8, m8, m9 928 punpckhwd m3, m10, m11 929 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 930%if cpuflag(avx2) 931 vinserti128 m2, m0, xm4, 1 932 vperm2i128 m3, m0, m4, q0301 933 PEL_10STORE%1 dstq, m2, m3 934%else 935 PEL_10STORE%1 dstq, m0, m4 936%endif 937%else 938 PEL_10STORE%1 dstq, m0, m1 939%endif 940 movdqa m4, m5 941 movdqa m5, m6 942 movdqa m6, m7 943%if (%1 > 8 && (%2 == 8)) 944 mova m8, m9 945 mova m9, m10 946 mova m10, m11 947%endif 948 LOOP_END dst, src, srcstride 949 RET 950 951cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src 952%assign %%stride ((%2 + 7)/8) 953 sub srcq, srcstrideq 954 EPEL_HV_FILTER %2 955 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 956 EPEL_COMPUTE %2, %1, m14, m15 957%if (%1 > 8 && (%2 == 8)) 958 SWAP m8, m1 959%endif 960 SWAP m4, m0 961 add srcq, srcstrideq 962 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 963 EPEL_COMPUTE %2, %1, m14, m15 964%if (%1 > 8 && (%2 == 8)) 965 SWAP m9, m1 966%endif 967 SWAP m5, m0 968 add srcq, srcstrideq 969 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 970 EPEL_COMPUTE %2, %1, m14, m15 971%if (%1 > 8 && (%2 == 8)) 972 SWAP m10, m1 973%endif 974 SWAP m6, m0 975 add srcq, srcstrideq 976.loop: 977 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 978 EPEL_COMPUTE %2, %1, m14, m15 979%if (%1 > 8 && (%2 == 8)) 980 SWAP m11, m1 981%endif 982 mova m7, m0 983 punpcklwd m0, m4, m5 984 punpcklwd m2, m6, m7 985%if %1 > 4 986 punpckhwd m1, m4, m5 987 punpckhwd m3, m6, m7 988%endif 989 EPEL_COMPUTE 14, %1, m12, m13 990%if (%1 > 8 && (%2 == 8)) 991 punpcklwd m4, m8, m9 992 punpcklwd m2, m10, m11 993 punpckhwd m8, m8, m9 994 punpckhwd m3, m10, m11 995 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 996 UNI_COMPUTE %1, %2, m0, m4, [pw_%2] 997%else 998 UNI_COMPUTE %1, %2, m0, m1, [pw_%2] 999%endif 1000 PEL_%2STORE%1 dstq, m0, m1 1001 mova m4, m5 1002 mova m5, m6 1003 mova m6, m7 1004%if (%1 > 8 && (%2 == 8)) 1005 mova m8, m9 1006 mova m9, m10 1007 mova m10, m11 1008%endif 1009 add dstq, dststrideq ; dst += dststride 1010 add srcq, srcstrideq ; src += srcstride 1011 dec heightd ; cmp height 1012 jnz .loop ; height loop 1013 RET 1014 1015cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src 1016%assign %%stride ((%2 + 7)/8) 1017 sub srcq, srcstrideq 1018 EPEL_HV_FILTER %2 1019 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 1020 EPEL_COMPUTE %2, %1, m14, m15 1021%if (%1 > 8 && (%2 == 8)) 1022 SWAP m8, m1 1023%endif 1024 SWAP m4, m0 1025 add srcq, srcstrideq 1026 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 1027 EPEL_COMPUTE %2, %1, m14, m15 1028%if (%1 > 8 && (%2 == 8)) 1029 SWAP m9, m1 1030%endif 1031 SWAP m5, m0 1032 add srcq, srcstrideq 1033 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 1034 EPEL_COMPUTE %2, %1, m14, m15 1035%if (%1 > 8 && (%2 == 8)) 1036 SWAP m10, m1 1037%endif 1038 SWAP m6, m0 1039 add srcq, srcstrideq 1040.loop: 1041 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 1042 EPEL_COMPUTE %2, %1, m14, m15 1043%if (%1 > 8 && (%2 == 8)) 1044 SWAP m11, m1 1045%endif 1046 SWAP m7, m0 1047 punpcklwd m0, m4, m5 1048 punpcklwd m2, m6, m7 1049%if %1 > 4 1050 punpckhwd m1, m4, m5 1051 punpckhwd m3, m6, m7 1052%endif 1053 EPEL_COMPUTE 14, %1, m12, m13 1054%if (%1 > 8 && (%2 == 8)) 1055 punpcklwd m4, m8, m9 1056 punpcklwd m2, m10, m11 1057 punpckhwd m8, m8, m9 1058 punpckhwd m3, m10, m11 1059 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 1060 SIMPLE_BILOAD %1, src2q, m8, m3 1061%if cpuflag(avx2) 1062 vinserti128 m1, m8, xm3, 1 1063 vperm2i128 m2, m8, m3, q0301 1064 BI_COMPUTE %1, %2, m0, m4, m1, m2, [pw_bi_%2] 1065%else 1066 BI_COMPUTE %1, %2, m0, m4, m8, m3, [pw_bi_%2] 1067%endif 1068%else 1069 SIMPLE_BILOAD %1, src2q, m8, m9 1070 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] 1071%endif 1072 PEL_%2STORE%1 dstq, m0, m4 1073 mova m4, m5 1074 mova m5, m6 1075 mova m6, m7 1076%if (%1 > 8 && (%2 == 8)) 1077 mova m8, m9 1078 mova m9, m10 1079 mova m10, m11 1080%endif 1081 add dstq, dststrideq ; dst += dststride 1082 add srcq, srcstrideq ; src += srcstride 1083 add src2q, 2*MAX_PB_SIZE ; src += srcstride 1084 dec heightd ; cmp height 1085 jnz .loop ; height loop 1086 RET 1087%endmacro 1088 1089; ****************************** 1090; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride, 1091; uint8_t *_src, ptrdiff_t _srcstride, 1092; int height, int mx, int my, int width) 1093; ****************************** 1094 1095%macro HEVC_PUT_HEVC_QPEL 2 1096cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter 1097 QPEL_FILTER %2, mx 1098.loop: 1099 QPEL_H_LOAD %2, srcq, %1, 10 1100 QPEL_COMPUTE %1, %2, 1 1101%if %2 > 8 1102 packssdw m0, m1 1103%endif 1104 PEL_10STORE%1 dstq, m0, m1 1105 LOOP_END dst, src, srcstride 1106 RET 1107 1108cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter 1109 mova m9, [pw_%2] 1110 QPEL_FILTER %2, mx 1111.loop: 1112 QPEL_H_LOAD %2, srcq, %1, 10 1113 QPEL_COMPUTE %1, %2 1114%if %2 > 8 1115 packssdw m0, m1 1116%endif 1117 UNI_COMPUTE %1, %2, m0, m1, m9 1118 PEL_%2STORE%1 dstq, m0, m1 1119 add dstq, dststrideq ; dst += dststride 1120 add srcq, srcstrideq ; src += srcstride 1121 dec heightd ; cmp height 1122 jnz .loop ; height loop 1123 RET 1124 1125cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter 1126 movdqa m9, [pw_bi_%2] 1127 QPEL_FILTER %2, mx 1128.loop: 1129 QPEL_H_LOAD %2, srcq, %1, 10 1130 QPEL_COMPUTE %1, %2, 1 1131%if %2 > 8 1132 packssdw m0, m1 1133%endif 1134 SIMPLE_BILOAD %1, src2q, m10, m11 1135 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1 1136 PEL_%2STORE%1 dstq, m0, m1 1137 add dstq, dststrideq ; dst += dststride 1138 add srcq, srcstrideq ; src += srcstride 1139 add src2q, 2*MAX_PB_SIZE ; src += srcstride 1140 dec heightd ; cmp height 1141 jnz .loop ; height loop 1142 RET 1143 1144 1145; ****************************** 1146; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride, 1147; uint8_t *_src, ptrdiff_t _srcstride, 1148; int height, int mx, int my, int width) 1149; ****************************** 1150 1151cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter 1152 movifnidn myd, mym 1153 lea r3srcq, [srcstrideq*3] 1154 QPEL_FILTER %2, my 1155.loop: 1156 QPEL_V_LOAD %2, srcq, srcstride, %1, r7 1157 QPEL_COMPUTE %1, %2, 1 1158%if %2 > 8 1159 packssdw m0, m1 1160%endif 1161 PEL_10STORE%1 dstq, m0, m1 1162 LOOP_END dst, src, srcstride 1163 RET 1164 1165cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter 1166 movifnidn myd, mym 1167 movdqa m9, [pw_%2] 1168 lea r3srcq, [srcstrideq*3] 1169 QPEL_FILTER %2, my 1170.loop: 1171 QPEL_V_LOAD %2, srcq, srcstride, %1, r8 1172 QPEL_COMPUTE %1, %2 1173%if %2 > 8 1174 packssdw m0, m1 1175%endif 1176 UNI_COMPUTE %1, %2, m0, m1, m9 1177 PEL_%2STORE%1 dstq, m0, m1 1178 add dstq, dststrideq ; dst += dststride 1179 add srcq, srcstrideq ; src += srcstride 1180 dec heightd ; cmp height 1181 jnz .loop ; height loop 1182 RET 1183 1184cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter 1185 movifnidn myd, mym 1186 movdqa m9, [pw_bi_%2] 1187 lea r3srcq, [srcstrideq*3] 1188 QPEL_FILTER %2, my 1189.loop: 1190 QPEL_V_LOAD %2, srcq, srcstride, %1, r9 1191 QPEL_COMPUTE %1, %2, 1 1192%if %2 > 8 1193 packssdw m0, m1 1194%endif 1195 SIMPLE_BILOAD %1, src2q, m10, m11 1196 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1 1197 PEL_%2STORE%1 dstq, m0, m1 1198 add dstq, dststrideq ; dst += dststride 1199 add srcq, srcstrideq ; src += srcstride 1200 add src2q, 2*MAX_PB_SIZE ; src += srcstride 1201 dec heightd ; cmp height 1202 jnz .loop ; height loop 1203 RET 1204%endmacro 1205 1206 1207; ****************************** 1208; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride, 1209; uint8_t *_src, ptrdiff_t _srcstride, 1210; int height, int mx, int my) 1211; ****************************** 1212%macro HEVC_PUT_HEVC_QPEL_HV 2 1213cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter 1214%if cpuflag(avx2) 1215%assign %%shift 4 1216%else 1217%assign %%shift 3 1218%endif 1219 sub mxq, 1 1220 sub myq, 1 1221 shl mxq, %%shift ; multiply by 32 1222 shl myq, %%shift ; multiply by 32 1223 lea r3srcq, [srcstrideq*3] 1224 sub srcq, r3srcq 1225 QPEL_H_LOAD %2, srcq, %1, 15 1226 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1227 SWAP m8, m0 1228 add srcq, srcstrideq 1229 QPEL_H_LOAD %2, srcq, %1, 15 1230 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1231 SWAP m9, m0 1232 add srcq, srcstrideq 1233 QPEL_H_LOAD %2, srcq, %1, 15 1234 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1235 SWAP m10, m0 1236 add srcq, srcstrideq 1237 QPEL_H_LOAD %2, srcq, %1, 15 1238 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1239 SWAP m11, m0 1240 add srcq, srcstrideq 1241 QPEL_H_LOAD %2, srcq, %1, 15 1242 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1243 SWAP m12, m0 1244 add srcq, srcstrideq 1245 QPEL_H_LOAD %2, srcq, %1, 15 1246 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1247 SWAP m13, m0 1248 add srcq, srcstrideq 1249 QPEL_H_LOAD %2, srcq, %1, 15 1250 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1251 SWAP m14, m0 1252 add srcq, srcstrideq 1253.loop: 1254 QPEL_H_LOAD %2, srcq, %1, 15 1255 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1256 SWAP m15, m0 1257 punpcklwd m0, m8, m9 1258 punpcklwd m2, m10, m11 1259 punpcklwd m4, m12, m13 1260 punpcklwd m6, m14, m15 1261%if %1 > 4 1262 punpckhwd m1, m8, m9 1263 punpckhwd m3, m10, m11 1264 punpckhwd m5, m12, m13 1265 punpckhwd m7, m14, m15 1266%endif 1267 QPEL_HV_COMPUTE %1, 14, my, ackssdw 1268 PEL_10STORE%1 dstq, m0, m1 1269%if %1 <= 4 1270 movq m8, m9 1271 movq m9, m10 1272 movq m10, m11 1273 movq m11, m12 1274 movq m12, m13 1275 movq m13, m14 1276 movq m14, m15 1277%else 1278 movdqa m8, m9 1279 movdqa m9, m10 1280 movdqa m10, m11 1281 movdqa m11, m12 1282 movdqa m12, m13 1283 movdqa m13, m14 1284 movdqa m14, m15 1285%endif 1286 LOOP_END dst, src, srcstride 1287 RET 1288 1289cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter 1290%if cpuflag(avx2) 1291%assign %%shift 4 1292%else 1293%assign %%shift 3 1294%endif 1295 sub mxq, 1 1296 sub myq, 1 1297 shl mxq, %%shift ; multiply by 32 1298 shl myq, %%shift ; multiply by 32 1299 lea r3srcq, [srcstrideq*3] 1300 sub srcq, r3srcq 1301 QPEL_H_LOAD %2, srcq, %1, 15 1302 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1303 SWAP m8, m0 1304 add srcq, srcstrideq 1305 QPEL_H_LOAD %2, srcq, %1, 15 1306 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1307 SWAP m9, m0 1308 add srcq, srcstrideq 1309 QPEL_H_LOAD %2, srcq, %1, 15 1310 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1311 SWAP m10, m0 1312 add srcq, srcstrideq 1313 QPEL_H_LOAD %2, srcq, %1, 15 1314 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1315 SWAP m11, m0 1316 add srcq, srcstrideq 1317 QPEL_H_LOAD %2, srcq, %1, 15 1318 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1319 SWAP m12, m0 1320 add srcq, srcstrideq 1321 QPEL_H_LOAD %2, srcq, %1, 15 1322 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1323 SWAP m13, m0 1324 add srcq, srcstrideq 1325 QPEL_H_LOAD %2, srcq, %1, 15 1326 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1327 SWAP m14, m0 1328 add srcq, srcstrideq 1329.loop: 1330 QPEL_H_LOAD %2, srcq, %1, 15 1331 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1332 SWAP m15, m0 1333 punpcklwd m0, m8, m9 1334 punpcklwd m2, m10, m11 1335 punpcklwd m4, m12, m13 1336 punpcklwd m6, m14, m15 1337%if %1 > 4 1338 punpckhwd m1, m8, m9 1339 punpckhwd m3, m10, m11 1340 punpckhwd m5, m12, m13 1341 punpckhwd m7, m14, m15 1342%endif 1343 QPEL_HV_COMPUTE %1, 14, my, ackusdw 1344 UNI_COMPUTE %1, %2, m0, m1, [pw_%2] 1345 PEL_%2STORE%1 dstq, m0, m1 1346 1347%if %1 <= 4 1348 movq m8, m9 1349 movq m9, m10 1350 movq m10, m11 1351 movq m11, m12 1352 movq m12, m13 1353 movq m13, m14 1354 movq m14, m15 1355%else 1356 mova m8, m9 1357 mova m9, m10 1358 mova m10, m11 1359 mova m11, m12 1360 mova m12, m13 1361 mova m13, m14 1362 mova m14, m15 1363%endif 1364 add dstq, dststrideq ; dst += dststride 1365 add srcq, srcstrideq ; src += srcstride 1366 dec heightd ; cmp height 1367 jnz .loop ; height loop 1368 RET 1369 1370cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter 1371%if cpuflag(avx2) 1372%assign %%shift 4 1373%else 1374%assign %%shift 3 1375%endif 1376 sub mxq, 1 1377 sub myq, 1 1378 shl mxq, %%shift ; multiply by 32 1379 shl myq, %%shift ; multiply by 32 1380 lea r3srcq, [srcstrideq*3] 1381 sub srcq, r3srcq 1382 QPEL_H_LOAD %2, srcq, %1, 15 1383 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1384 SWAP m8, m0 1385 add srcq, srcstrideq 1386 QPEL_H_LOAD %2, srcq, %1, 15 1387 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1388 SWAP m9, m0 1389 add srcq, srcstrideq 1390 QPEL_H_LOAD %2, srcq, %1, 15 1391 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1392 SWAP m10, m0 1393 add srcq, srcstrideq 1394 QPEL_H_LOAD %2, srcq, %1, 15 1395 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1396 SWAP m11, m0 1397 add srcq, srcstrideq 1398 QPEL_H_LOAD %2, srcq, %1, 15 1399 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1400 SWAP m12, m0 1401 add srcq, srcstrideq 1402 QPEL_H_LOAD %2, srcq, %1, 15 1403 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1404 SWAP m13, m0 1405 add srcq, srcstrideq 1406 QPEL_H_LOAD %2, srcq, %1, 15 1407 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1408 SWAP m14, m0 1409 add srcq, srcstrideq 1410.loop: 1411 QPEL_H_LOAD %2, srcq, %1, 15 1412 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1413 SWAP m15, m0 1414 punpcklwd m0, m8, m9 1415 punpcklwd m2, m10, m11 1416 punpcklwd m4, m12, m13 1417 punpcklwd m6, m14, m15 1418%if %1 > 4 1419 punpckhwd m1, m8, m9 1420 punpckhwd m3, m10, m11 1421 punpckhwd m5, m12, m13 1422 punpckhwd m7, m14, m15 1423%endif 1424 QPEL_HV_COMPUTE %1, 14, my, ackssdw 1425 SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case 1426 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] 1427 PEL_%2STORE%1 dstq, m0, m1 1428 1429%if %1 <= 4 1430 movq m8, m9 1431 movq m9, m10 1432 movq m10, m11 1433 movq m11, m12 1434 movq m12, m13 1435 movq m13, m14 1436 movq m14, m15 1437%else 1438 movdqa m8, m9 1439 movdqa m9, m10 1440 movdqa m10, m11 1441 movdqa m11, m12 1442 movdqa m12, m13 1443 movdqa m13, m14 1444 movdqa m14, m15 1445%endif 1446 add dstq, dststrideq ; dst += dststride 1447 add srcq, srcstrideq ; src += srcstride 1448 add src2q, 2*MAX_PB_SIZE ; src += srcstride 1449 dec heightd ; cmp height 1450 jnz .loop ; height loop 1451 RET 1452%endmacro 1453 1454%macro WEIGHTING_FUNCS 2 1455%if WIN64 || ARCH_X86_32 1456cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox 1457 mov r4d, denomm 1458%define SHIFT r4d 1459%else 1460cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox 1461%define SHIFT denomd 1462%endif 1463 lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom 1464%if %1 <= 4 1465 pxor m1, m1 1466%endif 1467 movd m2, wxm ; WX 1468 movd m4, SHIFT ; shift 1469%if %1 <= 4 1470 punpcklwd m2, m1 1471%else 1472 punpcklwd m2, m2 1473%endif 1474 dec SHIFT 1475 movdqu m5, [pd_1] 1476 movd m6, SHIFT 1477 pshufd m2, m2, 0 1478 mov SHIFT, oxm 1479 pslld m5, m6 1480%if %2 != 8 1481 shl SHIFT, %2-8 ; ox << (bitd - 8) 1482%endif 1483 movd m3, SHIFT ; OX 1484 pshufd m3, m3, 0 1485%if WIN64 || ARCH_X86_32 1486 mov SHIFT, heightm 1487%endif 1488.loop: 1489 SIMPLE_LOAD %1, 10, srcq, m0 1490%if %1 <= 4 1491 punpcklwd m0, m1 1492 pmaddwd m0, m2 1493 paddd m0, m5 1494 psrad m0, m4 1495 paddd m0, m3 1496%else 1497 pmulhw m6, m0, m2 1498 pmullw m0, m2 1499 punpckhwd m1, m0, m6 1500 punpcklwd m0, m6 1501 paddd m0, m5 1502 paddd m1, m5 1503 psrad m0, m4 1504 psrad m1, m4 1505 paddd m0, m3 1506 paddd m1, m3 1507%endif 1508 packssdw m0, m1 1509%if %2 == 8 1510 packuswb m0, m0 1511%else 1512 CLIPW m0, [pb_0], [max_pixels_%2] 1513%endif 1514 PEL_%2STORE%1 dstq, m0, m1 1515 add dstq, dststrideq ; dst += dststride 1516 add srcq, 2*MAX_PB_SIZE ; src += srcstride 1517 dec heightd ; cmp height 1518 jnz .loop ; height loop 1519 RET 1520 1521cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1 1522 movifnidn r5d, denomm 1523%if %1 <= 4 1524 pxor m1, m1 1525%endif 1526 movd m2, wx0m ; WX0 1527 lea r5d, [r5d+14-%2] ; shift = 14 - bitd + denom 1528 movd m3, wx1m ; WX1 1529 movd m0, r5d ; shift 1530%if %1 <= 4 1531 punpcklwd m2, m1 1532 punpcklwd m3, m1 1533%else 1534 punpcklwd m2, m2 1535 punpcklwd m3, m3 1536%endif 1537 inc r5d 1538 movd m5, r5d ; shift+1 1539 pshufd m2, m2, 0 1540 mov r5d, ox0m 1541 pshufd m3, m3, 0 1542 add r5d, ox1m 1543%if %2 != 8 1544 shl r5d, %2-8 ; ox << (bitd - 8) 1545%endif 1546 inc r5d 1547 movd m4, r5d ; offset 1548 pshufd m4, m4, 0 1549%if UNIX64 1550%define h heightd 1551%else 1552 mov r5d, heightm 1553%define h r5d 1554%endif 1555 pslld m4, m0 1556 1557.loop: 1558 SIMPLE_LOAD %1, 10, srcq, m0 1559 SIMPLE_LOAD %1, 10, src2q, m8 1560%if %1 <= 4 1561 punpcklwd m0, m1 1562 punpcklwd m8, m1 1563 pmaddwd m0, m3 1564 pmaddwd m8, m2 1565 paddd m0, m4 1566 paddd m0, m8 1567 psrad m0, m5 1568%else 1569 pmulhw m6, m0, m3 1570 pmullw m0, m3 1571 pmulhw m7, m8, m2 1572 pmullw m8, m2 1573 punpckhwd m1, m0, m6 1574 punpcklwd m0, m6 1575 punpckhwd m9, m8, m7 1576 punpcklwd m8, m7 1577 paddd m0, m8 1578 paddd m1, m9 1579 paddd m0, m4 1580 paddd m1, m4 1581 psrad m0, m5 1582 psrad m1, m5 1583%endif 1584 packssdw m0, m1 1585%if %2 == 8 1586 packuswb m0, m0 1587%else 1588 CLIPW m0, [pb_0], [max_pixels_%2] 1589%endif 1590 PEL_%2STORE%1 dstq, m0, m1 1591 add dstq, dststrideq ; dst += dststride 1592 add srcq, 2*MAX_PB_SIZE ; src += srcstride 1593 add src2q, 2*MAX_PB_SIZE ; src2 += srcstride 1594 dec h ; cmp height 1595 jnz .loop ; height loop 1596 RET 1597%endmacro 1598 1599INIT_XMM sse4 ; adds ff_ and _sse4 to function name 1600 1601WEIGHTING_FUNCS 2, 8 1602WEIGHTING_FUNCS 4, 8 1603WEIGHTING_FUNCS 6, 8 1604WEIGHTING_FUNCS 8, 8 1605 1606WEIGHTING_FUNCS 2, 10 1607WEIGHTING_FUNCS 4, 10 1608WEIGHTING_FUNCS 6, 10 1609WEIGHTING_FUNCS 8, 10 1610 1611WEIGHTING_FUNCS 2, 12 1612WEIGHTING_FUNCS 4, 12 1613WEIGHTING_FUNCS 6, 12 1614WEIGHTING_FUNCS 8, 12 1615 1616HEVC_PUT_HEVC_PEL_PIXELS 2, 8 1617HEVC_PUT_HEVC_PEL_PIXELS 4, 8 1618HEVC_PUT_HEVC_PEL_PIXELS 6, 8 1619HEVC_PUT_HEVC_PEL_PIXELS 8, 8 1620HEVC_PUT_HEVC_PEL_PIXELS 12, 8 1621HEVC_PUT_HEVC_PEL_PIXELS 16, 8 1622 1623HEVC_PUT_HEVC_PEL_PIXELS 2, 10 1624HEVC_PUT_HEVC_PEL_PIXELS 4, 10 1625HEVC_PUT_HEVC_PEL_PIXELS 6, 10 1626HEVC_PUT_HEVC_PEL_PIXELS 8, 10 1627 1628HEVC_PUT_HEVC_PEL_PIXELS 2, 12 1629HEVC_PUT_HEVC_PEL_PIXELS 4, 12 1630HEVC_PUT_HEVC_PEL_PIXELS 6, 12 1631HEVC_PUT_HEVC_PEL_PIXELS 8, 12 1632 1633HEVC_PUT_HEVC_EPEL 2, 8 1634HEVC_PUT_HEVC_EPEL 4, 8 1635HEVC_PUT_HEVC_EPEL 6, 8 1636HEVC_PUT_HEVC_EPEL 8, 8 1637HEVC_PUT_HEVC_EPEL 12, 8 1638HEVC_PUT_HEVC_EPEL 16, 8 1639 1640 1641HEVC_PUT_HEVC_EPEL 2, 10 1642HEVC_PUT_HEVC_EPEL 4, 10 1643HEVC_PUT_HEVC_EPEL 6, 10 1644HEVC_PUT_HEVC_EPEL 8, 10 1645 1646HEVC_PUT_HEVC_EPEL 2, 12 1647HEVC_PUT_HEVC_EPEL 4, 12 1648HEVC_PUT_HEVC_EPEL 6, 12 1649HEVC_PUT_HEVC_EPEL 8, 12 1650 1651HEVC_PUT_HEVC_EPEL_HV 2, 8 1652HEVC_PUT_HEVC_EPEL_HV 4, 8 1653HEVC_PUT_HEVC_EPEL_HV 6, 8 1654HEVC_PUT_HEVC_EPEL_HV 8, 8 1655HEVC_PUT_HEVC_EPEL_HV 16, 8 1656 1657HEVC_PUT_HEVC_EPEL_HV 2, 10 1658HEVC_PUT_HEVC_EPEL_HV 4, 10 1659HEVC_PUT_HEVC_EPEL_HV 6, 10 1660HEVC_PUT_HEVC_EPEL_HV 8, 10 1661 1662HEVC_PUT_HEVC_EPEL_HV 2, 12 1663HEVC_PUT_HEVC_EPEL_HV 4, 12 1664HEVC_PUT_HEVC_EPEL_HV 6, 12 1665HEVC_PUT_HEVC_EPEL_HV 8, 12 1666 1667HEVC_PUT_HEVC_QPEL 4, 8 1668HEVC_PUT_HEVC_QPEL 8, 8 1669HEVC_PUT_HEVC_QPEL 12, 8 1670HEVC_PUT_HEVC_QPEL 16, 8 1671 1672HEVC_PUT_HEVC_QPEL 4, 10 1673HEVC_PUT_HEVC_QPEL 8, 10 1674 1675HEVC_PUT_HEVC_QPEL 4, 12 1676HEVC_PUT_HEVC_QPEL 8, 12 1677 1678HEVC_PUT_HEVC_QPEL_HV 2, 8 1679HEVC_PUT_HEVC_QPEL_HV 4, 8 1680HEVC_PUT_HEVC_QPEL_HV 6, 8 1681HEVC_PUT_HEVC_QPEL_HV 8, 8 1682 1683HEVC_PUT_HEVC_QPEL_HV 2, 10 1684HEVC_PUT_HEVC_QPEL_HV 4, 10 1685HEVC_PUT_HEVC_QPEL_HV 6, 10 1686HEVC_PUT_HEVC_QPEL_HV 8, 10 1687 1688HEVC_PUT_HEVC_QPEL_HV 2, 12 1689HEVC_PUT_HEVC_QPEL_HV 4, 12 1690HEVC_PUT_HEVC_QPEL_HV 6, 12 1691HEVC_PUT_HEVC_QPEL_HV 8, 12 1692 1693%if HAVE_AVX2_EXTERNAL 1694INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0 1695 1696HEVC_PUT_HEVC_PEL_PIXELS 32, 8 1697HEVC_PUT_HEVC_PEL_PIXELS 16, 10 1698 1699HEVC_PUT_HEVC_EPEL 32, 8 1700HEVC_PUT_HEVC_EPEL 16, 10 1701 1702HEVC_PUT_HEVC_EPEL_HV 16, 10 1703HEVC_PUT_HEVC_EPEL_HV 32, 8 1704 1705HEVC_PUT_HEVC_QPEL 32, 8 1706 1707HEVC_PUT_HEVC_QPEL 16, 10 1708 1709HEVC_PUT_HEVC_QPEL_HV 16, 10 1710 1711%endif ;AVX2 1712%endif ; ARCH_X86_64 1713 1714%macro QPEL_FILTER_H 5 1715%define %%table hevc_qpel_filters_avx512icl_h_%1 1716%assign %%offset 4 1717 dec %2q 1718 shl %2q, 3 1719%ifdef PIC 1720 lea %5q, [%%table] 1721 %define FILTER %5q 1722%else 1723 %define FILTER %%table 1724%endif 1725 vpbroadcastd m%3, [FILTER + %2q + 0*%%offset] 1726 vpbroadcastd m%4, [FILTER + %2q + 1*%%offset] 1727%endmacro 1728 1729%macro QPEL_FILTER_V 5 1730 vpbroadcastd m%3, [%5 + %2q + 4*%4] 1731%endmacro 1732 1733%macro QPEL_LOAD_SHUF 2 1734 movu m%1, [pb_qpel_shuffle_index + 0] 1735 movu m%2, [pb_qpel_shuffle_index + 64] 1736%endmacro 1737 1738; required: m0-m5 1739; %1: dst register index 1740; %2: name for src 1741; %3: optional offset 1742%macro QPEL_H_LOAD_COMPUTE 2-3 1743%assign %%offset 0 1744%if %0 == 3 1745%assign %%offset %3 1746%endif 1747 pxor m%1, m%1 1748%if mmsize == 64 1749 movu ym4, [%2q + %%offset - 3] 1750%else 1751 movu xm4, [%2q + %%offset - 3] 1752%endif 1753 vpermb m5, m2, m4 1754 vpermb m4, m3, m4 1755 vpdpbusd m%1, m5, m0 1756 vpdpbusd m%1, m4, m1 1757%endmacro 1758 1759%macro HEVC_PUT_HEVC_QPEL_AVX512ICL 2 1760cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 8, dst, src, srcstride, height, mx, tmp 1761 QPEL_FILTER_H %1, mx, 0, 1, tmp 1762 QPEL_LOAD_SHUF 2, 3 1763.loop: 1764 QPEL_H_LOAD_COMPUTE 6, src 1765%if %1 == 4 1766 vpmovdw xm6, m6 1767 movq [dstq], xm6 1768%else 1769 vpmovdw [dstq], m6 1770%endif 1771%if %1 > 16 1772 QPEL_H_LOAD_COMPUTE 7, src, 16 1773 vpmovdw [dstq + 32], m7 1774%endif 1775%if %1 > 32 1776 QPEL_H_LOAD_COMPUTE 6, src, 32 1777 QPEL_H_LOAD_COMPUTE 7, src, 48 1778 vpmovdw [dstq + 64], m6 1779 vpmovdw [dstq + 96], m7 1780%endif 1781 LOOP_END dst, src, srcstride 1782 RET 1783%endmacro 1784 1785%macro HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 2 1786cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 7, 27, dst, src, srcstride, height, mx, my, tmp 1787%assign %%shift 6 1788%assign %%extra 7 1789 QPEL_FILTER_H %1, mx, 0, 1, tmp 1790 QPEL_LOAD_SHUF 2, 3 1791 lea tmpq, [srcstrideq*3] 1792 sub srcq, tmpq 1793 sub myq, 1 1794 shl myq, 5 1795%define %%table hevc_qpel_filters_avx512icl_v_%1 1796%ifdef PIC 1797 lea tmpq, [%%table] 1798 %define FILTER tmpq 1799%else 1800 %define FILTER %%table 1801%endif 1802%assign %%i 6 1803%assign %%j 0 1804%rep %1 1805 QPEL_FILTER_V %1, my, %%i, %%j, FILTER 1806 %assign %%i %%i+1 1807 %assign %%j %%j+1 1808%endrep 1809%rep %%extra 1810 QPEL_H_LOAD_COMPUTE %%i, src 1811 add srcq, srcstrideq 1812%assign %%i %%i+1 1813%endrep 1814.loop: 1815 QPEL_H_LOAD_COMPUTE %%i, src 1816 vpmulld m22, m14, m6 1817 vpmulld m23, m15, m7 1818 vpmulld m24, m16, m8 1819 vpmulld m25, m17, m9 1820 vpaddd m26, m22, m23 1821 vpaddd m24, m25 1822 vpaddd m26, m24 1823 vpmulld m22, m18, m10 1824 vpmulld m23, m19, m11 1825 vpmulld m24, m20, m12 1826 vpmulld m25, m21, m13 1827 vpaddd m22, m22, m23 1828 vpaddd m24, m25 1829 vpaddd m26, m24 1830 vpaddd m22, m26 1831 mova m14, m15 1832 mova m15, m16 1833 mova m16, m17 1834 mova m17, m18 1835 mova m18, m19 1836 mova m19, m20 1837 mova m20, m21 1838 vpsrad m22, %%shift 1839 vpmovdw [dstq], m22 1840 LOOP_END dst, src, srcstride 1841 1842 RET 1843%endmacro 1844 1845%if ARCH_X86_64 1846%if HAVE_AVX512ICL_EXTERNAL 1847 1848INIT_XMM avx512icl 1849HEVC_PUT_HEVC_QPEL_AVX512ICL 4, 8 1850 1851INIT_YMM avx512icl 1852HEVC_PUT_HEVC_QPEL_AVX512ICL 8, 8 1853HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 8, 8 1854 1855INIT_ZMM avx512icl 1856HEVC_PUT_HEVC_QPEL_AVX512ICL 16, 8 1857HEVC_PUT_HEVC_QPEL_AVX512ICL 32, 8 1858HEVC_PUT_HEVC_QPEL_AVX512ICL 64, 8 1859 1860%endif 1861%endif 1862