1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Oskar Arvidsson <oskar@irock.se> 7;* Loren Merritt <lorenm@u.washington.edu> 8;* Fiona Glaser <fiona@x264.com> 9;* 10;* This file is part of FFmpeg. 11;* 12;* FFmpeg is free software; you can redistribute it and/or 13;* modify it under the terms of the GNU Lesser General Public 14;* License as published by the Free Software Foundation; either 15;* version 2.1 of the License, or (at your option) any later version. 16;* 17;* FFmpeg is distributed in the hope that it will be useful, 18;* but WITHOUT ANY WARRANTY; without even the implied warranty of 19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20;* Lesser General Public License for more details. 21;* 22;* You should have received a copy of the GNU Lesser General Public 23;* License along with FFmpeg; if not, write to the Free Software 24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25;****************************************************************************** 26 27%include "libavutil/x86/x86util.asm" 28 29SECTION .text 30 31cextern pw_2 32cextern pw_3 33cextern pw_4 34cextern pw_1023 35%define pw_pixel_max pw_1023 36 37; out: %4 = |%1-%2|-%3 38; clobbers: %5 39%macro ABS_SUB 5 40 psubusw %5, %2, %1 41 psubusw %4, %1, %2 42 por %4, %5 43 psubw %4, %3 44%endmacro 45 46; out: %4 = |%1-%2|<%3 47%macro DIFF_LT 5 48 psubusw %4, %2, %1 49 psubusw %5, %1, %2 50 por %5, %4 ; |%1-%2| 51 pxor %4, %4 52 psubw %5, %3 ; |%1-%2|-%3 53 pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 54%endmacro 55 56%macro LOAD_AB 4 57 movd %1, %3 58 movd %2, %4 59 SPLATW %1, %1 60 SPLATW %2, %2 61%endmacro 62 63; in: %2=tc reg 64; out: %1=splatted tc 65%macro LOAD_TC 2 66 movd %1, [%2] 67 punpcklbw %1, %1 68%if mmsize == 8 69 pshufw %1, %1, 0 70%else 71 pshuflw %1, %1, 01010000b 72 pshufd %1, %1, 01010000b 73%endif 74 psraw %1, 6 75%endmacro 76 77; in: %1=p1, %2=p0, %3=q0, %4=q1 78; %5=alpha, %6=beta, %7-%9=tmp 79; out: %7=mask 80%macro LOAD_MASK 9 81 ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha 82 ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta 83 pand %8, %9 84 ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta 85 pxor %7, %7 86 pand %8, %9 87 pcmpgtw %7, %8 88%endmacro 89 90; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp 91; out: %1=p0', m2=q0' 92%macro DEBLOCK_P0_Q0 7 93 psubw %3, %4 94 pxor %7, %7 95 paddw %3, [pw_4] 96 psubw %7, %5 97 psubw %6, %2, %1 98 psllw %6, 2 99 paddw %3, %6 100 psraw %3, 3 101 mova %6, [pw_pixel_max] 102 CLIPW %3, %7, %5 103 pxor %7, %7 104 paddw %1, %3 105 psubw %2, %3 106 CLIPW %1, %7, %6 107 CLIPW %2, %7, %6 108%endmacro 109 110; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp 111%macro LUMA_Q1 6 112 pavgw %6, %3, %4 ; (p0+q0+1)>>1 113 paddw %1, %6 114 pxor %6, %6 115 psraw %1, 1 116 psubw %6, %5 117 psubw %1, %2 118 CLIPW %1, %6, %5 119 paddw %1, %2 120%endmacro 121 122%macro LUMA_DEBLOCK_ONE 3 123 DIFF_LT m5, %1, bm, m4, m6 124 pxor m6, m6 125 mova %3, m4 126 pcmpgtw m6, tcm 127 pand m4, tcm 128 pandn m6, m7 129 pand m4, m6 130 LUMA_Q1 m5, %2, m1, m2, m4, m6 131%endmacro 132 133%macro LUMA_H_STORE 2 134%if mmsize == 8 135 movq [r0-4], m0 136 movq [r0+r1-4], m1 137 movq [r0+r1*2-4], m2 138 movq [r0+%2-4], m3 139%else 140 movq [r0-4], m0 141 movhps [r0+r1-4], m0 142 movq [r0+r1*2-4], m1 143 movhps [%1-4], m1 144 movq [%1+r1-4], m2 145 movhps [%1+r1*2-4], m2 146 movq [%1+%2-4], m3 147 movhps [%1+r1*4-4], m3 148%endif 149%endmacro 150 151%macro DEBLOCK_LUMA 0 152;----------------------------------------------------------------------------- 153; void ff_deblock_v_luma_10(uint16_t *pix, int stride, int alpha, int beta, 154; int8_t *tc0) 155;----------------------------------------------------------------------------- 156cglobal deblock_v_luma_10, 5,5,8*(mmsize/16) 157 %assign pad 5*mmsize+12-(stack_offset&15) 158 %define tcm [rsp] 159 %define ms1 [rsp+mmsize] 160 %define ms2 [rsp+mmsize*2] 161 %define am [rsp+mmsize*3] 162 %define bm [rsp+mmsize*4] 163 SUB rsp, pad 164 shl r2d, 2 165 shl r3d, 2 166 LOAD_AB m4, m5, r2d, r3d 167 mov r3, 32/mmsize 168 mov r2, r0 169 sub r0, r1 170 mova am, m4 171 sub r0, r1 172 mova bm, m5 173 sub r0, r1 174.loop: 175 mova m0, [r0+r1] 176 mova m1, [r0+r1*2] 177 mova m2, [r2] 178 mova m3, [r2+r1] 179 180 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 181 LOAD_TC m6, r4 182 mova tcm, m6 183 184 mova m5, [r0] 185 LUMA_DEBLOCK_ONE m1, m0, ms1 186 mova [r0+r1], m5 187 188 mova m5, [r2+r1*2] 189 LUMA_DEBLOCK_ONE m2, m3, ms2 190 mova [r2+r1], m5 191 192 pxor m5, m5 193 mova m6, tcm 194 pcmpgtw m5, tcm 195 psubw m6, ms1 196 pandn m5, m7 197 psubw m6, ms2 198 pand m5, m6 199 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 200 mova [r0+r1*2], m1 201 mova [r2], m2 202 203 add r0, mmsize 204 add r2, mmsize 205 add r4, mmsize/8 206 dec r3 207 jg .loop 208 ADD rsp, pad 209 RET 210 211cglobal deblock_h_luma_10, 5,6,8*(mmsize/16) 212 %assign pad 7*mmsize+12-(stack_offset&15) 213 %define tcm [rsp] 214 %define ms1 [rsp+mmsize] 215 %define ms2 [rsp+mmsize*2] 216 %define p1m [rsp+mmsize*3] 217 %define p2m [rsp+mmsize*4] 218 %define am [rsp+mmsize*5] 219 %define bm [rsp+mmsize*6] 220 SUB rsp, pad 221 shl r2d, 2 222 shl r3d, 2 223 LOAD_AB m4, m5, r2d, r3d 224 mov r3, r1 225 mova am, m4 226 add r3, r1 227 mov r5, 32/mmsize 228 mova bm, m5 229 add r3, r1 230%if mmsize == 16 231 mov r2, r0 232 add r2, r3 233%endif 234.loop: 235%if mmsize == 8 236 movq m2, [r0-8] ; y q2 q1 q0 237 movq m7, [r0+0] 238 movq m5, [r0+r1-8] 239 movq m3, [r0+r1+0] 240 movq m0, [r0+r1*2-8] 241 movq m6, [r0+r1*2+0] 242 movq m1, [r0+r3-8] 243 TRANSPOSE4x4W 2, 5, 0, 1, 4 244 SWAP 2, 7 245 movq m7, [r0+r3] 246 TRANSPOSE4x4W 2, 3, 6, 7, 4 247%else 248 movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x 249 movu m0, [r0+r1-8] 250 movu m2, [r0+r1*2-8] 251 movu m3, [r2-8] 252 TRANSPOSE4x4W 5, 0, 2, 3, 6 253 mova tcm, m3 254 255 movu m4, [r2+r1-8] 256 movu m1, [r2+r1*2-8] 257 movu m3, [r2+r3-8] 258 movu m7, [r2+r1*4-8] 259 TRANSPOSE4x4W 4, 1, 3, 7, 6 260 261 mova m6, tcm 262 punpcklqdq m6, m7 263 punpckhqdq m5, m4 264 SBUTTERFLY qdq, 0, 1, 7 265 SBUTTERFLY qdq, 2, 3, 7 266%endif 267 268 mova p2m, m6 269 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 270 LOAD_TC m6, r4 271 mova tcm, m6 272 273 LUMA_DEBLOCK_ONE m1, m0, ms1 274 mova p1m, m5 275 276 mova m5, p2m 277 LUMA_DEBLOCK_ONE m2, m3, ms2 278 mova p2m, m5 279 280 pxor m5, m5 281 mova m6, tcm 282 pcmpgtw m5, tcm 283 psubw m6, ms1 284 pandn m5, m7 285 psubw m6, ms2 286 pand m5, m6 287 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 288 mova m0, p1m 289 mova m3, p2m 290 TRANSPOSE4x4W 0, 1, 2, 3, 4 291 LUMA_H_STORE r2, r3 292 293 add r4, mmsize/8 294 lea r0, [r0+r1*(mmsize/2)] 295 lea r2, [r2+r1*(mmsize/2)] 296 dec r5 297 jg .loop 298 ADD rsp, pad 299 RET 300%endmacro 301 302%if ARCH_X86_64 303; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 304; m12=alpha, m13=beta 305; out: m0=p1', m3=q1', m1=p0', m2=q0' 306; clobbers: m4, m5, m6, m7, m10, m11, m14 307%macro DEBLOCK_LUMA_INTER_SSE2 0 308 LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 309 LOAD_TC m6, r4 310 DIFF_LT m8, m1, m13, m10, m4 311 DIFF_LT m9, m2, m13, m11, m4 312 pand m6, m7 313 314 mova m14, m6 315 pxor m4, m4 316 pcmpgtw m6, m4 317 pand m6, m14 318 319 mova m5, m10 320 pand m5, m6 321 LUMA_Q1 m8, m0, m1, m2, m5, m4 322 323 mova m5, m11 324 pand m5, m6 325 LUMA_Q1 m9, m3, m1, m2, m5, m4 326 327 pxor m4, m4 328 psubw m6, m10 329 pcmpgtw m4, m14 330 pandn m4, m7 331 psubw m6, m11 332 pand m4, m6 333 DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 334 335 SWAP 0, 8 336 SWAP 3, 9 337%endmacro 338 339%macro DEBLOCK_LUMA_64 0 340cglobal deblock_v_luma_10, 5,5,15 341 %define p2 m8 342 %define p1 m0 343 %define p0 m1 344 %define q0 m2 345 %define q1 m3 346 %define q2 m9 347 %define mask0 m7 348 %define mask1 m10 349 %define mask2 m11 350 shl r2d, 2 351 shl r3d, 2 352 LOAD_AB m12, m13, r2d, r3d 353 mov r2, r0 354 sub r0, r1 355 sub r0, r1 356 sub r0, r1 357 mov r3, 2 358.loop: 359 mova p2, [r0] 360 mova p1, [r0+r1] 361 mova p0, [r0+r1*2] 362 mova q0, [r2] 363 mova q1, [r2+r1] 364 mova q2, [r2+r1*2] 365 DEBLOCK_LUMA_INTER_SSE2 366 mova [r0+r1], p1 367 mova [r0+r1*2], p0 368 mova [r2], q0 369 mova [r2+r1], q1 370 add r0, mmsize 371 add r2, mmsize 372 add r4, 2 373 dec r3 374 jg .loop 375 REP_RET 376 377cglobal deblock_h_luma_10, 5,7,15 378 shl r2d, 2 379 shl r3d, 2 380 LOAD_AB m12, m13, r2d, r3d 381 mov r2, r1 382 add r2, r1 383 add r2, r1 384 mov r5, r0 385 add r5, r2 386 mov r6, 2 387.loop: 388 movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x 389 movu m0, [r0+r1-8] 390 movu m2, [r0+r1*2-8] 391 movu m9, [r5-8] 392 movu m5, [r5+r1-8] 393 movu m1, [r5+r1*2-8] 394 movu m3, [r5+r2-8] 395 movu m7, [r5+r1*4-8] 396 397 TRANSPOSE4x4W 8, 0, 2, 9, 10 398 TRANSPOSE4x4W 5, 1, 3, 7, 10 399 400 punpckhqdq m8, m5 401 SBUTTERFLY qdq, 0, 1, 10 402 SBUTTERFLY qdq, 2, 3, 10 403 punpcklqdq m9, m7 404 405 DEBLOCK_LUMA_INTER_SSE2 406 407 TRANSPOSE4x4W 0, 1, 2, 3, 4 408 LUMA_H_STORE r5, r2 409 add r4, 2 410 lea r0, [r0+r1*8] 411 lea r5, [r5+r1*8] 412 dec r6 413 jg .loop 414 REP_RET 415%endmacro 416 417INIT_XMM sse2 418DEBLOCK_LUMA_64 419%if HAVE_AVX_EXTERNAL 420INIT_XMM avx 421DEBLOCK_LUMA_64 422%endif 423%endif 424 425%macro SWAPMOVA 2 426%ifid %1 427 SWAP %1, %2 428%else 429 mova %1, %2 430%endif 431%endmacro 432 433; in: t0-t2: tmp registers 434; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 435; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' 436%macro LUMA_INTRA_P012 12 ; p0..p3 in memory 437%if ARCH_X86_64 438 paddw t0, %3, %2 439 mova t2, %4 440 paddw t2, %3 441%else 442 mova t0, %3 443 mova t2, %4 444 paddw t0, %2 445 paddw t2, %3 446%endif 447 paddw t0, %1 448 paddw t2, t2 449 paddw t0, %5 450 paddw t2, %9 451 paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) 452 paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) 453 454 psrlw t2, 3 455 psrlw t1, t0, 2 456 psubw t2, %3 457 psubw t1, %2 458 pand t2, %8 459 pand t1, %8 460 paddw t2, %3 461 paddw t1, %2 462 SWAPMOVA %11, t1 463 464 psubw t1, t0, %3 465 paddw t0, t0 466 psubw t1, %5 467 psubw t0, %3 468 paddw t1, %6 469 paddw t1, %2 470 paddw t0, %6 471 psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 472 psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 473 474 pxor t0, t1 475 pxor t1, %1 476 pand t0, %8 477 pand t1, %7 478 pxor t0, t1 479 pxor t0, %1 480 SWAPMOVA %10, t0 481 SWAPMOVA %12, t2 482%endmacro 483 484%macro LUMA_INTRA_INIT 1 485 %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) 486 %define t0 m4 487 %define t1 m5 488 %define t2 m6 489 %define t3 m7 490 %assign i 4 491%rep %1 492 CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] 493 %assign i i+1 494%endrep 495 SUB rsp, pad 496%endmacro 497 498; in: %1-%3=tmp, %4=p2, %5=q2 499%macro LUMA_INTRA_INTER 5 500 LOAD_AB t0, t1, r2d, r3d 501 mova %1, t0 502 LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 503%if ARCH_X86_64 504 mova %2, t0 ; mask0 505 psrlw t3, %1, 2 506%else 507 mova t3, %1 508 mova %2, t0 ; mask0 509 psrlw t3, 2 510%endif 511 paddw t3, [pw_2] ; alpha/4+2 512 DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 513 pand t2, %2 514 mova t3, %5 ; q2 515 mova %1, t2 ; mask1 516 DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta 517 pand t2, %1 518 mova t3, %4 ; p2 519 mova %3, t2 ; mask1q 520 DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta 521 pand t2, %1 522 mova %1, t2 ; mask1p 523%endmacro 524 525%macro LUMA_H_INTRA_LOAD 0 526%if mmsize == 8 527 movu t0, [r0-8] 528 movu t1, [r0+r1-8] 529 movu m0, [r0+r1*2-8] 530 movu m1, [r0+r4-8] 531 TRANSPOSE4x4W 4, 5, 0, 1, 2 532 mova t4, t0 ; p3 533 mova t5, t1 ; p2 534 535 movu m2, [r0] 536 movu m3, [r0+r1] 537 movu t0, [r0+r1*2] 538 movu t1, [r0+r4] 539 TRANSPOSE4x4W 2, 3, 4, 5, 6 540 mova t6, t0 ; q2 541 mova t7, t1 ; q3 542%else 543 movu t0, [r0-8] 544 movu t1, [r0+r1-8] 545 movu m0, [r0+r1*2-8] 546 movu m1, [r0+r5-8] 547 movu m2, [r4-8] 548 movu m3, [r4+r1-8] 549 movu t2, [r4+r1*2-8] 550 movu t3, [r4+r5-8] 551 TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 552 mova t4, t0 ; p3 553 mova t5, t1 ; p2 554 mova t6, t2 ; q2 555 mova t7, t3 ; q3 556%endif 557%endmacro 558 559; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp 560%macro LUMA_H_INTRA_STORE 9 561%if mmsize == 8 562 TRANSPOSE4x4W %1, %2, %3, %4, %9 563 movq [r0-8], m%1 564 movq [r0+r1-8], m%2 565 movq [r0+r1*2-8], m%3 566 movq [r0+r4-8], m%4 567 movq m%1, %8 568 TRANSPOSE4x4W %5, %6, %7, %1, %9 569 movq [r0], m%5 570 movq [r0+r1], m%6 571 movq [r0+r1*2], m%7 572 movq [r0+r4], m%1 573%else 574 TRANSPOSE2x4x4W %1, %2, %3, %4, %9 575 movq [r0-8], m%1 576 movq [r0+r1-8], m%2 577 movq [r0+r1*2-8], m%3 578 movq [r0+r5-8], m%4 579 movhps [r4-8], m%1 580 movhps [r4+r1-8], m%2 581 movhps [r4+r1*2-8], m%3 582 movhps [r4+r5-8], m%4 583%ifnum %8 584 SWAP %1, %8 585%else 586 mova m%1, %8 587%endif 588 TRANSPOSE2x4x4W %5, %6, %7, %1, %9 589 movq [r0], m%5 590 movq [r0+r1], m%6 591 movq [r0+r1*2], m%7 592 movq [r0+r5], m%1 593 movhps [r4], m%5 594 movhps [r4+r1], m%6 595 movhps [r4+r1*2], m%7 596 movhps [r4+r5], m%1 597%endif 598%endmacro 599 600%if ARCH_X86_64 601;----------------------------------------------------------------------------- 602; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha, 603; int beta) 604;----------------------------------------------------------------------------- 605%macro DEBLOCK_LUMA_INTRA_64 0 606cglobal deblock_v_luma_intra_10, 4,7,16 607 %define t0 m1 608 %define t1 m2 609 %define t2 m4 610 %define p2 m8 611 %define p1 m9 612 %define p0 m10 613 %define q0 m11 614 %define q1 m12 615 %define q2 m13 616 %define aa m5 617 %define bb m14 618 lea r4, [r1*4] 619 lea r5, [r1*3] ; 3*stride 620 neg r4 621 add r4, r0 ; pix-4*stride 622 mov r6, 2 623 mova m0, [pw_2] 624 shl r2d, 2 625 shl r3d, 2 626 LOAD_AB aa, bb, r2d, r3d 627.loop: 628 mova p2, [r4+r1] 629 mova p1, [r4+2*r1] 630 mova p0, [r4+r5] 631 mova q0, [r0] 632 mova q1, [r0+r1] 633 mova q2, [r0+2*r1] 634 635 LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 636 mova t2, aa 637 psrlw t2, 2 638 paddw t2, m0 ; alpha/4+2 639 DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 640 DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta 641 DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta 642 pand m6, m3 643 pand m7, m6 644 pand m6, t1 645 LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] 646 LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] 647 add r0, mmsize 648 add r4, mmsize 649 dec r6 650 jg .loop 651 REP_RET 652 653;----------------------------------------------------------------------------- 654; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha, 655; int beta) 656;----------------------------------------------------------------------------- 657cglobal deblock_h_luma_intra_10, 4,7,16 658 %define t0 m15 659 %define t1 m14 660 %define t2 m2 661 %define q3 m5 662 %define q2 m8 663 %define q1 m9 664 %define q0 m10 665 %define p0 m11 666 %define p1 m12 667 %define p2 m13 668 %define p3 m4 669 %define spill [rsp] 670 %assign pad 24-(stack_offset&15) 671 SUB rsp, pad 672 lea r4, [r1*4] 673 lea r5, [r1*3] ; 3*stride 674 add r4, r0 ; pix+4*stride 675 mov r6, 2 676 mova m0, [pw_2] 677 shl r2d, 2 678 shl r3d, 2 679.loop: 680 movu q3, [r0-8] 681 movu q2, [r0+r1-8] 682 movu q1, [r0+r1*2-8] 683 movu q0, [r0+r5-8] 684 movu p0, [r4-8] 685 movu p1, [r4+r1-8] 686 movu p2, [r4+r1*2-8] 687 movu p3, [r4+r5-8] 688 TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 689 690 LOAD_AB m1, m2, r2d, r3d 691 LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 692 psrlw m1, 2 693 paddw m1, m0 ; alpha/4+2 694 DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 695 DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta 696 DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta 697 pand m6, m3 698 pand m7, m6 699 pand m6, t1 700 701 mova spill, q3 702 LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 703 LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 704 mova m7, spill 705 706 LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 707 708 lea r0, [r0+r1*8] 709 lea r4, [r4+r1*8] 710 dec r6 711 jg .loop 712 ADD rsp, pad 713 RET 714%endmacro 715 716INIT_XMM sse2 717DEBLOCK_LUMA_INTRA_64 718%if HAVE_AVX_EXTERNAL 719INIT_XMM avx 720DEBLOCK_LUMA_INTRA_64 721%endif 722 723%endif 724 725%macro DEBLOCK_LUMA_INTRA 0 726;----------------------------------------------------------------------------- 727; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha, 728; int beta) 729;----------------------------------------------------------------------------- 730cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16) 731 LUMA_INTRA_INIT 3 732 lea r4, [r1*4] 733 lea r5, [r1*3] 734 neg r4 735 add r4, r0 736 mov r6, 32/mmsize 737 shl r2d, 2 738 shl r3d, 2 739.loop: 740 mova m0, [r4+r1*2] ; p1 741 mova m1, [r4+r5] ; p0 742 mova m2, [r0] ; q0 743 mova m3, [r0+r1] ; q1 744 LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] 745 LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] 746 mova t3, [r0+r1*2] ; q2 747 LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] 748 add r0, mmsize 749 add r4, mmsize 750 dec r6 751 jg .loop 752 ADD rsp, pad 753 RET 754 755;----------------------------------------------------------------------------- 756; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha, 757; int beta) 758;----------------------------------------------------------------------------- 759cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16) 760 LUMA_INTRA_INIT 8 761%if mmsize == 8 762 lea r4, [r1*3] 763 mov r5, 32/mmsize 764%else 765 lea r4, [r1*4] 766 lea r5, [r1*3] ; 3*stride 767 add r4, r0 ; pix+4*stride 768 mov r6, 32/mmsize 769%endif 770 shl r2d, 2 771 shl r3d, 2 772.loop: 773 LUMA_H_INTRA_LOAD 774 LUMA_INTRA_INTER t8, t9, t10, t5, t6 775 776 LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 777 mova t3, t6 ; q2 778 LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 779 780 mova m2, t4 781 mova m0, t11 782 mova m1, t5 783 mova m3, t8 784 mova m6, t6 785 786 LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 787 788 lea r0, [r0+r1*(mmsize/2)] 789%if mmsize == 8 790 dec r5 791%else 792 lea r4, [r4+r1*(mmsize/2)] 793 dec r6 794%endif 795 jg .loop 796 ADD rsp, pad 797 RET 798%endmacro 799 800%if ARCH_X86_64 == 0 801%if HAVE_ALIGNED_STACK == 0 802INIT_MMX mmxext 803DEBLOCK_LUMA 804DEBLOCK_LUMA_INTRA 805%endif 806INIT_XMM sse2 807DEBLOCK_LUMA 808DEBLOCK_LUMA_INTRA 809%if HAVE_AVX_EXTERNAL 810INIT_XMM avx 811DEBLOCK_LUMA 812DEBLOCK_LUMA_INTRA 813%endif 814%endif 815 816; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp 817; out: %1=p0', %2=q0' 818%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 819 mova %6, [pw_2] 820 paddw %6, %3 821 paddw %6, %4 822 paddw %7, %6, %2 823 paddw %6, %1 824 paddw %6, %3 825 paddw %7, %4 826 psraw %6, 2 827 psraw %7, 2 828 psubw %6, %1 829 psubw %7, %2 830 pand %6, %5 831 pand %7, %5 832 paddw %1, %6 833 paddw %2, %7 834%endmacro 835 836%macro CHROMA_V_LOAD 1 837 mova m0, [r0] ; p1 838 mova m1, [r0+r1] ; p0 839 mova m2, [%1] ; q0 840 mova m3, [%1+r1] ; q1 841%endmacro 842 843%macro CHROMA_V_STORE 0 844 mova [r0+1*r1], m1 845 mova [r0+2*r1], m2 846%endmacro 847 848; in: 8 rows of 4 words in %4..%11 849; out: 4 rows of 8 words in m0..m3 850%macro TRANSPOSE4x8W_LOAD 8 851 movq m0, %1 852 movq m2, %2 853 movq m1, %3 854 movq m3, %4 855 856 punpcklwd m0, m2 857 punpcklwd m1, m3 858 punpckhdq m2, m0, m1 859 punpckldq m0, m1 860 861 movq m4, %5 862 movq m6, %6 863 movq m5, %7 864 movq m3, %8 865 866 punpcklwd m4, m6 867 punpcklwd m5, m3 868 punpckhdq m6, m4, m5 869 punpckldq m4, m5 870 871 punpckhqdq m1, m0, m4 872 punpcklqdq m0, m4 873 punpckhqdq m3, m2, m6 874 punpcklqdq m2, m6 875%endmacro 876 877; in: 4 rows of 8 words in m0..m3 878; out: 8 rows of 4 words in %1..%8 879%macro TRANSPOSE8x4W_STORE 8 880 TRANSPOSE4x4W 0, 1, 2, 3, 4 881 movq %1, m0 882 movhps %2, m0 883 movq %3, m1 884 movhps %4, m1 885 movq %5, m2 886 movhps %6, m2 887 movq %7, m3 888 movhps %8, m3 889%endmacro 890 891; %1 = base + 3*stride 892; %2 = 3*stride (unused on mmx) 893; %3, %4 = place to store p1 and q1 values 894%macro CHROMA_H_LOAD 4 895 %if mmsize == 8 896 movq m0, [pix_q - 4] 897 movq m1, [pix_q + stride_q - 4] 898 movq m2, [pix_q + 2*stride_q - 4] 899 movq m3, [%1 - 4] 900 TRANSPOSE4x4W 0, 1, 2, 3, 4 901 %else 902 TRANSPOSE4x8W_LOAD PASS8ROWS(pix_q-4, %1-4, stride_q, %2) 903 %endif 904 mova %3, m0 905 mova %4, m3 906%endmacro 907 908; %1 = base + 3*stride 909; %2 = 3*stride (unused on mmx) 910; %3, %4 = place to load p1 and q1 values 911%macro CHROMA_H_STORE 4 912 mova m0, %3 913 mova m3, %4 914 %if mmsize == 8 915 TRANSPOSE4x4W 0, 1, 2, 3, 4 916 movq [pix_q - 4], m0 917 movq [pix_q + stride_q - 4], m1 918 movq [pix_q + 2*stride_q - 4], m2 919 movq [%1 - 4], m3 920 %else 921 TRANSPOSE8x4W_STORE PASS8ROWS(pix_q-4, %1-4, stride_q, %2) 922 %endif 923%endmacro 924 925%macro CHROMA_V_LOAD_TC 2 926 movd %1, [%2] 927 punpcklbw %1, %1 928 punpcklwd %1, %1 929 psraw %1, 6 930%endmacro 931 932%macro DEBLOCK_CHROMA 0 933;----------------------------------------------------------------------------- 934; void ff_deblock_v_chroma_10(uint16_t *pix, int stride, int alpha, int beta, 935; int8_t *tc0) 936;----------------------------------------------------------------------------- 937cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) 938 mov r5, r0 939 sub r0, r1 940 sub r0, r1 941 shl r2d, 2 942 shl r3d, 2 943 CHROMA_V_LOAD r5 944 LOAD_AB m4, m5, r2d, r3d 945 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 946 pxor m4, m4 947 CHROMA_V_LOAD_TC m6, r4 948 psubw m6, [pw_3] 949 pmaxsw m6, m4 950 pand m7, m6 951 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 952 CHROMA_V_STORE 953 RET 954 955;----------------------------------------------------------------------------- 956; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha, 957; int beta) 958;----------------------------------------------------------------------------- 959cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) 960 mov r4, r0 961 sub r0, r1 962 sub r0, r1 963 shl r2d, 2 964 shl r3d, 2 965 CHROMA_V_LOAD r4 966 LOAD_AB m4, m5, r2d, r3d 967 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 968 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 969 CHROMA_V_STORE 970 RET 971 972;----------------------------------------------------------------------------- 973; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta, 974; int8_t *tc0) 975;----------------------------------------------------------------------------- 976cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_, tc0_ 977 shl alpha_d, 2 978 shl beta_d, 2 979 mov r5, pix_q 980 lea r6, [3*stride_q] 981 add r5, r6 982 983 CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize] 984 LOAD_AB m4, m5, alpha_d, beta_d 985 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 986 pxor m4, m4 987 CHROMA_V_LOAD_TC m6, tc0_q 988 psubw m6, [pw_3] 989 pmaxsw m6, m4 990 pand m7, m6 991 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 992 CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize] 993 994RET 995 996;----------------------------------------------------------------------------- 997; void ff_deblock_h_chroma422_10(uint16_t *pix, int stride, int alpha, int beta, 998; int8_t *tc0) 999;----------------------------------------------------------------------------- 1000cglobal deblock_h_chroma422_10, 5, 7, 8, 0-3*mmsize, pix_, stride_, alpha_, beta_, tc0_ 1001 shl alpha_d, 2 1002 shl beta_d, 2 1003 1004 movd m0, [tc0_q] 1005 punpcklbw m0, m0 1006 psraw m0, 6 1007 movq [rsp], m0 1008 1009 mov r5, pix_q 1010 lea r6, [3*stride_q] 1011 add r5, r6 1012 1013 mov r4, -8 1014 .loop: 1015 1016 CHROMA_H_LOAD r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize] 1017 LOAD_AB m4, m5, alpha_d, beta_d 1018 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 1019 pxor m4, m4 1020 movd m6, [rsp + r4 + 8] 1021 punpcklwd m6, m6 1022 punpcklwd m6, m6 1023 psubw m6, [pw_3] 1024 pmaxsw m6, m4 1025 pand m7, m6 1026 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 1027 CHROMA_H_STORE r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize] 1028 1029 lea pix_q, [pix_q + (mmsize/2)*stride_q] 1030 lea r5, [r5 + (mmsize/2)*stride_q] 1031 add r4, (mmsize/4) 1032 jl .loop 1033RET 1034 1035%endmacro 1036 1037INIT_XMM sse2 1038DEBLOCK_CHROMA 1039%if HAVE_AVX_EXTERNAL 1040INIT_XMM avx 1041DEBLOCK_CHROMA 1042%endif 1043