1cabdff1aSopenharmony_ci;***************************************************************************** 2cabdff1aSopenharmony_ci;* SSE2-optimized HEVC deblocking code 3cabdff1aSopenharmony_ci;***************************************************************************** 4cabdff1aSopenharmony_ci;* Copyright (C) 2013 VTT 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi> 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 9cabdff1aSopenharmony_ci;* 10cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 11cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 12cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 13cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 14cabdff1aSopenharmony_ci;* 15cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 16cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 19cabdff1aSopenharmony_ci;* 20cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 21cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 22cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23cabdff1aSopenharmony_ci;****************************************************************************** 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ciSECTION_RODATA 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_cicextern pw_1023 30cabdff1aSopenharmony_ci%define pw_pixel_max_10 pw_1023 31cabdff1aSopenharmony_cipw_pixel_max_12: times 8 dw ((1 << 12)-1) 32cabdff1aSopenharmony_cipw_m2: times 8 dw -2 33cabdff1aSopenharmony_cipd_1 : times 4 dd 1 34cabdff1aSopenharmony_ci 35cabdff1aSopenharmony_cicextern pw_4 36cabdff1aSopenharmony_cicextern pw_8 37cabdff1aSopenharmony_cicextern pw_m1 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ciSECTION .text 40cabdff1aSopenharmony_ciINIT_XMM sse2 41cabdff1aSopenharmony_ci 42cabdff1aSopenharmony_ci; in: 8 rows of 4 bytes in %4..%11 43cabdff1aSopenharmony_ci; out: 4 rows of 8 words in m0..m3 44cabdff1aSopenharmony_ci%macro TRANSPOSE4x8B_LOAD 8 45cabdff1aSopenharmony_ci movd m0, %1 46cabdff1aSopenharmony_ci movd m2, %2 47cabdff1aSopenharmony_ci movd m1, %3 48cabdff1aSopenharmony_ci movd m3, %4 49cabdff1aSopenharmony_ci 50cabdff1aSopenharmony_ci punpcklbw m0, m2 51cabdff1aSopenharmony_ci punpcklbw m1, m3 52cabdff1aSopenharmony_ci punpcklwd m0, m1 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ci movd m4, %5 55cabdff1aSopenharmony_ci movd m6, %6 56cabdff1aSopenharmony_ci movd m5, %7 57cabdff1aSopenharmony_ci movd m3, %8 58cabdff1aSopenharmony_ci 59cabdff1aSopenharmony_ci punpcklbw m4, m6 60cabdff1aSopenharmony_ci punpcklbw m5, m3 61cabdff1aSopenharmony_ci punpcklwd m4, m5 62cabdff1aSopenharmony_ci 63cabdff1aSopenharmony_ci punpckhdq m2, m0, m4 64cabdff1aSopenharmony_ci punpckldq m0, m4 65cabdff1aSopenharmony_ci 66cabdff1aSopenharmony_ci pxor m5, m5 67cabdff1aSopenharmony_ci punpckhbw m1, m0, m5 68cabdff1aSopenharmony_ci punpcklbw m0, m5 69cabdff1aSopenharmony_ci punpckhbw m3, m2, m5 70cabdff1aSopenharmony_ci punpcklbw m2, m5 71cabdff1aSopenharmony_ci%endmacro 72cabdff1aSopenharmony_ci 73cabdff1aSopenharmony_ci; in: 4 rows of 8 words in m0..m3 74cabdff1aSopenharmony_ci; out: 8 rows of 4 bytes in %1..%8 75cabdff1aSopenharmony_ci%macro TRANSPOSE8x4B_STORE 8 76cabdff1aSopenharmony_ci packuswb m0, m2 77cabdff1aSopenharmony_ci packuswb m1, m3 78cabdff1aSopenharmony_ci SBUTTERFLY bw, 0, 1, 2 79cabdff1aSopenharmony_ci SBUTTERFLY wd, 0, 1, 2 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_ci movd %1, m0 82cabdff1aSopenharmony_ci pshufd m0, m0, 0x39 83cabdff1aSopenharmony_ci movd %2, m0 84cabdff1aSopenharmony_ci pshufd m0, m0, 0x39 85cabdff1aSopenharmony_ci movd %3, m0 86cabdff1aSopenharmony_ci pshufd m0, m0, 0x39 87cabdff1aSopenharmony_ci movd %4, m0 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ci movd %5, m1 90cabdff1aSopenharmony_ci pshufd m1, m1, 0x39 91cabdff1aSopenharmony_ci movd %6, m1 92cabdff1aSopenharmony_ci pshufd m1, m1, 0x39 93cabdff1aSopenharmony_ci movd %7, m1 94cabdff1aSopenharmony_ci pshufd m1, m1, 0x39 95cabdff1aSopenharmony_ci movd %8, m1 96cabdff1aSopenharmony_ci%endmacro 97cabdff1aSopenharmony_ci 98cabdff1aSopenharmony_ci; in: 8 rows of 4 words in %4..%11 99cabdff1aSopenharmony_ci; out: 4 rows of 8 words in m0..m3 100cabdff1aSopenharmony_ci%macro TRANSPOSE4x8W_LOAD 8 101cabdff1aSopenharmony_ci movq m0, %1 102cabdff1aSopenharmony_ci movq m2, %2 103cabdff1aSopenharmony_ci movq m1, %3 104cabdff1aSopenharmony_ci movq m3, %4 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci punpcklwd m0, m2 107cabdff1aSopenharmony_ci punpcklwd m1, m3 108cabdff1aSopenharmony_ci punpckhdq m2, m0, m1 109cabdff1aSopenharmony_ci punpckldq m0, m1 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_ci movq m4, %5 112cabdff1aSopenharmony_ci movq m6, %6 113cabdff1aSopenharmony_ci movq m5, %7 114cabdff1aSopenharmony_ci movq m3, %8 115cabdff1aSopenharmony_ci 116cabdff1aSopenharmony_ci punpcklwd m4, m6 117cabdff1aSopenharmony_ci punpcklwd m5, m3 118cabdff1aSopenharmony_ci punpckhdq m6, m4, m5 119cabdff1aSopenharmony_ci punpckldq m4, m5 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_ci punpckhqdq m1, m0, m4 122cabdff1aSopenharmony_ci punpcklqdq m0, m4 123cabdff1aSopenharmony_ci punpckhqdq m3, m2, m6 124cabdff1aSopenharmony_ci punpcklqdq m2, m6 125cabdff1aSopenharmony_ci 126cabdff1aSopenharmony_ci%endmacro 127cabdff1aSopenharmony_ci 128cabdff1aSopenharmony_ci; in: 4 rows of 8 words in m0..m3 129cabdff1aSopenharmony_ci; out: 8 rows of 4 words in %1..%8 130cabdff1aSopenharmony_ci%macro TRANSPOSE8x4W_STORE 9 131cabdff1aSopenharmony_ci TRANSPOSE4x4W 0, 1, 2, 3, 4 132cabdff1aSopenharmony_ci 133cabdff1aSopenharmony_ci pxor m5, m5; zeros reg 134cabdff1aSopenharmony_ci CLIPW m0, m5, %9 135cabdff1aSopenharmony_ci CLIPW m1, m5, %9 136cabdff1aSopenharmony_ci CLIPW m2, m5, %9 137cabdff1aSopenharmony_ci CLIPW m3, m5, %9 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_ci movq %1, m0 140cabdff1aSopenharmony_ci movhps %2, m0 141cabdff1aSopenharmony_ci movq %3, m1 142cabdff1aSopenharmony_ci movhps %4, m1 143cabdff1aSopenharmony_ci movq %5, m2 144cabdff1aSopenharmony_ci movhps %6, m2 145cabdff1aSopenharmony_ci movq %7, m3 146cabdff1aSopenharmony_ci movhps %8, m3 147cabdff1aSopenharmony_ci%endmacro 148cabdff1aSopenharmony_ci 149cabdff1aSopenharmony_ci; in: 8 rows of 8 bytes in %1..%8 150cabdff1aSopenharmony_ci; out: 8 rows of 8 words in m0..m7 151cabdff1aSopenharmony_ci%macro TRANSPOSE8x8B_LOAD 8 152cabdff1aSopenharmony_ci movq m7, %1 153cabdff1aSopenharmony_ci movq m2, %2 154cabdff1aSopenharmony_ci movq m1, %3 155cabdff1aSopenharmony_ci movq m3, %4 156cabdff1aSopenharmony_ci 157cabdff1aSopenharmony_ci punpcklbw m7, m2 158cabdff1aSopenharmony_ci punpcklbw m1, m3 159cabdff1aSopenharmony_ci punpcklwd m3, m7, m1 160cabdff1aSopenharmony_ci punpckhwd m7, m1 161cabdff1aSopenharmony_ci 162cabdff1aSopenharmony_ci movq m4, %5 163cabdff1aSopenharmony_ci movq m6, %6 164cabdff1aSopenharmony_ci movq m5, %7 165cabdff1aSopenharmony_ci movq m15, %8 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_ci punpcklbw m4, m6 168cabdff1aSopenharmony_ci punpcklbw m5, m15 169cabdff1aSopenharmony_ci punpcklwd m9, m4, m5 170cabdff1aSopenharmony_ci punpckhwd m4, m5 171cabdff1aSopenharmony_ci 172cabdff1aSopenharmony_ci punpckldq m1, m3, m9; 0, 1 173cabdff1aSopenharmony_ci punpckhdq m3, m9; 2, 3 174cabdff1aSopenharmony_ci 175cabdff1aSopenharmony_ci punpckldq m5, m7, m4; 4, 5 176cabdff1aSopenharmony_ci punpckhdq m7, m4; 6, 7 177cabdff1aSopenharmony_ci 178cabdff1aSopenharmony_ci pxor m13, m13 179cabdff1aSopenharmony_ci 180cabdff1aSopenharmony_ci punpcklbw m0, m1, m13; 0 in 16 bit 181cabdff1aSopenharmony_ci punpckhbw m1, m13; 1 in 16 bit 182cabdff1aSopenharmony_ci 183cabdff1aSopenharmony_ci punpcklbw m2, m3, m13; 2 184cabdff1aSopenharmony_ci punpckhbw m3, m13; 3 185cabdff1aSopenharmony_ci 186cabdff1aSopenharmony_ci punpcklbw m4, m5, m13; 4 187cabdff1aSopenharmony_ci punpckhbw m5, m13; 5 188cabdff1aSopenharmony_ci 189cabdff1aSopenharmony_ci punpcklbw m6, m7, m13; 6 190cabdff1aSopenharmony_ci punpckhbw m7, m13; 7 191cabdff1aSopenharmony_ci%endmacro 192cabdff1aSopenharmony_ci 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_ci; in: 8 rows of 8 words in m0..m8 195cabdff1aSopenharmony_ci; out: 8 rows of 8 bytes in %1..%8 196cabdff1aSopenharmony_ci%macro TRANSPOSE8x8B_STORE 8 197cabdff1aSopenharmony_ci packuswb m0, m4 198cabdff1aSopenharmony_ci packuswb m1, m5 199cabdff1aSopenharmony_ci packuswb m2, m6 200cabdff1aSopenharmony_ci packuswb m3, m7 201cabdff1aSopenharmony_ci TRANSPOSE2x4x4B 0, 1, 2, 3, 4 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ci movq %1, m0 204cabdff1aSopenharmony_ci movhps %2, m0 205cabdff1aSopenharmony_ci movq %3, m1 206cabdff1aSopenharmony_ci movhps %4, m1 207cabdff1aSopenharmony_ci movq %5, m2 208cabdff1aSopenharmony_ci movhps %6, m2 209cabdff1aSopenharmony_ci movq %7, m3 210cabdff1aSopenharmony_ci movhps %8, m3 211cabdff1aSopenharmony_ci%endmacro 212cabdff1aSopenharmony_ci 213cabdff1aSopenharmony_ci; in: 8 rows of 8 words in %1..%8 214cabdff1aSopenharmony_ci; out: 8 rows of 8 words in m0..m7 215cabdff1aSopenharmony_ci%macro TRANSPOSE8x8W_LOAD 8 216cabdff1aSopenharmony_ci movdqu m0, %1 217cabdff1aSopenharmony_ci movdqu m1, %2 218cabdff1aSopenharmony_ci movdqu m2, %3 219cabdff1aSopenharmony_ci movdqu m3, %4 220cabdff1aSopenharmony_ci movdqu m4, %5 221cabdff1aSopenharmony_ci movdqu m5, %6 222cabdff1aSopenharmony_ci movdqu m6, %7 223cabdff1aSopenharmony_ci movdqu m7, %8 224cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 225cabdff1aSopenharmony_ci%endmacro 226cabdff1aSopenharmony_ci 227cabdff1aSopenharmony_ci; in: 8 rows of 8 words in m0..m8 228cabdff1aSopenharmony_ci; out: 8 rows of 8 words in %1..%8 229cabdff1aSopenharmony_ci%macro TRANSPOSE8x8W_STORE 9 230cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 231cabdff1aSopenharmony_ci 232cabdff1aSopenharmony_ci pxor m8, m8 233cabdff1aSopenharmony_ci CLIPW m0, m8, %9 234cabdff1aSopenharmony_ci CLIPW m1, m8, %9 235cabdff1aSopenharmony_ci CLIPW m2, m8, %9 236cabdff1aSopenharmony_ci CLIPW m3, m8, %9 237cabdff1aSopenharmony_ci CLIPW m4, m8, %9 238cabdff1aSopenharmony_ci CLIPW m5, m8, %9 239cabdff1aSopenharmony_ci CLIPW m6, m8, %9 240cabdff1aSopenharmony_ci CLIPW m7, m8, %9 241cabdff1aSopenharmony_ci 242cabdff1aSopenharmony_ci movdqu %1, m0 243cabdff1aSopenharmony_ci movdqu %2, m1 244cabdff1aSopenharmony_ci movdqu %3, m2 245cabdff1aSopenharmony_ci movdqu %4, m3 246cabdff1aSopenharmony_ci movdqu %5, m4 247cabdff1aSopenharmony_ci movdqu %6, m5 248cabdff1aSopenharmony_ci movdqu %7, m6 249cabdff1aSopenharmony_ci movdqu %8, m7 250cabdff1aSopenharmony_ci%endmacro 251cabdff1aSopenharmony_ci 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ci; in: %2 clobbered 254cabdff1aSopenharmony_ci; out: %1 255cabdff1aSopenharmony_ci; mask in m11 256cabdff1aSopenharmony_ci; clobbers m10 257cabdff1aSopenharmony_ci%macro MASKED_COPY 2 258cabdff1aSopenharmony_ci pand %2, m11 ; and mask 259cabdff1aSopenharmony_ci pandn m10, m11, %1; and -mask 260cabdff1aSopenharmony_ci por %2, m10 261cabdff1aSopenharmony_ci mova %1, %2 262cabdff1aSopenharmony_ci%endmacro 263cabdff1aSopenharmony_ci 264cabdff1aSopenharmony_ci; in: %2 clobbered 265cabdff1aSopenharmony_ci; out: %1 266cabdff1aSopenharmony_ci; mask in %3, will be clobbered 267cabdff1aSopenharmony_ci%macro MASKED_COPY2 3 268cabdff1aSopenharmony_ci pand %2, %3 ; and mask 269cabdff1aSopenharmony_ci pandn %3, %1; and -mask 270cabdff1aSopenharmony_ci por %2, %3 271cabdff1aSopenharmony_ci mova %1, %2 272cabdff1aSopenharmony_ci%endmacro 273cabdff1aSopenharmony_ci 274cabdff1aSopenharmony_ciALIGN 16 275cabdff1aSopenharmony_ci; input in m0 ... m3 and tcs in r2. Output in m1 and m2 276cabdff1aSopenharmony_ci%macro CHROMA_DEBLOCK_BODY 1 277cabdff1aSopenharmony_ci psubw m4, m2, m1; q0 - p0 278cabdff1aSopenharmony_ci psubw m5, m0, m3; p1 - q1 279cabdff1aSopenharmony_ci psllw m4, 2; << 2 280cabdff1aSopenharmony_ci paddw m5, m4; 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci ;tc calculations 283cabdff1aSopenharmony_ci movq m6, [tcq]; tc0 284cabdff1aSopenharmony_ci punpcklwd m6, m6 285cabdff1aSopenharmony_ci pshufd m6, m6, 0xA0; tc0, tc1 286cabdff1aSopenharmony_ci%if cpuflag(ssse3) 287cabdff1aSopenharmony_ci psignw m4, m6, [pw_m1]; -tc0, -tc1 288cabdff1aSopenharmony_ci%else 289cabdff1aSopenharmony_ci pmullw m4, m6, [pw_m1]; -tc0, -tc1 290cabdff1aSopenharmony_ci%endif 291cabdff1aSopenharmony_ci ;end tc calculations 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci paddw m5, [pw_4]; +4 294cabdff1aSopenharmony_ci psraw m5, 3; >> 3 295cabdff1aSopenharmony_ci 296cabdff1aSopenharmony_ci%if %1 > 8 297cabdff1aSopenharmony_ci psllw m4, %1-8; << (BIT_DEPTH - 8) 298cabdff1aSopenharmony_ci psllw m6, %1-8; << (BIT_DEPTH - 8) 299cabdff1aSopenharmony_ci%endif 300cabdff1aSopenharmony_ci pmaxsw m5, m4 301cabdff1aSopenharmony_ci pminsw m5, m6 302cabdff1aSopenharmony_ci paddw m1, m5; p0 + delta0 303cabdff1aSopenharmony_ci psubw m2, m5; q0 - delta0 304cabdff1aSopenharmony_ci%endmacro 305cabdff1aSopenharmony_ci 306cabdff1aSopenharmony_ci; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6 307cabdff1aSopenharmony_ci%macro LUMA_DEBLOCK_BODY 2 308cabdff1aSopenharmony_ci psllw m9, m2, 1; *2 309cabdff1aSopenharmony_ci psubw m10, m1, m9 310cabdff1aSopenharmony_ci paddw m10, m3 311cabdff1aSopenharmony_ci ABS1 m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci psllw m9, m5, 1; *2 314cabdff1aSopenharmony_ci psubw m11, m6, m9 315cabdff1aSopenharmony_ci paddw m11, m4 316cabdff1aSopenharmony_ci ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3 317cabdff1aSopenharmony_ci 318cabdff1aSopenharmony_ci ;beta calculations 319cabdff1aSopenharmony_ci%if %1 > 8 320cabdff1aSopenharmony_ci shl betaq, %1 - 8 321cabdff1aSopenharmony_ci%endif 322cabdff1aSopenharmony_ci movd m13, betad 323cabdff1aSopenharmony_ci SPLATW m13, m13, 0 324cabdff1aSopenharmony_ci ;end beta calculations 325cabdff1aSopenharmony_ci 326cabdff1aSopenharmony_ci paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3 327cabdff1aSopenharmony_ci 328cabdff1aSopenharmony_ci pshufhw m14, m9, 0x0f ;0b00001111; 0d3 0d3 0d0 0d0 in high 329cabdff1aSopenharmony_ci pshuflw m14, m14, 0x0f ;0b00001111; 1d3 1d3 1d0 1d0 in low 330cabdff1aSopenharmony_ci 331cabdff1aSopenharmony_ci pshufhw m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3 332cabdff1aSopenharmony_ci pshuflw m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3 333cabdff1aSopenharmony_ci 334cabdff1aSopenharmony_ci paddw m14, m9; 0d0+0d3, 1d0+1d3 335cabdff1aSopenharmony_ci 336cabdff1aSopenharmony_ci ;compare 337cabdff1aSopenharmony_ci pcmpgtw m15, m13, m14 338cabdff1aSopenharmony_ci movmskps r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1) 339cabdff1aSopenharmony_ci test r13, r13 340cabdff1aSopenharmony_ci je .bypassluma 341cabdff1aSopenharmony_ci 342cabdff1aSopenharmony_ci ;weak / strong decision compare to beta_2 343cabdff1aSopenharmony_ci psraw m15, m13, 2; beta >> 2 344cabdff1aSopenharmony_ci psllw m8, m9, 1; 345cabdff1aSopenharmony_ci pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2 346cabdff1aSopenharmony_ci movmskps r6, m15; 347cabdff1aSopenharmony_ci ;end weak / strong decision 348cabdff1aSopenharmony_ci 349cabdff1aSopenharmony_ci ; weak filter nd_p/q calculation 350cabdff1aSopenharmony_ci pshufd m8, m10, 0x31 351cabdff1aSopenharmony_ci psrld m8, 16 352cabdff1aSopenharmony_ci paddw m8, m10 353cabdff1aSopenharmony_ci movd r7d, m8 354cabdff1aSopenharmony_ci pshufd m8, m8, 0x4E 355cabdff1aSopenharmony_ci movd r8d, m8 356cabdff1aSopenharmony_ci 357cabdff1aSopenharmony_ci pshufd m8, m11, 0x31 358cabdff1aSopenharmony_ci psrld m8, 16 359cabdff1aSopenharmony_ci paddw m8, m11 360cabdff1aSopenharmony_ci movd r9d, m8 361cabdff1aSopenharmony_ci pshufd m8, m8, 0x4E 362cabdff1aSopenharmony_ci movd r10d, m8 363cabdff1aSopenharmony_ci ; end calc for weak filter 364cabdff1aSopenharmony_ci 365cabdff1aSopenharmony_ci ; filtering mask 366cabdff1aSopenharmony_ci mov r11, r13 367cabdff1aSopenharmony_ci shr r11, 3 368cabdff1aSopenharmony_ci movd m15, r11d 369cabdff1aSopenharmony_ci and r13, 1 370cabdff1aSopenharmony_ci movd m11, r13d 371cabdff1aSopenharmony_ci shufps m11, m15, 0 372cabdff1aSopenharmony_ci shl r11, 1 373cabdff1aSopenharmony_ci or r13, r11 374cabdff1aSopenharmony_ci 375cabdff1aSopenharmony_ci pcmpeqd m11, [pd_1]; filtering mask 376cabdff1aSopenharmony_ci 377cabdff1aSopenharmony_ci ;decide between strong and weak filtering 378cabdff1aSopenharmony_ci ;tc25 calculations 379cabdff1aSopenharmony_ci mov r11d, [tcq]; 380cabdff1aSopenharmony_ci%if %1 > 8 381cabdff1aSopenharmony_ci shl r11, %1 - 8 382cabdff1aSopenharmony_ci%endif 383cabdff1aSopenharmony_ci movd m8, r11d; tc0 384cabdff1aSopenharmony_ci mov r3d, [tcq+4]; 385cabdff1aSopenharmony_ci%if %1 > 8 386cabdff1aSopenharmony_ci shl r3, %1 - 8 387cabdff1aSopenharmony_ci%endif 388cabdff1aSopenharmony_ci add r11d, r3d; tc0 + tc1 389cabdff1aSopenharmony_ci jz .bypassluma 390cabdff1aSopenharmony_ci movd m9, r3d; tc1 391cabdff1aSopenharmony_ci punpcklwd m8, m8 392cabdff1aSopenharmony_ci punpcklwd m9, m9 393cabdff1aSopenharmony_ci shufps m8, m9, 0; tc0, tc1 394cabdff1aSopenharmony_ci mova m9, m8 395cabdff1aSopenharmony_ci psllw m8, 2; tc << 2 396cabdff1aSopenharmony_ci pavgw m8, m9; tc25 = ((tc * 5 + 1) >> 1) 397cabdff1aSopenharmony_ci ;end tc25 calculations 398cabdff1aSopenharmony_ci 399cabdff1aSopenharmony_ci ;----beta_3 comparison----- 400cabdff1aSopenharmony_ci psubw m12, m0, m3; p3 - p0 401cabdff1aSopenharmony_ci ABS1 m12, m14; abs(p3 - p0) 402cabdff1aSopenharmony_ci 403cabdff1aSopenharmony_ci psubw m15, m7, m4; q3 - q0 404cabdff1aSopenharmony_ci ABS1 m15, m14; abs(q3 - q0) 405cabdff1aSopenharmony_ci 406cabdff1aSopenharmony_ci paddw m12, m15; abs(p3 - p0) + abs(q3 - q0) 407cabdff1aSopenharmony_ci 408cabdff1aSopenharmony_ci pshufhw m12, m12, 0xf0 ;0b11110000; 409cabdff1aSopenharmony_ci pshuflw m12, m12, 0xf0 ;0b11110000; 410cabdff1aSopenharmony_ci 411cabdff1aSopenharmony_ci psraw m13, 3; beta >> 3 412cabdff1aSopenharmony_ci pcmpgtw m13, m12; 413cabdff1aSopenharmony_ci movmskps r11, m13; 414cabdff1aSopenharmony_ci and r6, r11; strong mask , beta_2 and beta_3 comparisons 415cabdff1aSopenharmony_ci ;----beta_3 comparison end----- 416cabdff1aSopenharmony_ci ;----tc25 comparison--- 417cabdff1aSopenharmony_ci psubw m12, m3, m4; p0 - q0 418cabdff1aSopenharmony_ci ABS1 m12, m14; abs(p0 - q0) 419cabdff1aSopenharmony_ci 420cabdff1aSopenharmony_ci pshufhw m12, m12, 0xf0 ;0b11110000; 421cabdff1aSopenharmony_ci pshuflw m12, m12, 0xf0 ;0b11110000; 422cabdff1aSopenharmony_ci 423cabdff1aSopenharmony_ci pcmpgtw m8, m12; tc25 comparisons 424cabdff1aSopenharmony_ci movmskps r11, m8; 425cabdff1aSopenharmony_ci and r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons 426cabdff1aSopenharmony_ci ;----tc25 comparison end--- 427cabdff1aSopenharmony_ci mov r11, r6; 428cabdff1aSopenharmony_ci shr r11, 1; 429cabdff1aSopenharmony_ci and r6, r11; strong mask, bits 2 and 0 430cabdff1aSopenharmony_ci 431cabdff1aSopenharmony_ci pmullw m14, m9, [pw_m2]; -tc * 2 432cabdff1aSopenharmony_ci paddw m9, m9 433cabdff1aSopenharmony_ci 434cabdff1aSopenharmony_ci and r6, 5; 0b101 435cabdff1aSopenharmony_ci mov r11, r6; strong mask 436cabdff1aSopenharmony_ci shr r6, 2; 437cabdff1aSopenharmony_ci movd m12, r6d; store to xmm for mask generation 438cabdff1aSopenharmony_ci shl r6, 1 439cabdff1aSopenharmony_ci and r11, 1 440cabdff1aSopenharmony_ci movd m10, r11d; store to xmm for mask generation 441cabdff1aSopenharmony_ci or r6, r11; final strong mask, bits 1 and 0 442cabdff1aSopenharmony_ci jz .weakfilter 443cabdff1aSopenharmony_ci 444cabdff1aSopenharmony_ci shufps m10, m12, 0 445cabdff1aSopenharmony_ci pcmpeqd m10, [pd_1]; strong mask 446cabdff1aSopenharmony_ci 447cabdff1aSopenharmony_ci mova m13, [pw_4]; 4 in every cell 448cabdff1aSopenharmony_ci pand m11, m10; combine filtering mask and strong mask 449cabdff1aSopenharmony_ci paddw m12, m2, m3; p1 + p0 450cabdff1aSopenharmony_ci paddw m12, m4; p1 + p0 + q0 451cabdff1aSopenharmony_ci mova m10, m12; copy 452cabdff1aSopenharmony_ci paddw m12, m12; 2*p1 + 2*p0 + 2*q0 453cabdff1aSopenharmony_ci paddw m12, m1; p2 + 2*p1 + 2*p0 + 2*q0 454cabdff1aSopenharmony_ci paddw m12, m5; p2 + 2*p1 + 2*p0 + 2*q0 + q1 455cabdff1aSopenharmony_ci paddw m12, m13; p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 456cabdff1aSopenharmony_ci psraw m12, 3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) 457cabdff1aSopenharmony_ci psubw m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0 458cabdff1aSopenharmony_ci pmaxsw m12, m14 459cabdff1aSopenharmony_ci pminsw m12, m9; av_clip( , -2 * tc, 2 * tc) 460cabdff1aSopenharmony_ci paddw m12, m3; p0' 461cabdff1aSopenharmony_ci 462cabdff1aSopenharmony_ci paddw m15, m1, m10; p2 + p1 + p0 + q0 463cabdff1aSopenharmony_ci psrlw m13, 1; 2 in every cell 464cabdff1aSopenharmony_ci paddw m15, m13; p2 + p1 + p0 + q0 + 2 465cabdff1aSopenharmony_ci psraw m15, 2; (p2 + p1 + p0 + q0 + 2) >> 2 466cabdff1aSopenharmony_ci psubw m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1 467cabdff1aSopenharmony_ci pmaxsw m15, m14 468cabdff1aSopenharmony_ci pminsw m15, m9; av_clip( , -2 * tc, 2 * tc) 469cabdff1aSopenharmony_ci paddw m15, m2; p1' 470cabdff1aSopenharmony_ci 471cabdff1aSopenharmony_ci paddw m8, m1, m0; p3 + p2 472cabdff1aSopenharmony_ci paddw m8, m8; 2*p3 + 2*p2 473cabdff1aSopenharmony_ci paddw m8, m1; 2*p3 + 3*p2 474cabdff1aSopenharmony_ci paddw m8, m10; 2*p3 + 3*p2 + p1 + p0 + q0 475cabdff1aSopenharmony_ci paddw m13, m13 476cabdff1aSopenharmony_ci paddw m8, m13; 2*p3 + 3*p2 + p1 + p0 + q0 + 4 477cabdff1aSopenharmony_ci psraw m8, 3; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3 478cabdff1aSopenharmony_ci psubw m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2 479cabdff1aSopenharmony_ci pmaxsw m8, m14 480cabdff1aSopenharmony_ci pminsw m8, m9; av_clip( , -2 * tc, 2 * tc) 481cabdff1aSopenharmony_ci paddw m8, m1; p2' 482cabdff1aSopenharmony_ci MASKED_COPY m1, m8 483cabdff1aSopenharmony_ci 484cabdff1aSopenharmony_ci paddw m8, m3, m4; p0 + q0 485cabdff1aSopenharmony_ci paddw m8, m5; p0 + q0 + q1 486cabdff1aSopenharmony_ci paddw m8, m8; 2*p0 + 2*q0 + 2*q1 487cabdff1aSopenharmony_ci paddw m8, m2; p1 + 2*p0 + 2*q0 + 2*q1 488cabdff1aSopenharmony_ci paddw m8, m6; p1 + 2*p0 + 2*q0 + 2*q1 + q2 489cabdff1aSopenharmony_ci paddw m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 490cabdff1aSopenharmony_ci psraw m8, 3; (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3 491cabdff1aSopenharmony_ci psubw m8, m4; 492cabdff1aSopenharmony_ci pmaxsw m8, m14 493cabdff1aSopenharmony_ci pminsw m8, m9; av_clip( , -2 * tc, 2 * tc) 494cabdff1aSopenharmony_ci paddw m8, m4; q0' 495cabdff1aSopenharmony_ci MASKED_COPY m2, m15 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci paddw m15, m3, m4; p0 + q0 498cabdff1aSopenharmony_ci paddw m15, m5; p0 + q0 + q1 499cabdff1aSopenharmony_ci mova m10, m15; 500cabdff1aSopenharmony_ci paddw m15, m6; p0 + q0 + q1 + q2 501cabdff1aSopenharmony_ci psrlw m13, 1; 2 in every cell 502cabdff1aSopenharmony_ci paddw m15, m13; p0 + q0 + q1 + q2 + 2 503cabdff1aSopenharmony_ci psraw m15, 2; (p0 + q0 + q1 + q2 + 2) >> 2 504cabdff1aSopenharmony_ci psubw m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1 505cabdff1aSopenharmony_ci pmaxsw m15, m14 506cabdff1aSopenharmony_ci pminsw m15, m9; av_clip( , -2 * tc, 2 * tc) 507cabdff1aSopenharmony_ci paddw m15, m5; q1' 508cabdff1aSopenharmony_ci 509cabdff1aSopenharmony_ci paddw m13, m7; q3 + 2 510cabdff1aSopenharmony_ci paddw m13, m6; q3 + q2 + 2 511cabdff1aSopenharmony_ci paddw m13, m13; 2*q3 + 2*q2 + 4 512cabdff1aSopenharmony_ci paddw m13, m6; 2*q3 + 3*q2 + 4 513cabdff1aSopenharmony_ci paddw m13, m10; 2*q3 + 3*q2 + q1 + q0 + p0 + 4 514cabdff1aSopenharmony_ci psraw m13, 3; (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3 515cabdff1aSopenharmony_ci psubw m13, m6; ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2 516cabdff1aSopenharmony_ci pmaxsw m13, m14 517cabdff1aSopenharmony_ci pminsw m13, m9; av_clip( , -2 * tc, 2 * tc) 518cabdff1aSopenharmony_ci paddw m13, m6; q2' 519cabdff1aSopenharmony_ci 520cabdff1aSopenharmony_ci MASKED_COPY m6, m13 521cabdff1aSopenharmony_ci MASKED_COPY m5, m15 522cabdff1aSopenharmony_ci MASKED_COPY m4, m8 523cabdff1aSopenharmony_ci MASKED_COPY m3, m12 524cabdff1aSopenharmony_ci 525cabdff1aSopenharmony_ci.weakfilter: 526cabdff1aSopenharmony_ci not r6; strong mask -> weak mask 527cabdff1aSopenharmony_ci and r6, r13; final weak filtering mask, bits 0 and 1 528cabdff1aSopenharmony_ci jz .store 529cabdff1aSopenharmony_ci 530cabdff1aSopenharmony_ci ; weak filtering mask 531cabdff1aSopenharmony_ci mov r11, r6 532cabdff1aSopenharmony_ci shr r11, 1 533cabdff1aSopenharmony_ci movd m12, r11d 534cabdff1aSopenharmony_ci and r6, 1 535cabdff1aSopenharmony_ci movd m11, r6d 536cabdff1aSopenharmony_ci shufps m11, m12, 0 537cabdff1aSopenharmony_ci pcmpeqd m11, [pd_1]; filtering mask 538cabdff1aSopenharmony_ci 539cabdff1aSopenharmony_ci mov r13, betaq 540cabdff1aSopenharmony_ci shr r13, 1; 541cabdff1aSopenharmony_ci add betaq, r13 542cabdff1aSopenharmony_ci shr betaq, 3; ((beta + (beta >> 1)) >> 3)) 543cabdff1aSopenharmony_ci 544cabdff1aSopenharmony_ci mova m13, [pw_8] 545cabdff1aSopenharmony_ci psubw m12, m4, m3 ; q0 - p0 546cabdff1aSopenharmony_ci psllw m10, m12, 3; 8 * (q0 - p0) 547cabdff1aSopenharmony_ci paddw m12, m10 ; 9 * (q0 - p0) 548cabdff1aSopenharmony_ci 549cabdff1aSopenharmony_ci psubw m10, m5, m2 ; q1 - p1 550cabdff1aSopenharmony_ci psllw m8, m10, 1; 2 * ( q1 - p1 ) 551cabdff1aSopenharmony_ci paddw m10, m8; 3 * ( q1 - p1 ) 552cabdff1aSopenharmony_ci psubw m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 ) 553cabdff1aSopenharmony_ci paddw m12, m13; + 8 554cabdff1aSopenharmony_ci psraw m12, 4; >> 4 , delta0 555cabdff1aSopenharmony_ci PABSW m13, m12; abs(delta0) 556cabdff1aSopenharmony_ci 557cabdff1aSopenharmony_ci 558cabdff1aSopenharmony_ci psllw m10, m9, 2; 8 * tc 559cabdff1aSopenharmony_ci paddw m10, m9; 10 * tc 560cabdff1aSopenharmony_ci pcmpgtw m10, m13 561cabdff1aSopenharmony_ci pand m11, m10 562cabdff1aSopenharmony_ci 563cabdff1aSopenharmony_ci psraw m9, 1; tc * 2 -> tc 564cabdff1aSopenharmony_ci psraw m14, 1; -tc * 2 -> -tc 565cabdff1aSopenharmony_ci 566cabdff1aSopenharmony_ci pmaxsw m12, m14 567cabdff1aSopenharmony_ci pminsw m12, m9; av_clip(delta0, -tc, tc) 568cabdff1aSopenharmony_ci 569cabdff1aSopenharmony_ci psraw m9, 1; tc -> tc / 2 570cabdff1aSopenharmony_ci%if cpuflag(ssse3) 571cabdff1aSopenharmony_ci psignw m14, m9, [pw_m1]; -tc / 2 572cabdff1aSopenharmony_ci%else 573cabdff1aSopenharmony_ci pmullw m14, m9, [pw_m1]; -tc / 2 574cabdff1aSopenharmony_ci%endif 575cabdff1aSopenharmony_ci 576cabdff1aSopenharmony_ci pavgw m15, m1, m3; (p2 + p0 + 1) >> 1 577cabdff1aSopenharmony_ci psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1 578cabdff1aSopenharmony_ci paddw m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0 579cabdff1aSopenharmony_ci psraw m15, 1; (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1 580cabdff1aSopenharmony_ci pmaxsw m15, m14 581cabdff1aSopenharmony_ci pminsw m15, m9; av_clip(deltap1, -tc/2, tc/2) 582cabdff1aSopenharmony_ci paddw m15, m2; p1' 583cabdff1aSopenharmony_ci 584cabdff1aSopenharmony_ci ;beta calculations 585cabdff1aSopenharmony_ci movd m10, betad 586cabdff1aSopenharmony_ci SPLATW m10, m10, 0 587cabdff1aSopenharmony_ci 588cabdff1aSopenharmony_ci movd m13, r7d; 1dp0 + 1dp3 589cabdff1aSopenharmony_ci movd m8, r8d; 0dp0 + 0dp3 590cabdff1aSopenharmony_ci punpcklwd m8, m8 591cabdff1aSopenharmony_ci punpcklwd m13, m13 592cabdff1aSopenharmony_ci shufps m13, m8, 0; 593cabdff1aSopenharmony_ci pcmpgtw m8, m10, m13 594cabdff1aSopenharmony_ci pand m8, m11 595cabdff1aSopenharmony_ci ;end beta calculations 596cabdff1aSopenharmony_ci MASKED_COPY2 m2, m15, m8; write p1' 597cabdff1aSopenharmony_ci 598cabdff1aSopenharmony_ci pavgw m8, m6, m4; (q2 + q0 + 1) >> 1 599cabdff1aSopenharmony_ci psubw m8, m5; ((q2 + q0 + 1) >> 1) - q1 600cabdff1aSopenharmony_ci psubw m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0) 601cabdff1aSopenharmony_ci psraw m8, 1; ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1 602cabdff1aSopenharmony_ci pmaxsw m8, m14 603cabdff1aSopenharmony_ci pminsw m8, m9; av_clip(deltaq1, -tc/2, tc/2) 604cabdff1aSopenharmony_ci paddw m8, m5; q1' 605cabdff1aSopenharmony_ci 606cabdff1aSopenharmony_ci movd m13, r9d; 607cabdff1aSopenharmony_ci movd m15, r10d; 608cabdff1aSopenharmony_ci punpcklwd m15, m15 609cabdff1aSopenharmony_ci punpcklwd m13, m13 610cabdff1aSopenharmony_ci shufps m13, m15, 0; dq0 + dq3 611cabdff1aSopenharmony_ci 612cabdff1aSopenharmony_ci pcmpgtw m10, m13; compare to ((beta+(beta>>1))>>3) 613cabdff1aSopenharmony_ci pand m10, m11 614cabdff1aSopenharmony_ci MASKED_COPY2 m5, m8, m10; write q1' 615cabdff1aSopenharmony_ci 616cabdff1aSopenharmony_ci paddw m15, m3, m12 ; p0 + delta0 617cabdff1aSopenharmony_ci MASKED_COPY m3, m15 618cabdff1aSopenharmony_ci 619cabdff1aSopenharmony_ci psubw m8, m4, m12 ; q0 - delta0 620cabdff1aSopenharmony_ci MASKED_COPY m4, m8 621cabdff1aSopenharmony_ci%endmacro 622cabdff1aSopenharmony_ci 623cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 624cabdff1aSopenharmony_ci; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc, 625cabdff1aSopenharmony_ci; uint8_t *_no_p, uint8_t *_no_q); 626cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 627cabdff1aSopenharmony_ci%macro LOOP_FILTER_CHROMA 0 628cabdff1aSopenharmony_cicglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride 629cabdff1aSopenharmony_ci sub pixq, 2 630cabdff1aSopenharmony_ci lea r3strideq, [3*strideq] 631cabdff1aSopenharmony_ci mov pix0q, pixq 632cabdff1aSopenharmony_ci add pixq, r3strideq 633cabdff1aSopenharmony_ci TRANSPOSE4x8B_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) 634cabdff1aSopenharmony_ci CHROMA_DEBLOCK_BODY 8 635cabdff1aSopenharmony_ci TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq) 636cabdff1aSopenharmony_ci RET 637cabdff1aSopenharmony_ci 638cabdff1aSopenharmony_cicglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride 639cabdff1aSopenharmony_ci sub pixq, 4 640cabdff1aSopenharmony_ci lea r3strideq, [3*strideq] 641cabdff1aSopenharmony_ci mov pix0q, pixq 642cabdff1aSopenharmony_ci add pixq, r3strideq 643cabdff1aSopenharmony_ci TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) 644cabdff1aSopenharmony_ci CHROMA_DEBLOCK_BODY 10 645cabdff1aSopenharmony_ci TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10] 646cabdff1aSopenharmony_ci RET 647cabdff1aSopenharmony_ci 648cabdff1aSopenharmony_cicglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride 649cabdff1aSopenharmony_ci sub pixq, 4 650cabdff1aSopenharmony_ci lea r3strideq, [3*strideq] 651cabdff1aSopenharmony_ci mov pix0q, pixq 652cabdff1aSopenharmony_ci add pixq, r3strideq 653cabdff1aSopenharmony_ci TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) 654cabdff1aSopenharmony_ci CHROMA_DEBLOCK_BODY 12 655cabdff1aSopenharmony_ci TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12] 656cabdff1aSopenharmony_ci RET 657cabdff1aSopenharmony_ci 658cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 659cabdff1aSopenharmony_ci; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc, 660cabdff1aSopenharmony_ci; uint8_t *_no_p, uint8_t *_no_q); 661cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 662cabdff1aSopenharmony_cicglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0 663cabdff1aSopenharmony_ci mov pix0q, pixq 664cabdff1aSopenharmony_ci sub pix0q, strideq 665cabdff1aSopenharmony_ci sub pix0q, strideq 666cabdff1aSopenharmony_ci movq m0, [pix0q]; p1 667cabdff1aSopenharmony_ci movq m1, [pix0q+strideq]; p0 668cabdff1aSopenharmony_ci movq m2, [pixq]; q0 669cabdff1aSopenharmony_ci movq m3, [pixq+strideq]; q1 670cabdff1aSopenharmony_ci pxor m5, m5; zeros reg 671cabdff1aSopenharmony_ci punpcklbw m0, m5 672cabdff1aSopenharmony_ci punpcklbw m1, m5 673cabdff1aSopenharmony_ci punpcklbw m2, m5 674cabdff1aSopenharmony_ci punpcklbw m3, m5 675cabdff1aSopenharmony_ci CHROMA_DEBLOCK_BODY 8 676cabdff1aSopenharmony_ci packuswb m1, m2 677cabdff1aSopenharmony_ci movh[pix0q+strideq], m1 678cabdff1aSopenharmony_ci movhps [pixq], m1 679cabdff1aSopenharmony_ci RET 680cabdff1aSopenharmony_ci 681cabdff1aSopenharmony_cicglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0 682cabdff1aSopenharmony_ci mov pix0q, pixq 683cabdff1aSopenharmony_ci sub pix0q, strideq 684cabdff1aSopenharmony_ci sub pix0q, strideq 685cabdff1aSopenharmony_ci movu m0, [pix0q]; p1 686cabdff1aSopenharmony_ci movu m1, [pix0q+strideq]; p0 687cabdff1aSopenharmony_ci movu m2, [pixq]; q0 688cabdff1aSopenharmony_ci movu m3, [pixq+strideq]; q1 689cabdff1aSopenharmony_ci CHROMA_DEBLOCK_BODY 10 690cabdff1aSopenharmony_ci pxor m5, m5; zeros reg 691cabdff1aSopenharmony_ci CLIPW m1, m5, [pw_pixel_max_10] 692cabdff1aSopenharmony_ci CLIPW m2, m5, [pw_pixel_max_10] 693cabdff1aSopenharmony_ci movu [pix0q+strideq], m1 694cabdff1aSopenharmony_ci movu [pixq], m2 695cabdff1aSopenharmony_ci RET 696cabdff1aSopenharmony_ci 697cabdff1aSopenharmony_cicglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0 698cabdff1aSopenharmony_ci mov pix0q, pixq 699cabdff1aSopenharmony_ci sub pix0q, strideq 700cabdff1aSopenharmony_ci sub pix0q, strideq 701cabdff1aSopenharmony_ci movu m0, [pix0q]; p1 702cabdff1aSopenharmony_ci movu m1, [pix0q+strideq]; p0 703cabdff1aSopenharmony_ci movu m2, [pixq]; q0 704cabdff1aSopenharmony_ci movu m3, [pixq+strideq]; q1 705cabdff1aSopenharmony_ci CHROMA_DEBLOCK_BODY 12 706cabdff1aSopenharmony_ci pxor m5, m5; zeros reg 707cabdff1aSopenharmony_ci CLIPW m1, m5, [pw_pixel_max_12] 708cabdff1aSopenharmony_ci CLIPW m2, m5, [pw_pixel_max_12] 709cabdff1aSopenharmony_ci movu [pix0q+strideq], m1 710cabdff1aSopenharmony_ci movu [pixq], m2 711cabdff1aSopenharmony_ci RET 712cabdff1aSopenharmony_ci%endmacro 713cabdff1aSopenharmony_ci 714cabdff1aSopenharmony_ciINIT_XMM sse2 715cabdff1aSopenharmony_ciLOOP_FILTER_CHROMA 716cabdff1aSopenharmony_ciINIT_XMM avx 717cabdff1aSopenharmony_ciLOOP_FILTER_CHROMA 718cabdff1aSopenharmony_ci 719cabdff1aSopenharmony_ci%if ARCH_X86_64 720cabdff1aSopenharmony_ci%macro LOOP_FILTER_LUMA 0 721cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 722cabdff1aSopenharmony_ci; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, 723cabdff1aSopenharmony_ci; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); 724cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 725cabdff1aSopenharmony_cicglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 726cabdff1aSopenharmony_ci sub pixq, 4 727cabdff1aSopenharmony_ci lea pix0q, [3 * r1] 728cabdff1aSopenharmony_ci mov src3strideq, pixq 729cabdff1aSopenharmony_ci add pixq, pix0q 730cabdff1aSopenharmony_ci TRANSPOSE8x8B_LOAD PASS8ROWS(src3strideq, pixq, r1, pix0q) 731cabdff1aSopenharmony_ci LUMA_DEBLOCK_BODY 8, v 732cabdff1aSopenharmony_ci.store: 733cabdff1aSopenharmony_ci TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q) 734cabdff1aSopenharmony_ci.bypassluma: 735cabdff1aSopenharmony_ci RET 736cabdff1aSopenharmony_ci 737cabdff1aSopenharmony_cicglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 738cabdff1aSopenharmony_ci sub pixq, 8 739cabdff1aSopenharmony_ci lea pix0q, [3 * strideq] 740cabdff1aSopenharmony_ci mov src3strideq, pixq 741cabdff1aSopenharmony_ci add pixq, pix0q 742cabdff1aSopenharmony_ci TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q) 743cabdff1aSopenharmony_ci LUMA_DEBLOCK_BODY 10, v 744cabdff1aSopenharmony_ci.store: 745cabdff1aSopenharmony_ci TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10] 746cabdff1aSopenharmony_ci.bypassluma: 747cabdff1aSopenharmony_ci RET 748cabdff1aSopenharmony_ci 749cabdff1aSopenharmony_cicglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 750cabdff1aSopenharmony_ci sub pixq, 8 751cabdff1aSopenharmony_ci lea pix0q, [3 * strideq] 752cabdff1aSopenharmony_ci mov src3strideq, pixq 753cabdff1aSopenharmony_ci add pixq, pix0q 754cabdff1aSopenharmony_ci TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q) 755cabdff1aSopenharmony_ci LUMA_DEBLOCK_BODY 12, v 756cabdff1aSopenharmony_ci.store: 757cabdff1aSopenharmony_ci TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12] 758cabdff1aSopenharmony_ci.bypassluma: 759cabdff1aSopenharmony_ci RET 760cabdff1aSopenharmony_ci 761cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 762cabdff1aSopenharmony_ci; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, 763cabdff1aSopenharmony_ci; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); 764cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 765cabdff1aSopenharmony_cicglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 766cabdff1aSopenharmony_ci lea src3strideq, [3 * strideq] 767cabdff1aSopenharmony_ci mov pix0q, pixq 768cabdff1aSopenharmony_ci sub pix0q, src3strideq 769cabdff1aSopenharmony_ci sub pix0q, strideq 770cabdff1aSopenharmony_ci movq m0, [pix0q]; p3 771cabdff1aSopenharmony_ci movq m1, [pix0q + strideq]; p2 772cabdff1aSopenharmony_ci movq m2, [pix0q + 2 * strideq]; p1 773cabdff1aSopenharmony_ci movq m3, [pix0q + src3strideq]; p0 774cabdff1aSopenharmony_ci movq m4, [pixq]; q0 775cabdff1aSopenharmony_ci movq m5, [pixq + strideq]; q1 776cabdff1aSopenharmony_ci movq m6, [pixq + 2 * strideq]; q2 777cabdff1aSopenharmony_ci movq m7, [pixq + src3strideq]; q3 778cabdff1aSopenharmony_ci pxor m8, m8 779cabdff1aSopenharmony_ci punpcklbw m0, m8 780cabdff1aSopenharmony_ci punpcklbw m1, m8 781cabdff1aSopenharmony_ci punpcklbw m2, m8 782cabdff1aSopenharmony_ci punpcklbw m3, m8 783cabdff1aSopenharmony_ci punpcklbw m4, m8 784cabdff1aSopenharmony_ci punpcklbw m5, m8 785cabdff1aSopenharmony_ci punpcklbw m6, m8 786cabdff1aSopenharmony_ci punpcklbw m7, m8 787cabdff1aSopenharmony_ci LUMA_DEBLOCK_BODY 8, h 788cabdff1aSopenharmony_ci.store: 789cabdff1aSopenharmony_ci packuswb m1, m2 790cabdff1aSopenharmony_ci packuswb m3, m4 791cabdff1aSopenharmony_ci packuswb m5, m6 792cabdff1aSopenharmony_ci movh [pix0q + strideq], m1 793cabdff1aSopenharmony_ci movhps [pix0q + 2 * strideq], m1 794cabdff1aSopenharmony_ci movh [pix0q + src3strideq], m3 795cabdff1aSopenharmony_ci movhps [pixq ], m3 796cabdff1aSopenharmony_ci movh [pixq + strideq], m5 797cabdff1aSopenharmony_ci movhps [pixq + 2 * strideq], m5 798cabdff1aSopenharmony_ci.bypassluma: 799cabdff1aSopenharmony_ci RET 800cabdff1aSopenharmony_ci 801cabdff1aSopenharmony_cicglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 802cabdff1aSopenharmony_ci lea src3strideq, [3 * strideq] 803cabdff1aSopenharmony_ci mov pix0q, pixq 804cabdff1aSopenharmony_ci sub pix0q, src3strideq 805cabdff1aSopenharmony_ci sub pix0q, strideq 806cabdff1aSopenharmony_ci movdqu m0, [pix0q]; p3 807cabdff1aSopenharmony_ci movdqu m1, [pix0q + strideq]; p2 808cabdff1aSopenharmony_ci movdqu m2, [pix0q + 2 * strideq]; p1 809cabdff1aSopenharmony_ci movdqu m3, [pix0q + src3strideq]; p0 810cabdff1aSopenharmony_ci movdqu m4, [pixq]; q0 811cabdff1aSopenharmony_ci movdqu m5, [pixq + strideq]; q1 812cabdff1aSopenharmony_ci movdqu m6, [pixq + 2 * strideq]; q2 813cabdff1aSopenharmony_ci movdqu m7, [pixq + src3strideq]; q3 814cabdff1aSopenharmony_ci LUMA_DEBLOCK_BODY 10, h 815cabdff1aSopenharmony_ci.store: 816cabdff1aSopenharmony_ci pxor m8, m8; zeros reg 817cabdff1aSopenharmony_ci CLIPW m1, m8, [pw_pixel_max_10] 818cabdff1aSopenharmony_ci CLIPW m2, m8, [pw_pixel_max_10] 819cabdff1aSopenharmony_ci CLIPW m3, m8, [pw_pixel_max_10] 820cabdff1aSopenharmony_ci CLIPW m4, m8, [pw_pixel_max_10] 821cabdff1aSopenharmony_ci CLIPW m5, m8, [pw_pixel_max_10] 822cabdff1aSopenharmony_ci CLIPW m6, m8, [pw_pixel_max_10] 823cabdff1aSopenharmony_ci movdqu [pix0q + strideq], m1; p2 824cabdff1aSopenharmony_ci movdqu [pix0q + 2 * strideq], m2; p1 825cabdff1aSopenharmony_ci movdqu [pix0q + src3strideq], m3; p0 826cabdff1aSopenharmony_ci movdqu [pixq ], m4; q0 827cabdff1aSopenharmony_ci movdqu [pixq + strideq], m5; q1 828cabdff1aSopenharmony_ci movdqu [pixq + 2 * strideq], m6; q2 829cabdff1aSopenharmony_ci.bypassluma: 830cabdff1aSopenharmony_ci RET 831cabdff1aSopenharmony_ci 832cabdff1aSopenharmony_cicglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride 833cabdff1aSopenharmony_ci lea src3strideq, [3 * strideq] 834cabdff1aSopenharmony_ci mov pix0q, pixq 835cabdff1aSopenharmony_ci sub pix0q, src3strideq 836cabdff1aSopenharmony_ci sub pix0q, strideq 837cabdff1aSopenharmony_ci movdqu m0, [pix0q]; p3 838cabdff1aSopenharmony_ci movdqu m1, [pix0q + strideq]; p2 839cabdff1aSopenharmony_ci movdqu m2, [pix0q + 2 * strideq]; p1 840cabdff1aSopenharmony_ci movdqu m3, [pix0q + src3strideq]; p0 841cabdff1aSopenharmony_ci movdqu m4, [pixq]; q0 842cabdff1aSopenharmony_ci movdqu m5, [pixq + strideq]; q1 843cabdff1aSopenharmony_ci movdqu m6, [pixq + 2 * strideq]; q2 844cabdff1aSopenharmony_ci movdqu m7, [pixq + src3strideq]; q3 845cabdff1aSopenharmony_ci LUMA_DEBLOCK_BODY 12, h 846cabdff1aSopenharmony_ci.store: 847cabdff1aSopenharmony_ci pxor m8, m8; zeros reg 848cabdff1aSopenharmony_ci CLIPW m1, m8, [pw_pixel_max_12] 849cabdff1aSopenharmony_ci CLIPW m2, m8, [pw_pixel_max_12] 850cabdff1aSopenharmony_ci CLIPW m3, m8, [pw_pixel_max_12] 851cabdff1aSopenharmony_ci CLIPW m4, m8, [pw_pixel_max_12] 852cabdff1aSopenharmony_ci CLIPW m5, m8, [pw_pixel_max_12] 853cabdff1aSopenharmony_ci CLIPW m6, m8, [pw_pixel_max_12] 854cabdff1aSopenharmony_ci movdqu [pix0q + strideq], m1; p2 855cabdff1aSopenharmony_ci movdqu [pix0q + 2 * strideq], m2; p1 856cabdff1aSopenharmony_ci movdqu [pix0q + src3strideq], m3; p0 857cabdff1aSopenharmony_ci movdqu [pixq ], m4; q0 858cabdff1aSopenharmony_ci movdqu [pixq + strideq], m5; q1 859cabdff1aSopenharmony_ci movdqu [pixq + 2 * strideq], m6; q2 860cabdff1aSopenharmony_ci.bypassluma: 861cabdff1aSopenharmony_ci RET 862cabdff1aSopenharmony_ci 863cabdff1aSopenharmony_ci%endmacro 864cabdff1aSopenharmony_ci 865cabdff1aSopenharmony_ciINIT_XMM sse2 866cabdff1aSopenharmony_ciLOOP_FILTER_LUMA 867cabdff1aSopenharmony_ciINIT_XMM ssse3 868cabdff1aSopenharmony_ciLOOP_FILTER_LUMA 869cabdff1aSopenharmony_ciINIT_XMM avx 870cabdff1aSopenharmony_ciLOOP_FILTER_LUMA 871cabdff1aSopenharmony_ci%endif 872