1cabdff1aSopenharmony_ci;***************************************************************************** 2cabdff1aSopenharmony_ci;* MMX/SSE2/AVX-optimized H.264 deblocking code 3cabdff1aSopenharmony_ci;***************************************************************************** 4cabdff1aSopenharmony_ci;* Copyright (C) 2005-2011 x264 project 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* Authors: Loren Merritt <lorenm@u.washington.edu> 7cabdff1aSopenharmony_ci;* Fiona Glaser <fiona@x264.com> 8cabdff1aSopenharmony_ci;* Oskar Arvidsson <oskar@irock.se> 9cabdff1aSopenharmony_ci;* 10cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 11cabdff1aSopenharmony_ci;* 12cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 13cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 14cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 15cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 16cabdff1aSopenharmony_ci;* 17cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 18cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 19cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 21cabdff1aSopenharmony_ci;* 22cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 23cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 24cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25cabdff1aSopenharmony_ci;****************************************************************************** 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_ciSECTION_RODATA 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_cipb_A1: times 16 db 0xA1 32cabdff1aSopenharmony_cipb_3_1: times 4 db 3, 1 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ciSECTION .text 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_cicextern pb_0 37cabdff1aSopenharmony_cicextern pb_1 38cabdff1aSopenharmony_cicextern pb_3 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_ci%define PASS8ROWS(base, base3, stride, stride3, offset) \ 41cabdff1aSopenharmony_ci PASS8ROWS(base+offset, base3+offset, stride, stride3) 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_ci; in: 8 rows of 4 bytes in %4..%11 44cabdff1aSopenharmony_ci; out: 4 rows of 8 bytes in m0..m3 45cabdff1aSopenharmony_ci%macro TRANSPOSE4x8_LOAD 11 46cabdff1aSopenharmony_ci movh m0, %4 47cabdff1aSopenharmony_ci movh m2, %5 48cabdff1aSopenharmony_ci movh m1, %6 49cabdff1aSopenharmony_ci movh m3, %7 50cabdff1aSopenharmony_ci punpckl%1 m0, m2 51cabdff1aSopenharmony_ci punpckl%1 m1, m3 52cabdff1aSopenharmony_ci mova m2, m0 53cabdff1aSopenharmony_ci punpckl%2 m0, m1 54cabdff1aSopenharmony_ci punpckh%2 m2, m1 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci movh m4, %8 57cabdff1aSopenharmony_ci movh m6, %9 58cabdff1aSopenharmony_ci movh m5, %10 59cabdff1aSopenharmony_ci movh m7, %11 60cabdff1aSopenharmony_ci punpckl%1 m4, m6 61cabdff1aSopenharmony_ci punpckl%1 m5, m7 62cabdff1aSopenharmony_ci mova m6, m4 63cabdff1aSopenharmony_ci punpckl%2 m4, m5 64cabdff1aSopenharmony_ci punpckh%2 m6, m5 65cabdff1aSopenharmony_ci 66cabdff1aSopenharmony_ci punpckh%3 m1, m0, m4 67cabdff1aSopenharmony_ci punpckh%3 m3, m2, m6 68cabdff1aSopenharmony_ci punpckl%3 m0, m4 69cabdff1aSopenharmony_ci punpckl%3 m2, m6 70cabdff1aSopenharmony_ci%endmacro 71cabdff1aSopenharmony_ci 72cabdff1aSopenharmony_ci; in: 4 rows of 8 bytes in m0..m3 73cabdff1aSopenharmony_ci; out: 8 rows of 4 bytes in %1..%8 74cabdff1aSopenharmony_ci%macro TRANSPOSE8x4B_STORE 8 75cabdff1aSopenharmony_ci punpckhdq m4, m0, m0 76cabdff1aSopenharmony_ci punpckhdq m5, m1, m1 77cabdff1aSopenharmony_ci punpckhdq m6, m2, m2 78cabdff1aSopenharmony_ci 79cabdff1aSopenharmony_ci punpcklbw m0, m1 80cabdff1aSopenharmony_ci punpcklbw m2, m3 81cabdff1aSopenharmony_ci punpcklwd m1, m0, m2 82cabdff1aSopenharmony_ci punpckhwd m0, m2 83cabdff1aSopenharmony_ci movh %1, m1 84cabdff1aSopenharmony_ci punpckhdq m1, m1 85cabdff1aSopenharmony_ci movh %2, m1 86cabdff1aSopenharmony_ci movh %3, m0 87cabdff1aSopenharmony_ci punpckhdq m0, m0 88cabdff1aSopenharmony_ci movh %4, m0 89cabdff1aSopenharmony_ci 90cabdff1aSopenharmony_ci punpckhdq m3, m3 91cabdff1aSopenharmony_ci punpcklbw m4, m5 92cabdff1aSopenharmony_ci punpcklbw m6, m3 93cabdff1aSopenharmony_ci punpcklwd m5, m4, m6 94cabdff1aSopenharmony_ci punpckhwd m4, m6 95cabdff1aSopenharmony_ci movh %5, m5 96cabdff1aSopenharmony_ci punpckhdq m5, m5 97cabdff1aSopenharmony_ci movh %6, m5 98cabdff1aSopenharmony_ci movh %7, m4 99cabdff1aSopenharmony_ci punpckhdq m4, m4 100cabdff1aSopenharmony_ci movh %8, m4 101cabdff1aSopenharmony_ci%endmacro 102cabdff1aSopenharmony_ci 103cabdff1aSopenharmony_ci%macro TRANSPOSE4x8B_LOAD 8 104cabdff1aSopenharmony_ci TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 105cabdff1aSopenharmony_ci%endmacro 106cabdff1aSopenharmony_ci 107cabdff1aSopenharmony_ci%macro SBUTTERFLY3 4 108cabdff1aSopenharmony_ci punpckh%1 %4, %2, %3 109cabdff1aSopenharmony_ci punpckl%1 %2, %3 110cabdff1aSopenharmony_ci%endmacro 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_ci; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 113cabdff1aSopenharmony_ci; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] 114cabdff1aSopenharmony_ci%macro TRANSPOSE6x8_MEM 9 115cabdff1aSopenharmony_ci RESET_MM_PERMUTATION 116cabdff1aSopenharmony_ci movq m0, %1 117cabdff1aSopenharmony_ci movq m1, %2 118cabdff1aSopenharmony_ci movq m2, %3 119cabdff1aSopenharmony_ci movq m3, %4 120cabdff1aSopenharmony_ci movq m4, %5 121cabdff1aSopenharmony_ci movq m5, %6 122cabdff1aSopenharmony_ci movq m6, %7 123cabdff1aSopenharmony_ci SBUTTERFLY bw, 0, 1, 7 124cabdff1aSopenharmony_ci SBUTTERFLY bw, 2, 3, 7 125cabdff1aSopenharmony_ci SBUTTERFLY bw, 4, 5, 7 126cabdff1aSopenharmony_ci movq [%9+0x10], m3 127cabdff1aSopenharmony_ci SBUTTERFLY3 bw, m6, %8, m7 128cabdff1aSopenharmony_ci SBUTTERFLY wd, 0, 2, 3 129cabdff1aSopenharmony_ci SBUTTERFLY wd, 4, 6, 3 130cabdff1aSopenharmony_ci punpckhdq m0, m4 131cabdff1aSopenharmony_ci movq [%9+0x00], m0 132cabdff1aSopenharmony_ci SBUTTERFLY3 wd, m1, [%9+0x10], m3 133cabdff1aSopenharmony_ci SBUTTERFLY wd, 5, 7, 0 134cabdff1aSopenharmony_ci SBUTTERFLY dq, 1, 5, 0 135cabdff1aSopenharmony_ci SBUTTERFLY dq, 2, 6, 0 136cabdff1aSopenharmony_ci punpckldq m3, m7 137cabdff1aSopenharmony_ci movq [%9+0x10], m2 138cabdff1aSopenharmony_ci movq [%9+0x20], m6 139cabdff1aSopenharmony_ci movq [%9+0x30], m1 140cabdff1aSopenharmony_ci movq [%9+0x40], m5 141cabdff1aSopenharmony_ci movq [%9+0x50], m3 142cabdff1aSopenharmony_ci RESET_MM_PERMUTATION 143cabdff1aSopenharmony_ci%endmacro 144cabdff1aSopenharmony_ci 145cabdff1aSopenharmony_ci; in: 8 rows of 8 in %1..%8 146cabdff1aSopenharmony_ci; out: 8 rows of 8 in %9..%16 147cabdff1aSopenharmony_ci%macro TRANSPOSE8x8_MEM 16 148cabdff1aSopenharmony_ci RESET_MM_PERMUTATION 149cabdff1aSopenharmony_ci movq m0, %1 150cabdff1aSopenharmony_ci movq m1, %2 151cabdff1aSopenharmony_ci movq m2, %3 152cabdff1aSopenharmony_ci movq m3, %4 153cabdff1aSopenharmony_ci movq m4, %5 154cabdff1aSopenharmony_ci movq m5, %6 155cabdff1aSopenharmony_ci movq m6, %7 156cabdff1aSopenharmony_ci SBUTTERFLY bw, 0, 1, 7 157cabdff1aSopenharmony_ci SBUTTERFLY bw, 2, 3, 7 158cabdff1aSopenharmony_ci SBUTTERFLY bw, 4, 5, 7 159cabdff1aSopenharmony_ci SBUTTERFLY3 bw, m6, %8, m7 160cabdff1aSopenharmony_ci movq %9, m5 161cabdff1aSopenharmony_ci SBUTTERFLY wd, 0, 2, 5 162cabdff1aSopenharmony_ci SBUTTERFLY wd, 4, 6, 5 163cabdff1aSopenharmony_ci SBUTTERFLY wd, 1, 3, 5 164cabdff1aSopenharmony_ci movq %11, m6 165cabdff1aSopenharmony_ci movq m6, %9 166cabdff1aSopenharmony_ci SBUTTERFLY wd, 6, 7, 5 167cabdff1aSopenharmony_ci SBUTTERFLY dq, 0, 4, 5 168cabdff1aSopenharmony_ci SBUTTERFLY dq, 1, 6, 5 169cabdff1aSopenharmony_ci movq %9, m0 170cabdff1aSopenharmony_ci movq %10, m4 171cabdff1aSopenharmony_ci movq %13, m1 172cabdff1aSopenharmony_ci movq %14, m6 173cabdff1aSopenharmony_ci SBUTTERFLY3 dq, m2, %11, m0 174cabdff1aSopenharmony_ci SBUTTERFLY dq, 3, 7, 4 175cabdff1aSopenharmony_ci movq %11, m2 176cabdff1aSopenharmony_ci movq %12, m0 177cabdff1aSopenharmony_ci movq %15, m3 178cabdff1aSopenharmony_ci movq %16, m7 179cabdff1aSopenharmony_ci RESET_MM_PERMUTATION 180cabdff1aSopenharmony_ci%endmacro 181cabdff1aSopenharmony_ci 182cabdff1aSopenharmony_ci; out: %4 = |%1-%2|>%3 183cabdff1aSopenharmony_ci; clobbers: %5 184cabdff1aSopenharmony_ci%macro DIFF_GT 5 185cabdff1aSopenharmony_ci%if avx_enabled == 0 186cabdff1aSopenharmony_ci mova %5, %2 187cabdff1aSopenharmony_ci mova %4, %1 188cabdff1aSopenharmony_ci psubusb %5, %1 189cabdff1aSopenharmony_ci psubusb %4, %2 190cabdff1aSopenharmony_ci%else 191cabdff1aSopenharmony_ci psubusb %5, %2, %1 192cabdff1aSopenharmony_ci psubusb %4, %1, %2 193cabdff1aSopenharmony_ci%endif 194cabdff1aSopenharmony_ci por %4, %5 195cabdff1aSopenharmony_ci psubusb %4, %3 196cabdff1aSopenharmony_ci%endmacro 197cabdff1aSopenharmony_ci 198cabdff1aSopenharmony_ci; out: %4 = |%1-%2|>%3 199cabdff1aSopenharmony_ci; clobbers: %5 200cabdff1aSopenharmony_ci%macro DIFF_GT2 5 201cabdff1aSopenharmony_ci%if ARCH_X86_64 202cabdff1aSopenharmony_ci psubusb %5, %2, %1 203cabdff1aSopenharmony_ci psubusb %4, %1, %2 204cabdff1aSopenharmony_ci%else 205cabdff1aSopenharmony_ci mova %5, %2 206cabdff1aSopenharmony_ci mova %4, %1 207cabdff1aSopenharmony_ci psubusb %5, %1 208cabdff1aSopenharmony_ci psubusb %4, %2 209cabdff1aSopenharmony_ci%endif 210cabdff1aSopenharmony_ci psubusb %5, %3 211cabdff1aSopenharmony_ci psubusb %4, %3 212cabdff1aSopenharmony_ci pcmpeqb %4, %5 213cabdff1aSopenharmony_ci%endmacro 214cabdff1aSopenharmony_ci 215cabdff1aSopenharmony_ci; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 216cabdff1aSopenharmony_ci; out: m5=beta-1, m7=mask, %3=alpha-1 217cabdff1aSopenharmony_ci; clobbers: m4,m6 218cabdff1aSopenharmony_ci%macro LOAD_MASK 2-3 219cabdff1aSopenharmony_ci movd m4, %1 220cabdff1aSopenharmony_ci movd m5, %2 221cabdff1aSopenharmony_ci SPLATW m4, m4 222cabdff1aSopenharmony_ci SPLATW m5, m5 223cabdff1aSopenharmony_ci packuswb m4, m4 ; 16x alpha-1 224cabdff1aSopenharmony_ci packuswb m5, m5 ; 16x beta-1 225cabdff1aSopenharmony_ci%if %0>2 226cabdff1aSopenharmony_ci mova %3, m4 227cabdff1aSopenharmony_ci%endif 228cabdff1aSopenharmony_ci DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 229cabdff1aSopenharmony_ci DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 230cabdff1aSopenharmony_ci por m7, m4 231cabdff1aSopenharmony_ci DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 232cabdff1aSopenharmony_ci por m7, m4 233cabdff1aSopenharmony_ci pxor m6, m6 234cabdff1aSopenharmony_ci pcmpeqb m7, m6 235cabdff1aSopenharmony_ci%endmacro 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) 238cabdff1aSopenharmony_ci; out: m1=p0' m2=q0' 239cabdff1aSopenharmony_ci; clobbers: m0,3-6 240cabdff1aSopenharmony_ci%macro DEBLOCK_P0_Q0 0 241cabdff1aSopenharmony_ci pcmpeqb m4, m4 242cabdff1aSopenharmony_ci pxor m5, m1, m2 ; p0^q0 243cabdff1aSopenharmony_ci pxor m3, m4 244cabdff1aSopenharmony_ci pand m5, [pb_1] ; (p0^q0)&1 245cabdff1aSopenharmony_ci pavgb m3, m0 ; (p1 - q1 + 256)>>1 246cabdff1aSopenharmony_ci pxor m4, m1 247cabdff1aSopenharmony_ci pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 248cabdff1aSopenharmony_ci pavgb m4, m2 ; (q0 - p0 + 256)>>1 249cabdff1aSopenharmony_ci pavgb m3, m5 250cabdff1aSopenharmony_ci mova m6, [pb_A1] 251cabdff1aSopenharmony_ci paddusb m3, m4 ; d+128+33 252cabdff1aSopenharmony_ci psubusb m6, m3 253cabdff1aSopenharmony_ci psubusb m3, [pb_A1] 254cabdff1aSopenharmony_ci pminub m6, m7 255cabdff1aSopenharmony_ci pminub m3, m7 256cabdff1aSopenharmony_ci psubusb m1, m6 257cabdff1aSopenharmony_ci psubusb m2, m3 258cabdff1aSopenharmony_ci paddusb m1, m3 259cabdff1aSopenharmony_ci paddusb m2, m6 260cabdff1aSopenharmony_ci%endmacro 261cabdff1aSopenharmony_ci 262cabdff1aSopenharmony_ci; in: m1=p0 m2=q0 263cabdff1aSopenharmony_ci; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp 264cabdff1aSopenharmony_ci; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) 265cabdff1aSopenharmony_ci; clobbers: q2, tmp, tc0 266cabdff1aSopenharmony_ci%macro LUMA_Q1 6 267cabdff1aSopenharmony_ci pavgb %6, m1, m2 268cabdff1aSopenharmony_ci pavgb %2, %6 ; avg(p2,avg(p0,q0)) 269cabdff1aSopenharmony_ci pxor %6, %3 270cabdff1aSopenharmony_ci pand %6, [pb_1] ; (p2^avg(p0,q0))&1 271cabdff1aSopenharmony_ci psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 272cabdff1aSopenharmony_ci psubusb %6, %1, %5 273cabdff1aSopenharmony_ci paddusb %5, %1 274cabdff1aSopenharmony_ci pmaxub %2, %6 275cabdff1aSopenharmony_ci pminub %2, %5 276cabdff1aSopenharmony_ci mova %4, %2 277cabdff1aSopenharmony_ci%endmacro 278cabdff1aSopenharmony_ci 279cabdff1aSopenharmony_ci%if ARCH_X86_64 280cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 281cabdff1aSopenharmony_ci; void ff_deblock_v_luma(uint8_t *pix, int stride, int alpha, int beta, 282cabdff1aSopenharmony_ci; int8_t *tc0) 283cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 284cabdff1aSopenharmony_ci%macro DEBLOCK_LUMA 0 285cabdff1aSopenharmony_cicglobal deblock_v_luma_8, 5,5,10, pix_, stride_, alpha_, beta_, base3_ 286cabdff1aSopenharmony_ci movd m8, [r4] ; tc0 287cabdff1aSopenharmony_ci lea r4, [stride_q*3] 288cabdff1aSopenharmony_ci dec alpha_d ; alpha-1 289cabdff1aSopenharmony_ci neg r4 290cabdff1aSopenharmony_ci dec beta_d ; beta-1 291cabdff1aSopenharmony_ci add base3_q, pix_q ; pix-3*stride 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci mova m0, [base3_q + stride_q] ; p1 294cabdff1aSopenharmony_ci mova m1, [base3_q + 2*stride_q] ; p0 295cabdff1aSopenharmony_ci mova m2, [pix_q] ; q0 296cabdff1aSopenharmony_ci mova m3, [pix_q + stride_q] ; q1 297cabdff1aSopenharmony_ci LOAD_MASK r2d, r3d 298cabdff1aSopenharmony_ci 299cabdff1aSopenharmony_ci punpcklbw m8, m8 300cabdff1aSopenharmony_ci punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] 301cabdff1aSopenharmony_ci pcmpeqb m9, m9 302cabdff1aSopenharmony_ci pcmpeqb m9, m8 303cabdff1aSopenharmony_ci pandn m9, m7 304cabdff1aSopenharmony_ci pand m8, m9 305cabdff1aSopenharmony_ci 306cabdff1aSopenharmony_ci movdqa m3, [base3_q] ; p2 307cabdff1aSopenharmony_ci DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 308cabdff1aSopenharmony_ci pand m6, m9 309cabdff1aSopenharmony_ci psubb m7, m8, m6 310cabdff1aSopenharmony_ci pand m6, m8 311cabdff1aSopenharmony_ci LUMA_Q1 m0, m3, [base3_q], [base3_q + stride_q], m6, m4 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci movdqa m4, [pix_q + 2*stride_q] ; q2 314cabdff1aSopenharmony_ci DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 315cabdff1aSopenharmony_ci pand m6, m9 316cabdff1aSopenharmony_ci pand m8, m6 317cabdff1aSopenharmony_ci psubb m7, m6 318cabdff1aSopenharmony_ci mova m3, [pix_q + stride_q] 319cabdff1aSopenharmony_ci LUMA_Q1 m3, m4, [pix_q + 2*stride_q], [pix_q + stride_q], m8, m6 320cabdff1aSopenharmony_ci 321cabdff1aSopenharmony_ci DEBLOCK_P0_Q0 322cabdff1aSopenharmony_ci mova [base3_q + 2*stride_q], m1 323cabdff1aSopenharmony_ci mova [pix_q], m2 324cabdff1aSopenharmony_ci RET 325cabdff1aSopenharmony_ci 326cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 327cabdff1aSopenharmony_ci; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta, 328cabdff1aSopenharmony_ci; int8_t *tc0) 329cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 330cabdff1aSopenharmony_ciINIT_MMX cpuname 331cabdff1aSopenharmony_cicglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64 332cabdff1aSopenharmony_ci movsxd r7, r1d 333cabdff1aSopenharmony_ci lea r8, [r7+r7*2] 334cabdff1aSopenharmony_ci lea r6, [r0-4] 335cabdff1aSopenharmony_ci lea r5, [r0-4+r8] 336cabdff1aSopenharmony_ci%if WIN64 337cabdff1aSopenharmony_ci %define pix_tmp rsp+0x30 ; shadow space + r4 338cabdff1aSopenharmony_ci%else 339cabdff1aSopenharmony_ci %define pix_tmp rsp 340cabdff1aSopenharmony_ci%endif 341cabdff1aSopenharmony_ci 342cabdff1aSopenharmony_ci ; transpose 6x16 -> tmp space 343cabdff1aSopenharmony_ci TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp 344cabdff1aSopenharmony_ci lea r6, [r6+r7*8] 345cabdff1aSopenharmony_ci lea r5, [r5+r7*8] 346cabdff1aSopenharmony_ci TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8 347cabdff1aSopenharmony_ci 348cabdff1aSopenharmony_ci ; vertical filter 349cabdff1aSopenharmony_ci ; alpha, beta, tc0 are still in r2d, r3d, r4 350cabdff1aSopenharmony_ci ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them 351cabdff1aSopenharmony_ci lea r0, [pix_tmp+0x30] 352cabdff1aSopenharmony_ci mov r1d, 0x10 353cabdff1aSopenharmony_ci%if WIN64 354cabdff1aSopenharmony_ci mov [rsp+0x20], r4 355cabdff1aSopenharmony_ci%endif 356cabdff1aSopenharmony_ci call deblock_v_luma_8 357cabdff1aSopenharmony_ci 358cabdff1aSopenharmony_ci ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) 359cabdff1aSopenharmony_ci add r6, 2 360cabdff1aSopenharmony_ci add r5, 2 361cabdff1aSopenharmony_ci movq m0, [pix_tmp+0x18] 362cabdff1aSopenharmony_ci movq m1, [pix_tmp+0x28] 363cabdff1aSopenharmony_ci movq m2, [pix_tmp+0x38] 364cabdff1aSopenharmony_ci movq m3, [pix_tmp+0x48] 365cabdff1aSopenharmony_ci TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) 366cabdff1aSopenharmony_ci 367cabdff1aSopenharmony_ci shl r7, 3 368cabdff1aSopenharmony_ci sub r6, r7 369cabdff1aSopenharmony_ci sub r5, r7 370cabdff1aSopenharmony_ci shr r7, 3 371cabdff1aSopenharmony_ci movq m0, [pix_tmp+0x10] 372cabdff1aSopenharmony_ci movq m1, [pix_tmp+0x20] 373cabdff1aSopenharmony_ci movq m2, [pix_tmp+0x30] 374cabdff1aSopenharmony_ci movq m3, [pix_tmp+0x40] 375cabdff1aSopenharmony_ci TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) 376cabdff1aSopenharmony_ci 377cabdff1aSopenharmony_ci RET 378cabdff1aSopenharmony_ci%endmacro 379cabdff1aSopenharmony_ci 380cabdff1aSopenharmony_ci%macro DEBLOCK_H_LUMA_MBAFF 0 381cabdff1aSopenharmony_ci 382cabdff1aSopenharmony_cicglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_, base3_, stride3_ 383cabdff1aSopenharmony_ci movsxd stride_q, stride_d 384cabdff1aSopenharmony_ci dec alpha_d 385cabdff1aSopenharmony_ci dec beta_d 386cabdff1aSopenharmony_ci mov base3_q, pix_q 387cabdff1aSopenharmony_ci lea stride3_q, [3*stride_q] 388cabdff1aSopenharmony_ci add base3_q, stride3_q 389cabdff1aSopenharmony_ci 390cabdff1aSopenharmony_ci movq m0, [pix_q - 4] 391cabdff1aSopenharmony_ci movq m1, [pix_q + stride_q - 4] 392cabdff1aSopenharmony_ci movq m2, [pix_q + 2*stride_q - 4] 393cabdff1aSopenharmony_ci movq m3, [base3_q - 4] 394cabdff1aSopenharmony_ci movq m4, [base3_q + stride_q - 4] 395cabdff1aSopenharmony_ci movq m5, [base3_q + 2*stride_q - 4] 396cabdff1aSopenharmony_ci movq m6, [base3_q + stride3_q - 4] 397cabdff1aSopenharmony_ci movq m7, [base3_q + 4*stride_q - 4] 398cabdff1aSopenharmony_ci 399cabdff1aSopenharmony_ci TRANSPOSE_8X8B 0,1,2,3,4,5,6,7 400cabdff1aSopenharmony_ci 401cabdff1aSopenharmony_ci %assign i 0 402cabdff1aSopenharmony_ci %rep 8 403cabdff1aSopenharmony_ci movq [rsp + 16*i], m %+ i 404cabdff1aSopenharmony_ci %assign i i+1 405cabdff1aSopenharmony_ci %endrep 406cabdff1aSopenharmony_ci 407cabdff1aSopenharmony_ci ; p2 = m1 [rsp + 16] 408cabdff1aSopenharmony_ci ; p1 = m2 [rsp + 32] 409cabdff1aSopenharmony_ci ; p0 = m3 [rsp + 48] 410cabdff1aSopenharmony_ci ; q0 = m4 [rsp + 64] 411cabdff1aSopenharmony_ci ; q1 = m5 [rsp + 80] 412cabdff1aSopenharmony_ci ; q2 = m6 [rsp + 96] 413cabdff1aSopenharmony_ci 414cabdff1aSopenharmony_ci SWAP 0, 2 415cabdff1aSopenharmony_ci SWAP 1, 3 416cabdff1aSopenharmony_ci SWAP 2, 4 417cabdff1aSopenharmony_ci SWAP 3, 5 418cabdff1aSopenharmony_ci 419cabdff1aSopenharmony_ci LOAD_MASK alpha_d, beta_d 420cabdff1aSopenharmony_ci movd m8, [tc0_q] 421cabdff1aSopenharmony_ci punpcklbw m8, m8 422cabdff1aSopenharmony_ci pcmpeqb m9, m9 423cabdff1aSopenharmony_ci pcmpeqb m9, m8 424cabdff1aSopenharmony_ci pandn m9, m7 425cabdff1aSopenharmony_ci pand m8, m9 426cabdff1aSopenharmony_ci 427cabdff1aSopenharmony_ci movdqa m3, [rsp + 16] ; p2 428cabdff1aSopenharmony_ci DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 429cabdff1aSopenharmony_ci pand m6, m9 430cabdff1aSopenharmony_ci psubb m7, m8, m6 431cabdff1aSopenharmony_ci pand m6, m8 432cabdff1aSopenharmony_ci LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4 433cabdff1aSopenharmony_ci 434cabdff1aSopenharmony_ci movdqa m4, [rsp + 96] ; q2 435cabdff1aSopenharmony_ci DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 436cabdff1aSopenharmony_ci pand m6, m9 437cabdff1aSopenharmony_ci pand m8, m6 438cabdff1aSopenharmony_ci psubb m7, m6 439cabdff1aSopenharmony_ci mova m3, [rsp + 80] 440cabdff1aSopenharmony_ci LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6 441cabdff1aSopenharmony_ci 442cabdff1aSopenharmony_ci DEBLOCK_P0_Q0 443cabdff1aSopenharmony_ci SWAP 1, 3 444cabdff1aSopenharmony_ci SWAP 2, 4 445cabdff1aSopenharmony_ci movq m0, [rsp] 446cabdff1aSopenharmony_ci movq m1, [rsp + 16] 447cabdff1aSopenharmony_ci movq m2, [rsp + 32] 448cabdff1aSopenharmony_ci movq m5, [rsp + 80] 449cabdff1aSopenharmony_ci movq m6, [rsp + 96] 450cabdff1aSopenharmony_ci movq m7, [rsp + 112] 451cabdff1aSopenharmony_ci 452cabdff1aSopenharmony_ci TRANSPOSE_8X8B 0,1,2,3,4,5,6,7 453cabdff1aSopenharmony_ci movq [pix_q - 4], m0 454cabdff1aSopenharmony_ci movq [pix_q + stride_q - 4], m1 455cabdff1aSopenharmony_ci movq [pix_q + 2*stride_q - 4], m2 456cabdff1aSopenharmony_ci movq [base3_q - 4], m3 457cabdff1aSopenharmony_ci movq [base3_q + stride_q - 4], m4 458cabdff1aSopenharmony_ci movq [base3_q + 2*stride_q - 4], m5 459cabdff1aSopenharmony_ci movq [base3_q + stride3_q - 4], m6 460cabdff1aSopenharmony_ci movq [base3_q + 4*stride_q - 4], m7 461cabdff1aSopenharmony_ci 462cabdff1aSopenharmony_ciRET 463cabdff1aSopenharmony_ci 464cabdff1aSopenharmony_ci%endmacro 465cabdff1aSopenharmony_ci 466cabdff1aSopenharmony_ciINIT_XMM sse2 467cabdff1aSopenharmony_ciDEBLOCK_H_LUMA_MBAFF 468cabdff1aSopenharmony_ciDEBLOCK_LUMA 469cabdff1aSopenharmony_ci 470cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 471cabdff1aSopenharmony_ciINIT_XMM avx 472cabdff1aSopenharmony_ciDEBLOCK_H_LUMA_MBAFF 473cabdff1aSopenharmony_ciDEBLOCK_LUMA 474cabdff1aSopenharmony_ci%endif 475cabdff1aSopenharmony_ci 476cabdff1aSopenharmony_ci%else 477cabdff1aSopenharmony_ci 478cabdff1aSopenharmony_ci%macro DEBLOCK_LUMA 2 479cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 480cabdff1aSopenharmony_ci; void ff_deblock_v8_luma(uint8_t *pix, int stride, int alpha, int beta, 481cabdff1aSopenharmony_ci; int8_t *tc0) 482cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 483cabdff1aSopenharmony_cicglobal deblock_%1_luma_8, 5,5,8,2*%2 484cabdff1aSopenharmony_ci lea r4, [r1*3] 485cabdff1aSopenharmony_ci dec r2 ; alpha-1 486cabdff1aSopenharmony_ci neg r4 487cabdff1aSopenharmony_ci dec r3 ; beta-1 488cabdff1aSopenharmony_ci add r4, r0 ; pix-3*stride 489cabdff1aSopenharmony_ci 490cabdff1aSopenharmony_ci mova m0, [r4+r1] ; p1 491cabdff1aSopenharmony_ci mova m1, [r4+2*r1] ; p0 492cabdff1aSopenharmony_ci mova m2, [r0] ; q0 493cabdff1aSopenharmony_ci mova m3, [r0+r1] ; q1 494cabdff1aSopenharmony_ci LOAD_MASK r2, r3 495cabdff1aSopenharmony_ci 496cabdff1aSopenharmony_ci mov r3, r4mp 497cabdff1aSopenharmony_ci pcmpeqb m3, m3 498cabdff1aSopenharmony_ci movd m4, [r3] ; tc0 499cabdff1aSopenharmony_ci punpcklbw m4, m4 500cabdff1aSopenharmony_ci punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] 501cabdff1aSopenharmony_ci mova [esp+%2], m4 ; tc 502cabdff1aSopenharmony_ci pcmpgtb m4, m3 503cabdff1aSopenharmony_ci mova m3, [r4] ; p2 504cabdff1aSopenharmony_ci pand m4, m7 505cabdff1aSopenharmony_ci mova [esp], m4 ; mask 506cabdff1aSopenharmony_ci 507cabdff1aSopenharmony_ci DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 508cabdff1aSopenharmony_ci pand m6, m4 509cabdff1aSopenharmony_ci pand m4, [esp+%2] ; tc 510cabdff1aSopenharmony_ci psubb m7, m4, m6 511cabdff1aSopenharmony_ci pand m6, m4 512cabdff1aSopenharmony_ci LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 513cabdff1aSopenharmony_ci 514cabdff1aSopenharmony_ci mova m4, [r0+2*r1] ; q2 515cabdff1aSopenharmony_ci DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 516cabdff1aSopenharmony_ci pand m6, [esp] ; mask 517cabdff1aSopenharmony_ci mova m5, [esp+%2] ; tc 518cabdff1aSopenharmony_ci psubb m7, m6 519cabdff1aSopenharmony_ci pand m5, m6 520cabdff1aSopenharmony_ci mova m3, [r0+r1] 521cabdff1aSopenharmony_ci LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 522cabdff1aSopenharmony_ci 523cabdff1aSopenharmony_ci DEBLOCK_P0_Q0 524cabdff1aSopenharmony_ci mova [r4+2*r1], m1 525cabdff1aSopenharmony_ci mova [r0], m2 526cabdff1aSopenharmony_ci RET 527cabdff1aSopenharmony_ci 528cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 529cabdff1aSopenharmony_ci; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta, 530cabdff1aSopenharmony_ci; int8_t *tc0) 531cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 532cabdff1aSopenharmony_ciINIT_MMX cpuname 533cabdff1aSopenharmony_cicglobal deblock_h_luma_8, 0,5,8,0x60+12 534cabdff1aSopenharmony_ci mov r0, r0mp 535cabdff1aSopenharmony_ci mov r3, r1m 536cabdff1aSopenharmony_ci lea r4, [r3*3] 537cabdff1aSopenharmony_ci sub r0, 4 538cabdff1aSopenharmony_ci lea r1, [r0+r4] 539cabdff1aSopenharmony_ci%define pix_tmp esp+12 540cabdff1aSopenharmony_ci 541cabdff1aSopenharmony_ci ; transpose 6x16 -> tmp space 542cabdff1aSopenharmony_ci TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp 543cabdff1aSopenharmony_ci lea r0, [r0+r3*8] 544cabdff1aSopenharmony_ci lea r1, [r1+r3*8] 545cabdff1aSopenharmony_ci TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 546cabdff1aSopenharmony_ci 547cabdff1aSopenharmony_ci ; vertical filter 548cabdff1aSopenharmony_ci lea r0, [pix_tmp+0x30] 549cabdff1aSopenharmony_ci PUSH dword r4m 550cabdff1aSopenharmony_ci PUSH dword r3m 551cabdff1aSopenharmony_ci PUSH dword r2m 552cabdff1aSopenharmony_ci PUSH dword 16 553cabdff1aSopenharmony_ci PUSH dword r0 554cabdff1aSopenharmony_ci call deblock_%1_luma_8 555cabdff1aSopenharmony_ci%ifidn %1, v8 556cabdff1aSopenharmony_ci add dword [esp ], 8 ; pix_tmp+0x38 557cabdff1aSopenharmony_ci add dword [esp+16], 2 ; tc0+2 558cabdff1aSopenharmony_ci call deblock_%1_luma_8 559cabdff1aSopenharmony_ci%endif 560cabdff1aSopenharmony_ci ADD esp, 20 561cabdff1aSopenharmony_ci 562cabdff1aSopenharmony_ci ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) 563cabdff1aSopenharmony_ci mov r0, r0mp 564cabdff1aSopenharmony_ci sub r0, 2 565cabdff1aSopenharmony_ci 566cabdff1aSopenharmony_ci movq m0, [pix_tmp+0x10] 567cabdff1aSopenharmony_ci movq m1, [pix_tmp+0x20] 568cabdff1aSopenharmony_ci lea r1, [r0+r4] 569cabdff1aSopenharmony_ci movq m2, [pix_tmp+0x30] 570cabdff1aSopenharmony_ci movq m3, [pix_tmp+0x40] 571cabdff1aSopenharmony_ci TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) 572cabdff1aSopenharmony_ci 573cabdff1aSopenharmony_ci lea r0, [r0+r3*8] 574cabdff1aSopenharmony_ci lea r1, [r1+r3*8] 575cabdff1aSopenharmony_ci movq m0, [pix_tmp+0x18] 576cabdff1aSopenharmony_ci movq m1, [pix_tmp+0x28] 577cabdff1aSopenharmony_ci movq m2, [pix_tmp+0x38] 578cabdff1aSopenharmony_ci movq m3, [pix_tmp+0x48] 579cabdff1aSopenharmony_ci TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) 580cabdff1aSopenharmony_ci 581cabdff1aSopenharmony_ci RET 582cabdff1aSopenharmony_ci%endmacro ; DEBLOCK_LUMA 583cabdff1aSopenharmony_ci 584cabdff1aSopenharmony_ciINIT_XMM sse2 585cabdff1aSopenharmony_ciDEBLOCK_LUMA v, 16 586cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 587cabdff1aSopenharmony_ciINIT_XMM avx 588cabdff1aSopenharmony_ciDEBLOCK_LUMA v, 16 589cabdff1aSopenharmony_ci%endif 590cabdff1aSopenharmony_ci 591cabdff1aSopenharmony_ci%endif ; ARCH 592cabdff1aSopenharmony_ci 593cabdff1aSopenharmony_ci 594cabdff1aSopenharmony_ci 595cabdff1aSopenharmony_ci%macro LUMA_INTRA_P012 4 ; p0..p3 in memory 596cabdff1aSopenharmony_ci%if ARCH_X86_64 597cabdff1aSopenharmony_ci pavgb t0, p2, p1 598cabdff1aSopenharmony_ci pavgb t1, p0, q0 599cabdff1aSopenharmony_ci%else 600cabdff1aSopenharmony_ci mova t0, p2 601cabdff1aSopenharmony_ci mova t1, p0 602cabdff1aSopenharmony_ci pavgb t0, p1 603cabdff1aSopenharmony_ci pavgb t1, q0 604cabdff1aSopenharmony_ci%endif 605cabdff1aSopenharmony_ci pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 606cabdff1aSopenharmony_ci mova t5, t1 607cabdff1aSopenharmony_ci%if ARCH_X86_64 608cabdff1aSopenharmony_ci paddb t2, p2, p1 609cabdff1aSopenharmony_ci paddb t3, p0, q0 610cabdff1aSopenharmony_ci%else 611cabdff1aSopenharmony_ci mova t2, p2 612cabdff1aSopenharmony_ci mova t3, p0 613cabdff1aSopenharmony_ci paddb t2, p1 614cabdff1aSopenharmony_ci paddb t3, q0 615cabdff1aSopenharmony_ci%endif 616cabdff1aSopenharmony_ci paddb t2, t3 617cabdff1aSopenharmony_ci mova t3, t2 618cabdff1aSopenharmony_ci mova t4, t2 619cabdff1aSopenharmony_ci psrlw t2, 1 620cabdff1aSopenharmony_ci pavgb t2, mpb_0 621cabdff1aSopenharmony_ci pxor t2, t0 622cabdff1aSopenharmony_ci pand t2, mpb_1 623cabdff1aSopenharmony_ci psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; 624cabdff1aSopenharmony_ci 625cabdff1aSopenharmony_ci%if ARCH_X86_64 626cabdff1aSopenharmony_ci pavgb t1, p2, q1 627cabdff1aSopenharmony_ci psubb t2, p2, q1 628cabdff1aSopenharmony_ci%else 629cabdff1aSopenharmony_ci mova t1, p2 630cabdff1aSopenharmony_ci mova t2, p2 631cabdff1aSopenharmony_ci pavgb t1, q1 632cabdff1aSopenharmony_ci psubb t2, q1 633cabdff1aSopenharmony_ci%endif 634cabdff1aSopenharmony_ci paddb t3, t3 635cabdff1aSopenharmony_ci psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 636cabdff1aSopenharmony_ci pand t2, mpb_1 637cabdff1aSopenharmony_ci psubb t1, t2 638cabdff1aSopenharmony_ci pavgb t1, p1 639cabdff1aSopenharmony_ci pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 640cabdff1aSopenharmony_ci psrlw t3, 2 641cabdff1aSopenharmony_ci pavgb t3, mpb_0 642cabdff1aSopenharmony_ci pxor t3, t1 643cabdff1aSopenharmony_ci pand t3, mpb_1 644cabdff1aSopenharmony_ci psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 645cabdff1aSopenharmony_ci 646cabdff1aSopenharmony_ci pxor t3, p0, q1 647cabdff1aSopenharmony_ci pavgb t2, p0, q1 648cabdff1aSopenharmony_ci pand t3, mpb_1 649cabdff1aSopenharmony_ci psubb t2, t3 650cabdff1aSopenharmony_ci pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 651cabdff1aSopenharmony_ci 652cabdff1aSopenharmony_ci pxor t1, t2 653cabdff1aSopenharmony_ci pxor t2, p0 654cabdff1aSopenharmony_ci pand t1, mask1p 655cabdff1aSopenharmony_ci pand t2, mask0 656cabdff1aSopenharmony_ci pxor t1, t2 657cabdff1aSopenharmony_ci pxor t1, p0 658cabdff1aSopenharmony_ci mova %1, t1 ; store p0 659cabdff1aSopenharmony_ci 660cabdff1aSopenharmony_ci mova t1, %4 ; p3 661cabdff1aSopenharmony_ci paddb t2, t1, p2 662cabdff1aSopenharmony_ci pavgb t1, p2 663cabdff1aSopenharmony_ci pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 664cabdff1aSopenharmony_ci paddb t2, t2 665cabdff1aSopenharmony_ci paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 666cabdff1aSopenharmony_ci psrlw t2, 2 667cabdff1aSopenharmony_ci pavgb t2, mpb_0 668cabdff1aSopenharmony_ci pxor t2, t1 669cabdff1aSopenharmony_ci pand t2, mpb_1 670cabdff1aSopenharmony_ci psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 671cabdff1aSopenharmony_ci 672cabdff1aSopenharmony_ci pxor t0, p1 673cabdff1aSopenharmony_ci pxor t1, p2 674cabdff1aSopenharmony_ci pand t0, mask1p 675cabdff1aSopenharmony_ci pand t1, mask1p 676cabdff1aSopenharmony_ci pxor t0, p1 677cabdff1aSopenharmony_ci pxor t1, p2 678cabdff1aSopenharmony_ci mova %2, t0 ; store p1 679cabdff1aSopenharmony_ci mova %3, t1 ; store p2 680cabdff1aSopenharmony_ci%endmacro 681cabdff1aSopenharmony_ci 682cabdff1aSopenharmony_ci%macro LUMA_INTRA_SWAP_PQ 0 683cabdff1aSopenharmony_ci %define q1 m0 684cabdff1aSopenharmony_ci %define q0 m1 685cabdff1aSopenharmony_ci %define p0 m2 686cabdff1aSopenharmony_ci %define p1 m3 687cabdff1aSopenharmony_ci %define p2 q2 688cabdff1aSopenharmony_ci %define mask1p mask1q 689cabdff1aSopenharmony_ci%endmacro 690cabdff1aSopenharmony_ci 691cabdff1aSopenharmony_ci%macro DEBLOCK_LUMA_INTRA 1 692cabdff1aSopenharmony_ci %define p1 m0 693cabdff1aSopenharmony_ci %define p0 m1 694cabdff1aSopenharmony_ci %define q0 m2 695cabdff1aSopenharmony_ci %define q1 m3 696cabdff1aSopenharmony_ci %define t0 m4 697cabdff1aSopenharmony_ci %define t1 m5 698cabdff1aSopenharmony_ci %define t2 m6 699cabdff1aSopenharmony_ci %define t3 m7 700cabdff1aSopenharmony_ci%if ARCH_X86_64 701cabdff1aSopenharmony_ci %define p2 m8 702cabdff1aSopenharmony_ci %define q2 m9 703cabdff1aSopenharmony_ci %define t4 m10 704cabdff1aSopenharmony_ci %define t5 m11 705cabdff1aSopenharmony_ci %define mask0 m12 706cabdff1aSopenharmony_ci %define mask1p m13 707cabdff1aSopenharmony_ci%if WIN64 708cabdff1aSopenharmony_ci %define mask1q [rsp] 709cabdff1aSopenharmony_ci%else 710cabdff1aSopenharmony_ci %define mask1q [rsp-24] 711cabdff1aSopenharmony_ci%endif 712cabdff1aSopenharmony_ci %define mpb_0 m14 713cabdff1aSopenharmony_ci %define mpb_1 m15 714cabdff1aSopenharmony_ci%else 715cabdff1aSopenharmony_ci %define spill(x) [esp+16*x] 716cabdff1aSopenharmony_ci %define p2 [r4+r1] 717cabdff1aSopenharmony_ci %define q2 [r0+2*r1] 718cabdff1aSopenharmony_ci %define t4 spill(0) 719cabdff1aSopenharmony_ci %define t5 spill(1) 720cabdff1aSopenharmony_ci %define mask0 spill(2) 721cabdff1aSopenharmony_ci %define mask1p spill(3) 722cabdff1aSopenharmony_ci %define mask1q spill(4) 723cabdff1aSopenharmony_ci %define mpb_0 [pb_0] 724cabdff1aSopenharmony_ci %define mpb_1 [pb_1] 725cabdff1aSopenharmony_ci%endif 726cabdff1aSopenharmony_ci 727cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 728cabdff1aSopenharmony_ci; void ff_deblock_v_luma_intra(uint8_t *pix, int stride, int alpha, int beta) 729cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 730cabdff1aSopenharmony_ci%if WIN64 731cabdff1aSopenharmony_cicglobal deblock_%1_luma_intra_8, 4,6,16,0x10 732cabdff1aSopenharmony_ci%else 733cabdff1aSopenharmony_cicglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50 734cabdff1aSopenharmony_ci%endif 735cabdff1aSopenharmony_ci lea r4, [r1*4] 736cabdff1aSopenharmony_ci lea r5, [r1*3] ; 3*stride 737cabdff1aSopenharmony_ci dec r2d ; alpha-1 738cabdff1aSopenharmony_ci jl .end 739cabdff1aSopenharmony_ci neg r4 740cabdff1aSopenharmony_ci dec r3d ; beta-1 741cabdff1aSopenharmony_ci jl .end 742cabdff1aSopenharmony_ci add r4, r0 ; pix-4*stride 743cabdff1aSopenharmony_ci mova p1, [r4+2*r1] 744cabdff1aSopenharmony_ci mova p0, [r4+r5] 745cabdff1aSopenharmony_ci mova q0, [r0] 746cabdff1aSopenharmony_ci mova q1, [r0+r1] 747cabdff1aSopenharmony_ci%if ARCH_X86_64 748cabdff1aSopenharmony_ci pxor mpb_0, mpb_0 749cabdff1aSopenharmony_ci mova mpb_1, [pb_1] 750cabdff1aSopenharmony_ci LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 751cabdff1aSopenharmony_ci SWAP 7, 12 ; m12=mask0 752cabdff1aSopenharmony_ci pavgb t5, mpb_0 753cabdff1aSopenharmony_ci pavgb t5, mpb_1 ; alpha/4+1 754cabdff1aSopenharmony_ci movdqa p2, [r4+r1] 755cabdff1aSopenharmony_ci movdqa q2, [r0+2*r1] 756cabdff1aSopenharmony_ci DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 757cabdff1aSopenharmony_ci DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 758cabdff1aSopenharmony_ci DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 759cabdff1aSopenharmony_ci pand t0, mask0 760cabdff1aSopenharmony_ci pand t4, t0 761cabdff1aSopenharmony_ci pand t2, t0 762cabdff1aSopenharmony_ci mova mask1q, t4 763cabdff1aSopenharmony_ci mova mask1p, t2 764cabdff1aSopenharmony_ci%else 765cabdff1aSopenharmony_ci LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 766cabdff1aSopenharmony_ci mova m4, t5 767cabdff1aSopenharmony_ci mova mask0, m7 768cabdff1aSopenharmony_ci pavgb m4, [pb_0] 769cabdff1aSopenharmony_ci pavgb m4, [pb_1] ; alpha/4+1 770cabdff1aSopenharmony_ci DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 771cabdff1aSopenharmony_ci pand m6, mask0 772cabdff1aSopenharmony_ci DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 773cabdff1aSopenharmony_ci pand m4, m6 774cabdff1aSopenharmony_ci mova mask1p, m4 775cabdff1aSopenharmony_ci DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 776cabdff1aSopenharmony_ci pand m4, m6 777cabdff1aSopenharmony_ci mova mask1q, m4 778cabdff1aSopenharmony_ci%endif 779cabdff1aSopenharmony_ci LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] 780cabdff1aSopenharmony_ci LUMA_INTRA_SWAP_PQ 781cabdff1aSopenharmony_ci LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] 782cabdff1aSopenharmony_ci.end: 783cabdff1aSopenharmony_ci RET 784cabdff1aSopenharmony_ci 785cabdff1aSopenharmony_ciINIT_MMX cpuname 786cabdff1aSopenharmony_ci%if ARCH_X86_64 787cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 788cabdff1aSopenharmony_ci; void ff_deblock_h_luma_intra(uint8_t *pix, int stride, int alpha, int beta) 789cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 790cabdff1aSopenharmony_cicglobal deblock_h_luma_intra_8, 4,9,0,0x80 791cabdff1aSopenharmony_ci movsxd r7, r1d 792cabdff1aSopenharmony_ci lea r8, [r7*3] 793cabdff1aSopenharmony_ci lea r6, [r0-4] 794cabdff1aSopenharmony_ci lea r5, [r0-4+r8] 795cabdff1aSopenharmony_ci%if WIN64 796cabdff1aSopenharmony_ci %define pix_tmp rsp+0x20 ; shadow space 797cabdff1aSopenharmony_ci%else 798cabdff1aSopenharmony_ci %define pix_tmp rsp 799cabdff1aSopenharmony_ci%endif 800cabdff1aSopenharmony_ci 801cabdff1aSopenharmony_ci ; transpose 8x16 -> tmp space 802cabdff1aSopenharmony_ci TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) 803cabdff1aSopenharmony_ci lea r6, [r6+r7*8] 804cabdff1aSopenharmony_ci lea r5, [r5+r7*8] 805cabdff1aSopenharmony_ci TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) 806cabdff1aSopenharmony_ci 807cabdff1aSopenharmony_ci lea r0, [pix_tmp+0x40] 808cabdff1aSopenharmony_ci mov r1, 0x10 809cabdff1aSopenharmony_ci call deblock_v_luma_intra_8 810cabdff1aSopenharmony_ci 811cabdff1aSopenharmony_ci ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) 812cabdff1aSopenharmony_ci lea r5, [r6+r8] 813cabdff1aSopenharmony_ci TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) 814cabdff1aSopenharmony_ci shl r7, 3 815cabdff1aSopenharmony_ci sub r6, r7 816cabdff1aSopenharmony_ci sub r5, r7 817cabdff1aSopenharmony_ci shr r7, 3 818cabdff1aSopenharmony_ci TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) 819cabdff1aSopenharmony_ci RET 820cabdff1aSopenharmony_ci%else 821cabdff1aSopenharmony_cicglobal deblock_h_luma_intra_8, 2,4,8,0x80 822cabdff1aSopenharmony_ci lea r3, [r1*3] 823cabdff1aSopenharmony_ci sub r0, 4 824cabdff1aSopenharmony_ci lea r2, [r0+r3] 825cabdff1aSopenharmony_ci %define pix_tmp rsp 826cabdff1aSopenharmony_ci 827cabdff1aSopenharmony_ci ; transpose 8x16 -> tmp space 828cabdff1aSopenharmony_ci TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) 829cabdff1aSopenharmony_ci lea r0, [r0+r1*8] 830cabdff1aSopenharmony_ci lea r2, [r2+r1*8] 831cabdff1aSopenharmony_ci TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) 832cabdff1aSopenharmony_ci 833cabdff1aSopenharmony_ci lea r0, [pix_tmp+0x40] 834cabdff1aSopenharmony_ci PUSH dword r3m 835cabdff1aSopenharmony_ci PUSH dword r2m 836cabdff1aSopenharmony_ci PUSH dword 16 837cabdff1aSopenharmony_ci PUSH r0 838cabdff1aSopenharmony_ci call deblock_%1_luma_intra_8 839cabdff1aSopenharmony_ci%ifidn %1, v8 840cabdff1aSopenharmony_ci add dword [rsp], 8 ; pix_tmp+8 841cabdff1aSopenharmony_ci call deblock_%1_luma_intra_8 842cabdff1aSopenharmony_ci%endif 843cabdff1aSopenharmony_ci ADD esp, 16 844cabdff1aSopenharmony_ci 845cabdff1aSopenharmony_ci mov r1, r1m 846cabdff1aSopenharmony_ci mov r0, r0mp 847cabdff1aSopenharmony_ci lea r3, [r1*3] 848cabdff1aSopenharmony_ci sub r0, 4 849cabdff1aSopenharmony_ci lea r2, [r0+r3] 850cabdff1aSopenharmony_ci ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) 851cabdff1aSopenharmony_ci TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) 852cabdff1aSopenharmony_ci lea r0, [r0+r1*8] 853cabdff1aSopenharmony_ci lea r2, [r2+r1*8] 854cabdff1aSopenharmony_ci TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) 855cabdff1aSopenharmony_ci RET 856cabdff1aSopenharmony_ci%endif ; ARCH_X86_64 857cabdff1aSopenharmony_ci%endmacro ; DEBLOCK_LUMA_INTRA 858cabdff1aSopenharmony_ci 859cabdff1aSopenharmony_ciINIT_XMM sse2 860cabdff1aSopenharmony_ciDEBLOCK_LUMA_INTRA v 861cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 862cabdff1aSopenharmony_ciINIT_XMM avx 863cabdff1aSopenharmony_ciDEBLOCK_LUMA_INTRA v 864cabdff1aSopenharmony_ci%endif 865cabdff1aSopenharmony_ci 866cabdff1aSopenharmony_ci%macro LOAD_8_ROWS 8 867cabdff1aSopenharmony_ci movd m0, %1 868cabdff1aSopenharmony_ci movd m1, %2 869cabdff1aSopenharmony_ci movd m2, %3 870cabdff1aSopenharmony_ci movd m3, %4 871cabdff1aSopenharmony_ci movd m4, %5 872cabdff1aSopenharmony_ci movd m5, %6 873cabdff1aSopenharmony_ci movd m6, %7 874cabdff1aSopenharmony_ci movd m7, %8 875cabdff1aSopenharmony_ci%endmacro 876cabdff1aSopenharmony_ci 877cabdff1aSopenharmony_ci%macro STORE_8_ROWS 8 878cabdff1aSopenharmony_ci movd %1, m0 879cabdff1aSopenharmony_ci movd %2, m1 880cabdff1aSopenharmony_ci movd %3, m2 881cabdff1aSopenharmony_ci movd %4, m3 882cabdff1aSopenharmony_ci movd %5, m4 883cabdff1aSopenharmony_ci movd %6, m5 884cabdff1aSopenharmony_ci movd %7, m6 885cabdff1aSopenharmony_ci movd %8, m7 886cabdff1aSopenharmony_ci%endmacro 887cabdff1aSopenharmony_ci 888cabdff1aSopenharmony_ci%macro TRANSPOSE_8x4B_XMM 0 889cabdff1aSopenharmony_ci punpcklbw m0, m1 890cabdff1aSopenharmony_ci punpcklbw m2, m3 891cabdff1aSopenharmony_ci punpcklbw m4, m5 892cabdff1aSopenharmony_ci punpcklbw m6, m7 893cabdff1aSopenharmony_ci punpcklwd m0, m2 894cabdff1aSopenharmony_ci punpcklwd m4, m6 895cabdff1aSopenharmony_ci punpckhdq m2, m0, m4 896cabdff1aSopenharmony_ci punpckldq m0, m4 897cabdff1aSopenharmony_ci MOVHL m1, m0 898cabdff1aSopenharmony_ci MOVHL m3, m2 899cabdff1aSopenharmony_ci%endmacro 900cabdff1aSopenharmony_ci 901cabdff1aSopenharmony_ci%macro TRANSPOSE_4x8B_XMM 0 902cabdff1aSopenharmony_ci punpcklbw m0, m1 903cabdff1aSopenharmony_ci punpcklbw m2, m3 904cabdff1aSopenharmony_ci punpckhwd m4, m0, m2 905cabdff1aSopenharmony_ci punpcklwd m0, m2 906cabdff1aSopenharmony_ci MOVHL m6, m4 907cabdff1aSopenharmony_ci MOVHL m2, m0 908cabdff1aSopenharmony_ci pshufd m1, m0, 1 909cabdff1aSopenharmony_ci pshufd m3, m2, 1 910cabdff1aSopenharmony_ci pshufd m5, m4, 1 911cabdff1aSopenharmony_ci pshufd m7, m6, 1 912cabdff1aSopenharmony_ci%endmacro 913cabdff1aSopenharmony_ci 914cabdff1aSopenharmony_ci%macro CHROMA_INTER_BODY_XMM 1 915cabdff1aSopenharmony_ci LOAD_MASK alpha_d, beta_d 916cabdff1aSopenharmony_ci movd m6, [tc0_q] 917cabdff1aSopenharmony_ci %rep %1 918cabdff1aSopenharmony_ci punpcklbw m6, m6 919cabdff1aSopenharmony_ci %endrep 920cabdff1aSopenharmony_ci pand m7, m6 921cabdff1aSopenharmony_ci DEBLOCK_P0_Q0 922cabdff1aSopenharmony_ci%endmacro 923cabdff1aSopenharmony_ci 924cabdff1aSopenharmony_ci%macro CHROMA_INTRA_BODY_XMM 0 925cabdff1aSopenharmony_ci LOAD_MASK alpha_d, beta_d 926cabdff1aSopenharmony_ci mova m5, m1 927cabdff1aSopenharmony_ci mova m6, m2 928cabdff1aSopenharmony_ci pxor m4, m1, m3 929cabdff1aSopenharmony_ci pand m4, [pb_1] 930cabdff1aSopenharmony_ci pavgb m1, m3 931cabdff1aSopenharmony_ci psubusb m1, m4 932cabdff1aSopenharmony_ci pavgb m1, m0 933cabdff1aSopenharmony_ci pxor m4, m2, m0 934cabdff1aSopenharmony_ci pand m4, [pb_1] 935cabdff1aSopenharmony_ci pavgb m2, m0 936cabdff1aSopenharmony_ci psubusb m2, m4 937cabdff1aSopenharmony_ci pavgb m2, m3 938cabdff1aSopenharmony_ci psubb m1, m5 939cabdff1aSopenharmony_ci psubb m2, m6 940cabdff1aSopenharmony_ci pand m1, m7 941cabdff1aSopenharmony_ci pand m2, m7 942cabdff1aSopenharmony_ci paddb m1, m5 943cabdff1aSopenharmony_ci paddb m2, m6 944cabdff1aSopenharmony_ci%endmacro 945cabdff1aSopenharmony_ci 946cabdff1aSopenharmony_ci%macro CHROMA_V_START_XMM 1 947cabdff1aSopenharmony_ci movsxdifnidn stride_q, stride_d 948cabdff1aSopenharmony_ci dec alpha_d 949cabdff1aSopenharmony_ci dec beta_d 950cabdff1aSopenharmony_ci mov %1, pix_q 951cabdff1aSopenharmony_ci sub %1, stride_q 952cabdff1aSopenharmony_ci sub %1, stride_q 953cabdff1aSopenharmony_ci%endmacro 954cabdff1aSopenharmony_ci 955cabdff1aSopenharmony_ci%macro CHROMA_H_START_XMM 2 956cabdff1aSopenharmony_ci movsxdifnidn stride_q, stride_d 957cabdff1aSopenharmony_ci dec alpha_d 958cabdff1aSopenharmony_ci dec beta_d 959cabdff1aSopenharmony_ci lea %2, [3*stride_q] 960cabdff1aSopenharmony_ci mov %1, pix_q 961cabdff1aSopenharmony_ci add %1, %2 962cabdff1aSopenharmony_ci%endmacro 963cabdff1aSopenharmony_ci 964cabdff1aSopenharmony_ci%macro DEBLOCK_CHROMA_XMM 1 965cabdff1aSopenharmony_ci 966cabdff1aSopenharmony_ciINIT_XMM %1 967cabdff1aSopenharmony_ci 968cabdff1aSopenharmony_cicglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_ 969cabdff1aSopenharmony_ci CHROMA_V_START_XMM r5 970cabdff1aSopenharmony_ci movq m0, [r5] 971cabdff1aSopenharmony_ci movq m1, [r5 + stride_q] 972cabdff1aSopenharmony_ci movq m2, [pix_q] 973cabdff1aSopenharmony_ci movq m3, [pix_q + stride_q] 974cabdff1aSopenharmony_ci CHROMA_INTER_BODY_XMM 1 975cabdff1aSopenharmony_ci movq [r5 + stride_q], m1 976cabdff1aSopenharmony_ci movq [pix_q], m2 977cabdff1aSopenharmony_ciRET 978cabdff1aSopenharmony_ci 979cabdff1aSopenharmony_cicglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_ 980cabdff1aSopenharmony_ci CHROMA_H_START_XMM r5, r6 981cabdff1aSopenharmony_ci LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 982cabdff1aSopenharmony_ci TRANSPOSE_8x4B_XMM 983cabdff1aSopenharmony_ci movq [rsp], m0 984cabdff1aSopenharmony_ci movq [rsp + 8], m3 985cabdff1aSopenharmony_ci CHROMA_INTER_BODY_XMM 1 986cabdff1aSopenharmony_ci movq m0, [rsp] 987cabdff1aSopenharmony_ci movq m3, [rsp + 8] 988cabdff1aSopenharmony_ci TRANSPOSE_4x8B_XMM 989cabdff1aSopenharmony_ci STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 990cabdff1aSopenharmony_ciRET 991cabdff1aSopenharmony_ci 992cabdff1aSopenharmony_cicglobal deblock_h_chroma422_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_ 993cabdff1aSopenharmony_ci CHROMA_H_START_XMM r5, r6 994cabdff1aSopenharmony_ci LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 995cabdff1aSopenharmony_ci TRANSPOSE_8x4B_XMM 996cabdff1aSopenharmony_ci movq [rsp], m0 997cabdff1aSopenharmony_ci movq [rsp + 8], m3 998cabdff1aSopenharmony_ci CHROMA_INTER_BODY_XMM 2 999cabdff1aSopenharmony_ci movq m0, [rsp] 1000cabdff1aSopenharmony_ci movq m3, [rsp + 8] 1001cabdff1aSopenharmony_ci TRANSPOSE_4x8B_XMM 1002cabdff1aSopenharmony_ci STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 1003cabdff1aSopenharmony_ci 1004cabdff1aSopenharmony_ci lea pix_q, [pix_q + 8*stride_q] 1005cabdff1aSopenharmony_ci lea r5, [r5 + 8*stride_q] 1006cabdff1aSopenharmony_ci add tc0_q, 2 1007cabdff1aSopenharmony_ci 1008cabdff1aSopenharmony_ci LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 1009cabdff1aSopenharmony_ci TRANSPOSE_8x4B_XMM 1010cabdff1aSopenharmony_ci movq [rsp], m0 1011cabdff1aSopenharmony_ci movq [rsp + 8], m3 1012cabdff1aSopenharmony_ci CHROMA_INTER_BODY_XMM 2 1013cabdff1aSopenharmony_ci movq m0, [rsp] 1014cabdff1aSopenharmony_ci movq m3, [rsp + 8] 1015cabdff1aSopenharmony_ci TRANSPOSE_4x8B_XMM 1016cabdff1aSopenharmony_ci STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 1017cabdff1aSopenharmony_ciRET 1018cabdff1aSopenharmony_ci 1019cabdff1aSopenharmony_cicglobal deblock_v_chroma_intra_8, 4, 5, 8, pix_, stride_, alpha_, beta_ 1020cabdff1aSopenharmony_ci CHROMA_V_START_XMM r4 1021cabdff1aSopenharmony_ci movq m0, [r4] 1022cabdff1aSopenharmony_ci movq m1, [r4 + stride_q] 1023cabdff1aSopenharmony_ci movq m2, [pix_q] 1024cabdff1aSopenharmony_ci movq m3, [pix_q + stride_q] 1025cabdff1aSopenharmony_ci CHROMA_INTRA_BODY_XMM 1026cabdff1aSopenharmony_ci movq [r4 + stride_q], m1 1027cabdff1aSopenharmony_ci movq [pix_q], m2 1028cabdff1aSopenharmony_ciRET 1029cabdff1aSopenharmony_ci 1030cabdff1aSopenharmony_cicglobal deblock_h_chroma_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_ 1031cabdff1aSopenharmony_ci CHROMA_H_START_XMM r4, r5 1032cabdff1aSopenharmony_ci LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1033cabdff1aSopenharmony_ci TRANSPOSE_8x4B_XMM 1034cabdff1aSopenharmony_ci CHROMA_INTRA_BODY_XMM 1035cabdff1aSopenharmony_ci TRANSPOSE_4x8B_XMM 1036cabdff1aSopenharmony_ci STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1037cabdff1aSopenharmony_ciRET 1038cabdff1aSopenharmony_ci 1039cabdff1aSopenharmony_cicglobal deblock_h_chroma422_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_ 1040cabdff1aSopenharmony_ci CHROMA_H_START_XMM r4, r5 1041cabdff1aSopenharmony_ci LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1042cabdff1aSopenharmony_ci TRANSPOSE_8x4B_XMM 1043cabdff1aSopenharmony_ci CHROMA_INTRA_BODY_XMM 1044cabdff1aSopenharmony_ci TRANSPOSE_4x8B_XMM 1045cabdff1aSopenharmony_ci STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1046cabdff1aSopenharmony_ci 1047cabdff1aSopenharmony_ci lea pix_q, [pix_q + 8*stride_q] 1048cabdff1aSopenharmony_ci lea r4, [r4 + 8*stride_q] 1049cabdff1aSopenharmony_ci 1050cabdff1aSopenharmony_ci LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1051cabdff1aSopenharmony_ci TRANSPOSE_8x4B_XMM 1052cabdff1aSopenharmony_ci CHROMA_INTRA_BODY_XMM 1053cabdff1aSopenharmony_ci TRANSPOSE_4x8B_XMM 1054cabdff1aSopenharmony_ci STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1055cabdff1aSopenharmony_ciRET 1056cabdff1aSopenharmony_ci 1057cabdff1aSopenharmony_ci%endmacro ; DEBLOCK_CHROMA_XMM 1058cabdff1aSopenharmony_ci 1059cabdff1aSopenharmony_ciDEBLOCK_CHROMA_XMM sse2 1060cabdff1aSopenharmony_ciDEBLOCK_CHROMA_XMM avx 1061cabdff1aSopenharmony_ci 1062cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1063cabdff1aSopenharmony_ci; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40], 1064cabdff1aSopenharmony_ci; int8_t ref[2][40], int16_t mv[2][40][2], 1065cabdff1aSopenharmony_ci; int bidir, int edges, int step, 1066cabdff1aSopenharmony_ci; int mask_mv0, int mask_mv1, int field); 1067cabdff1aSopenharmony_ci; 1068cabdff1aSopenharmony_ci; bidir is 0 or 1 1069cabdff1aSopenharmony_ci; edges is 1 or 4 1070cabdff1aSopenharmony_ci; step is 1 or 2 1071cabdff1aSopenharmony_ci; mask_mv0 is 0 or 3 1072cabdff1aSopenharmony_ci; mask_mv1 is 0 or 1 1073cabdff1aSopenharmony_ci; field is 0 or 1 1074cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1075cabdff1aSopenharmony_ci%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv, 1076cabdff1aSopenharmony_ci ; dir, d_idx, mask_dir, bidir 1077cabdff1aSopenharmony_ci%define edgesd %1 1078cabdff1aSopenharmony_ci%define stepd %2 1079cabdff1aSopenharmony_ci%define mask_mvd %3 1080cabdff1aSopenharmony_ci%define dir %4 1081cabdff1aSopenharmony_ci%define d_idx %5 1082cabdff1aSopenharmony_ci%define mask_dir %6 1083cabdff1aSopenharmony_ci%define bidir %7 1084cabdff1aSopenharmony_ci xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step) 1085cabdff1aSopenharmony_ci%%.b_idx_loop: 1086cabdff1aSopenharmony_ci%if mask_dir == 0 1087cabdff1aSopenharmony_ci pxor m0, m0 1088cabdff1aSopenharmony_ci%endif 1089cabdff1aSopenharmony_ci test b_idxd, dword mask_mvd 1090cabdff1aSopenharmony_ci jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv)) 1091cabdff1aSopenharmony_ci%if bidir == 1 1092cabdff1aSopenharmony_ci movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] } 1093cabdff1aSopenharmony_ci punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] } 1094cabdff1aSopenharmony_ci pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] } 1095cabdff1aSopenharmony_ci pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] } 1096cabdff1aSopenharmony_ci pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] } 1097cabdff1aSopenharmony_ci psubb m0, m2 ; { ref0[b] != ref0[bn], 1098cabdff1aSopenharmony_ci ; ref0[b] != ref1[bn] } 1099cabdff1aSopenharmony_ci psubb m1, m3 ; { ref1[b] != ref1[bn], 1100cabdff1aSopenharmony_ci ; ref1[b] != ref0[bn] } 1101cabdff1aSopenharmony_ci 1102cabdff1aSopenharmony_ci por m0, m1 1103cabdff1aSopenharmony_ci mova m1, [mvq+b_idxq*4+(d_idx+12)*4] 1104cabdff1aSopenharmony_ci mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] 1105cabdff1aSopenharmony_ci mova m3, m1 1106cabdff1aSopenharmony_ci mova m4, m2 1107cabdff1aSopenharmony_ci psubw m1, [mvq+b_idxq*4+12*4] 1108cabdff1aSopenharmony_ci psubw m2, [mvq+b_idxq*4+12*4+mmsize] 1109cabdff1aSopenharmony_ci psubw m3, [mvq+b_idxq*4+52*4] 1110cabdff1aSopenharmony_ci psubw m4, [mvq+b_idxq*4+52*4+mmsize] 1111cabdff1aSopenharmony_ci packsswb m1, m2 1112cabdff1aSopenharmony_ci packsswb m3, m4 1113cabdff1aSopenharmony_ci paddb m1, m6 1114cabdff1aSopenharmony_ci paddb m3, m6 1115cabdff1aSopenharmony_ci psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit 1116cabdff1aSopenharmony_ci psubusb m3, m5 1117cabdff1aSopenharmony_ci packsswb m1, m3 1118cabdff1aSopenharmony_ci 1119cabdff1aSopenharmony_ci por m0, m1 1120cabdff1aSopenharmony_ci mova m1, [mvq+b_idxq*4+(d_idx+52)*4] 1121cabdff1aSopenharmony_ci mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize] 1122cabdff1aSopenharmony_ci mova m3, m1 1123cabdff1aSopenharmony_ci mova m4, m2 1124cabdff1aSopenharmony_ci psubw m1, [mvq+b_idxq*4+12*4] 1125cabdff1aSopenharmony_ci psubw m2, [mvq+b_idxq*4+12*4+mmsize] 1126cabdff1aSopenharmony_ci psubw m3, [mvq+b_idxq*4+52*4] 1127cabdff1aSopenharmony_ci psubw m4, [mvq+b_idxq*4+52*4+mmsize] 1128cabdff1aSopenharmony_ci packsswb m1, m2 1129cabdff1aSopenharmony_ci packsswb m3, m4 1130cabdff1aSopenharmony_ci paddb m1, m6 1131cabdff1aSopenharmony_ci paddb m3, m6 1132cabdff1aSopenharmony_ci psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit 1133cabdff1aSopenharmony_ci psubusb m3, m5 1134cabdff1aSopenharmony_ci packsswb m1, m3 1135cabdff1aSopenharmony_ci 1136cabdff1aSopenharmony_ci pshufw m1, m1, 0x4E 1137cabdff1aSopenharmony_ci por m0, m1 1138cabdff1aSopenharmony_ci pshufw m1, m0, 0x4E 1139cabdff1aSopenharmony_ci pminub m0, m1 1140cabdff1aSopenharmony_ci%else ; bidir == 0 1141cabdff1aSopenharmony_ci movd m0, [refq+b_idxq+12] 1142cabdff1aSopenharmony_ci psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn] 1143cabdff1aSopenharmony_ci 1144cabdff1aSopenharmony_ci mova m1, [mvq+b_idxq*4+12*4] 1145cabdff1aSopenharmony_ci mova m2, [mvq+b_idxq*4+12*4+mmsize] 1146cabdff1aSopenharmony_ci psubw m1, [mvq+b_idxq*4+(d_idx+12)*4] 1147cabdff1aSopenharmony_ci psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] 1148cabdff1aSopenharmony_ci packsswb m1, m2 1149cabdff1aSopenharmony_ci paddb m1, m6 1150cabdff1aSopenharmony_ci psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit 1151cabdff1aSopenharmony_ci packsswb m1, m1 1152cabdff1aSopenharmony_ci por m0, m1 1153cabdff1aSopenharmony_ci%endif ; bidir == 1/0 1154cabdff1aSopenharmony_ci 1155cabdff1aSopenharmony_ci%%.skip_loop_iter: 1156cabdff1aSopenharmony_ci movd m1, [nnzq+b_idxq+12] 1157cabdff1aSopenharmony_ci por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn] 1158cabdff1aSopenharmony_ci 1159cabdff1aSopenharmony_ci pminub m1, m7 1160cabdff1aSopenharmony_ci pminub m0, m7 1161cabdff1aSopenharmony_ci psllw m1, 1 1162cabdff1aSopenharmony_ci pxor m2, m2 1163cabdff1aSopenharmony_ci pmaxub m1, m0 1164cabdff1aSopenharmony_ci punpcklbw m1, m2 1165cabdff1aSopenharmony_ci movq [bsq+b_idxq+32*dir], m1 1166cabdff1aSopenharmony_ci 1167cabdff1aSopenharmony_ci add b_idxd, dword stepd 1168cabdff1aSopenharmony_ci cmp b_idxd, dword edgesd 1169cabdff1aSopenharmony_ci jl %%.b_idx_loop 1170cabdff1aSopenharmony_ci%endmacro 1171cabdff1aSopenharmony_ci 1172cabdff1aSopenharmony_ciINIT_MMX mmxext 1173cabdff1aSopenharmony_cicglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \ 1174cabdff1aSopenharmony_ci step, mask_mv0, mask_mv1, field 1175cabdff1aSopenharmony_ci%define b_idxq bidirq 1176cabdff1aSopenharmony_ci%define b_idxd bidird 1177cabdff1aSopenharmony_ci cmp dword fieldm, 0 1178cabdff1aSopenharmony_ci mova m7, [pb_1] 1179cabdff1aSopenharmony_ci mova m5, [pb_3] 1180cabdff1aSopenharmony_ci je .nofield 1181cabdff1aSopenharmony_ci mova m5, [pb_3_1] 1182cabdff1aSopenharmony_ci.nofield: 1183cabdff1aSopenharmony_ci mova m6, m5 1184cabdff1aSopenharmony_ci paddb m5, m5 1185cabdff1aSopenharmony_ci 1186cabdff1aSopenharmony_ci shl dword stepd, 3 1187cabdff1aSopenharmony_ci shl dword edgesd, 3 1188cabdff1aSopenharmony_ci%if ARCH_X86_32 1189cabdff1aSopenharmony_ci%define mask_mv0d mask_mv0m 1190cabdff1aSopenharmony_ci%define mask_mv1d mask_mv1m 1191cabdff1aSopenharmony_ci%endif 1192cabdff1aSopenharmony_ci shl dword mask_mv1d, 3 1193cabdff1aSopenharmony_ci shl dword mask_mv0d, 3 1194cabdff1aSopenharmony_ci 1195cabdff1aSopenharmony_ci cmp dword bidird, 0 1196cabdff1aSopenharmony_ci jne .bidir 1197cabdff1aSopenharmony_ci loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0 1198cabdff1aSopenharmony_ci loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0 1199cabdff1aSopenharmony_ci 1200cabdff1aSopenharmony_ci mova m0, [bsq+mmsize*0] 1201cabdff1aSopenharmony_ci mova m1, [bsq+mmsize*1] 1202cabdff1aSopenharmony_ci mova m2, [bsq+mmsize*2] 1203cabdff1aSopenharmony_ci mova m3, [bsq+mmsize*3] 1204cabdff1aSopenharmony_ci TRANSPOSE4x4W 0, 1, 2, 3, 4 1205cabdff1aSopenharmony_ci mova [bsq+mmsize*0], m0 1206cabdff1aSopenharmony_ci mova [bsq+mmsize*1], m1 1207cabdff1aSopenharmony_ci mova [bsq+mmsize*2], m2 1208cabdff1aSopenharmony_ci mova [bsq+mmsize*3], m3 1209cabdff1aSopenharmony_ci RET 1210cabdff1aSopenharmony_ci 1211cabdff1aSopenharmony_ci.bidir: 1212cabdff1aSopenharmony_ci loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1 1213cabdff1aSopenharmony_ci loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1 1214cabdff1aSopenharmony_ci 1215cabdff1aSopenharmony_ci mova m0, [bsq+mmsize*0] 1216cabdff1aSopenharmony_ci mova m1, [bsq+mmsize*1] 1217cabdff1aSopenharmony_ci mova m2, [bsq+mmsize*2] 1218cabdff1aSopenharmony_ci mova m3, [bsq+mmsize*3] 1219cabdff1aSopenharmony_ci TRANSPOSE4x4W 0, 1, 2, 3, 4 1220cabdff1aSopenharmony_ci mova [bsq+mmsize*0], m0 1221cabdff1aSopenharmony_ci mova [bsq+mmsize*1], m1 1222cabdff1aSopenharmony_ci mova [bsq+mmsize*2], m2 1223cabdff1aSopenharmony_ci mova [bsq+mmsize*3], m3 1224cabdff1aSopenharmony_ci RET 1225