1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* VP9 loop filter SIMD optimizations 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com> 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ciSECTION_RODATA 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_cipw_511: times 16 dw 511 28cabdff1aSopenharmony_cipw_2047: times 16 dw 2047 29cabdff1aSopenharmony_cipw_16384: times 16 dw 16384 30cabdff1aSopenharmony_cipw_m512: times 16 dw -512 31cabdff1aSopenharmony_cipw_m2048: times 16 dw -2048 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_cicextern pw_1 34cabdff1aSopenharmony_cicextern pw_3 35cabdff1aSopenharmony_cicextern pw_4 36cabdff1aSopenharmony_cicextern pw_8 37cabdff1aSopenharmony_cicextern pw_16 38cabdff1aSopenharmony_cicextern pw_256 39cabdff1aSopenharmony_cicextern pw_1023 40cabdff1aSopenharmony_cicextern pw_4095 41cabdff1aSopenharmony_cicextern pw_m1 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_ciSECTION .text 44cabdff1aSopenharmony_ci 45cabdff1aSopenharmony_ci%macro SCRATCH 3-4 46cabdff1aSopenharmony_ci%if ARCH_X86_64 47cabdff1aSopenharmony_ci SWAP %1, %2 48cabdff1aSopenharmony_ci%if %0 == 4 49cabdff1aSopenharmony_ci%define reg_%4 m%2 50cabdff1aSopenharmony_ci%endif 51cabdff1aSopenharmony_ci%else 52cabdff1aSopenharmony_ci mova [%3], m%1 53cabdff1aSopenharmony_ci%if %0 == 4 54cabdff1aSopenharmony_ci%define reg_%4 [%3] 55cabdff1aSopenharmony_ci%endif 56cabdff1aSopenharmony_ci%endif 57cabdff1aSopenharmony_ci%endmacro 58cabdff1aSopenharmony_ci 59cabdff1aSopenharmony_ci%macro UNSCRATCH 3-4 60cabdff1aSopenharmony_ci%if ARCH_X86_64 61cabdff1aSopenharmony_ci SWAP %1, %2 62cabdff1aSopenharmony_ci%else 63cabdff1aSopenharmony_ci mova m%1, [%3] 64cabdff1aSopenharmony_ci%endif 65cabdff1aSopenharmony_ci%if %0 == 4 66cabdff1aSopenharmony_ci%undef reg_%4 67cabdff1aSopenharmony_ci%endif 68cabdff1aSopenharmony_ci%endmacro 69cabdff1aSopenharmony_ci 70cabdff1aSopenharmony_ci%macro PRELOAD 2-3 71cabdff1aSopenharmony_ci%if ARCH_X86_64 72cabdff1aSopenharmony_ci mova m%1, [%2] 73cabdff1aSopenharmony_ci%if %0 == 3 74cabdff1aSopenharmony_ci%define reg_%3 m%1 75cabdff1aSopenharmony_ci%endif 76cabdff1aSopenharmony_ci%elif %0 == 3 77cabdff1aSopenharmony_ci%define reg_%3 [%2] 78cabdff1aSopenharmony_ci%endif 79cabdff1aSopenharmony_ci%endmacro 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_ci; calculate p or q portion of flat8out 82cabdff1aSopenharmony_ci%macro FLAT8OUT_HALF 0 83cabdff1aSopenharmony_ci psubw m4, m0 ; q4-q0 84cabdff1aSopenharmony_ci psubw m5, m0 ; q5-q0 85cabdff1aSopenharmony_ci psubw m6, m0 ; q6-q0 86cabdff1aSopenharmony_ci psubw m7, m0 ; q7-q0 87cabdff1aSopenharmony_ci ABS2 m4, m5, m2, m3 ; abs(q4-q0) | abs(q5-q0) 88cabdff1aSopenharmony_ci ABS2 m6, m7, m2, m3 ; abs(q6-q0) | abs(q7-q0) 89cabdff1aSopenharmony_ci pcmpgtw m4, reg_F ; abs(q4-q0) > F 90cabdff1aSopenharmony_ci pcmpgtw m5, reg_F ; abs(q5-q0) > F 91cabdff1aSopenharmony_ci pcmpgtw m6, reg_F ; abs(q6-q0) > F 92cabdff1aSopenharmony_ci pcmpgtw m7, reg_F ; abs(q7-q0) > F 93cabdff1aSopenharmony_ci por m5, m4 94cabdff1aSopenharmony_ci por m7, m6 95cabdff1aSopenharmony_ci por m7, m5 ; !flat8out, q portion 96cabdff1aSopenharmony_ci%endmacro 97cabdff1aSopenharmony_ci 98cabdff1aSopenharmony_ci; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition) 99cabdff1aSopenharmony_ci%macro FLAT8IN_HALF 1 100cabdff1aSopenharmony_ci%if %1 > 4 101cabdff1aSopenharmony_ci psubw m4, m3, m0 ; q3-q0 102cabdff1aSopenharmony_ci psubw m5, m2, m0 ; q2-q0 103cabdff1aSopenharmony_ci ABS2 m4, m5, m6, m7 ; abs(q3-q0) | abs(q2-q0) 104cabdff1aSopenharmony_ci pcmpgtw m4, reg_F ; abs(q3-q0) > F 105cabdff1aSopenharmony_ci pcmpgtw m5, reg_F ; abs(q2-q0) > F 106cabdff1aSopenharmony_ci%endif 107cabdff1aSopenharmony_ci psubw m3, m2 ; q3-q2 108cabdff1aSopenharmony_ci psubw m2, m1 ; q2-q1 109cabdff1aSopenharmony_ci ABS2 m3, m2, m6, m7 ; abs(q3-q2) | abs(q2-q1) 110cabdff1aSopenharmony_ci pcmpgtw m3, reg_I ; abs(q3-q2) > I 111cabdff1aSopenharmony_ci pcmpgtw m2, reg_I ; abs(q2-q1) > I 112cabdff1aSopenharmony_ci%if %1 > 4 113cabdff1aSopenharmony_ci por m4, m5 114cabdff1aSopenharmony_ci%endif 115cabdff1aSopenharmony_ci por m2, m3 116cabdff1aSopenharmony_ci psubw m3, m1, m0 ; q1-q0 117cabdff1aSopenharmony_ci ABS1 m3, m5 ; abs(q1-q0) 118cabdff1aSopenharmony_ci%if %1 > 4 119cabdff1aSopenharmony_ci pcmpgtw m6, m3, reg_F ; abs(q1-q0) > F 120cabdff1aSopenharmony_ci%endif 121cabdff1aSopenharmony_ci pcmpgtw m7, m3, reg_H ; abs(q1-q0) > H 122cabdff1aSopenharmony_ci pcmpgtw m3, reg_I ; abs(q1-q0) > I 123cabdff1aSopenharmony_ci%if %1 > 4 124cabdff1aSopenharmony_ci por m4, m6 125cabdff1aSopenharmony_ci%endif 126cabdff1aSopenharmony_ci por m2, m3 127cabdff1aSopenharmony_ci%endmacro 128cabdff1aSopenharmony_ci 129cabdff1aSopenharmony_ci; one step in filter_14/filter_6 130cabdff1aSopenharmony_ci; 131cabdff1aSopenharmony_ci; take sum $reg, downshift, apply mask and write into dst 132cabdff1aSopenharmony_ci; 133cabdff1aSopenharmony_ci; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next 134cabdff1aSopenharmony_ci; step's sum $reg. This is omitted for the last row in each filter. 135cabdff1aSopenharmony_ci; 136cabdff1aSopenharmony_ci; if dont_store is set, don't write the result into memory, instead keep the 137cabdff1aSopenharmony_ci; values in register so we can write it out later 138cabdff1aSopenharmony_ci%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \ 139cabdff1aSopenharmony_ci ; src/sub1, sub2, add1, add2, dont_store 140cabdff1aSopenharmony_ci psrlw %1, %2, %4 141cabdff1aSopenharmony_ci psubw %1, %6 ; abs->delta 142cabdff1aSopenharmony_ci%ifnidn %7, "" 143cabdff1aSopenharmony_ci psubw %2, %6 144cabdff1aSopenharmony_ci psubw %2, %7 145cabdff1aSopenharmony_ci paddw %2, %8 146cabdff1aSopenharmony_ci paddw %2, %9 147cabdff1aSopenharmony_ci%endif 148cabdff1aSopenharmony_ci pand %1, reg_%3 ; apply mask 149cabdff1aSopenharmony_ci%if %10 == 1 150cabdff1aSopenharmony_ci paddw %6, %1 ; delta->abs 151cabdff1aSopenharmony_ci%else 152cabdff1aSopenharmony_ci paddw %1, %6 ; delta->abs 153cabdff1aSopenharmony_ci mova [%5], %1 154cabdff1aSopenharmony_ci%endif 155cabdff1aSopenharmony_ci%endmacro 156cabdff1aSopenharmony_ci 157cabdff1aSopenharmony_ci; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8} 158cabdff1aSopenharmony_ci 159cabdff1aSopenharmony_ci%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12] 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_ci%if ARCH_X86_64 162cabdff1aSopenharmony_ci%if %2 == 16 163cabdff1aSopenharmony_ci%assign %%num_xmm_regs 16 164cabdff1aSopenharmony_ci%elif %2 == 8 165cabdff1aSopenharmony_ci%assign %%num_xmm_regs 15 166cabdff1aSopenharmony_ci%else ; %2 == 4 167cabdff1aSopenharmony_ci%assign %%num_xmm_regs 14 168cabdff1aSopenharmony_ci%endif ; %2 169cabdff1aSopenharmony_ci%assign %%bak_mem 0 170cabdff1aSopenharmony_ci%else ; ARCH_X86_32 171cabdff1aSopenharmony_ci%assign %%num_xmm_regs 8 172cabdff1aSopenharmony_ci%if %2 == 16 173cabdff1aSopenharmony_ci%assign %%bak_mem 7 174cabdff1aSopenharmony_ci%elif %2 == 8 175cabdff1aSopenharmony_ci%assign %%bak_mem 6 176cabdff1aSopenharmony_ci%else ; %2 == 4 177cabdff1aSopenharmony_ci%assign %%bak_mem 5 178cabdff1aSopenharmony_ci%endif ; %2 179cabdff1aSopenharmony_ci%endif ; ARCH_X86_64/32 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ci%if %2 == 16 182cabdff1aSopenharmony_ci%ifidn %1, v 183cabdff1aSopenharmony_ci%assign %%num_gpr_regs 6 184cabdff1aSopenharmony_ci%else ; %1 == h 185cabdff1aSopenharmony_ci%assign %%num_gpr_regs 5 186cabdff1aSopenharmony_ci%endif ; %1 187cabdff1aSopenharmony_ci%assign %%wd_mem 6 188cabdff1aSopenharmony_ci%else ; %2 == 8/4 189cabdff1aSopenharmony_ci%assign %%num_gpr_regs 5 190cabdff1aSopenharmony_ci%if ARCH_X86_32 && %2 == 8 191cabdff1aSopenharmony_ci%assign %%wd_mem 2 192cabdff1aSopenharmony_ci%else ; ARCH_X86_64 || %2 == 4 193cabdff1aSopenharmony_ci%assign %%wd_mem 0 194cabdff1aSopenharmony_ci%endif ; ARCH_X86_64/32 etc. 195cabdff1aSopenharmony_ci%endif ; %2 196cabdff1aSopenharmony_ci 197cabdff1aSopenharmony_ci%ifidn %1, v 198cabdff1aSopenharmony_ci%assign %%tsp_mem 0 199cabdff1aSopenharmony_ci%elif %2 == 16 ; && %1 == h 200cabdff1aSopenharmony_ci%assign %%tsp_mem 16 201cabdff1aSopenharmony_ci%else ; %1 == h && %1 == 8/4 202cabdff1aSopenharmony_ci%assign %%tsp_mem 8 203cabdff1aSopenharmony_ci%endif ; %1/%2 204cabdff1aSopenharmony_ci 205cabdff1aSopenharmony_ci%assign %%off %%wd_mem 206cabdff1aSopenharmony_ci%assign %%tspoff %%bak_mem+%%wd_mem 207cabdff1aSopenharmony_ci%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize) 208cabdff1aSopenharmony_ci 209cabdff1aSopenharmony_ci%if %3 == 10 210cabdff1aSopenharmony_ci%define %%maxsgn 511 211cabdff1aSopenharmony_ci%define %%minsgn m512 212cabdff1aSopenharmony_ci%define %%maxusgn 1023 213cabdff1aSopenharmony_ci%define %%maxf 4 214cabdff1aSopenharmony_ci%else ; %3 == 12 215cabdff1aSopenharmony_ci%define %%maxsgn 2047 216cabdff1aSopenharmony_ci%define %%minsgn m2048 217cabdff1aSopenharmony_ci%define %%maxusgn 4095 218cabdff1aSopenharmony_ci%define %%maxf 16 219cabdff1aSopenharmony_ci%endif ; %3 220cabdff1aSopenharmony_ci 221cabdff1aSopenharmony_cicglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H 222cabdff1aSopenharmony_ci ; prepare E, I and H masks 223cabdff1aSopenharmony_ci shl Ed, %3-8 224cabdff1aSopenharmony_ci shl Id, %3-8 225cabdff1aSopenharmony_ci shl Hd, %3-8 226cabdff1aSopenharmony_ci%if cpuflag(ssse3) 227cabdff1aSopenharmony_ci mova m0, [pw_256] 228cabdff1aSopenharmony_ci%endif 229cabdff1aSopenharmony_ci movd m1, Ed 230cabdff1aSopenharmony_ci movd m2, Id 231cabdff1aSopenharmony_ci movd m3, Hd 232cabdff1aSopenharmony_ci%if cpuflag(ssse3) 233cabdff1aSopenharmony_ci pshufb m1, m0 ; E << (bit_depth - 8) 234cabdff1aSopenharmony_ci pshufb m2, m0 ; I << (bit_depth - 8) 235cabdff1aSopenharmony_ci pshufb m3, m0 ; H << (bit_depth - 8) 236cabdff1aSopenharmony_ci%else 237cabdff1aSopenharmony_ci punpcklwd m1, m1 238cabdff1aSopenharmony_ci punpcklwd m2, m2 239cabdff1aSopenharmony_ci punpcklwd m3, m3 240cabdff1aSopenharmony_ci pshufd m1, m1, q0000 241cabdff1aSopenharmony_ci pshufd m2, m2, q0000 242cabdff1aSopenharmony_ci pshufd m3, m3, q0000 243cabdff1aSopenharmony_ci%endif 244cabdff1aSopenharmony_ci SCRATCH 1, 8, rsp+(%%off+0)*mmsize, E 245cabdff1aSopenharmony_ci SCRATCH 2, 9, rsp+(%%off+1)*mmsize, I 246cabdff1aSopenharmony_ci SCRATCH 3, 10, rsp+(%%off+2)*mmsize, H 247cabdff1aSopenharmony_ci%if %2 > 4 248cabdff1aSopenharmony_ci PRELOAD 11, pw_ %+ %%maxf, F 249cabdff1aSopenharmony_ci%endif 250cabdff1aSopenharmony_ci 251cabdff1aSopenharmony_ci ; set up variables to load data 252cabdff1aSopenharmony_ci%ifidn %1, v 253cabdff1aSopenharmony_ci DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12 254cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 255cabdff1aSopenharmony_ci neg strideq 256cabdff1aSopenharmony_ci%if %2 == 16 257cabdff1aSopenharmony_ci lea dst0q, [dst8q+strideq*8] 258cabdff1aSopenharmony_ci%else 259cabdff1aSopenharmony_ci lea dst4q, [dst8q+strideq*4] 260cabdff1aSopenharmony_ci%endif 261cabdff1aSopenharmony_ci neg strideq 262cabdff1aSopenharmony_ci%if %2 == 16 263cabdff1aSopenharmony_ci lea dst12q, [dst8q+strideq*4] 264cabdff1aSopenharmony_ci lea dst4q, [dst0q+strideq*4] 265cabdff1aSopenharmony_ci%endif 266cabdff1aSopenharmony_ci 267cabdff1aSopenharmony_ci%if %2 == 16 268cabdff1aSopenharmony_ci%define %%p7 dst0q 269cabdff1aSopenharmony_ci%define %%p6 dst0q+strideq 270cabdff1aSopenharmony_ci%define %%p5 dst0q+strideq*2 271cabdff1aSopenharmony_ci%define %%p4 dst0q+stride3q 272cabdff1aSopenharmony_ci%endif 273cabdff1aSopenharmony_ci%define %%p3 dst4q 274cabdff1aSopenharmony_ci%define %%p2 dst4q+strideq 275cabdff1aSopenharmony_ci%define %%p1 dst4q+strideq*2 276cabdff1aSopenharmony_ci%define %%p0 dst4q+stride3q 277cabdff1aSopenharmony_ci%define %%q0 dst8q 278cabdff1aSopenharmony_ci%define %%q1 dst8q+strideq 279cabdff1aSopenharmony_ci%define %%q2 dst8q+strideq*2 280cabdff1aSopenharmony_ci%define %%q3 dst8q+stride3q 281cabdff1aSopenharmony_ci%if %2 == 16 282cabdff1aSopenharmony_ci%define %%q4 dst12q 283cabdff1aSopenharmony_ci%define %%q5 dst12q+strideq 284cabdff1aSopenharmony_ci%define %%q6 dst12q+strideq*2 285cabdff1aSopenharmony_ci%define %%q7 dst12q+stride3q 286cabdff1aSopenharmony_ci%endif 287cabdff1aSopenharmony_ci%else ; %1 == h 288cabdff1aSopenharmony_ci DEFINE_ARGS dst0, stride, stride3, dst4 289cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 290cabdff1aSopenharmony_ci lea dst4q, [dst0q+strideq*4] 291cabdff1aSopenharmony_ci 292cabdff1aSopenharmony_ci%define %%p3 rsp+(%%tspoff+0)*mmsize 293cabdff1aSopenharmony_ci%define %%p2 rsp+(%%tspoff+1)*mmsize 294cabdff1aSopenharmony_ci%define %%p1 rsp+(%%tspoff+2)*mmsize 295cabdff1aSopenharmony_ci%define %%p0 rsp+(%%tspoff+3)*mmsize 296cabdff1aSopenharmony_ci%define %%q0 rsp+(%%tspoff+4)*mmsize 297cabdff1aSopenharmony_ci%define %%q1 rsp+(%%tspoff+5)*mmsize 298cabdff1aSopenharmony_ci%define %%q2 rsp+(%%tspoff+6)*mmsize 299cabdff1aSopenharmony_ci%define %%q3 rsp+(%%tspoff+7)*mmsize 300cabdff1aSopenharmony_ci 301cabdff1aSopenharmony_ci%if %2 < 16 302cabdff1aSopenharmony_ci movu m0, [dst0q+strideq*0-8] 303cabdff1aSopenharmony_ci movu m1, [dst0q+strideq*1-8] 304cabdff1aSopenharmony_ci movu m2, [dst0q+strideq*2-8] 305cabdff1aSopenharmony_ci movu m3, [dst0q+stride3q -8] 306cabdff1aSopenharmony_ci movu m4, [dst4q+strideq*0-8] 307cabdff1aSopenharmony_ci movu m5, [dst4q+strideq*1-8] 308cabdff1aSopenharmony_ci movu m6, [dst4q+strideq*2-8] 309cabdff1aSopenharmony_ci movu m7, [dst4q+stride3q -8] 310cabdff1aSopenharmony_ci 311cabdff1aSopenharmony_ci%if ARCH_X86_64 312cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 313cabdff1aSopenharmony_ci%else 314cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0] 315cabdff1aSopenharmony_ci%endif 316cabdff1aSopenharmony_ci 317cabdff1aSopenharmony_ci mova [%%p3], m0 318cabdff1aSopenharmony_ci mova [%%p2], m1 319cabdff1aSopenharmony_ci mova [%%p1], m2 320cabdff1aSopenharmony_ci mova [%%p0], m3 321cabdff1aSopenharmony_ci%if ARCH_X86_64 322cabdff1aSopenharmony_ci mova [%%q0], m4 323cabdff1aSopenharmony_ci%endif 324cabdff1aSopenharmony_ci mova [%%q1], m5 325cabdff1aSopenharmony_ci mova [%%q2], m6 326cabdff1aSopenharmony_ci mova [%%q3], m7 327cabdff1aSopenharmony_ci 328cabdff1aSopenharmony_ci ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register 329cabdff1aSopenharmony_ci ; order here accordingly 330cabdff1aSopenharmony_ci%else ; %2 == 16 331cabdff1aSopenharmony_ci 332cabdff1aSopenharmony_ci%define %%p7 rsp+(%%tspoff+ 8)*mmsize 333cabdff1aSopenharmony_ci%define %%p6 rsp+(%%tspoff+ 9)*mmsize 334cabdff1aSopenharmony_ci%define %%p5 rsp+(%%tspoff+10)*mmsize 335cabdff1aSopenharmony_ci%define %%p4 rsp+(%%tspoff+11)*mmsize 336cabdff1aSopenharmony_ci%define %%q4 rsp+(%%tspoff+12)*mmsize 337cabdff1aSopenharmony_ci%define %%q5 rsp+(%%tspoff+13)*mmsize 338cabdff1aSopenharmony_ci%define %%q6 rsp+(%%tspoff+14)*mmsize 339cabdff1aSopenharmony_ci%define %%q7 rsp+(%%tspoff+15)*mmsize 340cabdff1aSopenharmony_ci 341cabdff1aSopenharmony_ci mova m0, [dst0q+strideq*0-16] 342cabdff1aSopenharmony_ci mova m1, [dst0q+strideq*1-16] 343cabdff1aSopenharmony_ci mova m2, [dst0q+strideq*2-16] 344cabdff1aSopenharmony_ci mova m3, [dst0q+stride3q -16] 345cabdff1aSopenharmony_ci mova m4, [dst4q+strideq*0-16] 346cabdff1aSopenharmony_ci mova m5, [dst4q+strideq*1-16] 347cabdff1aSopenharmony_ci%if ARCH_X86_64 348cabdff1aSopenharmony_ci mova m6, [dst4q+strideq*2-16] 349cabdff1aSopenharmony_ci%endif 350cabdff1aSopenharmony_ci mova m7, [dst4q+stride3q -16] 351cabdff1aSopenharmony_ci 352cabdff1aSopenharmony_ci%if ARCH_X86_64 353cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 354cabdff1aSopenharmony_ci%else 355cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1 356cabdff1aSopenharmony_ci%endif 357cabdff1aSopenharmony_ci 358cabdff1aSopenharmony_ci mova [%%p7], m0 359cabdff1aSopenharmony_ci mova [%%p6], m1 360cabdff1aSopenharmony_ci mova [%%p5], m2 361cabdff1aSopenharmony_ci mova [%%p4], m3 362cabdff1aSopenharmony_ci%if ARCH_X86_64 363cabdff1aSopenharmony_ci mova [%%p3], m4 364cabdff1aSopenharmony_ci%endif 365cabdff1aSopenharmony_ci mova [%%p2], m5 366cabdff1aSopenharmony_ci mova [%%p1], m6 367cabdff1aSopenharmony_ci mova [%%p0], m7 368cabdff1aSopenharmony_ci 369cabdff1aSopenharmony_ci mova m0, [dst0q+strideq*0] 370cabdff1aSopenharmony_ci mova m1, [dst0q+strideq*1] 371cabdff1aSopenharmony_ci mova m2, [dst0q+strideq*2] 372cabdff1aSopenharmony_ci mova m3, [dst0q+stride3q ] 373cabdff1aSopenharmony_ci mova m4, [dst4q+strideq*0] 374cabdff1aSopenharmony_ci mova m5, [dst4q+strideq*1] 375cabdff1aSopenharmony_ci%if ARCH_X86_64 376cabdff1aSopenharmony_ci mova m6, [dst4q+strideq*2] 377cabdff1aSopenharmony_ci%endif 378cabdff1aSopenharmony_ci mova m7, [dst4q+stride3q ] 379cabdff1aSopenharmony_ci 380cabdff1aSopenharmony_ci%if ARCH_X86_64 381cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 382cabdff1aSopenharmony_ci%else 383cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1 384cabdff1aSopenharmony_ci%endif 385cabdff1aSopenharmony_ci 386cabdff1aSopenharmony_ci mova [%%q0], m0 387cabdff1aSopenharmony_ci mova [%%q1], m1 388cabdff1aSopenharmony_ci mova [%%q2], m2 389cabdff1aSopenharmony_ci mova [%%q3], m3 390cabdff1aSopenharmony_ci%if ARCH_X86_64 391cabdff1aSopenharmony_ci mova [%%q4], m4 392cabdff1aSopenharmony_ci%endif 393cabdff1aSopenharmony_ci mova [%%q5], m5 394cabdff1aSopenharmony_ci mova [%%q6], m6 395cabdff1aSopenharmony_ci mova [%%q7], m7 396cabdff1aSopenharmony_ci 397cabdff1aSopenharmony_ci ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register 398cabdff1aSopenharmony_ci ; order here accordingly 399cabdff1aSopenharmony_ci%endif ; %2 400cabdff1aSopenharmony_ci%endif ; %1 401cabdff1aSopenharmony_ci 402cabdff1aSopenharmony_ci ; load q0|q4-7 data 403cabdff1aSopenharmony_ci mova m0, [%%q0] 404cabdff1aSopenharmony_ci%if %2 == 16 405cabdff1aSopenharmony_ci mova m4, [%%q4] 406cabdff1aSopenharmony_ci mova m5, [%%q5] 407cabdff1aSopenharmony_ci mova m6, [%%q6] 408cabdff1aSopenharmony_ci mova m7, [%%q7] 409cabdff1aSopenharmony_ci 410cabdff1aSopenharmony_ci ; flat8out q portion 411cabdff1aSopenharmony_ci FLAT8OUT_HALF 412cabdff1aSopenharmony_ci SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O 413cabdff1aSopenharmony_ci%endif 414cabdff1aSopenharmony_ci 415cabdff1aSopenharmony_ci ; load q1-3 data 416cabdff1aSopenharmony_ci mova m1, [%%q1] 417cabdff1aSopenharmony_ci mova m2, [%%q2] 418cabdff1aSopenharmony_ci mova m3, [%%q3] 419cabdff1aSopenharmony_ci 420cabdff1aSopenharmony_ci ; r6-8|pw_4[m8-11]=reg_E/I/H/F 421cabdff1aSopenharmony_ci ; r9[m15]=!flatout[q] 422cabdff1aSopenharmony_ci ; m12-14=free 423cabdff1aSopenharmony_ci ; m0-3=q0-q3 424cabdff1aSopenharmony_ci ; m4-7=free 425cabdff1aSopenharmony_ci 426cabdff1aSopenharmony_ci ; flat8in|fm|hev q portion 427cabdff1aSopenharmony_ci FLAT8IN_HALF %2 428cabdff1aSopenharmony_ci SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV 429cabdff1aSopenharmony_ci%if %2 > 4 430cabdff1aSopenharmony_ci SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8I 431cabdff1aSopenharmony_ci%endif 432cabdff1aSopenharmony_ci 433cabdff1aSopenharmony_ci ; r6-8|pw_4[m8-11]=reg_E/I/H/F 434cabdff1aSopenharmony_ci ; r9[m15]=!flat8out[q] 435cabdff1aSopenharmony_ci ; r10[m13]=hev[q] 436cabdff1aSopenharmony_ci ; r11[m14]=!flat8in[q] 437cabdff1aSopenharmony_ci ; m2=!fm[q] 438cabdff1aSopenharmony_ci ; m0,1=q0-q1 439cabdff1aSopenharmony_ci ; m2-7=free 440cabdff1aSopenharmony_ci ; m12=free 441cabdff1aSopenharmony_ci 442cabdff1aSopenharmony_ci ; load p0-1 443cabdff1aSopenharmony_ci mova m3, [%%p0] 444cabdff1aSopenharmony_ci mova m4, [%%p1] 445cabdff1aSopenharmony_ci 446cabdff1aSopenharmony_ci ; fm mb_edge portion 447cabdff1aSopenharmony_ci psubw m5, m3, m0 ; q0-p0 448cabdff1aSopenharmony_ci psubw m6, m4, m1 ; q1-p1 449cabdff1aSopenharmony_ci%if ARCH_X86_64 450cabdff1aSopenharmony_ci ABS2 m5, m6, m7, m12 ; abs(q0-p0) | abs(q1-p1) 451cabdff1aSopenharmony_ci%else 452cabdff1aSopenharmony_ci ABS1 m5, m7 ; abs(q0-p0) 453cabdff1aSopenharmony_ci ABS1 m6, m7 ; abs(q1-p1) 454cabdff1aSopenharmony_ci%endif 455cabdff1aSopenharmony_ci paddw m5, m5 456cabdff1aSopenharmony_ci psraw m6, 1 457cabdff1aSopenharmony_ci paddw m6, m5 ; abs(q0-p0)*2+(abs(q1-p1)>>1) 458cabdff1aSopenharmony_ci pcmpgtw m6, reg_E 459cabdff1aSopenharmony_ci por m2, m6 460cabdff1aSopenharmony_ci SCRATCH 2, 12, rsp+(%%off+3)*mmsize, FM 461cabdff1aSopenharmony_ci 462cabdff1aSopenharmony_ci ; r6-8|pw_4[m8-11]=reg_E/I/H/F 463cabdff1aSopenharmony_ci ; r9[m15]=!flat8out[q] 464cabdff1aSopenharmony_ci ; r10[m13]=hev[q] 465cabdff1aSopenharmony_ci ; r11[m14]=!flat8in[q] 466cabdff1aSopenharmony_ci ; r12[m12]=!fm[q] 467cabdff1aSopenharmony_ci ; m3-4=q0-1 468cabdff1aSopenharmony_ci ; m0-2/5-7=free 469cabdff1aSopenharmony_ci 470cabdff1aSopenharmony_ci ; load p4-7 data 471cabdff1aSopenharmony_ci SWAP 3, 0 ; p0 472cabdff1aSopenharmony_ci SWAP 4, 1 ; p1 473cabdff1aSopenharmony_ci%if %2 == 16 474cabdff1aSopenharmony_ci mova m7, [%%p7] 475cabdff1aSopenharmony_ci mova m6, [%%p6] 476cabdff1aSopenharmony_ci mova m5, [%%p5] 477cabdff1aSopenharmony_ci mova m4, [%%p4] 478cabdff1aSopenharmony_ci 479cabdff1aSopenharmony_ci ; flat8out p portion 480cabdff1aSopenharmony_ci FLAT8OUT_HALF 481cabdff1aSopenharmony_ci por m7, reg_F8O 482cabdff1aSopenharmony_ci SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O 483cabdff1aSopenharmony_ci%endif 484cabdff1aSopenharmony_ci 485cabdff1aSopenharmony_ci ; r6-8|pw_4[m8-11]=reg_E/I/H/F 486cabdff1aSopenharmony_ci ; r9[m15]=!flat8out 487cabdff1aSopenharmony_ci ; r10[m13]=hev[q] 488cabdff1aSopenharmony_ci ; r11[m14]=!flat8in[q] 489cabdff1aSopenharmony_ci ; r12[m12]=!fm[q] 490cabdff1aSopenharmony_ci ; m0=p0 491cabdff1aSopenharmony_ci ; m1-7=free 492cabdff1aSopenharmony_ci 493cabdff1aSopenharmony_ci ; load p2-3 data 494cabdff1aSopenharmony_ci mova m2, [%%p2] 495cabdff1aSopenharmony_ci mova m3, [%%p3] 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci ; flat8in|fm|hev p portion 498cabdff1aSopenharmony_ci FLAT8IN_HALF %2 499cabdff1aSopenharmony_ci por m7, reg_HEV 500cabdff1aSopenharmony_ci%if %2 > 4 501cabdff1aSopenharmony_ci por m4, reg_F8I 502cabdff1aSopenharmony_ci%endif 503cabdff1aSopenharmony_ci por m2, reg_FM 504cabdff1aSopenharmony_ci%if %2 > 4 505cabdff1aSopenharmony_ci por m4, m2 ; !flat8|!fm 506cabdff1aSopenharmony_ci%if %2 == 16 507cabdff1aSopenharmony_ci por m5, m4, reg_F8O ; !flat16|!fm 508cabdff1aSopenharmony_ci pandn m2, m4 ; filter4_mask 509cabdff1aSopenharmony_ci pandn m4, m5 ; filter8_mask 510cabdff1aSopenharmony_ci pxor m5, [pw_m1] ; filter16_mask 511cabdff1aSopenharmony_ci SCRATCH 5, 15, rsp+(%%off+6)*mmsize, F16M 512cabdff1aSopenharmony_ci%else 513cabdff1aSopenharmony_ci pandn m2, m4 ; filter4_mask 514cabdff1aSopenharmony_ci pxor m4, [pw_m1] ; filter8_mask 515cabdff1aSopenharmony_ci%endif 516cabdff1aSopenharmony_ci SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8M 517cabdff1aSopenharmony_ci%else 518cabdff1aSopenharmony_ci pxor m2, [pw_m1] ; filter4_mask 519cabdff1aSopenharmony_ci%endif 520cabdff1aSopenharmony_ci SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV 521cabdff1aSopenharmony_ci SCRATCH 2, 12, rsp+(%%off+3)*mmsize, F4M 522cabdff1aSopenharmony_ci 523cabdff1aSopenharmony_ci ; r9[m15]=filter16_mask 524cabdff1aSopenharmony_ci ; r10[m13]=hev 525cabdff1aSopenharmony_ci ; r11[m14]=filter8_mask 526cabdff1aSopenharmony_ci ; r12[m12]=filter4_mask 527cabdff1aSopenharmony_ci ; m0,1=p0-p1 528cabdff1aSopenharmony_ci ; m2-7=free 529cabdff1aSopenharmony_ci ; m8-11=free 530cabdff1aSopenharmony_ci 531cabdff1aSopenharmony_ci%if %2 > 4 532cabdff1aSopenharmony_ci%if %2 == 16 533cabdff1aSopenharmony_ci ; filter_14 534cabdff1aSopenharmony_ci mova m2, [%%p7] 535cabdff1aSopenharmony_ci mova m3, [%%p6] 536cabdff1aSopenharmony_ci mova m6, [%%p5] 537cabdff1aSopenharmony_ci mova m7, [%%p4] 538cabdff1aSopenharmony_ci PRELOAD 8, %%p3, P3 539cabdff1aSopenharmony_ci PRELOAD 9, %%p2, P2 540cabdff1aSopenharmony_ci%endif 541cabdff1aSopenharmony_ci PRELOAD 10, %%q0, Q0 542cabdff1aSopenharmony_ci PRELOAD 11, %%q1, Q1 543cabdff1aSopenharmony_ci%if %2 == 16 544cabdff1aSopenharmony_ci psllw m4, m2, 3 545cabdff1aSopenharmony_ci paddw m5, m3, m3 546cabdff1aSopenharmony_ci paddw m4, m6 547cabdff1aSopenharmony_ci paddw m5, m7 548cabdff1aSopenharmony_ci paddw m4, reg_P3 549cabdff1aSopenharmony_ci paddw m5, reg_P2 550cabdff1aSopenharmony_ci paddw m4, m1 551cabdff1aSopenharmony_ci paddw m5, m0 552cabdff1aSopenharmony_ci paddw m4, reg_Q0 ; q0+p1+p3+p5+p7*8 553cabdff1aSopenharmony_ci psubw m5, m2 ; p0+p2+p4+p6*2-p7 554cabdff1aSopenharmony_ci paddw m4, [pw_8] 555cabdff1aSopenharmony_ci paddw m5, m4 ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8 556cabdff1aSopenharmony_ci 557cabdff1aSopenharmony_ci ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction 558cabdff1aSopenharmony_ci ; at the end of the filter 559cabdff1aSopenharmony_ci 560cabdff1aSopenharmony_ci mova [rsp+0*mmsize], m3 561cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%p6, m3, m2, m6, reg_Q1 562cabdff1aSopenharmony_ci%endif 563cabdff1aSopenharmony_ci mova m3, [%%q2] 564cabdff1aSopenharmony_ci%if %2 == 16 565cabdff1aSopenharmony_ci mova [rsp+1*mmsize], m6 566cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%p5, m6, m2, m7, m3 567cabdff1aSopenharmony_ci%endif 568cabdff1aSopenharmony_ci mova m6, [%%q3] 569cabdff1aSopenharmony_ci%if %2 == 16 570cabdff1aSopenharmony_ci mova [rsp+2*mmsize], m7 571cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%p4, m7, m2, reg_P3, m6 572cabdff1aSopenharmony_ci mova m7, [%%q4] 573cabdff1aSopenharmony_ci%if ARCH_X86_64 574cabdff1aSopenharmony_ci mova [rsp+3*mmsize], reg_P3 575cabdff1aSopenharmony_ci%else 576cabdff1aSopenharmony_ci mova m4, reg_P3 577cabdff1aSopenharmony_ci mova [rsp+3*mmsize], m4 578cabdff1aSopenharmony_ci%endif 579cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%p3, reg_P3, m2, reg_P2, m7 580cabdff1aSopenharmony_ci PRELOAD 8, %%q5, Q5 581cabdff1aSopenharmony_ci%if ARCH_X86_64 582cabdff1aSopenharmony_ci mova [rsp+4*mmsize], reg_P2 583cabdff1aSopenharmony_ci%else 584cabdff1aSopenharmony_ci mova m4, reg_P2 585cabdff1aSopenharmony_ci mova [rsp+4*mmsize], m4 586cabdff1aSopenharmony_ci%endif 587cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%p2, reg_P2, m2, m1, reg_Q5 588cabdff1aSopenharmony_ci PRELOAD 9, %%q6, Q6 589cabdff1aSopenharmony_ci mova [rsp+5*mmsize], m1 590cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%p1, m1, m2, m0, reg_Q6 591cabdff1aSopenharmony_ci mova m1, [%%q7] 592cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%p0, m0, m2, reg_Q0, m1, 1 593cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1, ARCH_X86_64 594cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3, m1, ARCH_X86_64 595cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%q2, m3, [rsp+2*mmsize], m6, m1, 1 596cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%q3, m6, [rsp+3*mmsize], m7, m1 597cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%q4, m7, [rsp+4*mmsize], reg_Q5, m1 598cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1 599cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F16M, 4, %%q6, reg_Q6 600cabdff1aSopenharmony_ci 601cabdff1aSopenharmony_ci mova m7, [%%p1] 602cabdff1aSopenharmony_ci%else 603cabdff1aSopenharmony_ci SWAP 1, 7 604cabdff1aSopenharmony_ci%endif 605cabdff1aSopenharmony_ci 606cabdff1aSopenharmony_ci mova m2, [%%p3] 607cabdff1aSopenharmony_ci mova m1, [%%p2] 608cabdff1aSopenharmony_ci 609cabdff1aSopenharmony_ci ; reg_Q0-1 (m10-m11) 610cabdff1aSopenharmony_ci ; m0=p0 611cabdff1aSopenharmony_ci ; m1=p2 612cabdff1aSopenharmony_ci ; m2=p3 613cabdff1aSopenharmony_ci ; m3=q2 614cabdff1aSopenharmony_ci ; m4-5=free 615cabdff1aSopenharmony_ci ; m6=q3 616cabdff1aSopenharmony_ci ; m7=p1 617cabdff1aSopenharmony_ci ; m8-9 unused 618cabdff1aSopenharmony_ci 619cabdff1aSopenharmony_ci ; filter_6 620cabdff1aSopenharmony_ci psllw m4, m2, 2 621cabdff1aSopenharmony_ci paddw m5, m1, m1 622cabdff1aSopenharmony_ci paddw m4, m7 623cabdff1aSopenharmony_ci psubw m5, m2 624cabdff1aSopenharmony_ci paddw m4, m0 625cabdff1aSopenharmony_ci paddw m5, reg_Q0 626cabdff1aSopenharmony_ci paddw m4, [pw_4] 627cabdff1aSopenharmony_ci paddw m5, m4 628cabdff1aSopenharmony_ci 629cabdff1aSopenharmony_ci%if ARCH_X86_64 630cabdff1aSopenharmony_ci mova m8, m1 631cabdff1aSopenharmony_ci mova m9, m7 632cabdff1aSopenharmony_ci%else 633cabdff1aSopenharmony_ci mova [rsp+0*mmsize], m1 634cabdff1aSopenharmony_ci mova [rsp+1*mmsize], m7 635cabdff1aSopenharmony_ci%endif 636cabdff1aSopenharmony_ci%ifidn %1, v 637cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1 638cabdff1aSopenharmony_ci%else 639cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1, 1 640cabdff1aSopenharmony_ci%endif 641cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F8M, 3, %%p1, m7, m2, m0, m3, 1 642cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F8M, 3, %%p0, m0, m2, reg_Q0, m6, 1 643cabdff1aSopenharmony_ci%if ARCH_X86_64 644cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, m8, reg_Q1, m6, ARCH_X86_64 645cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, m9, m3, m6, ARCH_X86_64 646cabdff1aSopenharmony_ci%else 647cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64 648cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3, m6, ARCH_X86_64 649cabdff1aSopenharmony_ci%endif 650cabdff1aSopenharmony_ci FILTER_STEP m4, m5, F8M, 3, %%q2, m3 651cabdff1aSopenharmony_ci 652cabdff1aSopenharmony_ci UNSCRATCH 2, 10, %%q0 653cabdff1aSopenharmony_ci UNSCRATCH 6, 11, %%q1 654cabdff1aSopenharmony_ci%else 655cabdff1aSopenharmony_ci SWAP 1, 7 656cabdff1aSopenharmony_ci mova m2, [%%q0] 657cabdff1aSopenharmony_ci mova m6, [%%q1] 658cabdff1aSopenharmony_ci%endif 659cabdff1aSopenharmony_ci UNSCRATCH 3, 13, rsp+(%%off+4)*mmsize, HEV 660cabdff1aSopenharmony_ci 661cabdff1aSopenharmony_ci ; m0=p0 662cabdff1aSopenharmony_ci ; m1=p2 663cabdff1aSopenharmony_ci ; m2=q0 664cabdff1aSopenharmony_ci ; m3=hev_mask 665cabdff1aSopenharmony_ci ; m4-5=free 666cabdff1aSopenharmony_ci ; m6=q1 667cabdff1aSopenharmony_ci ; m7=p1 668cabdff1aSopenharmony_ci 669cabdff1aSopenharmony_ci ; filter_4 670cabdff1aSopenharmony_ci psubw m4, m7, m6 ; p1-q1 671cabdff1aSopenharmony_ci psubw m5, m2, m0 ; q0-p0 672cabdff1aSopenharmony_ci pand m4, m3 673cabdff1aSopenharmony_ci pminsw m4, [pw_ %+ %%maxsgn] 674cabdff1aSopenharmony_ci pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(p1-q1, 9) -> f 675cabdff1aSopenharmony_ci paddw m4, m5 676cabdff1aSopenharmony_ci paddw m5, m5 677cabdff1aSopenharmony_ci paddw m4, m5 ; 3*(q0-p0)+f 678cabdff1aSopenharmony_ci pminsw m4, [pw_ %+ %%maxsgn] 679cabdff1aSopenharmony_ci pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(3*(q0-p0)+f, 9) -> f 680cabdff1aSopenharmony_ci pand m4, reg_F4M 681cabdff1aSopenharmony_ci paddw m5, m4, [pw_4] 682cabdff1aSopenharmony_ci paddw m4, [pw_3] 683cabdff1aSopenharmony_ci pminsw m5, [pw_ %+ %%maxsgn] 684cabdff1aSopenharmony_ci pminsw m4, [pw_ %+ %%maxsgn] 685cabdff1aSopenharmony_ci psraw m5, 3 ; min_intp2(f+4, 9)>>3 -> f1 686cabdff1aSopenharmony_ci psraw m4, 3 ; min_intp2(f+3, 9)>>3 -> f2 687cabdff1aSopenharmony_ci psubw m2, m5 ; q0-f1 688cabdff1aSopenharmony_ci paddw m0, m4 ; p0+f2 689cabdff1aSopenharmony_ci pandn m3, m5 ; f1 & !hev (for p1/q1 adj) 690cabdff1aSopenharmony_ci pxor m4, m4 691cabdff1aSopenharmony_ci mova m5, [pw_ %+ %%maxusgn] 692cabdff1aSopenharmony_ci pmaxsw m2, m4 693cabdff1aSopenharmony_ci pmaxsw m0, m4 694cabdff1aSopenharmony_ci pminsw m2, m5 695cabdff1aSopenharmony_ci pminsw m0, m5 696cabdff1aSopenharmony_ci%if cpuflag(ssse3) 697cabdff1aSopenharmony_ci pmulhrsw m3, [pw_16384] ; (f1+1)>>1 698cabdff1aSopenharmony_ci%else 699cabdff1aSopenharmony_ci paddw m3, [pw_1] 700cabdff1aSopenharmony_ci psraw m3, 1 701cabdff1aSopenharmony_ci%endif 702cabdff1aSopenharmony_ci paddw m7, m3 ; p1+f 703cabdff1aSopenharmony_ci psubw m6, m3 ; q1-f 704cabdff1aSopenharmony_ci pmaxsw m7, m4 705cabdff1aSopenharmony_ci pmaxsw m6, m4 706cabdff1aSopenharmony_ci pminsw m7, m5 707cabdff1aSopenharmony_ci pminsw m6, m5 708cabdff1aSopenharmony_ci 709cabdff1aSopenharmony_ci ; store 710cabdff1aSopenharmony_ci%ifidn %1, v 711cabdff1aSopenharmony_ci mova [%%p1], m7 712cabdff1aSopenharmony_ci mova [%%p0], m0 713cabdff1aSopenharmony_ci mova [%%q0], m2 714cabdff1aSopenharmony_ci mova [%%q1], m6 715cabdff1aSopenharmony_ci%else ; %1 == h 716cabdff1aSopenharmony_ci%if %2 == 4 717cabdff1aSopenharmony_ci TRANSPOSE4x4W 7, 0, 2, 6, 1 718cabdff1aSopenharmony_ci movh [dst0q+strideq*0-4], m7 719cabdff1aSopenharmony_ci movhps [dst0q+strideq*1-4], m7 720cabdff1aSopenharmony_ci movh [dst0q+strideq*2-4], m0 721cabdff1aSopenharmony_ci movhps [dst0q+stride3q -4], m0 722cabdff1aSopenharmony_ci movh [dst4q+strideq*0-4], m2 723cabdff1aSopenharmony_ci movhps [dst4q+strideq*1-4], m2 724cabdff1aSopenharmony_ci movh [dst4q+strideq*2-4], m6 725cabdff1aSopenharmony_ci movhps [dst4q+stride3q -4], m6 726cabdff1aSopenharmony_ci%elif %2 == 8 727cabdff1aSopenharmony_ci mova m3, [%%p3] 728cabdff1aSopenharmony_ci mova m4, [%%q2] 729cabdff1aSopenharmony_ci mova m5, [%%q3] 730cabdff1aSopenharmony_ci 731cabdff1aSopenharmony_ci%if ARCH_X86_64 732cabdff1aSopenharmony_ci TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, 8 733cabdff1aSopenharmony_ci%else 734cabdff1aSopenharmony_ci TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1 735cabdff1aSopenharmony_ci mova m2, [%%q0] 736cabdff1aSopenharmony_ci%endif 737cabdff1aSopenharmony_ci 738cabdff1aSopenharmony_ci movu [dst0q+strideq*0-8], m3 739cabdff1aSopenharmony_ci movu [dst0q+strideq*1-8], m1 740cabdff1aSopenharmony_ci movu [dst0q+strideq*2-8], m7 741cabdff1aSopenharmony_ci movu [dst0q+stride3q -8], m0 742cabdff1aSopenharmony_ci movu [dst4q+strideq*0-8], m2 743cabdff1aSopenharmony_ci movu [dst4q+strideq*1-8], m6 744cabdff1aSopenharmony_ci movu [dst4q+strideq*2-8], m4 745cabdff1aSopenharmony_ci movu [dst4q+stride3q -8], m5 746cabdff1aSopenharmony_ci%else ; %2 == 16 747cabdff1aSopenharmony_ci SCRATCH 2, 8, %%q0 748cabdff1aSopenharmony_ci SCRATCH 6, 9, %%q1 749cabdff1aSopenharmony_ci mova m2, [%%p7] 750cabdff1aSopenharmony_ci mova m3, [%%p6] 751cabdff1aSopenharmony_ci mova m4, [%%p5] 752cabdff1aSopenharmony_ci mova m5, [%%p4] 753cabdff1aSopenharmony_ci mova m6, [%%p3] 754cabdff1aSopenharmony_ci 755cabdff1aSopenharmony_ci%if ARCH_X86_64 756cabdff1aSopenharmony_ci TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, 10 757cabdff1aSopenharmony_ci%else 758cabdff1aSopenharmony_ci mova [%%p1], m7 759cabdff1aSopenharmony_ci TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1 760cabdff1aSopenharmony_ci%endif 761cabdff1aSopenharmony_ci 762cabdff1aSopenharmony_ci mova [dst0q+strideq*0-16], m2 763cabdff1aSopenharmony_ci mova [dst0q+strideq*1-16], m3 764cabdff1aSopenharmony_ci mova [dst0q+strideq*2-16], m4 765cabdff1aSopenharmony_ci mova [dst0q+stride3q -16], m5 766cabdff1aSopenharmony_ci%if ARCH_X86_64 767cabdff1aSopenharmony_ci mova [dst4q+strideq*0-16], m6 768cabdff1aSopenharmony_ci%endif 769cabdff1aSopenharmony_ci mova [dst4q+strideq*1-16], m1 770cabdff1aSopenharmony_ci mova [dst4q+strideq*2-16], m7 771cabdff1aSopenharmony_ci mova [dst4q+stride3q -16], m0 772cabdff1aSopenharmony_ci 773cabdff1aSopenharmony_ci UNSCRATCH 2, 8, %%q0 774cabdff1aSopenharmony_ci UNSCRATCH 6, 9, %%q1 775cabdff1aSopenharmony_ci mova m0, [%%q2] 776cabdff1aSopenharmony_ci mova m1, [%%q3] 777cabdff1aSopenharmony_ci mova m3, [%%q4] 778cabdff1aSopenharmony_ci mova m4, [%%q5] 779cabdff1aSopenharmony_ci%if ARCH_X86_64 780cabdff1aSopenharmony_ci mova m5, [%%q6] 781cabdff1aSopenharmony_ci%endif 782cabdff1aSopenharmony_ci mova m7, [%%q7] 783cabdff1aSopenharmony_ci 784cabdff1aSopenharmony_ci%if ARCH_X86_64 785cabdff1aSopenharmony_ci TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, 8 786cabdff1aSopenharmony_ci%else 787cabdff1aSopenharmony_ci TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1 788cabdff1aSopenharmony_ci%endif 789cabdff1aSopenharmony_ci 790cabdff1aSopenharmony_ci mova [dst0q+strideq*0], m2 791cabdff1aSopenharmony_ci mova [dst0q+strideq*1], m6 792cabdff1aSopenharmony_ci mova [dst0q+strideq*2], m0 793cabdff1aSopenharmony_ci mova [dst0q+stride3q ], m1 794cabdff1aSopenharmony_ci%if ARCH_X86_64 795cabdff1aSopenharmony_ci mova [dst4q+strideq*0], m3 796cabdff1aSopenharmony_ci%endif 797cabdff1aSopenharmony_ci mova [dst4q+strideq*1], m4 798cabdff1aSopenharmony_ci mova [dst4q+strideq*2], m5 799cabdff1aSopenharmony_ci mova [dst4q+stride3q ], m7 800cabdff1aSopenharmony_ci%endif ; %2 801cabdff1aSopenharmony_ci%endif ; %1 802cabdff1aSopenharmony_ci RET 803cabdff1aSopenharmony_ci%endmacro 804cabdff1aSopenharmony_ci 805cabdff1aSopenharmony_ci%macro LOOP_FILTER_CPUSETS 3 806cabdff1aSopenharmony_ciINIT_XMM sse2 807cabdff1aSopenharmony_ciLOOP_FILTER %1, %2, %3 808cabdff1aSopenharmony_ciINIT_XMM ssse3 809cabdff1aSopenharmony_ciLOOP_FILTER %1, %2, %3 810cabdff1aSopenharmony_ciINIT_XMM avx 811cabdff1aSopenharmony_ciLOOP_FILTER %1, %2, %3 812cabdff1aSopenharmony_ci%endmacro 813cabdff1aSopenharmony_ci 814cabdff1aSopenharmony_ci%macro LOOP_FILTER_WDSETS 2 815cabdff1aSopenharmony_ciLOOP_FILTER_CPUSETS %1, 4, %2 816cabdff1aSopenharmony_ciLOOP_FILTER_CPUSETS %1, 8, %2 817cabdff1aSopenharmony_ciLOOP_FILTER_CPUSETS %1, 16, %2 818cabdff1aSopenharmony_ci%endmacro 819cabdff1aSopenharmony_ci 820cabdff1aSopenharmony_ciLOOP_FILTER_WDSETS h, 10 821cabdff1aSopenharmony_ciLOOP_FILTER_WDSETS v, 10 822cabdff1aSopenharmony_ciLOOP_FILTER_WDSETS h, 12 823cabdff1aSopenharmony_ciLOOP_FILTER_WDSETS v, 12 824