1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* VP9 MC SIMD optimizations 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com> 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ciSECTION_RODATA 32 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_cipd_64: times 8 dd 64 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_cicextern pw_1023 30cabdff1aSopenharmony_cicextern pw_4095 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ciSECTION .text 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ci%macro filter_h4_fn 1-2 12 35cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery 36cabdff1aSopenharmony_ci mova m5, [pw_1023] 37cabdff1aSopenharmony_ci.body: 38cabdff1aSopenharmony_ci%if notcpuflag(sse4) && ARCH_X86_64 39cabdff1aSopenharmony_ci pxor m11, m11 40cabdff1aSopenharmony_ci%endif 41cabdff1aSopenharmony_ci mova m6, [pd_64] 42cabdff1aSopenharmony_ci mova m7, [filteryq+ 0] 43cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 44cabdff1aSopenharmony_ci mova m8, [filteryq+32] 45cabdff1aSopenharmony_ci mova m9, [filteryq+64] 46cabdff1aSopenharmony_ci mova m10, [filteryq+96] 47cabdff1aSopenharmony_ci%endif 48cabdff1aSopenharmony_ci.loop: 49cabdff1aSopenharmony_ci movh m0, [srcq-6] 50cabdff1aSopenharmony_ci movh m1, [srcq-4] 51cabdff1aSopenharmony_ci movh m2, [srcq-2] 52cabdff1aSopenharmony_ci movh m3, [srcq+0] 53cabdff1aSopenharmony_ci movh m4, [srcq+2] 54cabdff1aSopenharmony_ci punpcklwd m0, m1 55cabdff1aSopenharmony_ci punpcklwd m2, m3 56cabdff1aSopenharmony_ci pmaddwd m0, m7 57cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 58cabdff1aSopenharmony_ci pmaddwd m2, m8 59cabdff1aSopenharmony_ci%else 60cabdff1aSopenharmony_ci pmaddwd m2, [filteryq+32] 61cabdff1aSopenharmony_ci%endif 62cabdff1aSopenharmony_ci movu m1, [srcq+4] 63cabdff1aSopenharmony_ci movu m3, [srcq+6] 64cabdff1aSopenharmony_ci paddd m0, m2 65cabdff1aSopenharmony_ci movu m2, [srcq+8] 66cabdff1aSopenharmony_ci add srcq, sstrideq 67cabdff1aSopenharmony_ci punpcklwd m4, m1 68cabdff1aSopenharmony_ci punpcklwd m3, m2 69cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 70cabdff1aSopenharmony_ci pmaddwd m4, m9 71cabdff1aSopenharmony_ci pmaddwd m3, m10 72cabdff1aSopenharmony_ci%else 73cabdff1aSopenharmony_ci pmaddwd m4, [filteryq+64] 74cabdff1aSopenharmony_ci pmaddwd m3, [filteryq+96] 75cabdff1aSopenharmony_ci%endif 76cabdff1aSopenharmony_ci paddd m0, m4 77cabdff1aSopenharmony_ci paddd m0, m3 78cabdff1aSopenharmony_ci paddd m0, m6 79cabdff1aSopenharmony_ci psrad m0, 7 80cabdff1aSopenharmony_ci%if cpuflag(sse4) 81cabdff1aSopenharmony_ci packusdw m0, m0 82cabdff1aSopenharmony_ci%else 83cabdff1aSopenharmony_ci packssdw m0, m0 84cabdff1aSopenharmony_ci%endif 85cabdff1aSopenharmony_ci%ifidn %1, avg 86cabdff1aSopenharmony_ci movh m1, [dstq] 87cabdff1aSopenharmony_ci%endif 88cabdff1aSopenharmony_ci pminsw m0, m5 89cabdff1aSopenharmony_ci%if notcpuflag(sse4) 90cabdff1aSopenharmony_ci%if ARCH_X86_64 91cabdff1aSopenharmony_ci pmaxsw m0, m11 92cabdff1aSopenharmony_ci%else 93cabdff1aSopenharmony_ci pxor m2, m2 94cabdff1aSopenharmony_ci pmaxsw m0, m2 95cabdff1aSopenharmony_ci%endif 96cabdff1aSopenharmony_ci%endif 97cabdff1aSopenharmony_ci%ifidn %1, avg 98cabdff1aSopenharmony_ci pavgw m0, m1 99cabdff1aSopenharmony_ci%endif 100cabdff1aSopenharmony_ci movh [dstq], m0 101cabdff1aSopenharmony_ci add dstq, dstrideq 102cabdff1aSopenharmony_ci dec hd 103cabdff1aSopenharmony_ci jg .loop 104cabdff1aSopenharmony_ci RET 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery 107cabdff1aSopenharmony_ci mova m5, [pw_4095] 108cabdff1aSopenharmony_ci jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body 109cabdff1aSopenharmony_ci%endmacro 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_ciINIT_XMM sse2 112cabdff1aSopenharmony_cifilter_h4_fn put 113cabdff1aSopenharmony_cifilter_h4_fn avg 114cabdff1aSopenharmony_ci 115cabdff1aSopenharmony_ci%macro filter_h_fn 1-2 12 116cabdff1aSopenharmony_ci%assign %%px mmsize/2 117cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery 118cabdff1aSopenharmony_ci mova m5, [pw_1023] 119cabdff1aSopenharmony_ci.body: 120cabdff1aSopenharmony_ci%if notcpuflag(sse4) && ARCH_X86_64 121cabdff1aSopenharmony_ci pxor m11, m11 122cabdff1aSopenharmony_ci%endif 123cabdff1aSopenharmony_ci mova m6, [pd_64] 124cabdff1aSopenharmony_ci mova m7, [filteryq+ 0] 125cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 126cabdff1aSopenharmony_ci mova m8, [filteryq+32] 127cabdff1aSopenharmony_ci mova m9, [filteryq+64] 128cabdff1aSopenharmony_ci mova m10, [filteryq+96] 129cabdff1aSopenharmony_ci%endif 130cabdff1aSopenharmony_ci.loop: 131cabdff1aSopenharmony_ci movu m0, [srcq-6] 132cabdff1aSopenharmony_ci movu m1, [srcq-4] 133cabdff1aSopenharmony_ci movu m2, [srcq-2] 134cabdff1aSopenharmony_ci movu m3, [srcq+0] 135cabdff1aSopenharmony_ci movu m4, [srcq+2] 136cabdff1aSopenharmony_ci pmaddwd m0, m7 137cabdff1aSopenharmony_ci pmaddwd m1, m7 138cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 139cabdff1aSopenharmony_ci pmaddwd m2, m8 140cabdff1aSopenharmony_ci pmaddwd m3, m8 141cabdff1aSopenharmony_ci pmaddwd m4, m9 142cabdff1aSopenharmony_ci%else 143cabdff1aSopenharmony_ci pmaddwd m2, [filteryq+32] 144cabdff1aSopenharmony_ci pmaddwd m3, [filteryq+32] 145cabdff1aSopenharmony_ci pmaddwd m4, [filteryq+64] 146cabdff1aSopenharmony_ci%endif 147cabdff1aSopenharmony_ci paddd m0, m2 148cabdff1aSopenharmony_ci paddd m1, m3 149cabdff1aSopenharmony_ci paddd m0, m4 150cabdff1aSopenharmony_ci movu m2, [srcq+4] 151cabdff1aSopenharmony_ci movu m3, [srcq+6] 152cabdff1aSopenharmony_ci movu m4, [srcq+8] 153cabdff1aSopenharmony_ci add srcq, sstrideq 154cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 155cabdff1aSopenharmony_ci pmaddwd m2, m9 156cabdff1aSopenharmony_ci pmaddwd m3, m10 157cabdff1aSopenharmony_ci pmaddwd m4, m10 158cabdff1aSopenharmony_ci%else 159cabdff1aSopenharmony_ci pmaddwd m2, [filteryq+64] 160cabdff1aSopenharmony_ci pmaddwd m3, [filteryq+96] 161cabdff1aSopenharmony_ci pmaddwd m4, [filteryq+96] 162cabdff1aSopenharmony_ci%endif 163cabdff1aSopenharmony_ci paddd m1, m2 164cabdff1aSopenharmony_ci paddd m0, m3 165cabdff1aSopenharmony_ci paddd m1, m4 166cabdff1aSopenharmony_ci paddd m0, m6 167cabdff1aSopenharmony_ci paddd m1, m6 168cabdff1aSopenharmony_ci psrad m0, 7 169cabdff1aSopenharmony_ci psrad m1, 7 170cabdff1aSopenharmony_ci%if cpuflag(sse4) 171cabdff1aSopenharmony_ci packusdw m0, m0 172cabdff1aSopenharmony_ci packusdw m1, m1 173cabdff1aSopenharmony_ci%else 174cabdff1aSopenharmony_ci packssdw m0, m0 175cabdff1aSopenharmony_ci packssdw m1, m1 176cabdff1aSopenharmony_ci%endif 177cabdff1aSopenharmony_ci punpcklwd m0, m1 178cabdff1aSopenharmony_ci pminsw m0, m5 179cabdff1aSopenharmony_ci%if notcpuflag(sse4) 180cabdff1aSopenharmony_ci%if ARCH_X86_64 181cabdff1aSopenharmony_ci pmaxsw m0, m11 182cabdff1aSopenharmony_ci%else 183cabdff1aSopenharmony_ci pxor m2, m2 184cabdff1aSopenharmony_ci pmaxsw m0, m2 185cabdff1aSopenharmony_ci%endif 186cabdff1aSopenharmony_ci%endif 187cabdff1aSopenharmony_ci%ifidn %1, avg 188cabdff1aSopenharmony_ci pavgw m0, [dstq] 189cabdff1aSopenharmony_ci%endif 190cabdff1aSopenharmony_ci mova [dstq], m0 191cabdff1aSopenharmony_ci add dstq, dstrideq 192cabdff1aSopenharmony_ci dec hd 193cabdff1aSopenharmony_ci jg .loop 194cabdff1aSopenharmony_ci RET 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery 197cabdff1aSopenharmony_ci mova m5, [pw_4095] 198cabdff1aSopenharmony_ci jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body 199cabdff1aSopenharmony_ci%endmacro 200cabdff1aSopenharmony_ci 201cabdff1aSopenharmony_ciINIT_XMM sse2 202cabdff1aSopenharmony_cifilter_h_fn put 203cabdff1aSopenharmony_cifilter_h_fn avg 204cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 205cabdff1aSopenharmony_ciINIT_YMM avx2 206cabdff1aSopenharmony_cifilter_h_fn put 207cabdff1aSopenharmony_cifilter_h_fn avg 208cabdff1aSopenharmony_ci%endif 209cabdff1aSopenharmony_ci 210cabdff1aSopenharmony_ci%macro filter_v4_fn 1-2 12 211cabdff1aSopenharmony_ci%if ARCH_X86_64 212cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 213cabdff1aSopenharmony_ci%else 214cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 215cabdff1aSopenharmony_ci mov filteryq, r5mp 216cabdff1aSopenharmony_ci%define hd r4mp 217cabdff1aSopenharmony_ci%endif 218cabdff1aSopenharmony_ci mova m5, [pw_1023] 219cabdff1aSopenharmony_ci.body: 220cabdff1aSopenharmony_ci%if notcpuflag(sse4) && ARCH_X86_64 221cabdff1aSopenharmony_ci pxor m11, m11 222cabdff1aSopenharmony_ci%endif 223cabdff1aSopenharmony_ci mova m6, [pd_64] 224cabdff1aSopenharmony_ci lea sstride3q, [sstrideq*3] 225cabdff1aSopenharmony_ci lea src4q, [srcq+sstrideq] 226cabdff1aSopenharmony_ci sub srcq, sstride3q 227cabdff1aSopenharmony_ci mova m7, [filteryq+ 0] 228cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 229cabdff1aSopenharmony_ci mova m8, [filteryq+ 32] 230cabdff1aSopenharmony_ci mova m9, [filteryq+ 64] 231cabdff1aSopenharmony_ci mova m10, [filteryq+ 96] 232cabdff1aSopenharmony_ci%endif 233cabdff1aSopenharmony_ci.loop: 234cabdff1aSopenharmony_ci ; FIXME maybe reuse loads from previous rows, or just 235cabdff1aSopenharmony_ci ; more generally unroll this to prevent multiple loads of 236cabdff1aSopenharmony_ci ; the same data? 237cabdff1aSopenharmony_ci movh m0, [srcq] 238cabdff1aSopenharmony_ci movh m1, [srcq+sstrideq] 239cabdff1aSopenharmony_ci movh m2, [srcq+sstrideq*2] 240cabdff1aSopenharmony_ci movh m3, [srcq+sstride3q] 241cabdff1aSopenharmony_ci add srcq, sstrideq 242cabdff1aSopenharmony_ci movh m4, [src4q] 243cabdff1aSopenharmony_ci punpcklwd m0, m1 244cabdff1aSopenharmony_ci punpcklwd m2, m3 245cabdff1aSopenharmony_ci pmaddwd m0, m7 246cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 247cabdff1aSopenharmony_ci pmaddwd m2, m8 248cabdff1aSopenharmony_ci%else 249cabdff1aSopenharmony_ci pmaddwd m2, [filteryq+ 32] 250cabdff1aSopenharmony_ci%endif 251cabdff1aSopenharmony_ci movh m1, [src4q+sstrideq] 252cabdff1aSopenharmony_ci movh m3, [src4q+sstrideq*2] 253cabdff1aSopenharmony_ci paddd m0, m2 254cabdff1aSopenharmony_ci movh m2, [src4q+sstride3q] 255cabdff1aSopenharmony_ci add src4q, sstrideq 256cabdff1aSopenharmony_ci punpcklwd m4, m1 257cabdff1aSopenharmony_ci punpcklwd m3, m2 258cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 259cabdff1aSopenharmony_ci pmaddwd m4, m9 260cabdff1aSopenharmony_ci pmaddwd m3, m10 261cabdff1aSopenharmony_ci%else 262cabdff1aSopenharmony_ci pmaddwd m4, [filteryq+ 64] 263cabdff1aSopenharmony_ci pmaddwd m3, [filteryq+ 96] 264cabdff1aSopenharmony_ci%endif 265cabdff1aSopenharmony_ci paddd m0, m4 266cabdff1aSopenharmony_ci paddd m0, m3 267cabdff1aSopenharmony_ci paddd m0, m6 268cabdff1aSopenharmony_ci psrad m0, 7 269cabdff1aSopenharmony_ci%if cpuflag(sse4) 270cabdff1aSopenharmony_ci packusdw m0, m0 271cabdff1aSopenharmony_ci%else 272cabdff1aSopenharmony_ci packssdw m0, m0 273cabdff1aSopenharmony_ci%endif 274cabdff1aSopenharmony_ci%ifidn %1, avg 275cabdff1aSopenharmony_ci movh m1, [dstq] 276cabdff1aSopenharmony_ci%endif 277cabdff1aSopenharmony_ci pminsw m0, m5 278cabdff1aSopenharmony_ci%if notcpuflag(sse4) 279cabdff1aSopenharmony_ci%if ARCH_X86_64 280cabdff1aSopenharmony_ci pmaxsw m0, m11 281cabdff1aSopenharmony_ci%else 282cabdff1aSopenharmony_ci pxor m2, m2 283cabdff1aSopenharmony_ci pmaxsw m0, m2 284cabdff1aSopenharmony_ci%endif 285cabdff1aSopenharmony_ci%endif 286cabdff1aSopenharmony_ci%ifidn %1, avg 287cabdff1aSopenharmony_ci pavgw m0, m1 288cabdff1aSopenharmony_ci%endif 289cabdff1aSopenharmony_ci movh [dstq], m0 290cabdff1aSopenharmony_ci add dstq, dstrideq 291cabdff1aSopenharmony_ci dec hd 292cabdff1aSopenharmony_ci jg .loop 293cabdff1aSopenharmony_ci RET 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci%if ARCH_X86_64 296cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 297cabdff1aSopenharmony_ci%else 298cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 299cabdff1aSopenharmony_ci mov filteryq, r5mp 300cabdff1aSopenharmony_ci%endif 301cabdff1aSopenharmony_ci mova m5, [pw_4095] 302cabdff1aSopenharmony_ci jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body 303cabdff1aSopenharmony_ci%endmacro 304cabdff1aSopenharmony_ci 305cabdff1aSopenharmony_ciINIT_XMM sse2 306cabdff1aSopenharmony_cifilter_v4_fn put 307cabdff1aSopenharmony_cifilter_v4_fn avg 308cabdff1aSopenharmony_ci 309cabdff1aSopenharmony_ci%macro filter_v_fn 1-2 13 310cabdff1aSopenharmony_ci%assign %%px mmsize/2 311cabdff1aSopenharmony_ci%if ARCH_X86_64 312cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 313cabdff1aSopenharmony_ci%else 314cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 315cabdff1aSopenharmony_ci mov filteryq, r5mp 316cabdff1aSopenharmony_ci%define hd r4mp 317cabdff1aSopenharmony_ci%endif 318cabdff1aSopenharmony_ci mova m5, [pw_1023] 319cabdff1aSopenharmony_ci.body: 320cabdff1aSopenharmony_ci%if notcpuflag(sse4) && ARCH_X86_64 321cabdff1aSopenharmony_ci pxor m12, m12 322cabdff1aSopenharmony_ci%endif 323cabdff1aSopenharmony_ci%if ARCH_X86_64 324cabdff1aSopenharmony_ci mova m11, [pd_64] 325cabdff1aSopenharmony_ci%endif 326cabdff1aSopenharmony_ci lea sstride3q, [sstrideq*3] 327cabdff1aSopenharmony_ci lea src4q, [srcq+sstrideq] 328cabdff1aSopenharmony_ci sub srcq, sstride3q 329cabdff1aSopenharmony_ci mova m7, [filteryq+ 0] 330cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 331cabdff1aSopenharmony_ci mova m8, [filteryq+ 32] 332cabdff1aSopenharmony_ci mova m9, [filteryq+ 64] 333cabdff1aSopenharmony_ci mova m10, [filteryq+ 96] 334cabdff1aSopenharmony_ci%endif 335cabdff1aSopenharmony_ci.loop: 336cabdff1aSopenharmony_ci ; FIXME maybe reuse loads from previous rows, or just 337cabdff1aSopenharmony_ci ; more generally unroll this to prevent multiple loads of 338cabdff1aSopenharmony_ci ; the same data? 339cabdff1aSopenharmony_ci movu m0, [srcq] 340cabdff1aSopenharmony_ci movu m1, [srcq+sstrideq] 341cabdff1aSopenharmony_ci movu m2, [srcq+sstrideq*2] 342cabdff1aSopenharmony_ci movu m3, [srcq+sstride3q] 343cabdff1aSopenharmony_ci add srcq, sstrideq 344cabdff1aSopenharmony_ci movu m4, [src4q] 345cabdff1aSopenharmony_ci SBUTTERFLY wd, 0, 1, 6 346cabdff1aSopenharmony_ci SBUTTERFLY wd, 2, 3, 6 347cabdff1aSopenharmony_ci pmaddwd m0, m7 348cabdff1aSopenharmony_ci pmaddwd m1, m7 349cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 350cabdff1aSopenharmony_ci pmaddwd m2, m8 351cabdff1aSopenharmony_ci pmaddwd m3, m8 352cabdff1aSopenharmony_ci%else 353cabdff1aSopenharmony_ci pmaddwd m2, [filteryq+ 32] 354cabdff1aSopenharmony_ci pmaddwd m3, [filteryq+ 32] 355cabdff1aSopenharmony_ci%endif 356cabdff1aSopenharmony_ci paddd m0, m2 357cabdff1aSopenharmony_ci paddd m1, m3 358cabdff1aSopenharmony_ci movu m2, [src4q+sstrideq] 359cabdff1aSopenharmony_ci movu m3, [src4q+sstrideq*2] 360cabdff1aSopenharmony_ci SBUTTERFLY wd, 4, 2, 6 361cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 362cabdff1aSopenharmony_ci pmaddwd m4, m9 363cabdff1aSopenharmony_ci pmaddwd m2, m9 364cabdff1aSopenharmony_ci%else 365cabdff1aSopenharmony_ci pmaddwd m4, [filteryq+ 64] 366cabdff1aSopenharmony_ci pmaddwd m2, [filteryq+ 64] 367cabdff1aSopenharmony_ci%endif 368cabdff1aSopenharmony_ci paddd m0, m4 369cabdff1aSopenharmony_ci paddd m1, m2 370cabdff1aSopenharmony_ci movu m4, [src4q+sstride3q] 371cabdff1aSopenharmony_ci add src4q, sstrideq 372cabdff1aSopenharmony_ci SBUTTERFLY wd, 3, 4, 6 373cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 374cabdff1aSopenharmony_ci pmaddwd m3, m10 375cabdff1aSopenharmony_ci pmaddwd m4, m10 376cabdff1aSopenharmony_ci%else 377cabdff1aSopenharmony_ci pmaddwd m3, [filteryq+ 96] 378cabdff1aSopenharmony_ci pmaddwd m4, [filteryq+ 96] 379cabdff1aSopenharmony_ci%endif 380cabdff1aSopenharmony_ci paddd m0, m3 381cabdff1aSopenharmony_ci paddd m1, m4 382cabdff1aSopenharmony_ci%if ARCH_X86_64 383cabdff1aSopenharmony_ci paddd m0, m11 384cabdff1aSopenharmony_ci paddd m1, m11 385cabdff1aSopenharmony_ci%else 386cabdff1aSopenharmony_ci paddd m0, [pd_64] 387cabdff1aSopenharmony_ci paddd m1, [pd_64] 388cabdff1aSopenharmony_ci%endif 389cabdff1aSopenharmony_ci psrad m0, 7 390cabdff1aSopenharmony_ci psrad m1, 7 391cabdff1aSopenharmony_ci%if cpuflag(sse4) 392cabdff1aSopenharmony_ci packusdw m0, m1 393cabdff1aSopenharmony_ci%else 394cabdff1aSopenharmony_ci packssdw m0, m1 395cabdff1aSopenharmony_ci%endif 396cabdff1aSopenharmony_ci pminsw m0, m5 397cabdff1aSopenharmony_ci%if notcpuflag(sse4) 398cabdff1aSopenharmony_ci%if ARCH_X86_64 399cabdff1aSopenharmony_ci pmaxsw m0, m12 400cabdff1aSopenharmony_ci%else 401cabdff1aSopenharmony_ci pxor m2, m2 402cabdff1aSopenharmony_ci pmaxsw m0, m2 403cabdff1aSopenharmony_ci%endif 404cabdff1aSopenharmony_ci%endif 405cabdff1aSopenharmony_ci%ifidn %1, avg 406cabdff1aSopenharmony_ci pavgw m0, [dstq] 407cabdff1aSopenharmony_ci%endif 408cabdff1aSopenharmony_ci mova [dstq], m0 409cabdff1aSopenharmony_ci add dstq, dstrideq 410cabdff1aSopenharmony_ci dec hd 411cabdff1aSopenharmony_ci jg .loop 412cabdff1aSopenharmony_ci RET 413cabdff1aSopenharmony_ci 414cabdff1aSopenharmony_ci%if ARCH_X86_64 415cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 416cabdff1aSopenharmony_ci%else 417cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 418cabdff1aSopenharmony_ci mov filteryq, r5mp 419cabdff1aSopenharmony_ci%endif 420cabdff1aSopenharmony_ci mova m5, [pw_4095] 421cabdff1aSopenharmony_ci jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body 422cabdff1aSopenharmony_ci%endmacro 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ciINIT_XMM sse2 425cabdff1aSopenharmony_cifilter_v_fn put 426cabdff1aSopenharmony_cifilter_v_fn avg 427cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 428cabdff1aSopenharmony_ciINIT_YMM avx2 429cabdff1aSopenharmony_cifilter_v_fn put 430cabdff1aSopenharmony_cifilter_v_fn avg 431cabdff1aSopenharmony_ci%endif 432