1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* VP9 motion compensation SIMD optimizations 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ciSECTION_RODATA 32 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_cicextern pw_256 28cabdff1aSopenharmony_cicextern pw_64 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ci%macro F8_SSSE3_TAPS 8 31cabdff1aSopenharmony_citimes 16 db %1, %2 32cabdff1aSopenharmony_citimes 16 db %3, %4 33cabdff1aSopenharmony_citimes 16 db %5, %6 34cabdff1aSopenharmony_citimes 16 db %7, %8 35cabdff1aSopenharmony_ci%endmacro 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci%macro F8_SSE2_TAPS 8 38cabdff1aSopenharmony_citimes 8 dw %1 39cabdff1aSopenharmony_citimes 8 dw %2 40cabdff1aSopenharmony_citimes 8 dw %3 41cabdff1aSopenharmony_citimes 8 dw %4 42cabdff1aSopenharmony_citimes 8 dw %5 43cabdff1aSopenharmony_citimes 8 dw %6 44cabdff1aSopenharmony_citimes 8 dw %7 45cabdff1aSopenharmony_citimes 8 dw %8 46cabdff1aSopenharmony_ci%endmacro 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci%macro F8_16BPP_TAPS 8 49cabdff1aSopenharmony_citimes 8 dw %1, %2 50cabdff1aSopenharmony_citimes 8 dw %3, %4 51cabdff1aSopenharmony_citimes 8 dw %5, %6 52cabdff1aSopenharmony_citimes 8 dw %7, %8 53cabdff1aSopenharmony_ci%endmacro 54cabdff1aSopenharmony_ci 55cabdff1aSopenharmony_ci%macro FILTER 1 56cabdff1aSopenharmony_ciconst filters_%1 ; smooth 57cabdff1aSopenharmony_ci F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0 58cabdff1aSopenharmony_ci F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0 59cabdff1aSopenharmony_ci F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0 60cabdff1aSopenharmony_ci F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0 61cabdff1aSopenharmony_ci F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0 62cabdff1aSopenharmony_ci F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0 63cabdff1aSopenharmony_ci F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1 64cabdff1aSopenharmony_ci F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1 65cabdff1aSopenharmony_ci F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1 66cabdff1aSopenharmony_ci F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1 67cabdff1aSopenharmony_ci F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2 68cabdff1aSopenharmony_ci F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2 69cabdff1aSopenharmony_ci F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2 70cabdff1aSopenharmony_ci F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2 71cabdff1aSopenharmony_ci F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3 72cabdff1aSopenharmony_ci ; regular 73cabdff1aSopenharmony_ci F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0 74cabdff1aSopenharmony_ci F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0 75cabdff1aSopenharmony_ci F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1 76cabdff1aSopenharmony_ci F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1 77cabdff1aSopenharmony_ci F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1 78cabdff1aSopenharmony_ci F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1 79cabdff1aSopenharmony_ci F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1 80cabdff1aSopenharmony_ci F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1 81cabdff1aSopenharmony_ci F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1 82cabdff1aSopenharmony_ci F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1 83cabdff1aSopenharmony_ci F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1 84cabdff1aSopenharmony_ci F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1 85cabdff1aSopenharmony_ci F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1 86cabdff1aSopenharmony_ci F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1 87cabdff1aSopenharmony_ci F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0 88cabdff1aSopenharmony_ci ; sharp 89cabdff1aSopenharmony_ci F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0 90cabdff1aSopenharmony_ci F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1 91cabdff1aSopenharmony_ci F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2 92cabdff1aSopenharmony_ci F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2 93cabdff1aSopenharmony_ci F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3 94cabdff1aSopenharmony_ci F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3 95cabdff1aSopenharmony_ci F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4 96cabdff1aSopenharmony_ci F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4 97cabdff1aSopenharmony_ci F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4 98cabdff1aSopenharmony_ci F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4 99cabdff1aSopenharmony_ci F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4 100cabdff1aSopenharmony_ci F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4 101cabdff1aSopenharmony_ci F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3 102cabdff1aSopenharmony_ci F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2 103cabdff1aSopenharmony_ci F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1 104cabdff1aSopenharmony_ci%endmacro 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci%define F8_TAPS F8_SSSE3_TAPS 107cabdff1aSopenharmony_ci; int8_t ff_filters_ssse3[3][15][4][32] 108cabdff1aSopenharmony_ciFILTER ssse3 109cabdff1aSopenharmony_ci%define F8_TAPS F8_SSE2_TAPS 110cabdff1aSopenharmony_ci; int16_t ff_filters_sse2[3][15][8][8] 111cabdff1aSopenharmony_ciFILTER sse2 112cabdff1aSopenharmony_ci%define F8_TAPS F8_16BPP_TAPS 113cabdff1aSopenharmony_ci; int16_t ff_filters_16bpp[3][15][4][16] 114cabdff1aSopenharmony_ciFILTER 16bpp 115cabdff1aSopenharmony_ci 116cabdff1aSopenharmony_ciSECTION .text 117cabdff1aSopenharmony_ci 118cabdff1aSopenharmony_ci%macro filter_sse2_h_fn 1 119cabdff1aSopenharmony_ci%assign %%px mmsize/2 120cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery 121cabdff1aSopenharmony_ci pxor m5, m5 122cabdff1aSopenharmony_ci mova m6, [pw_64] 123cabdff1aSopenharmony_ci mova m7, [filteryq+ 0] 124cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 125cabdff1aSopenharmony_ci mova m8, [filteryq+ 16] 126cabdff1aSopenharmony_ci mova m9, [filteryq+ 32] 127cabdff1aSopenharmony_ci mova m10, [filteryq+ 48] 128cabdff1aSopenharmony_ci mova m11, [filteryq+ 64] 129cabdff1aSopenharmony_ci mova m12, [filteryq+ 80] 130cabdff1aSopenharmony_ci mova m13, [filteryq+ 96] 131cabdff1aSopenharmony_ci mova m14, [filteryq+112] 132cabdff1aSopenharmony_ci%endif 133cabdff1aSopenharmony_ci.loop: 134cabdff1aSopenharmony_ci movh m0, [srcq-3] 135cabdff1aSopenharmony_ci movh m1, [srcq-2] 136cabdff1aSopenharmony_ci movh m2, [srcq-1] 137cabdff1aSopenharmony_ci movh m3, [srcq+0] 138cabdff1aSopenharmony_ci movh m4, [srcq+1] 139cabdff1aSopenharmony_ci punpcklbw m0, m5 140cabdff1aSopenharmony_ci punpcklbw m1, m5 141cabdff1aSopenharmony_ci punpcklbw m2, m5 142cabdff1aSopenharmony_ci punpcklbw m3, m5 143cabdff1aSopenharmony_ci punpcklbw m4, m5 144cabdff1aSopenharmony_ci pmullw m0, m7 145cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 146cabdff1aSopenharmony_ci pmullw m1, m8 147cabdff1aSopenharmony_ci pmullw m2, m9 148cabdff1aSopenharmony_ci pmullw m3, m10 149cabdff1aSopenharmony_ci pmullw m4, m11 150cabdff1aSopenharmony_ci%else 151cabdff1aSopenharmony_ci pmullw m1, [filteryq+ 16] 152cabdff1aSopenharmony_ci pmullw m2, [filteryq+ 32] 153cabdff1aSopenharmony_ci pmullw m3, [filteryq+ 48] 154cabdff1aSopenharmony_ci pmullw m4, [filteryq+ 64] 155cabdff1aSopenharmony_ci%endif 156cabdff1aSopenharmony_ci paddw m0, m1 157cabdff1aSopenharmony_ci paddw m2, m3 158cabdff1aSopenharmony_ci paddw m0, m4 159cabdff1aSopenharmony_ci movh m1, [srcq+2] 160cabdff1aSopenharmony_ci movh m3, [srcq+3] 161cabdff1aSopenharmony_ci movh m4, [srcq+4] 162cabdff1aSopenharmony_ci add srcq, sstrideq 163cabdff1aSopenharmony_ci punpcklbw m1, m5 164cabdff1aSopenharmony_ci punpcklbw m3, m5 165cabdff1aSopenharmony_ci punpcklbw m4, m5 166cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 167cabdff1aSopenharmony_ci pmullw m1, m12 168cabdff1aSopenharmony_ci pmullw m3, m13 169cabdff1aSopenharmony_ci pmullw m4, m14 170cabdff1aSopenharmony_ci%else 171cabdff1aSopenharmony_ci pmullw m1, [filteryq+ 80] 172cabdff1aSopenharmony_ci pmullw m3, [filteryq+ 96] 173cabdff1aSopenharmony_ci pmullw m4, [filteryq+112] 174cabdff1aSopenharmony_ci%endif 175cabdff1aSopenharmony_ci paddw m0, m1 176cabdff1aSopenharmony_ci paddw m3, m4 177cabdff1aSopenharmony_ci paddw m0, m6 178cabdff1aSopenharmony_ci paddw m2, m3 179cabdff1aSopenharmony_ci paddsw m0, m2 180cabdff1aSopenharmony_ci psraw m0, 7 181cabdff1aSopenharmony_ci%ifidn %1, avg 182cabdff1aSopenharmony_ci movh m1, [dstq] 183cabdff1aSopenharmony_ci%endif 184cabdff1aSopenharmony_ci packuswb m0, m0 185cabdff1aSopenharmony_ci%ifidn %1, avg 186cabdff1aSopenharmony_ci pavgb m0, m1 187cabdff1aSopenharmony_ci%endif 188cabdff1aSopenharmony_ci movh [dstq], m0 189cabdff1aSopenharmony_ci add dstq, dstrideq 190cabdff1aSopenharmony_ci dec hd 191cabdff1aSopenharmony_ci jg .loop 192cabdff1aSopenharmony_ci RET 193cabdff1aSopenharmony_ci%endmacro 194cabdff1aSopenharmony_ci 195cabdff1aSopenharmony_ciINIT_MMX mmxext 196cabdff1aSopenharmony_cifilter_sse2_h_fn put 197cabdff1aSopenharmony_cifilter_sse2_h_fn avg 198cabdff1aSopenharmony_ci 199cabdff1aSopenharmony_ciINIT_XMM sse2 200cabdff1aSopenharmony_cifilter_sse2_h_fn put 201cabdff1aSopenharmony_cifilter_sse2_h_fn avg 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ci%macro filter_h_fn 1 204cabdff1aSopenharmony_ci%assign %%px mmsize/2 205cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery 206cabdff1aSopenharmony_ci mova m6, [pw_256] 207cabdff1aSopenharmony_ci mova m7, [filteryq+ 0] 208cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 209cabdff1aSopenharmony_ci mova m8, [filteryq+32] 210cabdff1aSopenharmony_ci mova m9, [filteryq+64] 211cabdff1aSopenharmony_ci mova m10, [filteryq+96] 212cabdff1aSopenharmony_ci%endif 213cabdff1aSopenharmony_ci.loop: 214cabdff1aSopenharmony_ci movh m0, [srcq-3] 215cabdff1aSopenharmony_ci movh m1, [srcq-2] 216cabdff1aSopenharmony_ci movh m2, [srcq-1] 217cabdff1aSopenharmony_ci movh m3, [srcq+0] 218cabdff1aSopenharmony_ci movh m4, [srcq+1] 219cabdff1aSopenharmony_ci movh m5, [srcq+2] 220cabdff1aSopenharmony_ci punpcklbw m0, m1 221cabdff1aSopenharmony_ci punpcklbw m2, m3 222cabdff1aSopenharmony_ci movh m1, [srcq+3] 223cabdff1aSopenharmony_ci movh m3, [srcq+4] 224cabdff1aSopenharmony_ci add srcq, sstrideq 225cabdff1aSopenharmony_ci punpcklbw m4, m5 226cabdff1aSopenharmony_ci punpcklbw m1, m3 227cabdff1aSopenharmony_ci pmaddubsw m0, m7 228cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 229cabdff1aSopenharmony_ci pmaddubsw m2, m8 230cabdff1aSopenharmony_ci pmaddubsw m4, m9 231cabdff1aSopenharmony_ci pmaddubsw m1, m10 232cabdff1aSopenharmony_ci%else 233cabdff1aSopenharmony_ci pmaddubsw m2, [filteryq+32] 234cabdff1aSopenharmony_ci pmaddubsw m4, [filteryq+64] 235cabdff1aSopenharmony_ci pmaddubsw m1, [filteryq+96] 236cabdff1aSopenharmony_ci%endif 237cabdff1aSopenharmony_ci paddw m0, m4 238cabdff1aSopenharmony_ci paddw m2, m1 239cabdff1aSopenharmony_ci paddsw m0, m2 240cabdff1aSopenharmony_ci pmulhrsw m0, m6 241cabdff1aSopenharmony_ci%ifidn %1, avg 242cabdff1aSopenharmony_ci movh m1, [dstq] 243cabdff1aSopenharmony_ci%endif 244cabdff1aSopenharmony_ci packuswb m0, m0 245cabdff1aSopenharmony_ci%ifidn %1, avg 246cabdff1aSopenharmony_ci pavgb m0, m1 247cabdff1aSopenharmony_ci%endif 248cabdff1aSopenharmony_ci movh [dstq], m0 249cabdff1aSopenharmony_ci add dstq, dstrideq 250cabdff1aSopenharmony_ci dec hd 251cabdff1aSopenharmony_ci jg .loop 252cabdff1aSopenharmony_ci RET 253cabdff1aSopenharmony_ci%endmacro 254cabdff1aSopenharmony_ci 255cabdff1aSopenharmony_ciINIT_MMX ssse3 256cabdff1aSopenharmony_cifilter_h_fn put 257cabdff1aSopenharmony_cifilter_h_fn avg 258cabdff1aSopenharmony_ci 259cabdff1aSopenharmony_ciINIT_XMM ssse3 260cabdff1aSopenharmony_cifilter_h_fn put 261cabdff1aSopenharmony_cifilter_h_fn avg 262cabdff1aSopenharmony_ci 263cabdff1aSopenharmony_ci%if ARCH_X86_64 264cabdff1aSopenharmony_ci%macro filter_hx2_fn 1 265cabdff1aSopenharmony_ci%assign %%px mmsize 266cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery 267cabdff1aSopenharmony_ci mova m13, [pw_256] 268cabdff1aSopenharmony_ci mova m8, [filteryq+ 0] 269cabdff1aSopenharmony_ci mova m9, [filteryq+32] 270cabdff1aSopenharmony_ci mova m10, [filteryq+64] 271cabdff1aSopenharmony_ci mova m11, [filteryq+96] 272cabdff1aSopenharmony_ci.loop: 273cabdff1aSopenharmony_ci movu m0, [srcq-3] 274cabdff1aSopenharmony_ci movu m1, [srcq-2] 275cabdff1aSopenharmony_ci movu m2, [srcq-1] 276cabdff1aSopenharmony_ci movu m3, [srcq+0] 277cabdff1aSopenharmony_ci movu m4, [srcq+1] 278cabdff1aSopenharmony_ci movu m5, [srcq+2] 279cabdff1aSopenharmony_ci movu m6, [srcq+3] 280cabdff1aSopenharmony_ci movu m7, [srcq+4] 281cabdff1aSopenharmony_ci add srcq, sstrideq 282cabdff1aSopenharmony_ci SBUTTERFLY bw, 0, 1, 12 283cabdff1aSopenharmony_ci SBUTTERFLY bw, 2, 3, 12 284cabdff1aSopenharmony_ci SBUTTERFLY bw, 4, 5, 12 285cabdff1aSopenharmony_ci SBUTTERFLY bw, 6, 7, 12 286cabdff1aSopenharmony_ci pmaddubsw m0, m8 287cabdff1aSopenharmony_ci pmaddubsw m1, m8 288cabdff1aSopenharmony_ci pmaddubsw m2, m9 289cabdff1aSopenharmony_ci pmaddubsw m3, m9 290cabdff1aSopenharmony_ci pmaddubsw m4, m10 291cabdff1aSopenharmony_ci pmaddubsw m5, m10 292cabdff1aSopenharmony_ci pmaddubsw m6, m11 293cabdff1aSopenharmony_ci pmaddubsw m7, m11 294cabdff1aSopenharmony_ci paddw m0, m4 295cabdff1aSopenharmony_ci paddw m1, m5 296cabdff1aSopenharmony_ci paddw m2, m6 297cabdff1aSopenharmony_ci paddw m3, m7 298cabdff1aSopenharmony_ci paddsw m0, m2 299cabdff1aSopenharmony_ci paddsw m1, m3 300cabdff1aSopenharmony_ci pmulhrsw m0, m13 301cabdff1aSopenharmony_ci pmulhrsw m1, m13 302cabdff1aSopenharmony_ci packuswb m0, m1 303cabdff1aSopenharmony_ci%ifidn %1, avg 304cabdff1aSopenharmony_ci pavgb m0, [dstq] 305cabdff1aSopenharmony_ci%endif 306cabdff1aSopenharmony_ci mova [dstq], m0 307cabdff1aSopenharmony_ci add dstq, dstrideq 308cabdff1aSopenharmony_ci dec hd 309cabdff1aSopenharmony_ci jg .loop 310cabdff1aSopenharmony_ci RET 311cabdff1aSopenharmony_ci%endmacro 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ciINIT_XMM ssse3 314cabdff1aSopenharmony_cifilter_hx2_fn put 315cabdff1aSopenharmony_cifilter_hx2_fn avg 316cabdff1aSopenharmony_ci 317cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 318cabdff1aSopenharmony_ciINIT_YMM avx2 319cabdff1aSopenharmony_cifilter_hx2_fn put 320cabdff1aSopenharmony_cifilter_hx2_fn avg 321cabdff1aSopenharmony_ci%endif 322cabdff1aSopenharmony_ci 323cabdff1aSopenharmony_ci%endif ; ARCH_X86_64 324cabdff1aSopenharmony_ci 325cabdff1aSopenharmony_ci%macro filter_sse2_v_fn 1 326cabdff1aSopenharmony_ci%assign %%px mmsize/2 327cabdff1aSopenharmony_ci%if ARCH_X86_64 328cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3 329cabdff1aSopenharmony_ci%else 330cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3 331cabdff1aSopenharmony_ci mov filteryq, r5mp 332cabdff1aSopenharmony_ci%define hd r4mp 333cabdff1aSopenharmony_ci%endif 334cabdff1aSopenharmony_ci pxor m5, m5 335cabdff1aSopenharmony_ci mova m6, [pw_64] 336cabdff1aSopenharmony_ci lea sstride3q, [sstrideq*3] 337cabdff1aSopenharmony_ci lea src4q, [srcq+sstrideq] 338cabdff1aSopenharmony_ci sub srcq, sstride3q 339cabdff1aSopenharmony_ci mova m7, [filteryq+ 0] 340cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 341cabdff1aSopenharmony_ci mova m8, [filteryq+ 16] 342cabdff1aSopenharmony_ci mova m9, [filteryq+ 32] 343cabdff1aSopenharmony_ci mova m10, [filteryq+ 48] 344cabdff1aSopenharmony_ci mova m11, [filteryq+ 64] 345cabdff1aSopenharmony_ci mova m12, [filteryq+ 80] 346cabdff1aSopenharmony_ci mova m13, [filteryq+ 96] 347cabdff1aSopenharmony_ci mova m14, [filteryq+112] 348cabdff1aSopenharmony_ci%endif 349cabdff1aSopenharmony_ci.loop: 350cabdff1aSopenharmony_ci ; FIXME maybe reuse loads from previous rows, or just 351cabdff1aSopenharmony_ci ; more generally unroll this to prevent multiple loads of 352cabdff1aSopenharmony_ci ; the same data? 353cabdff1aSopenharmony_ci movh m0, [srcq] 354cabdff1aSopenharmony_ci movh m1, [srcq+sstrideq] 355cabdff1aSopenharmony_ci movh m2, [srcq+sstrideq*2] 356cabdff1aSopenharmony_ci movh m3, [srcq+sstride3q] 357cabdff1aSopenharmony_ci add srcq, sstrideq 358cabdff1aSopenharmony_ci movh m4, [src4q] 359cabdff1aSopenharmony_ci punpcklbw m0, m5 360cabdff1aSopenharmony_ci punpcklbw m1, m5 361cabdff1aSopenharmony_ci punpcklbw m2, m5 362cabdff1aSopenharmony_ci punpcklbw m3, m5 363cabdff1aSopenharmony_ci punpcklbw m4, m5 364cabdff1aSopenharmony_ci pmullw m0, m7 365cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 366cabdff1aSopenharmony_ci pmullw m1, m8 367cabdff1aSopenharmony_ci pmullw m2, m9 368cabdff1aSopenharmony_ci pmullw m3, m10 369cabdff1aSopenharmony_ci pmullw m4, m11 370cabdff1aSopenharmony_ci%else 371cabdff1aSopenharmony_ci pmullw m1, [filteryq+ 16] 372cabdff1aSopenharmony_ci pmullw m2, [filteryq+ 32] 373cabdff1aSopenharmony_ci pmullw m3, [filteryq+ 48] 374cabdff1aSopenharmony_ci pmullw m4, [filteryq+ 64] 375cabdff1aSopenharmony_ci%endif 376cabdff1aSopenharmony_ci paddw m0, m1 377cabdff1aSopenharmony_ci paddw m2, m3 378cabdff1aSopenharmony_ci paddw m0, m4 379cabdff1aSopenharmony_ci movh m1, [src4q+sstrideq] 380cabdff1aSopenharmony_ci movh m3, [src4q+sstrideq*2] 381cabdff1aSopenharmony_ci movh m4, [src4q+sstride3q] 382cabdff1aSopenharmony_ci add src4q, sstrideq 383cabdff1aSopenharmony_ci punpcklbw m1, m5 384cabdff1aSopenharmony_ci punpcklbw m3, m5 385cabdff1aSopenharmony_ci punpcklbw m4, m5 386cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 387cabdff1aSopenharmony_ci pmullw m1, m12 388cabdff1aSopenharmony_ci pmullw m3, m13 389cabdff1aSopenharmony_ci pmullw m4, m14 390cabdff1aSopenharmony_ci%else 391cabdff1aSopenharmony_ci pmullw m1, [filteryq+ 80] 392cabdff1aSopenharmony_ci pmullw m3, [filteryq+ 96] 393cabdff1aSopenharmony_ci pmullw m4, [filteryq+112] 394cabdff1aSopenharmony_ci%endif 395cabdff1aSopenharmony_ci paddw m0, m1 396cabdff1aSopenharmony_ci paddw m3, m4 397cabdff1aSopenharmony_ci paddw m0, m6 398cabdff1aSopenharmony_ci paddw m2, m3 399cabdff1aSopenharmony_ci paddsw m0, m2 400cabdff1aSopenharmony_ci psraw m0, 7 401cabdff1aSopenharmony_ci%ifidn %1, avg 402cabdff1aSopenharmony_ci movh m1, [dstq] 403cabdff1aSopenharmony_ci%endif 404cabdff1aSopenharmony_ci packuswb m0, m0 405cabdff1aSopenharmony_ci%ifidn %1, avg 406cabdff1aSopenharmony_ci pavgb m0, m1 407cabdff1aSopenharmony_ci%endif 408cabdff1aSopenharmony_ci movh [dstq], m0 409cabdff1aSopenharmony_ci add dstq, dstrideq 410cabdff1aSopenharmony_ci dec hd 411cabdff1aSopenharmony_ci jg .loop 412cabdff1aSopenharmony_ci RET 413cabdff1aSopenharmony_ci%endmacro 414cabdff1aSopenharmony_ci 415cabdff1aSopenharmony_ciINIT_MMX mmxext 416cabdff1aSopenharmony_cifilter_sse2_v_fn put 417cabdff1aSopenharmony_cifilter_sse2_v_fn avg 418cabdff1aSopenharmony_ci 419cabdff1aSopenharmony_ciINIT_XMM sse2 420cabdff1aSopenharmony_cifilter_sse2_v_fn put 421cabdff1aSopenharmony_cifilter_sse2_v_fn avg 422cabdff1aSopenharmony_ci 423cabdff1aSopenharmony_ci%macro filter_v_fn 1 424cabdff1aSopenharmony_ci%assign %%px mmsize/2 425cabdff1aSopenharmony_ci%if ARCH_X86_64 426cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3 427cabdff1aSopenharmony_ci%else 428cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3 429cabdff1aSopenharmony_ci mov filteryq, r5mp 430cabdff1aSopenharmony_ci%define hd r4mp 431cabdff1aSopenharmony_ci%endif 432cabdff1aSopenharmony_ci mova m6, [pw_256] 433cabdff1aSopenharmony_ci lea sstride3q, [sstrideq*3] 434cabdff1aSopenharmony_ci lea src4q, [srcq+sstrideq] 435cabdff1aSopenharmony_ci sub srcq, sstride3q 436cabdff1aSopenharmony_ci mova m7, [filteryq+ 0] 437cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 438cabdff1aSopenharmony_ci mova m8, [filteryq+32] 439cabdff1aSopenharmony_ci mova m9, [filteryq+64] 440cabdff1aSopenharmony_ci mova m10, [filteryq+96] 441cabdff1aSopenharmony_ci%endif 442cabdff1aSopenharmony_ci.loop: 443cabdff1aSopenharmony_ci ; FIXME maybe reuse loads from previous rows, or just more generally 444cabdff1aSopenharmony_ci ; unroll this to prevent multiple loads of the same data? 445cabdff1aSopenharmony_ci movh m0, [srcq] 446cabdff1aSopenharmony_ci movh m1, [srcq+sstrideq] 447cabdff1aSopenharmony_ci movh m2, [srcq+sstrideq*2] 448cabdff1aSopenharmony_ci movh m3, [srcq+sstride3q] 449cabdff1aSopenharmony_ci movh m4, [src4q] 450cabdff1aSopenharmony_ci movh m5, [src4q+sstrideq] 451cabdff1aSopenharmony_ci punpcklbw m0, m1 452cabdff1aSopenharmony_ci punpcklbw m2, m3 453cabdff1aSopenharmony_ci movh m1, [src4q+sstrideq*2] 454cabdff1aSopenharmony_ci movh m3, [src4q+sstride3q] 455cabdff1aSopenharmony_ci add srcq, sstrideq 456cabdff1aSopenharmony_ci add src4q, sstrideq 457cabdff1aSopenharmony_ci punpcklbw m4, m5 458cabdff1aSopenharmony_ci punpcklbw m1, m3 459cabdff1aSopenharmony_ci pmaddubsw m0, m7 460cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8 461cabdff1aSopenharmony_ci pmaddubsw m2, m8 462cabdff1aSopenharmony_ci pmaddubsw m4, m9 463cabdff1aSopenharmony_ci pmaddubsw m1, m10 464cabdff1aSopenharmony_ci%else 465cabdff1aSopenharmony_ci pmaddubsw m2, [filteryq+32] 466cabdff1aSopenharmony_ci pmaddubsw m4, [filteryq+64] 467cabdff1aSopenharmony_ci pmaddubsw m1, [filteryq+96] 468cabdff1aSopenharmony_ci%endif 469cabdff1aSopenharmony_ci paddw m0, m4 470cabdff1aSopenharmony_ci paddw m2, m1 471cabdff1aSopenharmony_ci paddsw m0, m2 472cabdff1aSopenharmony_ci pmulhrsw m0, m6 473cabdff1aSopenharmony_ci%ifidn %1, avg 474cabdff1aSopenharmony_ci movh m1, [dstq] 475cabdff1aSopenharmony_ci%endif 476cabdff1aSopenharmony_ci packuswb m0, m0 477cabdff1aSopenharmony_ci%ifidn %1, avg 478cabdff1aSopenharmony_ci pavgb m0, m1 479cabdff1aSopenharmony_ci%endif 480cabdff1aSopenharmony_ci movh [dstq], m0 481cabdff1aSopenharmony_ci add dstq, dstrideq 482cabdff1aSopenharmony_ci dec hd 483cabdff1aSopenharmony_ci jg .loop 484cabdff1aSopenharmony_ci RET 485cabdff1aSopenharmony_ci%endmacro 486cabdff1aSopenharmony_ci 487cabdff1aSopenharmony_ciINIT_MMX ssse3 488cabdff1aSopenharmony_cifilter_v_fn put 489cabdff1aSopenharmony_cifilter_v_fn avg 490cabdff1aSopenharmony_ci 491cabdff1aSopenharmony_ciINIT_XMM ssse3 492cabdff1aSopenharmony_cifilter_v_fn put 493cabdff1aSopenharmony_cifilter_v_fn avg 494cabdff1aSopenharmony_ci 495cabdff1aSopenharmony_ci%if ARCH_X86_64 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci%macro filter_vx2_fn 1 498cabdff1aSopenharmony_ci%assign %%px mmsize 499cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3 500cabdff1aSopenharmony_ci mova m13, [pw_256] 501cabdff1aSopenharmony_ci lea sstride3q, [sstrideq*3] 502cabdff1aSopenharmony_ci lea src4q, [srcq+sstrideq] 503cabdff1aSopenharmony_ci sub srcq, sstride3q 504cabdff1aSopenharmony_ci mova m8, [filteryq+ 0] 505cabdff1aSopenharmony_ci mova m9, [filteryq+32] 506cabdff1aSopenharmony_ci mova m10, [filteryq+64] 507cabdff1aSopenharmony_ci mova m11, [filteryq+96] 508cabdff1aSopenharmony_ci.loop: 509cabdff1aSopenharmony_ci ; FIXME maybe reuse loads from previous rows, or just 510cabdff1aSopenharmony_ci ; more generally unroll this to prevent multiple loads of 511cabdff1aSopenharmony_ci ; the same data? 512cabdff1aSopenharmony_ci movu m0, [srcq] 513cabdff1aSopenharmony_ci movu m1, [srcq+sstrideq] 514cabdff1aSopenharmony_ci movu m2, [srcq+sstrideq*2] 515cabdff1aSopenharmony_ci movu m3, [srcq+sstride3q] 516cabdff1aSopenharmony_ci movu m4, [src4q] 517cabdff1aSopenharmony_ci movu m5, [src4q+sstrideq] 518cabdff1aSopenharmony_ci movu m6, [src4q+sstrideq*2] 519cabdff1aSopenharmony_ci movu m7, [src4q+sstride3q] 520cabdff1aSopenharmony_ci add srcq, sstrideq 521cabdff1aSopenharmony_ci add src4q, sstrideq 522cabdff1aSopenharmony_ci SBUTTERFLY bw, 0, 1, 12 523cabdff1aSopenharmony_ci SBUTTERFLY bw, 2, 3, 12 524cabdff1aSopenharmony_ci SBUTTERFLY bw, 4, 5, 12 525cabdff1aSopenharmony_ci SBUTTERFLY bw, 6, 7, 12 526cabdff1aSopenharmony_ci pmaddubsw m0, m8 527cabdff1aSopenharmony_ci pmaddubsw m1, m8 528cabdff1aSopenharmony_ci pmaddubsw m2, m9 529cabdff1aSopenharmony_ci pmaddubsw m3, m9 530cabdff1aSopenharmony_ci pmaddubsw m4, m10 531cabdff1aSopenharmony_ci pmaddubsw m5, m10 532cabdff1aSopenharmony_ci pmaddubsw m6, m11 533cabdff1aSopenharmony_ci pmaddubsw m7, m11 534cabdff1aSopenharmony_ci paddw m0, m4 535cabdff1aSopenharmony_ci paddw m1, m5 536cabdff1aSopenharmony_ci paddw m2, m6 537cabdff1aSopenharmony_ci paddw m3, m7 538cabdff1aSopenharmony_ci paddsw m0, m2 539cabdff1aSopenharmony_ci paddsw m1, m3 540cabdff1aSopenharmony_ci pmulhrsw m0, m13 541cabdff1aSopenharmony_ci pmulhrsw m1, m13 542cabdff1aSopenharmony_ci packuswb m0, m1 543cabdff1aSopenharmony_ci%ifidn %1, avg 544cabdff1aSopenharmony_ci pavgb m0, [dstq] 545cabdff1aSopenharmony_ci%endif 546cabdff1aSopenharmony_ci mova [dstq], m0 547cabdff1aSopenharmony_ci add dstq, dstrideq 548cabdff1aSopenharmony_ci dec hd 549cabdff1aSopenharmony_ci jg .loop 550cabdff1aSopenharmony_ci RET 551cabdff1aSopenharmony_ci%endmacro 552cabdff1aSopenharmony_ci 553cabdff1aSopenharmony_ciINIT_XMM ssse3 554cabdff1aSopenharmony_cifilter_vx2_fn put 555cabdff1aSopenharmony_cifilter_vx2_fn avg 556cabdff1aSopenharmony_ci 557cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 558cabdff1aSopenharmony_ciINIT_YMM avx2 559cabdff1aSopenharmony_cifilter_vx2_fn put 560cabdff1aSopenharmony_cifilter_vx2_fn avg 561cabdff1aSopenharmony_ci%endif 562cabdff1aSopenharmony_ci 563cabdff1aSopenharmony_ci%endif ; ARCH_X86_64 564cabdff1aSopenharmony_ci 565cabdff1aSopenharmony_ci%macro fpel_fn 6-8 0, 4 566cabdff1aSopenharmony_ci%if %2 == 4 567cabdff1aSopenharmony_ci%define %%srcfn movh 568cabdff1aSopenharmony_ci%define %%dstfn movh 569cabdff1aSopenharmony_ci%else 570cabdff1aSopenharmony_ci%define %%srcfn movu 571cabdff1aSopenharmony_ci%define %%dstfn mova 572cabdff1aSopenharmony_ci%endif 573cabdff1aSopenharmony_ci 574cabdff1aSopenharmony_ci%if %7 == 8 575cabdff1aSopenharmony_ci%define %%pavg pavgb 576cabdff1aSopenharmony_ci%define %%szsuf _8 577cabdff1aSopenharmony_ci%elif %7 == 16 578cabdff1aSopenharmony_ci%define %%pavg pavgw 579cabdff1aSopenharmony_ci%define %%szsuf _16 580cabdff1aSopenharmony_ci%else 581cabdff1aSopenharmony_ci%define %%szsuf 582cabdff1aSopenharmony_ci%endif 583cabdff1aSopenharmony_ci 584cabdff1aSopenharmony_ci%if %2 <= mmsize 585cabdff1aSopenharmony_cicglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 586cabdff1aSopenharmony_ci lea sstride3q, [sstrideq*3] 587cabdff1aSopenharmony_ci lea dstride3q, [dstrideq*3] 588cabdff1aSopenharmony_ci%else 589cabdff1aSopenharmony_cicglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h 590cabdff1aSopenharmony_ci%endif 591cabdff1aSopenharmony_ci.loop: 592cabdff1aSopenharmony_ci %%srcfn m0, [srcq] 593cabdff1aSopenharmony_ci %%srcfn m1, [srcq+s%3] 594cabdff1aSopenharmony_ci %%srcfn m2, [srcq+s%4] 595cabdff1aSopenharmony_ci %%srcfn m3, [srcq+s%5] 596cabdff1aSopenharmony_ci%if %2/mmsize == 8 597cabdff1aSopenharmony_ci %%srcfn m4, [srcq+mmsize*4] 598cabdff1aSopenharmony_ci %%srcfn m5, [srcq+mmsize*5] 599cabdff1aSopenharmony_ci %%srcfn m6, [srcq+mmsize*6] 600cabdff1aSopenharmony_ci %%srcfn m7, [srcq+mmsize*7] 601cabdff1aSopenharmony_ci%endif 602cabdff1aSopenharmony_ci lea srcq, [srcq+sstrideq*%6] 603cabdff1aSopenharmony_ci%ifidn %1, avg 604cabdff1aSopenharmony_ci %%pavg m0, [dstq] 605cabdff1aSopenharmony_ci %%pavg m1, [dstq+d%3] 606cabdff1aSopenharmony_ci %%pavg m2, [dstq+d%4] 607cabdff1aSopenharmony_ci%if %2 == 4 608cabdff1aSopenharmony_ci %%srcfn m4, [dstq+d%5] 609cabdff1aSopenharmony_ci %%pavg m3, m4 610cabdff1aSopenharmony_ci%else 611cabdff1aSopenharmony_ci %%pavg m3, [dstq+d%5] 612cabdff1aSopenharmony_ci%endif 613cabdff1aSopenharmony_ci%if %2/mmsize == 8 614cabdff1aSopenharmony_ci %%pavg m4, [dstq+mmsize*4] 615cabdff1aSopenharmony_ci %%pavg m5, [dstq+mmsize*5] 616cabdff1aSopenharmony_ci %%pavg m6, [dstq+mmsize*6] 617cabdff1aSopenharmony_ci %%pavg m7, [dstq+mmsize*7] 618cabdff1aSopenharmony_ci%endif 619cabdff1aSopenharmony_ci%endif 620cabdff1aSopenharmony_ci %%dstfn [dstq], m0 621cabdff1aSopenharmony_ci %%dstfn [dstq+d%3], m1 622cabdff1aSopenharmony_ci %%dstfn [dstq+d%4], m2 623cabdff1aSopenharmony_ci %%dstfn [dstq+d%5], m3 624cabdff1aSopenharmony_ci%if %2/mmsize == 8 625cabdff1aSopenharmony_ci %%dstfn [dstq+mmsize*4], m4 626cabdff1aSopenharmony_ci %%dstfn [dstq+mmsize*5], m5 627cabdff1aSopenharmony_ci %%dstfn [dstq+mmsize*6], m6 628cabdff1aSopenharmony_ci %%dstfn [dstq+mmsize*7], m7 629cabdff1aSopenharmony_ci%endif 630cabdff1aSopenharmony_ci lea dstq, [dstq+dstrideq*%6] 631cabdff1aSopenharmony_ci sub hd, %6 632cabdff1aSopenharmony_ci jnz .loop 633cabdff1aSopenharmony_ci RET 634cabdff1aSopenharmony_ci%endmacro 635cabdff1aSopenharmony_ci 636cabdff1aSopenharmony_ci%define d16 16 637cabdff1aSopenharmony_ci%define s16 16 638cabdff1aSopenharmony_ci%define d32 32 639cabdff1aSopenharmony_ci%define s32 32 640cabdff1aSopenharmony_ciINIT_MMX mmx 641cabdff1aSopenharmony_cifpel_fn put, 4, strideq, strideq*2, stride3q, 4 642cabdff1aSopenharmony_cifpel_fn put, 8, strideq, strideq*2, stride3q, 4 643cabdff1aSopenharmony_ciINIT_MMX mmxext 644cabdff1aSopenharmony_cifpel_fn avg, 4, strideq, strideq*2, stride3q, 4, 8 645cabdff1aSopenharmony_cifpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 8 646cabdff1aSopenharmony_ciINIT_XMM sse 647cabdff1aSopenharmony_cifpel_fn put, 16, strideq, strideq*2, stride3q, 4 648cabdff1aSopenharmony_cifpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2 649cabdff1aSopenharmony_cifpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1 650cabdff1aSopenharmony_cifpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 0, 8 651cabdff1aSopenharmony_ciINIT_XMM sse2 652cabdff1aSopenharmony_cifpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8 653cabdff1aSopenharmony_cifpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 8 654cabdff1aSopenharmony_cifpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 8 655cabdff1aSopenharmony_ciINIT_YMM avx 656cabdff1aSopenharmony_cifpel_fn put, 32, strideq, strideq*2, stride3q, 4 657cabdff1aSopenharmony_cifpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2 658cabdff1aSopenharmony_cifpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1 659cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 660cabdff1aSopenharmony_ciINIT_YMM avx2 661cabdff1aSopenharmony_cifpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8 662cabdff1aSopenharmony_cifpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 8 663cabdff1aSopenharmony_ci%endif 664cabdff1aSopenharmony_ciINIT_MMX mmxext 665cabdff1aSopenharmony_cifpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 16 666cabdff1aSopenharmony_ciINIT_XMM sse2 667cabdff1aSopenharmony_cifpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 16 668cabdff1aSopenharmony_cifpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 16 669cabdff1aSopenharmony_cifpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 16 670cabdff1aSopenharmony_cifpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16, 8 671cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 672cabdff1aSopenharmony_ciINIT_YMM avx2 673cabdff1aSopenharmony_cifpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 16 674cabdff1aSopenharmony_cifpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 16 675cabdff1aSopenharmony_cifpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16 676cabdff1aSopenharmony_ci%endif 677cabdff1aSopenharmony_ci%undef s16 678cabdff1aSopenharmony_ci%undef d16 679cabdff1aSopenharmony_ci%undef s32 680cabdff1aSopenharmony_ci%undef d32 681