xref: /third_party/ffmpeg/libavcodec/x86/vp9mc.asm (revision cabdff1a)
1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* VP9 motion compensation SIMD optimizations
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci;*
13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
17cabdff1aSopenharmony_ci;*
18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci;******************************************************************************
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ciSECTION_RODATA 32
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_cicextern pw_256
28cabdff1aSopenharmony_cicextern pw_64
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ci%macro F8_SSSE3_TAPS 8
31cabdff1aSopenharmony_citimes 16 db %1, %2
32cabdff1aSopenharmony_citimes 16 db %3, %4
33cabdff1aSopenharmony_citimes 16 db %5, %6
34cabdff1aSopenharmony_citimes 16 db %7, %8
35cabdff1aSopenharmony_ci%endmacro
36cabdff1aSopenharmony_ci
37cabdff1aSopenharmony_ci%macro F8_SSE2_TAPS 8
38cabdff1aSopenharmony_citimes 8 dw %1
39cabdff1aSopenharmony_citimes 8 dw %2
40cabdff1aSopenharmony_citimes 8 dw %3
41cabdff1aSopenharmony_citimes 8 dw %4
42cabdff1aSopenharmony_citimes 8 dw %5
43cabdff1aSopenharmony_citimes 8 dw %6
44cabdff1aSopenharmony_citimes 8 dw %7
45cabdff1aSopenharmony_citimes 8 dw %8
46cabdff1aSopenharmony_ci%endmacro
47cabdff1aSopenharmony_ci
48cabdff1aSopenharmony_ci%macro F8_16BPP_TAPS 8
49cabdff1aSopenharmony_citimes 8 dw %1, %2
50cabdff1aSopenharmony_citimes 8 dw %3, %4
51cabdff1aSopenharmony_citimes 8 dw %5, %6
52cabdff1aSopenharmony_citimes 8 dw %7, %8
53cabdff1aSopenharmony_ci%endmacro
54cabdff1aSopenharmony_ci
55cabdff1aSopenharmony_ci%macro FILTER 1
56cabdff1aSopenharmony_ciconst filters_%1 ; smooth
57cabdff1aSopenharmony_ci                    F8_TAPS -3, -1,  32,  64,  38,   1, -3,  0
58cabdff1aSopenharmony_ci                    F8_TAPS -2, -2,  29,  63,  41,   2, -3,  0
59cabdff1aSopenharmony_ci                    F8_TAPS -2, -2,  26,  63,  43,   4, -4,  0
60cabdff1aSopenharmony_ci                    F8_TAPS -2, -3,  24,  62,  46,   5, -4,  0
61cabdff1aSopenharmony_ci                    F8_TAPS -2, -3,  21,  60,  49,   7, -4,  0
62cabdff1aSopenharmony_ci                    F8_TAPS -1, -4,  18,  59,  51,   9, -4,  0
63cabdff1aSopenharmony_ci                    F8_TAPS -1, -4,  16,  57,  53,  12, -4, -1
64cabdff1aSopenharmony_ci                    F8_TAPS -1, -4,  14,  55,  55,  14, -4, -1
65cabdff1aSopenharmony_ci                    F8_TAPS -1, -4,  12,  53,  57,  16, -4, -1
66cabdff1aSopenharmony_ci                    F8_TAPS  0, -4,   9,  51,  59,  18, -4, -1
67cabdff1aSopenharmony_ci                    F8_TAPS  0, -4,   7,  49,  60,  21, -3, -2
68cabdff1aSopenharmony_ci                    F8_TAPS  0, -4,   5,  46,  62,  24, -3, -2
69cabdff1aSopenharmony_ci                    F8_TAPS  0, -4,   4,  43,  63,  26, -2, -2
70cabdff1aSopenharmony_ci                    F8_TAPS  0, -3,   2,  41,  63,  29, -2, -2
71cabdff1aSopenharmony_ci                    F8_TAPS  0, -3,   1,  38,  64,  32, -1, -3
72cabdff1aSopenharmony_ci                    ; regular
73cabdff1aSopenharmony_ci                    F8_TAPS  0,  1,  -5, 126,   8,  -3,  1,  0
74cabdff1aSopenharmony_ci                    F8_TAPS -1,  3, -10, 122,  18,  -6,  2,  0
75cabdff1aSopenharmony_ci                    F8_TAPS -1,  4, -13, 118,  27,  -9,  3, -1
76cabdff1aSopenharmony_ci                    F8_TAPS -1,  4, -16, 112,  37, -11,  4, -1
77cabdff1aSopenharmony_ci                    F8_TAPS -1,  5, -18, 105,  48, -14,  4, -1
78cabdff1aSopenharmony_ci                    F8_TAPS -1,  5, -19,  97,  58, -16,  5, -1
79cabdff1aSopenharmony_ci                    F8_TAPS -1,  6, -19,  88,  68, -18,  5, -1
80cabdff1aSopenharmony_ci                    F8_TAPS -1,  6, -19,  78,  78, -19,  6, -1
81cabdff1aSopenharmony_ci                    F8_TAPS -1,  5, -18,  68,  88, -19,  6, -1
82cabdff1aSopenharmony_ci                    F8_TAPS -1,  5, -16,  58,  97, -19,  5, -1
83cabdff1aSopenharmony_ci                    F8_TAPS -1,  4, -14,  48, 105, -18,  5, -1
84cabdff1aSopenharmony_ci                    F8_TAPS -1,  4, -11,  37, 112, -16,  4, -1
85cabdff1aSopenharmony_ci                    F8_TAPS -1,  3,  -9,  27, 118, -13,  4, -1
86cabdff1aSopenharmony_ci                    F8_TAPS  0,  2,  -6,  18, 122, -10,  3, -1
87cabdff1aSopenharmony_ci                    F8_TAPS  0,  1,  -3,   8, 126,  -5,  1,  0
88cabdff1aSopenharmony_ci                    ; sharp
89cabdff1aSopenharmony_ci                    F8_TAPS -1,  3,  -7, 127,   8,  -3,  1,  0
90cabdff1aSopenharmony_ci                    F8_TAPS -2,  5, -13, 125,  17,  -6,  3, -1
91cabdff1aSopenharmony_ci                    F8_TAPS -3,  7, -17, 121,  27, -10,  5, -2
92cabdff1aSopenharmony_ci                    F8_TAPS -4,  9, -20, 115,  37, -13,  6, -2
93cabdff1aSopenharmony_ci                    F8_TAPS -4, 10, -23, 108,  48, -16,  8, -3
94cabdff1aSopenharmony_ci                    F8_TAPS -4, 10, -24, 100,  59, -19,  9, -3
95cabdff1aSopenharmony_ci                    F8_TAPS -4, 11, -24,  90,  70, -21, 10, -4
96cabdff1aSopenharmony_ci                    F8_TAPS -4, 11, -23,  80,  80, -23, 11, -4
97cabdff1aSopenharmony_ci                    F8_TAPS -4, 10, -21,  70,  90, -24, 11, -4
98cabdff1aSopenharmony_ci                    F8_TAPS -3,  9, -19,  59, 100, -24, 10, -4
99cabdff1aSopenharmony_ci                    F8_TAPS -3,  8, -16,  48, 108, -23, 10, -4
100cabdff1aSopenharmony_ci                    F8_TAPS -2,  6, -13,  37, 115, -20,  9, -4
101cabdff1aSopenharmony_ci                    F8_TAPS -2,  5, -10,  27, 121, -17,  7, -3
102cabdff1aSopenharmony_ci                    F8_TAPS -1,  3,  -6,  17, 125, -13,  5, -2
103cabdff1aSopenharmony_ci                    F8_TAPS  0,  1,  -3,   8, 127,  -7,  3, -1
104cabdff1aSopenharmony_ci%endmacro
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ci%define F8_TAPS F8_SSSE3_TAPS
107cabdff1aSopenharmony_ci; int8_t ff_filters_ssse3[3][15][4][32]
108cabdff1aSopenharmony_ciFILTER ssse3
109cabdff1aSopenharmony_ci%define F8_TAPS F8_SSE2_TAPS
110cabdff1aSopenharmony_ci; int16_t ff_filters_sse2[3][15][8][8]
111cabdff1aSopenharmony_ciFILTER sse2
112cabdff1aSopenharmony_ci%define F8_TAPS F8_16BPP_TAPS
113cabdff1aSopenharmony_ci; int16_t ff_filters_16bpp[3][15][4][16]
114cabdff1aSopenharmony_ciFILTER 16bpp
115cabdff1aSopenharmony_ci
116cabdff1aSopenharmony_ciSECTION .text
117cabdff1aSopenharmony_ci
118cabdff1aSopenharmony_ci%macro filter_sse2_h_fn 1
119cabdff1aSopenharmony_ci%assign %%px mmsize/2
120cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery
121cabdff1aSopenharmony_ci    pxor        m5, m5
122cabdff1aSopenharmony_ci    mova        m6, [pw_64]
123cabdff1aSopenharmony_ci    mova        m7, [filteryq+  0]
124cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
125cabdff1aSopenharmony_ci    mova        m8, [filteryq+ 16]
126cabdff1aSopenharmony_ci    mova        m9, [filteryq+ 32]
127cabdff1aSopenharmony_ci    mova       m10, [filteryq+ 48]
128cabdff1aSopenharmony_ci    mova       m11, [filteryq+ 64]
129cabdff1aSopenharmony_ci    mova       m12, [filteryq+ 80]
130cabdff1aSopenharmony_ci    mova       m13, [filteryq+ 96]
131cabdff1aSopenharmony_ci    mova       m14, [filteryq+112]
132cabdff1aSopenharmony_ci%endif
133cabdff1aSopenharmony_ci.loop:
134cabdff1aSopenharmony_ci    movh        m0, [srcq-3]
135cabdff1aSopenharmony_ci    movh        m1, [srcq-2]
136cabdff1aSopenharmony_ci    movh        m2, [srcq-1]
137cabdff1aSopenharmony_ci    movh        m3, [srcq+0]
138cabdff1aSopenharmony_ci    movh        m4, [srcq+1]
139cabdff1aSopenharmony_ci    punpcklbw   m0, m5
140cabdff1aSopenharmony_ci    punpcklbw   m1, m5
141cabdff1aSopenharmony_ci    punpcklbw   m2, m5
142cabdff1aSopenharmony_ci    punpcklbw   m3, m5
143cabdff1aSopenharmony_ci    punpcklbw   m4, m5
144cabdff1aSopenharmony_ci    pmullw      m0, m7
145cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
146cabdff1aSopenharmony_ci    pmullw      m1, m8
147cabdff1aSopenharmony_ci    pmullw      m2, m9
148cabdff1aSopenharmony_ci    pmullw      m3, m10
149cabdff1aSopenharmony_ci    pmullw      m4, m11
150cabdff1aSopenharmony_ci%else
151cabdff1aSopenharmony_ci    pmullw      m1, [filteryq+ 16]
152cabdff1aSopenharmony_ci    pmullw      m2, [filteryq+ 32]
153cabdff1aSopenharmony_ci    pmullw      m3, [filteryq+ 48]
154cabdff1aSopenharmony_ci    pmullw      m4, [filteryq+ 64]
155cabdff1aSopenharmony_ci%endif
156cabdff1aSopenharmony_ci    paddw       m0, m1
157cabdff1aSopenharmony_ci    paddw       m2, m3
158cabdff1aSopenharmony_ci    paddw       m0, m4
159cabdff1aSopenharmony_ci    movh        m1, [srcq+2]
160cabdff1aSopenharmony_ci    movh        m3, [srcq+3]
161cabdff1aSopenharmony_ci    movh        m4, [srcq+4]
162cabdff1aSopenharmony_ci    add       srcq, sstrideq
163cabdff1aSopenharmony_ci    punpcklbw   m1, m5
164cabdff1aSopenharmony_ci    punpcklbw   m3, m5
165cabdff1aSopenharmony_ci    punpcklbw   m4, m5
166cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
167cabdff1aSopenharmony_ci    pmullw      m1, m12
168cabdff1aSopenharmony_ci    pmullw      m3, m13
169cabdff1aSopenharmony_ci    pmullw      m4, m14
170cabdff1aSopenharmony_ci%else
171cabdff1aSopenharmony_ci    pmullw      m1, [filteryq+ 80]
172cabdff1aSopenharmony_ci    pmullw      m3, [filteryq+ 96]
173cabdff1aSopenharmony_ci    pmullw      m4, [filteryq+112]
174cabdff1aSopenharmony_ci%endif
175cabdff1aSopenharmony_ci    paddw       m0, m1
176cabdff1aSopenharmony_ci    paddw       m3, m4
177cabdff1aSopenharmony_ci    paddw       m0, m6
178cabdff1aSopenharmony_ci    paddw       m2, m3
179cabdff1aSopenharmony_ci    paddsw      m0, m2
180cabdff1aSopenharmony_ci    psraw       m0, 7
181cabdff1aSopenharmony_ci%ifidn %1, avg
182cabdff1aSopenharmony_ci    movh        m1, [dstq]
183cabdff1aSopenharmony_ci%endif
184cabdff1aSopenharmony_ci    packuswb    m0, m0
185cabdff1aSopenharmony_ci%ifidn %1, avg
186cabdff1aSopenharmony_ci    pavgb       m0, m1
187cabdff1aSopenharmony_ci%endif
188cabdff1aSopenharmony_ci    movh    [dstq], m0
189cabdff1aSopenharmony_ci    add       dstq, dstrideq
190cabdff1aSopenharmony_ci    dec         hd
191cabdff1aSopenharmony_ci    jg .loop
192cabdff1aSopenharmony_ci    RET
193cabdff1aSopenharmony_ci%endmacro
194cabdff1aSopenharmony_ci
195cabdff1aSopenharmony_ciINIT_MMX mmxext
196cabdff1aSopenharmony_cifilter_sse2_h_fn put
197cabdff1aSopenharmony_cifilter_sse2_h_fn avg
198cabdff1aSopenharmony_ci
199cabdff1aSopenharmony_ciINIT_XMM sse2
200cabdff1aSopenharmony_cifilter_sse2_h_fn put
201cabdff1aSopenharmony_cifilter_sse2_h_fn avg
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ci%macro filter_h_fn 1
204cabdff1aSopenharmony_ci%assign %%px mmsize/2
205cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery
206cabdff1aSopenharmony_ci    mova        m6, [pw_256]
207cabdff1aSopenharmony_ci    mova        m7, [filteryq+ 0]
208cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
209cabdff1aSopenharmony_ci    mova        m8, [filteryq+32]
210cabdff1aSopenharmony_ci    mova        m9, [filteryq+64]
211cabdff1aSopenharmony_ci    mova       m10, [filteryq+96]
212cabdff1aSopenharmony_ci%endif
213cabdff1aSopenharmony_ci.loop:
214cabdff1aSopenharmony_ci    movh        m0, [srcq-3]
215cabdff1aSopenharmony_ci    movh        m1, [srcq-2]
216cabdff1aSopenharmony_ci    movh        m2, [srcq-1]
217cabdff1aSopenharmony_ci    movh        m3, [srcq+0]
218cabdff1aSopenharmony_ci    movh        m4, [srcq+1]
219cabdff1aSopenharmony_ci    movh        m5, [srcq+2]
220cabdff1aSopenharmony_ci    punpcklbw   m0, m1
221cabdff1aSopenharmony_ci    punpcklbw   m2, m3
222cabdff1aSopenharmony_ci    movh        m1, [srcq+3]
223cabdff1aSopenharmony_ci    movh        m3, [srcq+4]
224cabdff1aSopenharmony_ci    add       srcq, sstrideq
225cabdff1aSopenharmony_ci    punpcklbw   m4, m5
226cabdff1aSopenharmony_ci    punpcklbw   m1, m3
227cabdff1aSopenharmony_ci    pmaddubsw   m0, m7
228cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
229cabdff1aSopenharmony_ci    pmaddubsw   m2, m8
230cabdff1aSopenharmony_ci    pmaddubsw   m4, m9
231cabdff1aSopenharmony_ci    pmaddubsw   m1, m10
232cabdff1aSopenharmony_ci%else
233cabdff1aSopenharmony_ci    pmaddubsw   m2, [filteryq+32]
234cabdff1aSopenharmony_ci    pmaddubsw   m4, [filteryq+64]
235cabdff1aSopenharmony_ci    pmaddubsw   m1, [filteryq+96]
236cabdff1aSopenharmony_ci%endif
237cabdff1aSopenharmony_ci    paddw       m0, m4
238cabdff1aSopenharmony_ci    paddw       m2, m1
239cabdff1aSopenharmony_ci    paddsw      m0, m2
240cabdff1aSopenharmony_ci    pmulhrsw    m0, m6
241cabdff1aSopenharmony_ci%ifidn %1, avg
242cabdff1aSopenharmony_ci    movh        m1, [dstq]
243cabdff1aSopenharmony_ci%endif
244cabdff1aSopenharmony_ci    packuswb    m0, m0
245cabdff1aSopenharmony_ci%ifidn %1, avg
246cabdff1aSopenharmony_ci    pavgb       m0, m1
247cabdff1aSopenharmony_ci%endif
248cabdff1aSopenharmony_ci    movh    [dstq], m0
249cabdff1aSopenharmony_ci    add       dstq, dstrideq
250cabdff1aSopenharmony_ci    dec         hd
251cabdff1aSopenharmony_ci    jg .loop
252cabdff1aSopenharmony_ci    RET
253cabdff1aSopenharmony_ci%endmacro
254cabdff1aSopenharmony_ci
255cabdff1aSopenharmony_ciINIT_MMX ssse3
256cabdff1aSopenharmony_cifilter_h_fn put
257cabdff1aSopenharmony_cifilter_h_fn avg
258cabdff1aSopenharmony_ci
259cabdff1aSopenharmony_ciINIT_XMM ssse3
260cabdff1aSopenharmony_cifilter_h_fn put
261cabdff1aSopenharmony_cifilter_h_fn avg
262cabdff1aSopenharmony_ci
263cabdff1aSopenharmony_ci%if ARCH_X86_64
264cabdff1aSopenharmony_ci%macro filter_hx2_fn 1
265cabdff1aSopenharmony_ci%assign %%px mmsize
266cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery
267cabdff1aSopenharmony_ci    mova       m13, [pw_256]
268cabdff1aSopenharmony_ci    mova        m8, [filteryq+ 0]
269cabdff1aSopenharmony_ci    mova        m9, [filteryq+32]
270cabdff1aSopenharmony_ci    mova       m10, [filteryq+64]
271cabdff1aSopenharmony_ci    mova       m11, [filteryq+96]
272cabdff1aSopenharmony_ci.loop:
273cabdff1aSopenharmony_ci    movu        m0, [srcq-3]
274cabdff1aSopenharmony_ci    movu        m1, [srcq-2]
275cabdff1aSopenharmony_ci    movu        m2, [srcq-1]
276cabdff1aSopenharmony_ci    movu        m3, [srcq+0]
277cabdff1aSopenharmony_ci    movu        m4, [srcq+1]
278cabdff1aSopenharmony_ci    movu        m5, [srcq+2]
279cabdff1aSopenharmony_ci    movu        m6, [srcq+3]
280cabdff1aSopenharmony_ci    movu        m7, [srcq+4]
281cabdff1aSopenharmony_ci    add       srcq, sstrideq
282cabdff1aSopenharmony_ci    SBUTTERFLY  bw, 0, 1, 12
283cabdff1aSopenharmony_ci    SBUTTERFLY  bw, 2, 3, 12
284cabdff1aSopenharmony_ci    SBUTTERFLY  bw, 4, 5, 12
285cabdff1aSopenharmony_ci    SBUTTERFLY  bw, 6, 7, 12
286cabdff1aSopenharmony_ci    pmaddubsw   m0, m8
287cabdff1aSopenharmony_ci    pmaddubsw   m1, m8
288cabdff1aSopenharmony_ci    pmaddubsw   m2, m9
289cabdff1aSopenharmony_ci    pmaddubsw   m3, m9
290cabdff1aSopenharmony_ci    pmaddubsw   m4, m10
291cabdff1aSopenharmony_ci    pmaddubsw   m5, m10
292cabdff1aSopenharmony_ci    pmaddubsw   m6, m11
293cabdff1aSopenharmony_ci    pmaddubsw   m7, m11
294cabdff1aSopenharmony_ci    paddw       m0, m4
295cabdff1aSopenharmony_ci    paddw       m1, m5
296cabdff1aSopenharmony_ci    paddw       m2, m6
297cabdff1aSopenharmony_ci    paddw       m3, m7
298cabdff1aSopenharmony_ci    paddsw      m0, m2
299cabdff1aSopenharmony_ci    paddsw      m1, m3
300cabdff1aSopenharmony_ci    pmulhrsw    m0, m13
301cabdff1aSopenharmony_ci    pmulhrsw    m1, m13
302cabdff1aSopenharmony_ci    packuswb    m0, m1
303cabdff1aSopenharmony_ci%ifidn %1, avg
304cabdff1aSopenharmony_ci    pavgb       m0, [dstq]
305cabdff1aSopenharmony_ci%endif
306cabdff1aSopenharmony_ci    mova    [dstq], m0
307cabdff1aSopenharmony_ci    add       dstq, dstrideq
308cabdff1aSopenharmony_ci    dec         hd
309cabdff1aSopenharmony_ci    jg .loop
310cabdff1aSopenharmony_ci    RET
311cabdff1aSopenharmony_ci%endmacro
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ciINIT_XMM ssse3
314cabdff1aSopenharmony_cifilter_hx2_fn put
315cabdff1aSopenharmony_cifilter_hx2_fn avg
316cabdff1aSopenharmony_ci
317cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
318cabdff1aSopenharmony_ciINIT_YMM avx2
319cabdff1aSopenharmony_cifilter_hx2_fn put
320cabdff1aSopenharmony_cifilter_hx2_fn avg
321cabdff1aSopenharmony_ci%endif
322cabdff1aSopenharmony_ci
323cabdff1aSopenharmony_ci%endif ; ARCH_X86_64
324cabdff1aSopenharmony_ci
325cabdff1aSopenharmony_ci%macro filter_sse2_v_fn 1
326cabdff1aSopenharmony_ci%assign %%px mmsize/2
327cabdff1aSopenharmony_ci%if ARCH_X86_64
328cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
329cabdff1aSopenharmony_ci%else
330cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
331cabdff1aSopenharmony_ci    mov   filteryq, r5mp
332cabdff1aSopenharmony_ci%define hd r4mp
333cabdff1aSopenharmony_ci%endif
334cabdff1aSopenharmony_ci    pxor        m5, m5
335cabdff1aSopenharmony_ci    mova        m6, [pw_64]
336cabdff1aSopenharmony_ci    lea  sstride3q, [sstrideq*3]
337cabdff1aSopenharmony_ci    lea      src4q, [srcq+sstrideq]
338cabdff1aSopenharmony_ci    sub       srcq, sstride3q
339cabdff1aSopenharmony_ci    mova        m7, [filteryq+  0]
340cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
341cabdff1aSopenharmony_ci    mova        m8, [filteryq+ 16]
342cabdff1aSopenharmony_ci    mova        m9, [filteryq+ 32]
343cabdff1aSopenharmony_ci    mova       m10, [filteryq+ 48]
344cabdff1aSopenharmony_ci    mova       m11, [filteryq+ 64]
345cabdff1aSopenharmony_ci    mova       m12, [filteryq+ 80]
346cabdff1aSopenharmony_ci    mova       m13, [filteryq+ 96]
347cabdff1aSopenharmony_ci    mova       m14, [filteryq+112]
348cabdff1aSopenharmony_ci%endif
349cabdff1aSopenharmony_ci.loop:
350cabdff1aSopenharmony_ci    ; FIXME maybe reuse loads from previous rows, or just
351cabdff1aSopenharmony_ci    ; more generally unroll this to prevent multiple loads of
352cabdff1aSopenharmony_ci    ; the same data?
353cabdff1aSopenharmony_ci    movh        m0, [srcq]
354cabdff1aSopenharmony_ci    movh        m1, [srcq+sstrideq]
355cabdff1aSopenharmony_ci    movh        m2, [srcq+sstrideq*2]
356cabdff1aSopenharmony_ci    movh        m3, [srcq+sstride3q]
357cabdff1aSopenharmony_ci    add       srcq, sstrideq
358cabdff1aSopenharmony_ci    movh        m4, [src4q]
359cabdff1aSopenharmony_ci    punpcklbw   m0, m5
360cabdff1aSopenharmony_ci    punpcklbw   m1, m5
361cabdff1aSopenharmony_ci    punpcklbw   m2, m5
362cabdff1aSopenharmony_ci    punpcklbw   m3, m5
363cabdff1aSopenharmony_ci    punpcklbw   m4, m5
364cabdff1aSopenharmony_ci    pmullw      m0, m7
365cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
366cabdff1aSopenharmony_ci    pmullw      m1, m8
367cabdff1aSopenharmony_ci    pmullw      m2, m9
368cabdff1aSopenharmony_ci    pmullw      m3, m10
369cabdff1aSopenharmony_ci    pmullw      m4, m11
370cabdff1aSopenharmony_ci%else
371cabdff1aSopenharmony_ci    pmullw      m1, [filteryq+ 16]
372cabdff1aSopenharmony_ci    pmullw      m2, [filteryq+ 32]
373cabdff1aSopenharmony_ci    pmullw      m3, [filteryq+ 48]
374cabdff1aSopenharmony_ci    pmullw      m4, [filteryq+ 64]
375cabdff1aSopenharmony_ci%endif
376cabdff1aSopenharmony_ci    paddw       m0, m1
377cabdff1aSopenharmony_ci    paddw       m2, m3
378cabdff1aSopenharmony_ci    paddw       m0, m4
379cabdff1aSopenharmony_ci    movh        m1, [src4q+sstrideq]
380cabdff1aSopenharmony_ci    movh        m3, [src4q+sstrideq*2]
381cabdff1aSopenharmony_ci    movh        m4, [src4q+sstride3q]
382cabdff1aSopenharmony_ci    add      src4q, sstrideq
383cabdff1aSopenharmony_ci    punpcklbw   m1, m5
384cabdff1aSopenharmony_ci    punpcklbw   m3, m5
385cabdff1aSopenharmony_ci    punpcklbw   m4, m5
386cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
387cabdff1aSopenharmony_ci    pmullw      m1, m12
388cabdff1aSopenharmony_ci    pmullw      m3, m13
389cabdff1aSopenharmony_ci    pmullw      m4, m14
390cabdff1aSopenharmony_ci%else
391cabdff1aSopenharmony_ci    pmullw      m1, [filteryq+ 80]
392cabdff1aSopenharmony_ci    pmullw      m3, [filteryq+ 96]
393cabdff1aSopenharmony_ci    pmullw      m4, [filteryq+112]
394cabdff1aSopenharmony_ci%endif
395cabdff1aSopenharmony_ci    paddw       m0, m1
396cabdff1aSopenharmony_ci    paddw       m3, m4
397cabdff1aSopenharmony_ci    paddw       m0, m6
398cabdff1aSopenharmony_ci    paddw       m2, m3
399cabdff1aSopenharmony_ci    paddsw      m0, m2
400cabdff1aSopenharmony_ci    psraw       m0, 7
401cabdff1aSopenharmony_ci%ifidn %1, avg
402cabdff1aSopenharmony_ci    movh        m1, [dstq]
403cabdff1aSopenharmony_ci%endif
404cabdff1aSopenharmony_ci    packuswb    m0, m0
405cabdff1aSopenharmony_ci%ifidn %1, avg
406cabdff1aSopenharmony_ci    pavgb       m0, m1
407cabdff1aSopenharmony_ci%endif
408cabdff1aSopenharmony_ci    movh    [dstq], m0
409cabdff1aSopenharmony_ci    add       dstq, dstrideq
410cabdff1aSopenharmony_ci    dec         hd
411cabdff1aSopenharmony_ci    jg .loop
412cabdff1aSopenharmony_ci    RET
413cabdff1aSopenharmony_ci%endmacro
414cabdff1aSopenharmony_ci
415cabdff1aSopenharmony_ciINIT_MMX mmxext
416cabdff1aSopenharmony_cifilter_sse2_v_fn put
417cabdff1aSopenharmony_cifilter_sse2_v_fn avg
418cabdff1aSopenharmony_ci
419cabdff1aSopenharmony_ciINIT_XMM sse2
420cabdff1aSopenharmony_cifilter_sse2_v_fn put
421cabdff1aSopenharmony_cifilter_sse2_v_fn avg
422cabdff1aSopenharmony_ci
423cabdff1aSopenharmony_ci%macro filter_v_fn 1
424cabdff1aSopenharmony_ci%assign %%px mmsize/2
425cabdff1aSopenharmony_ci%if ARCH_X86_64
426cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
427cabdff1aSopenharmony_ci%else
428cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
429cabdff1aSopenharmony_ci    mov   filteryq, r5mp
430cabdff1aSopenharmony_ci%define hd r4mp
431cabdff1aSopenharmony_ci%endif
432cabdff1aSopenharmony_ci    mova        m6, [pw_256]
433cabdff1aSopenharmony_ci    lea  sstride3q, [sstrideq*3]
434cabdff1aSopenharmony_ci    lea      src4q, [srcq+sstrideq]
435cabdff1aSopenharmony_ci    sub       srcq, sstride3q
436cabdff1aSopenharmony_ci    mova        m7, [filteryq+ 0]
437cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
438cabdff1aSopenharmony_ci    mova        m8, [filteryq+32]
439cabdff1aSopenharmony_ci    mova        m9, [filteryq+64]
440cabdff1aSopenharmony_ci    mova       m10, [filteryq+96]
441cabdff1aSopenharmony_ci%endif
442cabdff1aSopenharmony_ci.loop:
443cabdff1aSopenharmony_ci    ; FIXME maybe reuse loads from previous rows, or just more generally
444cabdff1aSopenharmony_ci    ; unroll this to prevent multiple loads of the same data?
445cabdff1aSopenharmony_ci    movh        m0, [srcq]
446cabdff1aSopenharmony_ci    movh        m1, [srcq+sstrideq]
447cabdff1aSopenharmony_ci    movh        m2, [srcq+sstrideq*2]
448cabdff1aSopenharmony_ci    movh        m3, [srcq+sstride3q]
449cabdff1aSopenharmony_ci    movh        m4, [src4q]
450cabdff1aSopenharmony_ci    movh        m5, [src4q+sstrideq]
451cabdff1aSopenharmony_ci    punpcklbw   m0, m1
452cabdff1aSopenharmony_ci    punpcklbw   m2, m3
453cabdff1aSopenharmony_ci    movh        m1, [src4q+sstrideq*2]
454cabdff1aSopenharmony_ci    movh        m3, [src4q+sstride3q]
455cabdff1aSopenharmony_ci    add       srcq, sstrideq
456cabdff1aSopenharmony_ci    add      src4q, sstrideq
457cabdff1aSopenharmony_ci    punpcklbw   m4, m5
458cabdff1aSopenharmony_ci    punpcklbw   m1, m3
459cabdff1aSopenharmony_ci    pmaddubsw   m0, m7
460cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
461cabdff1aSopenharmony_ci    pmaddubsw   m2, m8
462cabdff1aSopenharmony_ci    pmaddubsw   m4, m9
463cabdff1aSopenharmony_ci    pmaddubsw   m1, m10
464cabdff1aSopenharmony_ci%else
465cabdff1aSopenharmony_ci    pmaddubsw   m2, [filteryq+32]
466cabdff1aSopenharmony_ci    pmaddubsw   m4, [filteryq+64]
467cabdff1aSopenharmony_ci    pmaddubsw   m1, [filteryq+96]
468cabdff1aSopenharmony_ci%endif
469cabdff1aSopenharmony_ci    paddw       m0, m4
470cabdff1aSopenharmony_ci    paddw       m2, m1
471cabdff1aSopenharmony_ci    paddsw      m0, m2
472cabdff1aSopenharmony_ci    pmulhrsw    m0, m6
473cabdff1aSopenharmony_ci%ifidn %1, avg
474cabdff1aSopenharmony_ci    movh        m1, [dstq]
475cabdff1aSopenharmony_ci%endif
476cabdff1aSopenharmony_ci    packuswb    m0, m0
477cabdff1aSopenharmony_ci%ifidn %1, avg
478cabdff1aSopenharmony_ci    pavgb       m0, m1
479cabdff1aSopenharmony_ci%endif
480cabdff1aSopenharmony_ci    movh    [dstq], m0
481cabdff1aSopenharmony_ci    add       dstq, dstrideq
482cabdff1aSopenharmony_ci    dec         hd
483cabdff1aSopenharmony_ci    jg .loop
484cabdff1aSopenharmony_ci    RET
485cabdff1aSopenharmony_ci%endmacro
486cabdff1aSopenharmony_ci
487cabdff1aSopenharmony_ciINIT_MMX ssse3
488cabdff1aSopenharmony_cifilter_v_fn put
489cabdff1aSopenharmony_cifilter_v_fn avg
490cabdff1aSopenharmony_ci
491cabdff1aSopenharmony_ciINIT_XMM ssse3
492cabdff1aSopenharmony_cifilter_v_fn put
493cabdff1aSopenharmony_cifilter_v_fn avg
494cabdff1aSopenharmony_ci
495cabdff1aSopenharmony_ci%if ARCH_X86_64
496cabdff1aSopenharmony_ci
497cabdff1aSopenharmony_ci%macro filter_vx2_fn 1
498cabdff1aSopenharmony_ci%assign %%px mmsize
499cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
500cabdff1aSopenharmony_ci    mova       m13, [pw_256]
501cabdff1aSopenharmony_ci    lea  sstride3q, [sstrideq*3]
502cabdff1aSopenharmony_ci    lea      src4q, [srcq+sstrideq]
503cabdff1aSopenharmony_ci    sub       srcq, sstride3q
504cabdff1aSopenharmony_ci    mova        m8, [filteryq+ 0]
505cabdff1aSopenharmony_ci    mova        m9, [filteryq+32]
506cabdff1aSopenharmony_ci    mova       m10, [filteryq+64]
507cabdff1aSopenharmony_ci    mova       m11, [filteryq+96]
508cabdff1aSopenharmony_ci.loop:
509cabdff1aSopenharmony_ci    ; FIXME maybe reuse loads from previous rows, or just
510cabdff1aSopenharmony_ci    ; more generally unroll this to prevent multiple loads of
511cabdff1aSopenharmony_ci    ; the same data?
512cabdff1aSopenharmony_ci    movu        m0, [srcq]
513cabdff1aSopenharmony_ci    movu        m1, [srcq+sstrideq]
514cabdff1aSopenharmony_ci    movu        m2, [srcq+sstrideq*2]
515cabdff1aSopenharmony_ci    movu        m3, [srcq+sstride3q]
516cabdff1aSopenharmony_ci    movu        m4, [src4q]
517cabdff1aSopenharmony_ci    movu        m5, [src4q+sstrideq]
518cabdff1aSopenharmony_ci    movu        m6, [src4q+sstrideq*2]
519cabdff1aSopenharmony_ci    movu        m7, [src4q+sstride3q]
520cabdff1aSopenharmony_ci    add       srcq, sstrideq
521cabdff1aSopenharmony_ci    add      src4q, sstrideq
522cabdff1aSopenharmony_ci    SBUTTERFLY  bw, 0, 1, 12
523cabdff1aSopenharmony_ci    SBUTTERFLY  bw, 2, 3, 12
524cabdff1aSopenharmony_ci    SBUTTERFLY  bw, 4, 5, 12
525cabdff1aSopenharmony_ci    SBUTTERFLY  bw, 6, 7, 12
526cabdff1aSopenharmony_ci    pmaddubsw   m0, m8
527cabdff1aSopenharmony_ci    pmaddubsw   m1, m8
528cabdff1aSopenharmony_ci    pmaddubsw   m2, m9
529cabdff1aSopenharmony_ci    pmaddubsw   m3, m9
530cabdff1aSopenharmony_ci    pmaddubsw   m4, m10
531cabdff1aSopenharmony_ci    pmaddubsw   m5, m10
532cabdff1aSopenharmony_ci    pmaddubsw   m6, m11
533cabdff1aSopenharmony_ci    pmaddubsw   m7, m11
534cabdff1aSopenharmony_ci    paddw       m0, m4
535cabdff1aSopenharmony_ci    paddw       m1, m5
536cabdff1aSopenharmony_ci    paddw       m2, m6
537cabdff1aSopenharmony_ci    paddw       m3, m7
538cabdff1aSopenharmony_ci    paddsw      m0, m2
539cabdff1aSopenharmony_ci    paddsw      m1, m3
540cabdff1aSopenharmony_ci    pmulhrsw    m0, m13
541cabdff1aSopenharmony_ci    pmulhrsw    m1, m13
542cabdff1aSopenharmony_ci    packuswb    m0, m1
543cabdff1aSopenharmony_ci%ifidn %1, avg
544cabdff1aSopenharmony_ci    pavgb       m0, [dstq]
545cabdff1aSopenharmony_ci%endif
546cabdff1aSopenharmony_ci    mova    [dstq], m0
547cabdff1aSopenharmony_ci    add       dstq, dstrideq
548cabdff1aSopenharmony_ci    dec         hd
549cabdff1aSopenharmony_ci    jg .loop
550cabdff1aSopenharmony_ci    RET
551cabdff1aSopenharmony_ci%endmacro
552cabdff1aSopenharmony_ci
553cabdff1aSopenharmony_ciINIT_XMM ssse3
554cabdff1aSopenharmony_cifilter_vx2_fn put
555cabdff1aSopenharmony_cifilter_vx2_fn avg
556cabdff1aSopenharmony_ci
557cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
558cabdff1aSopenharmony_ciINIT_YMM avx2
559cabdff1aSopenharmony_cifilter_vx2_fn put
560cabdff1aSopenharmony_cifilter_vx2_fn avg
561cabdff1aSopenharmony_ci%endif
562cabdff1aSopenharmony_ci
563cabdff1aSopenharmony_ci%endif ; ARCH_X86_64
564cabdff1aSopenharmony_ci
565cabdff1aSopenharmony_ci%macro fpel_fn 6-8 0, 4
566cabdff1aSopenharmony_ci%if %2 == 4
567cabdff1aSopenharmony_ci%define %%srcfn movh
568cabdff1aSopenharmony_ci%define %%dstfn movh
569cabdff1aSopenharmony_ci%else
570cabdff1aSopenharmony_ci%define %%srcfn movu
571cabdff1aSopenharmony_ci%define %%dstfn mova
572cabdff1aSopenharmony_ci%endif
573cabdff1aSopenharmony_ci
574cabdff1aSopenharmony_ci%if %7 == 8
575cabdff1aSopenharmony_ci%define %%pavg pavgb
576cabdff1aSopenharmony_ci%define %%szsuf _8
577cabdff1aSopenharmony_ci%elif %7 == 16
578cabdff1aSopenharmony_ci%define %%pavg pavgw
579cabdff1aSopenharmony_ci%define %%szsuf _16
580cabdff1aSopenharmony_ci%else
581cabdff1aSopenharmony_ci%define %%szsuf
582cabdff1aSopenharmony_ci%endif
583cabdff1aSopenharmony_ci
584cabdff1aSopenharmony_ci%if %2 <= mmsize
585cabdff1aSopenharmony_cicglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
586cabdff1aSopenharmony_ci    lea  sstride3q, [sstrideq*3]
587cabdff1aSopenharmony_ci    lea  dstride3q, [dstrideq*3]
588cabdff1aSopenharmony_ci%else
589cabdff1aSopenharmony_cicglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h
590cabdff1aSopenharmony_ci%endif
591cabdff1aSopenharmony_ci.loop:
592cabdff1aSopenharmony_ci    %%srcfn     m0, [srcq]
593cabdff1aSopenharmony_ci    %%srcfn     m1, [srcq+s%3]
594cabdff1aSopenharmony_ci    %%srcfn     m2, [srcq+s%4]
595cabdff1aSopenharmony_ci    %%srcfn     m3, [srcq+s%5]
596cabdff1aSopenharmony_ci%if %2/mmsize == 8
597cabdff1aSopenharmony_ci    %%srcfn     m4, [srcq+mmsize*4]
598cabdff1aSopenharmony_ci    %%srcfn     m5, [srcq+mmsize*5]
599cabdff1aSopenharmony_ci    %%srcfn     m6, [srcq+mmsize*6]
600cabdff1aSopenharmony_ci    %%srcfn     m7, [srcq+mmsize*7]
601cabdff1aSopenharmony_ci%endif
602cabdff1aSopenharmony_ci    lea       srcq, [srcq+sstrideq*%6]
603cabdff1aSopenharmony_ci%ifidn %1, avg
604cabdff1aSopenharmony_ci    %%pavg      m0, [dstq]
605cabdff1aSopenharmony_ci    %%pavg      m1, [dstq+d%3]
606cabdff1aSopenharmony_ci    %%pavg      m2, [dstq+d%4]
607cabdff1aSopenharmony_ci%if %2 == 4
608cabdff1aSopenharmony_ci    %%srcfn     m4, [dstq+d%5]
609cabdff1aSopenharmony_ci    %%pavg      m3, m4
610cabdff1aSopenharmony_ci%else
611cabdff1aSopenharmony_ci    %%pavg      m3, [dstq+d%5]
612cabdff1aSopenharmony_ci%endif
613cabdff1aSopenharmony_ci%if %2/mmsize == 8
614cabdff1aSopenharmony_ci    %%pavg      m4, [dstq+mmsize*4]
615cabdff1aSopenharmony_ci    %%pavg      m5, [dstq+mmsize*5]
616cabdff1aSopenharmony_ci    %%pavg      m6, [dstq+mmsize*6]
617cabdff1aSopenharmony_ci    %%pavg      m7, [dstq+mmsize*7]
618cabdff1aSopenharmony_ci%endif
619cabdff1aSopenharmony_ci%endif
620cabdff1aSopenharmony_ci    %%dstfn [dstq], m0
621cabdff1aSopenharmony_ci    %%dstfn [dstq+d%3], m1
622cabdff1aSopenharmony_ci    %%dstfn [dstq+d%4], m2
623cabdff1aSopenharmony_ci    %%dstfn [dstq+d%5], m3
624cabdff1aSopenharmony_ci%if %2/mmsize == 8
625cabdff1aSopenharmony_ci    %%dstfn [dstq+mmsize*4], m4
626cabdff1aSopenharmony_ci    %%dstfn [dstq+mmsize*5], m5
627cabdff1aSopenharmony_ci    %%dstfn [dstq+mmsize*6], m6
628cabdff1aSopenharmony_ci    %%dstfn [dstq+mmsize*7], m7
629cabdff1aSopenharmony_ci%endif
630cabdff1aSopenharmony_ci    lea       dstq, [dstq+dstrideq*%6]
631cabdff1aSopenharmony_ci    sub         hd, %6
632cabdff1aSopenharmony_ci    jnz .loop
633cabdff1aSopenharmony_ci    RET
634cabdff1aSopenharmony_ci%endmacro
635cabdff1aSopenharmony_ci
636cabdff1aSopenharmony_ci%define d16 16
637cabdff1aSopenharmony_ci%define s16 16
638cabdff1aSopenharmony_ci%define d32 32
639cabdff1aSopenharmony_ci%define s32 32
640cabdff1aSopenharmony_ciINIT_MMX mmx
641cabdff1aSopenharmony_cifpel_fn put, 4,  strideq, strideq*2, stride3q, 4
642cabdff1aSopenharmony_cifpel_fn put, 8,  strideq, strideq*2, stride3q, 4
643cabdff1aSopenharmony_ciINIT_MMX mmxext
644cabdff1aSopenharmony_cifpel_fn avg, 4,  strideq, strideq*2, stride3q, 4, 8
645cabdff1aSopenharmony_cifpel_fn avg, 8,  strideq, strideq*2, stride3q, 4, 8
646cabdff1aSopenharmony_ciINIT_XMM sse
647cabdff1aSopenharmony_cifpel_fn put, 16, strideq, strideq*2, stride3q, 4
648cabdff1aSopenharmony_cifpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
649cabdff1aSopenharmony_cifpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
650cabdff1aSopenharmony_cifpel_fn put, 128, mmsize, mmsize*2,  mmsize*3, 1, 0, 8
651cabdff1aSopenharmony_ciINIT_XMM sse2
652cabdff1aSopenharmony_cifpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8
653cabdff1aSopenharmony_cifpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2, 8
654cabdff1aSopenharmony_cifpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1, 8
655cabdff1aSopenharmony_ciINIT_YMM avx
656cabdff1aSopenharmony_cifpel_fn put, 32, strideq, strideq*2, stride3q, 4
657cabdff1aSopenharmony_cifpel_fn put, 64, mmsize,  strideq,   strideq+mmsize, 2
658cabdff1aSopenharmony_cifpel_fn put, 128, mmsize, mmsize*2,     mmsize*3, 1
659cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
660cabdff1aSopenharmony_ciINIT_YMM avx2
661cabdff1aSopenharmony_cifpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8
662cabdff1aSopenharmony_cifpel_fn avg, 64, mmsize,  strideq,   strideq+mmsize, 2, 8
663cabdff1aSopenharmony_ci%endif
664cabdff1aSopenharmony_ciINIT_MMX mmxext
665cabdff1aSopenharmony_cifpel_fn avg,  8,  strideq, strideq*2, stride3q, 4, 16
666cabdff1aSopenharmony_ciINIT_XMM sse2
667cabdff1aSopenharmony_cifpel_fn avg,  16, strideq, strideq*2, stride3q, 4, 16
668cabdff1aSopenharmony_cifpel_fn avg,  32, mmsize,  strideq,   strideq+mmsize, 2, 16
669cabdff1aSopenharmony_cifpel_fn avg,  64, mmsize,  mmsize*2,  mmsize*3, 1, 16
670cabdff1aSopenharmony_cifpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16, 8
671cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
672cabdff1aSopenharmony_ciINIT_YMM avx2
673cabdff1aSopenharmony_cifpel_fn avg,  32, strideq, strideq*2, stride3q, 4, 16
674cabdff1aSopenharmony_cifpel_fn avg,  64, mmsize,  strideq,   strideq+mmsize, 2, 16
675cabdff1aSopenharmony_cifpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16
676cabdff1aSopenharmony_ci%endif
677cabdff1aSopenharmony_ci%undef s16
678cabdff1aSopenharmony_ci%undef d16
679cabdff1aSopenharmony_ci%undef s32
680cabdff1aSopenharmony_ci%undef d32
681