1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* VP9 MC SIMD optimizations
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci;*
13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
17cabdff1aSopenharmony_ci;*
18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci;******************************************************************************
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ciSECTION_RODATA 32
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_cipd_64: times 8 dd 64
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_cicextern pw_1023
30cabdff1aSopenharmony_cicextern pw_4095
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ciSECTION .text
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_ci%macro filter_h4_fn 1-2 12
35cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
36cabdff1aSopenharmony_ci    mova        m5, [pw_1023]
37cabdff1aSopenharmony_ci.body:
38cabdff1aSopenharmony_ci%if notcpuflag(sse4) && ARCH_X86_64
39cabdff1aSopenharmony_ci    pxor       m11, m11
40cabdff1aSopenharmony_ci%endif
41cabdff1aSopenharmony_ci    mova        m6, [pd_64]
42cabdff1aSopenharmony_ci    mova        m7, [filteryq+ 0]
43cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
44cabdff1aSopenharmony_ci    mova        m8, [filteryq+32]
45cabdff1aSopenharmony_ci    mova        m9, [filteryq+64]
46cabdff1aSopenharmony_ci    mova       m10, [filteryq+96]
47cabdff1aSopenharmony_ci%endif
48cabdff1aSopenharmony_ci.loop:
49cabdff1aSopenharmony_ci    movh        m0, [srcq-6]
50cabdff1aSopenharmony_ci    movh        m1, [srcq-4]
51cabdff1aSopenharmony_ci    movh        m2, [srcq-2]
52cabdff1aSopenharmony_ci    movh        m3, [srcq+0]
53cabdff1aSopenharmony_ci    movh        m4, [srcq+2]
54cabdff1aSopenharmony_ci    punpcklwd   m0, m1
55cabdff1aSopenharmony_ci    punpcklwd   m2, m3
56cabdff1aSopenharmony_ci    pmaddwd     m0, m7
57cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
58cabdff1aSopenharmony_ci    pmaddwd     m2, m8
59cabdff1aSopenharmony_ci%else
60cabdff1aSopenharmony_ci    pmaddwd     m2, [filteryq+32]
61cabdff1aSopenharmony_ci%endif
62cabdff1aSopenharmony_ci    movu        m1, [srcq+4]
63cabdff1aSopenharmony_ci    movu        m3, [srcq+6]
64cabdff1aSopenharmony_ci    paddd       m0, m2
65cabdff1aSopenharmony_ci    movu        m2, [srcq+8]
66cabdff1aSopenharmony_ci    add       srcq, sstrideq
67cabdff1aSopenharmony_ci    punpcklwd   m4, m1
68cabdff1aSopenharmony_ci    punpcklwd   m3, m2
69cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
70cabdff1aSopenharmony_ci    pmaddwd     m4, m9
71cabdff1aSopenharmony_ci    pmaddwd     m3, m10
72cabdff1aSopenharmony_ci%else
73cabdff1aSopenharmony_ci    pmaddwd     m4, [filteryq+64]
74cabdff1aSopenharmony_ci    pmaddwd     m3, [filteryq+96]
75cabdff1aSopenharmony_ci%endif
76cabdff1aSopenharmony_ci    paddd       m0, m4
77cabdff1aSopenharmony_ci    paddd       m0, m3
78cabdff1aSopenharmony_ci    paddd       m0, m6
79cabdff1aSopenharmony_ci    psrad       m0, 7
80cabdff1aSopenharmony_ci%if cpuflag(sse4)
81cabdff1aSopenharmony_ci    packusdw    m0, m0
82cabdff1aSopenharmony_ci%else
83cabdff1aSopenharmony_ci    packssdw    m0, m0
84cabdff1aSopenharmony_ci%endif
85cabdff1aSopenharmony_ci%ifidn %1, avg
86cabdff1aSopenharmony_ci    movh        m1, [dstq]
87cabdff1aSopenharmony_ci%endif
88cabdff1aSopenharmony_ci    pminsw      m0, m5
89cabdff1aSopenharmony_ci%if notcpuflag(sse4)
90cabdff1aSopenharmony_ci%if ARCH_X86_64
91cabdff1aSopenharmony_ci    pmaxsw      m0, m11
92cabdff1aSopenharmony_ci%else
93cabdff1aSopenharmony_ci    pxor        m2, m2
94cabdff1aSopenharmony_ci    pmaxsw      m0, m2
95cabdff1aSopenharmony_ci%endif
96cabdff1aSopenharmony_ci%endif
97cabdff1aSopenharmony_ci%ifidn %1, avg
98cabdff1aSopenharmony_ci    pavgw       m0, m1
99cabdff1aSopenharmony_ci%endif
100cabdff1aSopenharmony_ci    movh    [dstq], m0
101cabdff1aSopenharmony_ci    add       dstq, dstrideq
102cabdff1aSopenharmony_ci    dec         hd
103cabdff1aSopenharmony_ci    jg .loop
104cabdff1aSopenharmony_ci    RET
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
107cabdff1aSopenharmony_ci    mova        m5, [pw_4095]
108cabdff1aSopenharmony_ci    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body
109cabdff1aSopenharmony_ci%endmacro
110cabdff1aSopenharmony_ci
111cabdff1aSopenharmony_ciINIT_XMM sse2
112cabdff1aSopenharmony_cifilter_h4_fn put
113cabdff1aSopenharmony_cifilter_h4_fn avg
114cabdff1aSopenharmony_ci
115cabdff1aSopenharmony_ci%macro filter_h_fn 1-2 12
116cabdff1aSopenharmony_ci%assign %%px mmsize/2
117cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
118cabdff1aSopenharmony_ci    mova        m5, [pw_1023]
119cabdff1aSopenharmony_ci.body:
120cabdff1aSopenharmony_ci%if notcpuflag(sse4) && ARCH_X86_64
121cabdff1aSopenharmony_ci    pxor       m11, m11
122cabdff1aSopenharmony_ci%endif
123cabdff1aSopenharmony_ci    mova        m6, [pd_64]
124cabdff1aSopenharmony_ci    mova        m7, [filteryq+ 0]
125cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
126cabdff1aSopenharmony_ci    mova        m8, [filteryq+32]
127cabdff1aSopenharmony_ci    mova        m9, [filteryq+64]
128cabdff1aSopenharmony_ci    mova       m10, [filteryq+96]
129cabdff1aSopenharmony_ci%endif
130cabdff1aSopenharmony_ci.loop:
131cabdff1aSopenharmony_ci    movu        m0, [srcq-6]
132cabdff1aSopenharmony_ci    movu        m1, [srcq-4]
133cabdff1aSopenharmony_ci    movu        m2, [srcq-2]
134cabdff1aSopenharmony_ci    movu        m3, [srcq+0]
135cabdff1aSopenharmony_ci    movu        m4, [srcq+2]
136cabdff1aSopenharmony_ci    pmaddwd     m0, m7
137cabdff1aSopenharmony_ci    pmaddwd     m1, m7
138cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
139cabdff1aSopenharmony_ci    pmaddwd     m2, m8
140cabdff1aSopenharmony_ci    pmaddwd     m3, m8
141cabdff1aSopenharmony_ci    pmaddwd     m4, m9
142cabdff1aSopenharmony_ci%else
143cabdff1aSopenharmony_ci    pmaddwd     m2, [filteryq+32]
144cabdff1aSopenharmony_ci    pmaddwd     m3, [filteryq+32]
145cabdff1aSopenharmony_ci    pmaddwd     m4, [filteryq+64]
146cabdff1aSopenharmony_ci%endif
147cabdff1aSopenharmony_ci    paddd       m0, m2
148cabdff1aSopenharmony_ci    paddd       m1, m3
149cabdff1aSopenharmony_ci    paddd       m0, m4
150cabdff1aSopenharmony_ci    movu        m2, [srcq+4]
151cabdff1aSopenharmony_ci    movu        m3, [srcq+6]
152cabdff1aSopenharmony_ci    movu        m4, [srcq+8]
153cabdff1aSopenharmony_ci    add       srcq, sstrideq
154cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
155cabdff1aSopenharmony_ci    pmaddwd     m2, m9
156cabdff1aSopenharmony_ci    pmaddwd     m3, m10
157cabdff1aSopenharmony_ci    pmaddwd     m4, m10
158cabdff1aSopenharmony_ci%else
159cabdff1aSopenharmony_ci    pmaddwd     m2, [filteryq+64]
160cabdff1aSopenharmony_ci    pmaddwd     m3, [filteryq+96]
161cabdff1aSopenharmony_ci    pmaddwd     m4, [filteryq+96]
162cabdff1aSopenharmony_ci%endif
163cabdff1aSopenharmony_ci    paddd       m1, m2
164cabdff1aSopenharmony_ci    paddd       m0, m3
165cabdff1aSopenharmony_ci    paddd       m1, m4
166cabdff1aSopenharmony_ci    paddd       m0, m6
167cabdff1aSopenharmony_ci    paddd       m1, m6
168cabdff1aSopenharmony_ci    psrad       m0, 7
169cabdff1aSopenharmony_ci    psrad       m1, 7
170cabdff1aSopenharmony_ci%if cpuflag(sse4)
171cabdff1aSopenharmony_ci    packusdw    m0, m0
172cabdff1aSopenharmony_ci    packusdw    m1, m1
173cabdff1aSopenharmony_ci%else
174cabdff1aSopenharmony_ci    packssdw    m0, m0
175cabdff1aSopenharmony_ci    packssdw    m1, m1
176cabdff1aSopenharmony_ci%endif
177cabdff1aSopenharmony_ci    punpcklwd   m0, m1
178cabdff1aSopenharmony_ci    pminsw      m0, m5
179cabdff1aSopenharmony_ci%if notcpuflag(sse4)
180cabdff1aSopenharmony_ci%if ARCH_X86_64
181cabdff1aSopenharmony_ci    pmaxsw      m0, m11
182cabdff1aSopenharmony_ci%else
183cabdff1aSopenharmony_ci    pxor        m2, m2
184cabdff1aSopenharmony_ci    pmaxsw      m0, m2
185cabdff1aSopenharmony_ci%endif
186cabdff1aSopenharmony_ci%endif
187cabdff1aSopenharmony_ci%ifidn %1, avg
188cabdff1aSopenharmony_ci    pavgw       m0, [dstq]
189cabdff1aSopenharmony_ci%endif
190cabdff1aSopenharmony_ci    mova    [dstq], m0
191cabdff1aSopenharmony_ci    add       dstq, dstrideq
192cabdff1aSopenharmony_ci    dec         hd
193cabdff1aSopenharmony_ci    jg .loop
194cabdff1aSopenharmony_ci    RET
195cabdff1aSopenharmony_ci
196cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
197cabdff1aSopenharmony_ci    mova        m5, [pw_4095]
198cabdff1aSopenharmony_ci    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body
199cabdff1aSopenharmony_ci%endmacro
200cabdff1aSopenharmony_ci
201cabdff1aSopenharmony_ciINIT_XMM sse2
202cabdff1aSopenharmony_cifilter_h_fn put
203cabdff1aSopenharmony_cifilter_h_fn avg
204cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
205cabdff1aSopenharmony_ciINIT_YMM avx2
206cabdff1aSopenharmony_cifilter_h_fn put
207cabdff1aSopenharmony_cifilter_h_fn avg
208cabdff1aSopenharmony_ci%endif
209cabdff1aSopenharmony_ci
210cabdff1aSopenharmony_ci%macro filter_v4_fn 1-2 12
211cabdff1aSopenharmony_ci%if ARCH_X86_64
212cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
213cabdff1aSopenharmony_ci%else
214cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
215cabdff1aSopenharmony_ci    mov   filteryq, r5mp
216cabdff1aSopenharmony_ci%define hd r4mp
217cabdff1aSopenharmony_ci%endif
218cabdff1aSopenharmony_ci    mova        m5, [pw_1023]
219cabdff1aSopenharmony_ci.body:
220cabdff1aSopenharmony_ci%if notcpuflag(sse4) && ARCH_X86_64
221cabdff1aSopenharmony_ci    pxor       m11, m11
222cabdff1aSopenharmony_ci%endif
223cabdff1aSopenharmony_ci    mova        m6, [pd_64]
224cabdff1aSopenharmony_ci    lea  sstride3q, [sstrideq*3]
225cabdff1aSopenharmony_ci    lea      src4q, [srcq+sstrideq]
226cabdff1aSopenharmony_ci    sub       srcq, sstride3q
227cabdff1aSopenharmony_ci    mova        m7, [filteryq+  0]
228cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
229cabdff1aSopenharmony_ci    mova        m8, [filteryq+ 32]
230cabdff1aSopenharmony_ci    mova        m9, [filteryq+ 64]
231cabdff1aSopenharmony_ci    mova       m10, [filteryq+ 96]
232cabdff1aSopenharmony_ci%endif
233cabdff1aSopenharmony_ci.loop:
234cabdff1aSopenharmony_ci    ; FIXME maybe reuse loads from previous rows, or just
235cabdff1aSopenharmony_ci    ; more generally unroll this to prevent multiple loads of
236cabdff1aSopenharmony_ci    ; the same data?
237cabdff1aSopenharmony_ci    movh        m0, [srcq]
238cabdff1aSopenharmony_ci    movh        m1, [srcq+sstrideq]
239cabdff1aSopenharmony_ci    movh        m2, [srcq+sstrideq*2]
240cabdff1aSopenharmony_ci    movh        m3, [srcq+sstride3q]
241cabdff1aSopenharmony_ci    add       srcq, sstrideq
242cabdff1aSopenharmony_ci    movh        m4, [src4q]
243cabdff1aSopenharmony_ci    punpcklwd   m0, m1
244cabdff1aSopenharmony_ci    punpcklwd   m2, m3
245cabdff1aSopenharmony_ci    pmaddwd     m0, m7
246cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
247cabdff1aSopenharmony_ci    pmaddwd     m2, m8
248cabdff1aSopenharmony_ci%else
249cabdff1aSopenharmony_ci    pmaddwd     m2, [filteryq+ 32]
250cabdff1aSopenharmony_ci%endif
251cabdff1aSopenharmony_ci    movh        m1, [src4q+sstrideq]
252cabdff1aSopenharmony_ci    movh        m3, [src4q+sstrideq*2]
253cabdff1aSopenharmony_ci    paddd       m0, m2
254cabdff1aSopenharmony_ci    movh        m2, [src4q+sstride3q]
255cabdff1aSopenharmony_ci    add      src4q, sstrideq
256cabdff1aSopenharmony_ci    punpcklwd   m4, m1
257cabdff1aSopenharmony_ci    punpcklwd   m3, m2
258cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
259cabdff1aSopenharmony_ci    pmaddwd     m4, m9
260cabdff1aSopenharmony_ci    pmaddwd     m3, m10
261cabdff1aSopenharmony_ci%else
262cabdff1aSopenharmony_ci    pmaddwd     m4, [filteryq+ 64]
263cabdff1aSopenharmony_ci    pmaddwd     m3, [filteryq+ 96]
264cabdff1aSopenharmony_ci%endif
265cabdff1aSopenharmony_ci    paddd       m0, m4
266cabdff1aSopenharmony_ci    paddd       m0, m3
267cabdff1aSopenharmony_ci    paddd       m0, m6
268cabdff1aSopenharmony_ci    psrad       m0, 7
269cabdff1aSopenharmony_ci%if cpuflag(sse4)
270cabdff1aSopenharmony_ci    packusdw    m0, m0
271cabdff1aSopenharmony_ci%else
272cabdff1aSopenharmony_ci    packssdw    m0, m0
273cabdff1aSopenharmony_ci%endif
274cabdff1aSopenharmony_ci%ifidn %1, avg
275cabdff1aSopenharmony_ci    movh        m1, [dstq]
276cabdff1aSopenharmony_ci%endif
277cabdff1aSopenharmony_ci    pminsw      m0, m5
278cabdff1aSopenharmony_ci%if notcpuflag(sse4)
279cabdff1aSopenharmony_ci%if ARCH_X86_64
280cabdff1aSopenharmony_ci    pmaxsw      m0, m11
281cabdff1aSopenharmony_ci%else
282cabdff1aSopenharmony_ci    pxor        m2, m2
283cabdff1aSopenharmony_ci    pmaxsw      m0, m2
284cabdff1aSopenharmony_ci%endif
285cabdff1aSopenharmony_ci%endif
286cabdff1aSopenharmony_ci%ifidn %1, avg
287cabdff1aSopenharmony_ci    pavgw       m0, m1
288cabdff1aSopenharmony_ci%endif
289cabdff1aSopenharmony_ci    movh    [dstq], m0
290cabdff1aSopenharmony_ci    add       dstq, dstrideq
291cabdff1aSopenharmony_ci    dec         hd
292cabdff1aSopenharmony_ci    jg .loop
293cabdff1aSopenharmony_ci    RET
294cabdff1aSopenharmony_ci
295cabdff1aSopenharmony_ci%if ARCH_X86_64
296cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
297cabdff1aSopenharmony_ci%else
298cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
299cabdff1aSopenharmony_ci    mov   filteryq, r5mp
300cabdff1aSopenharmony_ci%endif
301cabdff1aSopenharmony_ci    mova        m5, [pw_4095]
302cabdff1aSopenharmony_ci    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body
303cabdff1aSopenharmony_ci%endmacro
304cabdff1aSopenharmony_ci
305cabdff1aSopenharmony_ciINIT_XMM sse2
306cabdff1aSopenharmony_cifilter_v4_fn put
307cabdff1aSopenharmony_cifilter_v4_fn avg
308cabdff1aSopenharmony_ci
309cabdff1aSopenharmony_ci%macro filter_v_fn 1-2 13
310cabdff1aSopenharmony_ci%assign %%px mmsize/2
311cabdff1aSopenharmony_ci%if ARCH_X86_64
312cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
313cabdff1aSopenharmony_ci%else
314cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
315cabdff1aSopenharmony_ci    mov   filteryq, r5mp
316cabdff1aSopenharmony_ci%define hd r4mp
317cabdff1aSopenharmony_ci%endif
318cabdff1aSopenharmony_ci    mova        m5, [pw_1023]
319cabdff1aSopenharmony_ci.body:
320cabdff1aSopenharmony_ci%if notcpuflag(sse4) && ARCH_X86_64
321cabdff1aSopenharmony_ci    pxor       m12, m12
322cabdff1aSopenharmony_ci%endif
323cabdff1aSopenharmony_ci%if ARCH_X86_64
324cabdff1aSopenharmony_ci    mova       m11, [pd_64]
325cabdff1aSopenharmony_ci%endif
326cabdff1aSopenharmony_ci    lea  sstride3q, [sstrideq*3]
327cabdff1aSopenharmony_ci    lea      src4q, [srcq+sstrideq]
328cabdff1aSopenharmony_ci    sub       srcq, sstride3q
329cabdff1aSopenharmony_ci    mova        m7, [filteryq+  0]
330cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
331cabdff1aSopenharmony_ci    mova        m8, [filteryq+ 32]
332cabdff1aSopenharmony_ci    mova        m9, [filteryq+ 64]
333cabdff1aSopenharmony_ci    mova       m10, [filteryq+ 96]
334cabdff1aSopenharmony_ci%endif
335cabdff1aSopenharmony_ci.loop:
336cabdff1aSopenharmony_ci    ; FIXME maybe reuse loads from previous rows, or just
337cabdff1aSopenharmony_ci    ; more generally unroll this to prevent multiple loads of
338cabdff1aSopenharmony_ci    ; the same data?
339cabdff1aSopenharmony_ci    movu        m0, [srcq]
340cabdff1aSopenharmony_ci    movu        m1, [srcq+sstrideq]
341cabdff1aSopenharmony_ci    movu        m2, [srcq+sstrideq*2]
342cabdff1aSopenharmony_ci    movu        m3, [srcq+sstride3q]
343cabdff1aSopenharmony_ci    add       srcq, sstrideq
344cabdff1aSopenharmony_ci    movu        m4, [src4q]
345cabdff1aSopenharmony_ci    SBUTTERFLY  wd, 0, 1, 6
346cabdff1aSopenharmony_ci    SBUTTERFLY  wd, 2, 3, 6
347cabdff1aSopenharmony_ci    pmaddwd     m0, m7
348cabdff1aSopenharmony_ci    pmaddwd     m1, m7
349cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
350cabdff1aSopenharmony_ci    pmaddwd     m2, m8
351cabdff1aSopenharmony_ci    pmaddwd     m3, m8
352cabdff1aSopenharmony_ci%else
353cabdff1aSopenharmony_ci    pmaddwd     m2, [filteryq+ 32]
354cabdff1aSopenharmony_ci    pmaddwd     m3, [filteryq+ 32]
355cabdff1aSopenharmony_ci%endif
356cabdff1aSopenharmony_ci    paddd       m0, m2
357cabdff1aSopenharmony_ci    paddd       m1, m3
358cabdff1aSopenharmony_ci    movu        m2, [src4q+sstrideq]
359cabdff1aSopenharmony_ci    movu        m3, [src4q+sstrideq*2]
360cabdff1aSopenharmony_ci    SBUTTERFLY  wd, 4, 2, 6
361cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
362cabdff1aSopenharmony_ci    pmaddwd     m4, m9
363cabdff1aSopenharmony_ci    pmaddwd     m2, m9
364cabdff1aSopenharmony_ci%else
365cabdff1aSopenharmony_ci    pmaddwd     m4, [filteryq+ 64]
366cabdff1aSopenharmony_ci    pmaddwd     m2, [filteryq+ 64]
367cabdff1aSopenharmony_ci%endif
368cabdff1aSopenharmony_ci    paddd       m0, m4
369cabdff1aSopenharmony_ci    paddd       m1, m2
370cabdff1aSopenharmony_ci    movu        m4, [src4q+sstride3q]
371cabdff1aSopenharmony_ci    add      src4q, sstrideq
372cabdff1aSopenharmony_ci    SBUTTERFLY  wd, 3, 4, 6
373cabdff1aSopenharmony_ci%if ARCH_X86_64 && mmsize > 8
374cabdff1aSopenharmony_ci    pmaddwd     m3, m10
375cabdff1aSopenharmony_ci    pmaddwd     m4, m10
376cabdff1aSopenharmony_ci%else
377cabdff1aSopenharmony_ci    pmaddwd     m3, [filteryq+ 96]
378cabdff1aSopenharmony_ci    pmaddwd     m4, [filteryq+ 96]
379cabdff1aSopenharmony_ci%endif
380cabdff1aSopenharmony_ci    paddd       m0, m3
381cabdff1aSopenharmony_ci    paddd       m1, m4
382cabdff1aSopenharmony_ci%if ARCH_X86_64
383cabdff1aSopenharmony_ci    paddd       m0, m11
384cabdff1aSopenharmony_ci    paddd       m1, m11
385cabdff1aSopenharmony_ci%else
386cabdff1aSopenharmony_ci    paddd       m0, [pd_64]
387cabdff1aSopenharmony_ci    paddd       m1, [pd_64]
388cabdff1aSopenharmony_ci%endif
389cabdff1aSopenharmony_ci    psrad       m0, 7
390cabdff1aSopenharmony_ci    psrad       m1, 7
391cabdff1aSopenharmony_ci%if cpuflag(sse4)
392cabdff1aSopenharmony_ci    packusdw    m0, m1
393cabdff1aSopenharmony_ci%else
394cabdff1aSopenharmony_ci    packssdw    m0, m1
395cabdff1aSopenharmony_ci%endif
396cabdff1aSopenharmony_ci    pminsw      m0, m5
397cabdff1aSopenharmony_ci%if notcpuflag(sse4)
398cabdff1aSopenharmony_ci%if ARCH_X86_64
399cabdff1aSopenharmony_ci    pmaxsw      m0, m12
400cabdff1aSopenharmony_ci%else
401cabdff1aSopenharmony_ci    pxor        m2, m2
402cabdff1aSopenharmony_ci    pmaxsw      m0, m2
403cabdff1aSopenharmony_ci%endif
404cabdff1aSopenharmony_ci%endif
405cabdff1aSopenharmony_ci%ifidn %1, avg
406cabdff1aSopenharmony_ci    pavgw       m0, [dstq]
407cabdff1aSopenharmony_ci%endif
408cabdff1aSopenharmony_ci    mova    [dstq], m0
409cabdff1aSopenharmony_ci    add       dstq, dstrideq
410cabdff1aSopenharmony_ci    dec         hd
411cabdff1aSopenharmony_ci    jg .loop
412cabdff1aSopenharmony_ci    RET
413cabdff1aSopenharmony_ci
414cabdff1aSopenharmony_ci%if ARCH_X86_64
415cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
416cabdff1aSopenharmony_ci%else
417cabdff1aSopenharmony_cicglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
418cabdff1aSopenharmony_ci    mov   filteryq, r5mp
419cabdff1aSopenharmony_ci%endif
420cabdff1aSopenharmony_ci    mova        m5, [pw_4095]
421cabdff1aSopenharmony_ci    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body
422cabdff1aSopenharmony_ci%endmacro
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ciINIT_XMM sse2
425cabdff1aSopenharmony_cifilter_v_fn put
426cabdff1aSopenharmony_cifilter_v_fn avg
427cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
428cabdff1aSopenharmony_ciINIT_YMM avx2
429cabdff1aSopenharmony_cifilter_v_fn put
430cabdff1aSopenharmony_cifilter_v_fn avg
431cabdff1aSopenharmony_ci%endif
432