1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* VP9 loop filter SIMD optimizations
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci;*
13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
17cabdff1aSopenharmony_ci;*
18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci;******************************************************************************
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ciSECTION_RODATA
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_cipw_511: times 16 dw 511
28cabdff1aSopenharmony_cipw_2047: times 16 dw 2047
29cabdff1aSopenharmony_cipw_16384: times 16 dw 16384
30cabdff1aSopenharmony_cipw_m512: times 16 dw -512
31cabdff1aSopenharmony_cipw_m2048: times 16 dw -2048
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_cicextern pw_1
34cabdff1aSopenharmony_cicextern pw_3
35cabdff1aSopenharmony_cicextern pw_4
36cabdff1aSopenharmony_cicextern pw_8
37cabdff1aSopenharmony_cicextern pw_16
38cabdff1aSopenharmony_cicextern pw_256
39cabdff1aSopenharmony_cicextern pw_1023
40cabdff1aSopenharmony_cicextern pw_4095
41cabdff1aSopenharmony_cicextern pw_m1
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_ciSECTION .text
44cabdff1aSopenharmony_ci
45cabdff1aSopenharmony_ci%macro SCRATCH 3-4
46cabdff1aSopenharmony_ci%if ARCH_X86_64
47cabdff1aSopenharmony_ci    SWAP                %1, %2
48cabdff1aSopenharmony_ci%if %0 == 4
49cabdff1aSopenharmony_ci%define reg_%4 m%2
50cabdff1aSopenharmony_ci%endif
51cabdff1aSopenharmony_ci%else
52cabdff1aSopenharmony_ci    mova              [%3], m%1
53cabdff1aSopenharmony_ci%if %0 == 4
54cabdff1aSopenharmony_ci%define reg_%4 [%3]
55cabdff1aSopenharmony_ci%endif
56cabdff1aSopenharmony_ci%endif
57cabdff1aSopenharmony_ci%endmacro
58cabdff1aSopenharmony_ci
59cabdff1aSopenharmony_ci%macro UNSCRATCH 3-4
60cabdff1aSopenharmony_ci%if ARCH_X86_64
61cabdff1aSopenharmony_ci    SWAP                %1, %2
62cabdff1aSopenharmony_ci%else
63cabdff1aSopenharmony_ci    mova               m%1, [%3]
64cabdff1aSopenharmony_ci%endif
65cabdff1aSopenharmony_ci%if %0 == 4
66cabdff1aSopenharmony_ci%undef reg_%4
67cabdff1aSopenharmony_ci%endif
68cabdff1aSopenharmony_ci%endmacro
69cabdff1aSopenharmony_ci
70cabdff1aSopenharmony_ci%macro PRELOAD 2-3
71cabdff1aSopenharmony_ci%if ARCH_X86_64
72cabdff1aSopenharmony_ci    mova               m%1, [%2]
73cabdff1aSopenharmony_ci%if %0 == 3
74cabdff1aSopenharmony_ci%define reg_%3 m%1
75cabdff1aSopenharmony_ci%endif
76cabdff1aSopenharmony_ci%elif %0 == 3
77cabdff1aSopenharmony_ci%define reg_%3 [%2]
78cabdff1aSopenharmony_ci%endif
79cabdff1aSopenharmony_ci%endmacro
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_ci; calculate p or q portion of flat8out
82cabdff1aSopenharmony_ci%macro FLAT8OUT_HALF 0
83cabdff1aSopenharmony_ci    psubw               m4, m0                      ; q4-q0
84cabdff1aSopenharmony_ci    psubw               m5, m0                      ; q5-q0
85cabdff1aSopenharmony_ci    psubw               m6, m0                      ; q6-q0
86cabdff1aSopenharmony_ci    psubw               m7, m0                      ; q7-q0
87cabdff1aSopenharmony_ci    ABS2                m4, m5, m2, m3              ; abs(q4-q0) | abs(q5-q0)
88cabdff1aSopenharmony_ci    ABS2                m6, m7, m2, m3              ; abs(q6-q0) | abs(q7-q0)
89cabdff1aSopenharmony_ci    pcmpgtw             m4, reg_F                   ; abs(q4-q0) > F
90cabdff1aSopenharmony_ci    pcmpgtw             m5, reg_F                   ; abs(q5-q0) > F
91cabdff1aSopenharmony_ci    pcmpgtw             m6, reg_F                   ; abs(q6-q0) > F
92cabdff1aSopenharmony_ci    pcmpgtw             m7, reg_F                   ; abs(q7-q0) > F
93cabdff1aSopenharmony_ci    por                 m5, m4
94cabdff1aSopenharmony_ci    por                 m7, m6
95cabdff1aSopenharmony_ci    por                 m7, m5                      ; !flat8out, q portion
96cabdff1aSopenharmony_ci%endmacro
97cabdff1aSopenharmony_ci
98cabdff1aSopenharmony_ci; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
99cabdff1aSopenharmony_ci%macro FLAT8IN_HALF 1
100cabdff1aSopenharmony_ci%if %1 > 4
101cabdff1aSopenharmony_ci    psubw               m4, m3, m0                  ; q3-q0
102cabdff1aSopenharmony_ci    psubw               m5, m2, m0                  ; q2-q0
103cabdff1aSopenharmony_ci    ABS2                m4, m5, m6, m7              ; abs(q3-q0) | abs(q2-q0)
104cabdff1aSopenharmony_ci    pcmpgtw             m4, reg_F                   ; abs(q3-q0) > F
105cabdff1aSopenharmony_ci    pcmpgtw             m5, reg_F                   ; abs(q2-q0) > F
106cabdff1aSopenharmony_ci%endif
107cabdff1aSopenharmony_ci    psubw               m3, m2                      ; q3-q2
108cabdff1aSopenharmony_ci    psubw               m2, m1                      ; q2-q1
109cabdff1aSopenharmony_ci    ABS2                m3, m2, m6, m7              ; abs(q3-q2) | abs(q2-q1)
110cabdff1aSopenharmony_ci    pcmpgtw             m3, reg_I                   ; abs(q3-q2) > I
111cabdff1aSopenharmony_ci    pcmpgtw             m2, reg_I                   ; abs(q2-q1) > I
112cabdff1aSopenharmony_ci%if %1 > 4
113cabdff1aSopenharmony_ci    por                 m4, m5
114cabdff1aSopenharmony_ci%endif
115cabdff1aSopenharmony_ci    por                 m2, m3
116cabdff1aSopenharmony_ci    psubw               m3, m1, m0                  ; q1-q0
117cabdff1aSopenharmony_ci    ABS1                m3, m5                      ; abs(q1-q0)
118cabdff1aSopenharmony_ci%if %1 > 4
119cabdff1aSopenharmony_ci    pcmpgtw             m6, m3, reg_F               ; abs(q1-q0) > F
120cabdff1aSopenharmony_ci%endif
121cabdff1aSopenharmony_ci    pcmpgtw             m7, m3, reg_H               ; abs(q1-q0) > H
122cabdff1aSopenharmony_ci    pcmpgtw             m3, reg_I                   ; abs(q1-q0) > I
123cabdff1aSopenharmony_ci%if %1 > 4
124cabdff1aSopenharmony_ci    por                 m4, m6
125cabdff1aSopenharmony_ci%endif
126cabdff1aSopenharmony_ci    por                 m2, m3
127cabdff1aSopenharmony_ci%endmacro
128cabdff1aSopenharmony_ci
129cabdff1aSopenharmony_ci; one step in filter_14/filter_6
130cabdff1aSopenharmony_ci;
131cabdff1aSopenharmony_ci; take sum $reg, downshift, apply mask and write into dst
132cabdff1aSopenharmony_ci;
133cabdff1aSopenharmony_ci; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
134cabdff1aSopenharmony_ci; step's sum $reg. This is omitted for the last row in each filter.
135cabdff1aSopenharmony_ci;
136cabdff1aSopenharmony_ci; if dont_store is set, don't write the result into memory, instead keep the
137cabdff1aSopenharmony_ci; values in register so we can write it out later
138cabdff1aSopenharmony_ci%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
139cabdff1aSopenharmony_ci                                      ; src/sub1, sub2, add1, add2, dont_store
140cabdff1aSopenharmony_ci    psrlw               %1, %2, %4
141cabdff1aSopenharmony_ci    psubw               %1, %6                      ; abs->delta
142cabdff1aSopenharmony_ci%ifnidn %7, ""
143cabdff1aSopenharmony_ci    psubw               %2, %6
144cabdff1aSopenharmony_ci    psubw               %2, %7
145cabdff1aSopenharmony_ci    paddw               %2, %8
146cabdff1aSopenharmony_ci    paddw               %2, %9
147cabdff1aSopenharmony_ci%endif
148cabdff1aSopenharmony_ci    pand                %1, reg_%3                  ; apply mask
149cabdff1aSopenharmony_ci%if %10 == 1
150cabdff1aSopenharmony_ci    paddw               %6, %1                      ; delta->abs
151cabdff1aSopenharmony_ci%else
152cabdff1aSopenharmony_ci    paddw               %1, %6                      ; delta->abs
153cabdff1aSopenharmony_ci    mova              [%5], %1
154cabdff1aSopenharmony_ci%endif
155cabdff1aSopenharmony_ci%endmacro
156cabdff1aSopenharmony_ci
157cabdff1aSopenharmony_ci; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
158cabdff1aSopenharmony_ci
159cabdff1aSopenharmony_ci%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12]
160cabdff1aSopenharmony_ci
161cabdff1aSopenharmony_ci%if ARCH_X86_64
162cabdff1aSopenharmony_ci%if %2 == 16
163cabdff1aSopenharmony_ci%assign %%num_xmm_regs 16
164cabdff1aSopenharmony_ci%elif %2 == 8
165cabdff1aSopenharmony_ci%assign %%num_xmm_regs 15
166cabdff1aSopenharmony_ci%else ; %2 == 4
167cabdff1aSopenharmony_ci%assign %%num_xmm_regs 14
168cabdff1aSopenharmony_ci%endif ; %2
169cabdff1aSopenharmony_ci%assign %%bak_mem 0
170cabdff1aSopenharmony_ci%else ; ARCH_X86_32
171cabdff1aSopenharmony_ci%assign %%num_xmm_regs 8
172cabdff1aSopenharmony_ci%if %2 == 16
173cabdff1aSopenharmony_ci%assign %%bak_mem 7
174cabdff1aSopenharmony_ci%elif %2 == 8
175cabdff1aSopenharmony_ci%assign %%bak_mem 6
176cabdff1aSopenharmony_ci%else ; %2 == 4
177cabdff1aSopenharmony_ci%assign %%bak_mem 5
178cabdff1aSopenharmony_ci%endif ; %2
179cabdff1aSopenharmony_ci%endif ; ARCH_X86_64/32
180cabdff1aSopenharmony_ci
181cabdff1aSopenharmony_ci%if %2 == 16
182cabdff1aSopenharmony_ci%ifidn %1, v
183cabdff1aSopenharmony_ci%assign %%num_gpr_regs 6
184cabdff1aSopenharmony_ci%else ; %1 == h
185cabdff1aSopenharmony_ci%assign %%num_gpr_regs 5
186cabdff1aSopenharmony_ci%endif ; %1
187cabdff1aSopenharmony_ci%assign %%wd_mem 6
188cabdff1aSopenharmony_ci%else ; %2 == 8/4
189cabdff1aSopenharmony_ci%assign %%num_gpr_regs 5
190cabdff1aSopenharmony_ci%if ARCH_X86_32 && %2 == 8
191cabdff1aSopenharmony_ci%assign %%wd_mem 2
192cabdff1aSopenharmony_ci%else ; ARCH_X86_64 || %2 == 4
193cabdff1aSopenharmony_ci%assign %%wd_mem 0
194cabdff1aSopenharmony_ci%endif ; ARCH_X86_64/32 etc.
195cabdff1aSopenharmony_ci%endif ; %2
196cabdff1aSopenharmony_ci
197cabdff1aSopenharmony_ci%ifidn %1, v
198cabdff1aSopenharmony_ci%assign %%tsp_mem 0
199cabdff1aSopenharmony_ci%elif %2 == 16 ; && %1 == h
200cabdff1aSopenharmony_ci%assign %%tsp_mem 16
201cabdff1aSopenharmony_ci%else ; %1 == h && %1 == 8/4
202cabdff1aSopenharmony_ci%assign %%tsp_mem 8
203cabdff1aSopenharmony_ci%endif ; %1/%2
204cabdff1aSopenharmony_ci
205cabdff1aSopenharmony_ci%assign %%off %%wd_mem
206cabdff1aSopenharmony_ci%assign %%tspoff %%bak_mem+%%wd_mem
207cabdff1aSopenharmony_ci%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize)
208cabdff1aSopenharmony_ci
209cabdff1aSopenharmony_ci%if %3 == 10
210cabdff1aSopenharmony_ci%define %%maxsgn 511
211cabdff1aSopenharmony_ci%define %%minsgn m512
212cabdff1aSopenharmony_ci%define %%maxusgn 1023
213cabdff1aSopenharmony_ci%define %%maxf 4
214cabdff1aSopenharmony_ci%else ; %3 == 12
215cabdff1aSopenharmony_ci%define %%maxsgn 2047
216cabdff1aSopenharmony_ci%define %%minsgn m2048
217cabdff1aSopenharmony_ci%define %%maxusgn 4095
218cabdff1aSopenharmony_ci%define %%maxf 16
219cabdff1aSopenharmony_ci%endif ; %3
220cabdff1aSopenharmony_ci
221cabdff1aSopenharmony_cicglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H
222cabdff1aSopenharmony_ci    ; prepare E, I and H masks
223cabdff1aSopenharmony_ci    shl                 Ed, %3-8
224cabdff1aSopenharmony_ci    shl                 Id, %3-8
225cabdff1aSopenharmony_ci    shl                 Hd, %3-8
226cabdff1aSopenharmony_ci%if cpuflag(ssse3)
227cabdff1aSopenharmony_ci    mova                m0, [pw_256]
228cabdff1aSopenharmony_ci%endif
229cabdff1aSopenharmony_ci    movd                m1, Ed
230cabdff1aSopenharmony_ci    movd                m2, Id
231cabdff1aSopenharmony_ci    movd                m3, Hd
232cabdff1aSopenharmony_ci%if cpuflag(ssse3)
233cabdff1aSopenharmony_ci    pshufb              m1, m0                      ; E << (bit_depth - 8)
234cabdff1aSopenharmony_ci    pshufb              m2, m0                      ; I << (bit_depth - 8)
235cabdff1aSopenharmony_ci    pshufb              m3, m0                      ; H << (bit_depth - 8)
236cabdff1aSopenharmony_ci%else
237cabdff1aSopenharmony_ci    punpcklwd           m1, m1
238cabdff1aSopenharmony_ci    punpcklwd           m2, m2
239cabdff1aSopenharmony_ci    punpcklwd           m3, m3
240cabdff1aSopenharmony_ci    pshufd              m1, m1, q0000
241cabdff1aSopenharmony_ci    pshufd              m2, m2, q0000
242cabdff1aSopenharmony_ci    pshufd              m3, m3, q0000
243cabdff1aSopenharmony_ci%endif
244cabdff1aSopenharmony_ci    SCRATCH              1,  8, rsp+(%%off+0)*mmsize,  E
245cabdff1aSopenharmony_ci    SCRATCH              2,  9, rsp+(%%off+1)*mmsize,  I
246cabdff1aSopenharmony_ci    SCRATCH              3, 10, rsp+(%%off+2)*mmsize,  H
247cabdff1aSopenharmony_ci%if %2 > 4
248cabdff1aSopenharmony_ci    PRELOAD                 11, pw_ %+ %%maxf, F
249cabdff1aSopenharmony_ci%endif
250cabdff1aSopenharmony_ci
251cabdff1aSopenharmony_ci    ; set up variables to load data
252cabdff1aSopenharmony_ci%ifidn %1, v
253cabdff1aSopenharmony_ci    DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12
254cabdff1aSopenharmony_ci    lea           stride3q, [strideq*3]
255cabdff1aSopenharmony_ci    neg            strideq
256cabdff1aSopenharmony_ci%if %2 == 16
257cabdff1aSopenharmony_ci    lea              dst0q, [dst8q+strideq*8]
258cabdff1aSopenharmony_ci%else
259cabdff1aSopenharmony_ci    lea              dst4q, [dst8q+strideq*4]
260cabdff1aSopenharmony_ci%endif
261cabdff1aSopenharmony_ci    neg            strideq
262cabdff1aSopenharmony_ci%if %2 == 16
263cabdff1aSopenharmony_ci    lea             dst12q, [dst8q+strideq*4]
264cabdff1aSopenharmony_ci    lea              dst4q, [dst0q+strideq*4]
265cabdff1aSopenharmony_ci%endif
266cabdff1aSopenharmony_ci
267cabdff1aSopenharmony_ci%if %2 == 16
268cabdff1aSopenharmony_ci%define %%p7 dst0q
269cabdff1aSopenharmony_ci%define %%p6 dst0q+strideq
270cabdff1aSopenharmony_ci%define %%p5 dst0q+strideq*2
271cabdff1aSopenharmony_ci%define %%p4 dst0q+stride3q
272cabdff1aSopenharmony_ci%endif
273cabdff1aSopenharmony_ci%define %%p3 dst4q
274cabdff1aSopenharmony_ci%define %%p2 dst4q+strideq
275cabdff1aSopenharmony_ci%define %%p1 dst4q+strideq*2
276cabdff1aSopenharmony_ci%define %%p0 dst4q+stride3q
277cabdff1aSopenharmony_ci%define %%q0 dst8q
278cabdff1aSopenharmony_ci%define %%q1 dst8q+strideq
279cabdff1aSopenharmony_ci%define %%q2 dst8q+strideq*2
280cabdff1aSopenharmony_ci%define %%q3 dst8q+stride3q
281cabdff1aSopenharmony_ci%if %2 == 16
282cabdff1aSopenharmony_ci%define %%q4 dst12q
283cabdff1aSopenharmony_ci%define %%q5 dst12q+strideq
284cabdff1aSopenharmony_ci%define %%q6 dst12q+strideq*2
285cabdff1aSopenharmony_ci%define %%q7 dst12q+stride3q
286cabdff1aSopenharmony_ci%endif
287cabdff1aSopenharmony_ci%else ; %1 == h
288cabdff1aSopenharmony_ci    DEFINE_ARGS dst0, stride, stride3, dst4
289cabdff1aSopenharmony_ci    lea           stride3q, [strideq*3]
290cabdff1aSopenharmony_ci    lea              dst4q, [dst0q+strideq*4]
291cabdff1aSopenharmony_ci
292cabdff1aSopenharmony_ci%define %%p3 rsp+(%%tspoff+0)*mmsize
293cabdff1aSopenharmony_ci%define %%p2 rsp+(%%tspoff+1)*mmsize
294cabdff1aSopenharmony_ci%define %%p1 rsp+(%%tspoff+2)*mmsize
295cabdff1aSopenharmony_ci%define %%p0 rsp+(%%tspoff+3)*mmsize
296cabdff1aSopenharmony_ci%define %%q0 rsp+(%%tspoff+4)*mmsize
297cabdff1aSopenharmony_ci%define %%q1 rsp+(%%tspoff+5)*mmsize
298cabdff1aSopenharmony_ci%define %%q2 rsp+(%%tspoff+6)*mmsize
299cabdff1aSopenharmony_ci%define %%q3 rsp+(%%tspoff+7)*mmsize
300cabdff1aSopenharmony_ci
301cabdff1aSopenharmony_ci%if %2 < 16
302cabdff1aSopenharmony_ci    movu                m0, [dst0q+strideq*0-8]
303cabdff1aSopenharmony_ci    movu                m1, [dst0q+strideq*1-8]
304cabdff1aSopenharmony_ci    movu                m2, [dst0q+strideq*2-8]
305cabdff1aSopenharmony_ci    movu                m3, [dst0q+stride3q -8]
306cabdff1aSopenharmony_ci    movu                m4, [dst4q+strideq*0-8]
307cabdff1aSopenharmony_ci    movu                m5, [dst4q+strideq*1-8]
308cabdff1aSopenharmony_ci    movu                m6, [dst4q+strideq*2-8]
309cabdff1aSopenharmony_ci    movu                m7, [dst4q+stride3q -8]
310cabdff1aSopenharmony_ci
311cabdff1aSopenharmony_ci%if ARCH_X86_64
312cabdff1aSopenharmony_ci    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
313cabdff1aSopenharmony_ci%else
314cabdff1aSopenharmony_ci    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0]
315cabdff1aSopenharmony_ci%endif
316cabdff1aSopenharmony_ci
317cabdff1aSopenharmony_ci    mova            [%%p3], m0
318cabdff1aSopenharmony_ci    mova            [%%p2], m1
319cabdff1aSopenharmony_ci    mova            [%%p1], m2
320cabdff1aSopenharmony_ci    mova            [%%p0], m3
321cabdff1aSopenharmony_ci%if ARCH_X86_64
322cabdff1aSopenharmony_ci    mova            [%%q0], m4
323cabdff1aSopenharmony_ci%endif
324cabdff1aSopenharmony_ci    mova            [%%q1], m5
325cabdff1aSopenharmony_ci    mova            [%%q2], m6
326cabdff1aSopenharmony_ci    mova            [%%q3], m7
327cabdff1aSopenharmony_ci
328cabdff1aSopenharmony_ci    ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
329cabdff1aSopenharmony_ci    ; order here accordingly
330cabdff1aSopenharmony_ci%else ; %2 == 16
331cabdff1aSopenharmony_ci
332cabdff1aSopenharmony_ci%define %%p7 rsp+(%%tspoff+ 8)*mmsize
333cabdff1aSopenharmony_ci%define %%p6 rsp+(%%tspoff+ 9)*mmsize
334cabdff1aSopenharmony_ci%define %%p5 rsp+(%%tspoff+10)*mmsize
335cabdff1aSopenharmony_ci%define %%p4 rsp+(%%tspoff+11)*mmsize
336cabdff1aSopenharmony_ci%define %%q4 rsp+(%%tspoff+12)*mmsize
337cabdff1aSopenharmony_ci%define %%q5 rsp+(%%tspoff+13)*mmsize
338cabdff1aSopenharmony_ci%define %%q6 rsp+(%%tspoff+14)*mmsize
339cabdff1aSopenharmony_ci%define %%q7 rsp+(%%tspoff+15)*mmsize
340cabdff1aSopenharmony_ci
341cabdff1aSopenharmony_ci    mova                m0, [dst0q+strideq*0-16]
342cabdff1aSopenharmony_ci    mova                m1, [dst0q+strideq*1-16]
343cabdff1aSopenharmony_ci    mova                m2, [dst0q+strideq*2-16]
344cabdff1aSopenharmony_ci    mova                m3, [dst0q+stride3q -16]
345cabdff1aSopenharmony_ci    mova                m4, [dst4q+strideq*0-16]
346cabdff1aSopenharmony_ci    mova                m5, [dst4q+strideq*1-16]
347cabdff1aSopenharmony_ci%if ARCH_X86_64
348cabdff1aSopenharmony_ci    mova                m6, [dst4q+strideq*2-16]
349cabdff1aSopenharmony_ci%endif
350cabdff1aSopenharmony_ci    mova                m7, [dst4q+stride3q -16]
351cabdff1aSopenharmony_ci
352cabdff1aSopenharmony_ci%if ARCH_X86_64
353cabdff1aSopenharmony_ci    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
354cabdff1aSopenharmony_ci%else
355cabdff1aSopenharmony_ci    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1
356cabdff1aSopenharmony_ci%endif
357cabdff1aSopenharmony_ci
358cabdff1aSopenharmony_ci    mova            [%%p7], m0
359cabdff1aSopenharmony_ci    mova            [%%p6], m1
360cabdff1aSopenharmony_ci    mova            [%%p5], m2
361cabdff1aSopenharmony_ci    mova            [%%p4], m3
362cabdff1aSopenharmony_ci%if ARCH_X86_64
363cabdff1aSopenharmony_ci    mova            [%%p3], m4
364cabdff1aSopenharmony_ci%endif
365cabdff1aSopenharmony_ci    mova            [%%p2], m5
366cabdff1aSopenharmony_ci    mova            [%%p1], m6
367cabdff1aSopenharmony_ci    mova            [%%p0], m7
368cabdff1aSopenharmony_ci
369cabdff1aSopenharmony_ci    mova                m0, [dst0q+strideq*0]
370cabdff1aSopenharmony_ci    mova                m1, [dst0q+strideq*1]
371cabdff1aSopenharmony_ci    mova                m2, [dst0q+strideq*2]
372cabdff1aSopenharmony_ci    mova                m3, [dst0q+stride3q ]
373cabdff1aSopenharmony_ci    mova                m4, [dst4q+strideq*0]
374cabdff1aSopenharmony_ci    mova                m5, [dst4q+strideq*1]
375cabdff1aSopenharmony_ci%if ARCH_X86_64
376cabdff1aSopenharmony_ci    mova                m6, [dst4q+strideq*2]
377cabdff1aSopenharmony_ci%endif
378cabdff1aSopenharmony_ci    mova                m7, [dst4q+stride3q ]
379cabdff1aSopenharmony_ci
380cabdff1aSopenharmony_ci%if ARCH_X86_64
381cabdff1aSopenharmony_ci    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
382cabdff1aSopenharmony_ci%else
383cabdff1aSopenharmony_ci    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1
384cabdff1aSopenharmony_ci%endif
385cabdff1aSopenharmony_ci
386cabdff1aSopenharmony_ci    mova            [%%q0], m0
387cabdff1aSopenharmony_ci    mova            [%%q1], m1
388cabdff1aSopenharmony_ci    mova            [%%q2], m2
389cabdff1aSopenharmony_ci    mova            [%%q3], m3
390cabdff1aSopenharmony_ci%if ARCH_X86_64
391cabdff1aSopenharmony_ci    mova            [%%q4], m4
392cabdff1aSopenharmony_ci%endif
393cabdff1aSopenharmony_ci    mova            [%%q5], m5
394cabdff1aSopenharmony_ci    mova            [%%q6], m6
395cabdff1aSopenharmony_ci    mova            [%%q7], m7
396cabdff1aSopenharmony_ci
397cabdff1aSopenharmony_ci    ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
398cabdff1aSopenharmony_ci    ; order here accordingly
399cabdff1aSopenharmony_ci%endif ; %2
400cabdff1aSopenharmony_ci%endif ; %1
401cabdff1aSopenharmony_ci
402cabdff1aSopenharmony_ci    ; load q0|q4-7 data
403cabdff1aSopenharmony_ci    mova                m0, [%%q0]
404cabdff1aSopenharmony_ci%if %2 == 16
405cabdff1aSopenharmony_ci    mova                m4, [%%q4]
406cabdff1aSopenharmony_ci    mova                m5, [%%q5]
407cabdff1aSopenharmony_ci    mova                m6, [%%q6]
408cabdff1aSopenharmony_ci    mova                m7, [%%q7]
409cabdff1aSopenharmony_ci
410cabdff1aSopenharmony_ci    ; flat8out q portion
411cabdff1aSopenharmony_ci    FLAT8OUT_HALF
412cabdff1aSopenharmony_ci    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
413cabdff1aSopenharmony_ci%endif
414cabdff1aSopenharmony_ci
415cabdff1aSopenharmony_ci    ; load q1-3 data
416cabdff1aSopenharmony_ci    mova                m1, [%%q1]
417cabdff1aSopenharmony_ci    mova                m2, [%%q2]
418cabdff1aSopenharmony_ci    mova                m3, [%%q3]
419cabdff1aSopenharmony_ci
420cabdff1aSopenharmony_ci    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
421cabdff1aSopenharmony_ci    ; r9[m15]=!flatout[q]
422cabdff1aSopenharmony_ci    ; m12-14=free
423cabdff1aSopenharmony_ci    ; m0-3=q0-q3
424cabdff1aSopenharmony_ci    ; m4-7=free
425cabdff1aSopenharmony_ci
426cabdff1aSopenharmony_ci    ; flat8in|fm|hev q portion
427cabdff1aSopenharmony_ci    FLAT8IN_HALF        %2
428cabdff1aSopenharmony_ci    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
429cabdff1aSopenharmony_ci%if %2 > 4
430cabdff1aSopenharmony_ci    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8I
431cabdff1aSopenharmony_ci%endif
432cabdff1aSopenharmony_ci
433cabdff1aSopenharmony_ci    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
434cabdff1aSopenharmony_ci    ; r9[m15]=!flat8out[q]
435cabdff1aSopenharmony_ci    ; r10[m13]=hev[q]
436cabdff1aSopenharmony_ci    ; r11[m14]=!flat8in[q]
437cabdff1aSopenharmony_ci    ; m2=!fm[q]
438cabdff1aSopenharmony_ci    ; m0,1=q0-q1
439cabdff1aSopenharmony_ci    ; m2-7=free
440cabdff1aSopenharmony_ci    ; m12=free
441cabdff1aSopenharmony_ci
442cabdff1aSopenharmony_ci    ; load p0-1
443cabdff1aSopenharmony_ci    mova                m3, [%%p0]
444cabdff1aSopenharmony_ci    mova                m4, [%%p1]
445cabdff1aSopenharmony_ci
446cabdff1aSopenharmony_ci    ; fm mb_edge portion
447cabdff1aSopenharmony_ci    psubw               m5, m3, m0                  ; q0-p0
448cabdff1aSopenharmony_ci    psubw               m6, m4, m1                  ; q1-p1
449cabdff1aSopenharmony_ci%if ARCH_X86_64
450cabdff1aSopenharmony_ci    ABS2                m5, m6, m7, m12             ; abs(q0-p0) | abs(q1-p1)
451cabdff1aSopenharmony_ci%else
452cabdff1aSopenharmony_ci    ABS1                m5, m7                      ; abs(q0-p0)
453cabdff1aSopenharmony_ci    ABS1                m6, m7                      ; abs(q1-p1)
454cabdff1aSopenharmony_ci%endif
455cabdff1aSopenharmony_ci    paddw               m5, m5
456cabdff1aSopenharmony_ci    psraw               m6, 1
457cabdff1aSopenharmony_ci    paddw               m6, m5                      ; abs(q0-p0)*2+(abs(q1-p1)>>1)
458cabdff1aSopenharmony_ci    pcmpgtw             m6, reg_E
459cabdff1aSopenharmony_ci    por                 m2, m6
460cabdff1aSopenharmony_ci    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, FM
461cabdff1aSopenharmony_ci
462cabdff1aSopenharmony_ci    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
463cabdff1aSopenharmony_ci    ; r9[m15]=!flat8out[q]
464cabdff1aSopenharmony_ci    ; r10[m13]=hev[q]
465cabdff1aSopenharmony_ci    ; r11[m14]=!flat8in[q]
466cabdff1aSopenharmony_ci    ; r12[m12]=!fm[q]
467cabdff1aSopenharmony_ci    ; m3-4=q0-1
468cabdff1aSopenharmony_ci    ; m0-2/5-7=free
469cabdff1aSopenharmony_ci
470cabdff1aSopenharmony_ci    ; load p4-7 data
471cabdff1aSopenharmony_ci    SWAP                 3, 0                       ; p0
472cabdff1aSopenharmony_ci    SWAP                 4, 1                       ; p1
473cabdff1aSopenharmony_ci%if %2 == 16
474cabdff1aSopenharmony_ci    mova                m7, [%%p7]
475cabdff1aSopenharmony_ci    mova                m6, [%%p6]
476cabdff1aSopenharmony_ci    mova                m5, [%%p5]
477cabdff1aSopenharmony_ci    mova                m4, [%%p4]
478cabdff1aSopenharmony_ci
479cabdff1aSopenharmony_ci    ; flat8out p portion
480cabdff1aSopenharmony_ci    FLAT8OUT_HALF
481cabdff1aSopenharmony_ci    por                 m7, reg_F8O
482cabdff1aSopenharmony_ci    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
483cabdff1aSopenharmony_ci%endif
484cabdff1aSopenharmony_ci
485cabdff1aSopenharmony_ci    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
486cabdff1aSopenharmony_ci    ; r9[m15]=!flat8out
487cabdff1aSopenharmony_ci    ; r10[m13]=hev[q]
488cabdff1aSopenharmony_ci    ; r11[m14]=!flat8in[q]
489cabdff1aSopenharmony_ci    ; r12[m12]=!fm[q]
490cabdff1aSopenharmony_ci    ; m0=p0
491cabdff1aSopenharmony_ci    ; m1-7=free
492cabdff1aSopenharmony_ci
493cabdff1aSopenharmony_ci    ; load p2-3 data
494cabdff1aSopenharmony_ci    mova                m2, [%%p2]
495cabdff1aSopenharmony_ci    mova                m3, [%%p3]
496cabdff1aSopenharmony_ci
497cabdff1aSopenharmony_ci    ; flat8in|fm|hev p portion
498cabdff1aSopenharmony_ci    FLAT8IN_HALF        %2
499cabdff1aSopenharmony_ci    por                 m7, reg_HEV
500cabdff1aSopenharmony_ci%if %2 > 4
501cabdff1aSopenharmony_ci    por                 m4, reg_F8I
502cabdff1aSopenharmony_ci%endif
503cabdff1aSopenharmony_ci    por                 m2, reg_FM
504cabdff1aSopenharmony_ci%if %2 > 4
505cabdff1aSopenharmony_ci    por                 m4, m2                      ; !flat8|!fm
506cabdff1aSopenharmony_ci%if %2 == 16
507cabdff1aSopenharmony_ci    por                 m5, m4, reg_F8O             ; !flat16|!fm
508cabdff1aSopenharmony_ci    pandn               m2, m4                      ; filter4_mask
509cabdff1aSopenharmony_ci    pandn               m4, m5                      ; filter8_mask
510cabdff1aSopenharmony_ci    pxor                m5, [pw_m1]                 ; filter16_mask
511cabdff1aSopenharmony_ci    SCRATCH              5, 15, rsp+(%%off+6)*mmsize, F16M
512cabdff1aSopenharmony_ci%else
513cabdff1aSopenharmony_ci    pandn               m2, m4                      ; filter4_mask
514cabdff1aSopenharmony_ci    pxor                m4, [pw_m1]                 ; filter8_mask
515cabdff1aSopenharmony_ci%endif
516cabdff1aSopenharmony_ci    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8M
517cabdff1aSopenharmony_ci%else
518cabdff1aSopenharmony_ci    pxor                m2, [pw_m1]                 ; filter4_mask
519cabdff1aSopenharmony_ci%endif
520cabdff1aSopenharmony_ci    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
521cabdff1aSopenharmony_ci    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, F4M
522cabdff1aSopenharmony_ci
523cabdff1aSopenharmony_ci    ; r9[m15]=filter16_mask
524cabdff1aSopenharmony_ci    ; r10[m13]=hev
525cabdff1aSopenharmony_ci    ; r11[m14]=filter8_mask
526cabdff1aSopenharmony_ci    ; r12[m12]=filter4_mask
527cabdff1aSopenharmony_ci    ; m0,1=p0-p1
528cabdff1aSopenharmony_ci    ; m2-7=free
529cabdff1aSopenharmony_ci    ; m8-11=free
530cabdff1aSopenharmony_ci
531cabdff1aSopenharmony_ci%if %2 > 4
532cabdff1aSopenharmony_ci%if %2 == 16
533cabdff1aSopenharmony_ci    ; filter_14
534cabdff1aSopenharmony_ci    mova                m2, [%%p7]
535cabdff1aSopenharmony_ci    mova                m3, [%%p6]
536cabdff1aSopenharmony_ci    mova                m6, [%%p5]
537cabdff1aSopenharmony_ci    mova                m7, [%%p4]
538cabdff1aSopenharmony_ci    PRELOAD              8, %%p3, P3
539cabdff1aSopenharmony_ci    PRELOAD              9, %%p2, P2
540cabdff1aSopenharmony_ci%endif
541cabdff1aSopenharmony_ci    PRELOAD             10, %%q0, Q0
542cabdff1aSopenharmony_ci    PRELOAD             11, %%q1, Q1
543cabdff1aSopenharmony_ci%if %2 == 16
544cabdff1aSopenharmony_ci    psllw               m4, m2, 3
545cabdff1aSopenharmony_ci    paddw               m5, m3, m3
546cabdff1aSopenharmony_ci    paddw               m4, m6
547cabdff1aSopenharmony_ci    paddw               m5, m7
548cabdff1aSopenharmony_ci    paddw               m4, reg_P3
549cabdff1aSopenharmony_ci    paddw               m5, reg_P2
550cabdff1aSopenharmony_ci    paddw               m4, m1
551cabdff1aSopenharmony_ci    paddw               m5, m0
552cabdff1aSopenharmony_ci    paddw               m4, reg_Q0                  ; q0+p1+p3+p5+p7*8
553cabdff1aSopenharmony_ci    psubw               m5, m2                      ; p0+p2+p4+p6*2-p7
554cabdff1aSopenharmony_ci    paddw               m4, [pw_8]
555cabdff1aSopenharmony_ci    paddw               m5, m4                      ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
556cabdff1aSopenharmony_ci
557cabdff1aSopenharmony_ci    ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
558cabdff1aSopenharmony_ci    ; at the end of the filter
559cabdff1aSopenharmony_ci
560cabdff1aSopenharmony_ci    mova    [rsp+0*mmsize], m3
561cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%p6, m3,     m2,             m6,     reg_Q1
562cabdff1aSopenharmony_ci%endif
563cabdff1aSopenharmony_ci    mova                m3, [%%q2]
564cabdff1aSopenharmony_ci%if %2 == 16
565cabdff1aSopenharmony_ci    mova    [rsp+1*mmsize], m6
566cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%p5, m6,     m2,             m7,     m3
567cabdff1aSopenharmony_ci%endif
568cabdff1aSopenharmony_ci    mova                m6, [%%q3]
569cabdff1aSopenharmony_ci%if %2 == 16
570cabdff1aSopenharmony_ci    mova    [rsp+2*mmsize], m7
571cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%p4, m7,     m2,             reg_P3, m6
572cabdff1aSopenharmony_ci    mova                m7, [%%q4]
573cabdff1aSopenharmony_ci%if ARCH_X86_64
574cabdff1aSopenharmony_ci    mova    [rsp+3*mmsize], reg_P3
575cabdff1aSopenharmony_ci%else
576cabdff1aSopenharmony_ci    mova                m4, reg_P3
577cabdff1aSopenharmony_ci    mova    [rsp+3*mmsize], m4
578cabdff1aSopenharmony_ci%endif
579cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%p3, reg_P3, m2,             reg_P2, m7
580cabdff1aSopenharmony_ci    PRELOAD              8, %%q5, Q5
581cabdff1aSopenharmony_ci%if ARCH_X86_64
582cabdff1aSopenharmony_ci    mova    [rsp+4*mmsize], reg_P2
583cabdff1aSopenharmony_ci%else
584cabdff1aSopenharmony_ci    mova                m4, reg_P2
585cabdff1aSopenharmony_ci    mova    [rsp+4*mmsize], m4
586cabdff1aSopenharmony_ci%endif
587cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%p2, reg_P2, m2,             m1,     reg_Q5
588cabdff1aSopenharmony_ci    PRELOAD              9, %%q6, Q6
589cabdff1aSopenharmony_ci    mova    [rsp+5*mmsize], m1
590cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%p1, m1,     m2,             m0,     reg_Q6
591cabdff1aSopenharmony_ci    mova                m1, [%%q7]
592cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%p0, m0,     m2,             reg_Q0, m1,     1
593cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1,     ARCH_X86_64
594cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m1,     ARCH_X86_64
595cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%q2, m3,     [rsp+2*mmsize], m6,     m1,     1
596cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%q3, m6,     [rsp+3*mmsize], m7,     m1
597cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%q4, m7,     [rsp+4*mmsize], reg_Q5, m1
598cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1
599cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F16M, 4, %%q6, reg_Q6
600cabdff1aSopenharmony_ci
601cabdff1aSopenharmony_ci    mova                m7, [%%p1]
602cabdff1aSopenharmony_ci%else
603cabdff1aSopenharmony_ci    SWAP                 1, 7
604cabdff1aSopenharmony_ci%endif
605cabdff1aSopenharmony_ci
606cabdff1aSopenharmony_ci    mova                m2, [%%p3]
607cabdff1aSopenharmony_ci    mova                m1, [%%p2]
608cabdff1aSopenharmony_ci
609cabdff1aSopenharmony_ci    ; reg_Q0-1 (m10-m11)
610cabdff1aSopenharmony_ci    ; m0=p0
611cabdff1aSopenharmony_ci    ; m1=p2
612cabdff1aSopenharmony_ci    ; m2=p3
613cabdff1aSopenharmony_ci    ; m3=q2
614cabdff1aSopenharmony_ci    ; m4-5=free
615cabdff1aSopenharmony_ci    ; m6=q3
616cabdff1aSopenharmony_ci    ; m7=p1
617cabdff1aSopenharmony_ci    ; m8-9 unused
618cabdff1aSopenharmony_ci
619cabdff1aSopenharmony_ci    ; filter_6
620cabdff1aSopenharmony_ci    psllw               m4, m2, 2
621cabdff1aSopenharmony_ci    paddw               m5, m1, m1
622cabdff1aSopenharmony_ci    paddw               m4, m7
623cabdff1aSopenharmony_ci    psubw               m5, m2
624cabdff1aSopenharmony_ci    paddw               m4, m0
625cabdff1aSopenharmony_ci    paddw               m5, reg_Q0
626cabdff1aSopenharmony_ci    paddw               m4, [pw_4]
627cabdff1aSopenharmony_ci    paddw               m5, m4
628cabdff1aSopenharmony_ci
629cabdff1aSopenharmony_ci%if ARCH_X86_64
630cabdff1aSopenharmony_ci    mova                m8, m1
631cabdff1aSopenharmony_ci    mova                m9, m7
632cabdff1aSopenharmony_ci%else
633cabdff1aSopenharmony_ci    mova    [rsp+0*mmsize], m1
634cabdff1aSopenharmony_ci    mova    [rsp+1*mmsize], m7
635cabdff1aSopenharmony_ci%endif
636cabdff1aSopenharmony_ci%ifidn %1, v
637cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1
638cabdff1aSopenharmony_ci%else
639cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1, 1
640cabdff1aSopenharmony_ci%endif
641cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F8M, 3, %%p1, m7,     m2,             m0,     m3, 1
642cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F8M, 3, %%p0, m0,     m2,             reg_Q0, m6, 1
643cabdff1aSopenharmony_ci%if ARCH_X86_64
644cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, m8,             reg_Q1, m6, ARCH_X86_64
645cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, m9,             m3,     m6, ARCH_X86_64
646cabdff1aSopenharmony_ci%else
647cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64
648cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m6, ARCH_X86_64
649cabdff1aSopenharmony_ci%endif
650cabdff1aSopenharmony_ci    FILTER_STEP         m4, m5, F8M, 3, %%q2, m3
651cabdff1aSopenharmony_ci
652cabdff1aSopenharmony_ci    UNSCRATCH            2, 10, %%q0
653cabdff1aSopenharmony_ci    UNSCRATCH            6, 11, %%q1
654cabdff1aSopenharmony_ci%else
655cabdff1aSopenharmony_ci    SWAP                 1, 7
656cabdff1aSopenharmony_ci    mova                m2, [%%q0]
657cabdff1aSopenharmony_ci    mova                m6, [%%q1]
658cabdff1aSopenharmony_ci%endif
659cabdff1aSopenharmony_ci    UNSCRATCH            3, 13, rsp+(%%off+4)*mmsize, HEV
660cabdff1aSopenharmony_ci
661cabdff1aSopenharmony_ci    ; m0=p0
662cabdff1aSopenharmony_ci    ; m1=p2
663cabdff1aSopenharmony_ci    ; m2=q0
664cabdff1aSopenharmony_ci    ; m3=hev_mask
665cabdff1aSopenharmony_ci    ; m4-5=free
666cabdff1aSopenharmony_ci    ; m6=q1
667cabdff1aSopenharmony_ci    ; m7=p1
668cabdff1aSopenharmony_ci
669cabdff1aSopenharmony_ci    ; filter_4
670cabdff1aSopenharmony_ci    psubw               m4, m7, m6              ; p1-q1
671cabdff1aSopenharmony_ci    psubw               m5, m2, m0              ; q0-p0
672cabdff1aSopenharmony_ci    pand                m4, m3
673cabdff1aSopenharmony_ci    pminsw              m4, [pw_ %+ %%maxsgn]
674cabdff1aSopenharmony_ci    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(p1-q1, 9) -> f
675cabdff1aSopenharmony_ci    paddw               m4, m5
676cabdff1aSopenharmony_ci    paddw               m5, m5
677cabdff1aSopenharmony_ci    paddw               m4, m5                  ; 3*(q0-p0)+f
678cabdff1aSopenharmony_ci    pminsw              m4, [pw_ %+ %%maxsgn]
679cabdff1aSopenharmony_ci    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(3*(q0-p0)+f, 9) -> f
680cabdff1aSopenharmony_ci    pand                m4, reg_F4M
681cabdff1aSopenharmony_ci    paddw               m5, m4, [pw_4]
682cabdff1aSopenharmony_ci    paddw               m4, [pw_3]
683cabdff1aSopenharmony_ci    pminsw              m5, [pw_ %+ %%maxsgn]
684cabdff1aSopenharmony_ci    pminsw              m4, [pw_ %+ %%maxsgn]
685cabdff1aSopenharmony_ci    psraw               m5, 3                   ; min_intp2(f+4, 9)>>3 -> f1
686cabdff1aSopenharmony_ci    psraw               m4, 3                   ; min_intp2(f+3, 9)>>3 -> f2
687cabdff1aSopenharmony_ci    psubw               m2, m5                  ; q0-f1
688cabdff1aSopenharmony_ci    paddw               m0, m4                  ; p0+f2
689cabdff1aSopenharmony_ci    pandn               m3, m5                  ; f1 & !hev (for p1/q1 adj)
690cabdff1aSopenharmony_ci    pxor                m4, m4
691cabdff1aSopenharmony_ci    mova                m5, [pw_ %+ %%maxusgn]
692cabdff1aSopenharmony_ci    pmaxsw              m2, m4
693cabdff1aSopenharmony_ci    pmaxsw              m0, m4
694cabdff1aSopenharmony_ci    pminsw              m2, m5
695cabdff1aSopenharmony_ci    pminsw              m0, m5
696cabdff1aSopenharmony_ci%if cpuflag(ssse3)
697cabdff1aSopenharmony_ci    pmulhrsw            m3, [pw_16384]          ; (f1+1)>>1
698cabdff1aSopenharmony_ci%else
699cabdff1aSopenharmony_ci    paddw               m3, [pw_1]
700cabdff1aSopenharmony_ci    psraw               m3, 1
701cabdff1aSopenharmony_ci%endif
702cabdff1aSopenharmony_ci    paddw               m7, m3                  ; p1+f
703cabdff1aSopenharmony_ci    psubw               m6, m3                  ; q1-f
704cabdff1aSopenharmony_ci    pmaxsw              m7, m4
705cabdff1aSopenharmony_ci    pmaxsw              m6, m4
706cabdff1aSopenharmony_ci    pminsw              m7, m5
707cabdff1aSopenharmony_ci    pminsw              m6, m5
708cabdff1aSopenharmony_ci
709cabdff1aSopenharmony_ci    ; store
710cabdff1aSopenharmony_ci%ifidn %1, v
711cabdff1aSopenharmony_ci    mova            [%%p1], m7
712cabdff1aSopenharmony_ci    mova            [%%p0], m0
713cabdff1aSopenharmony_ci    mova            [%%q0], m2
714cabdff1aSopenharmony_ci    mova            [%%q1], m6
715cabdff1aSopenharmony_ci%else ; %1 == h
716cabdff1aSopenharmony_ci%if %2 == 4
717cabdff1aSopenharmony_ci    TRANSPOSE4x4W        7, 0, 2, 6, 1
718cabdff1aSopenharmony_ci    movh   [dst0q+strideq*0-4], m7
719cabdff1aSopenharmony_ci    movhps [dst0q+strideq*1-4], m7
720cabdff1aSopenharmony_ci    movh   [dst0q+strideq*2-4], m0
721cabdff1aSopenharmony_ci    movhps [dst0q+stride3q -4], m0
722cabdff1aSopenharmony_ci    movh   [dst4q+strideq*0-4], m2
723cabdff1aSopenharmony_ci    movhps [dst4q+strideq*1-4], m2
724cabdff1aSopenharmony_ci    movh   [dst4q+strideq*2-4], m6
725cabdff1aSopenharmony_ci    movhps [dst4q+stride3q -4], m6
726cabdff1aSopenharmony_ci%elif %2 == 8
727cabdff1aSopenharmony_ci    mova                m3, [%%p3]
728cabdff1aSopenharmony_ci    mova                m4, [%%q2]
729cabdff1aSopenharmony_ci    mova                m5, [%%q3]
730cabdff1aSopenharmony_ci
731cabdff1aSopenharmony_ci%if ARCH_X86_64
732cabdff1aSopenharmony_ci    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, 8
733cabdff1aSopenharmony_ci%else
734cabdff1aSopenharmony_ci    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1
735cabdff1aSopenharmony_ci    mova                m2, [%%q0]
736cabdff1aSopenharmony_ci%endif
737cabdff1aSopenharmony_ci
738cabdff1aSopenharmony_ci    movu [dst0q+strideq*0-8], m3
739cabdff1aSopenharmony_ci    movu [dst0q+strideq*1-8], m1
740cabdff1aSopenharmony_ci    movu [dst0q+strideq*2-8], m7
741cabdff1aSopenharmony_ci    movu [dst0q+stride3q -8], m0
742cabdff1aSopenharmony_ci    movu [dst4q+strideq*0-8], m2
743cabdff1aSopenharmony_ci    movu [dst4q+strideq*1-8], m6
744cabdff1aSopenharmony_ci    movu [dst4q+strideq*2-8], m4
745cabdff1aSopenharmony_ci    movu [dst4q+stride3q -8], m5
746cabdff1aSopenharmony_ci%else ; %2 == 16
747cabdff1aSopenharmony_ci    SCRATCH              2, 8, %%q0
748cabdff1aSopenharmony_ci    SCRATCH              6, 9, %%q1
749cabdff1aSopenharmony_ci    mova                m2, [%%p7]
750cabdff1aSopenharmony_ci    mova                m3, [%%p6]
751cabdff1aSopenharmony_ci    mova                m4, [%%p5]
752cabdff1aSopenharmony_ci    mova                m5, [%%p4]
753cabdff1aSopenharmony_ci    mova                m6, [%%p3]
754cabdff1aSopenharmony_ci
755cabdff1aSopenharmony_ci%if ARCH_X86_64
756cabdff1aSopenharmony_ci    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, 10
757cabdff1aSopenharmony_ci%else
758cabdff1aSopenharmony_ci    mova            [%%p1], m7
759cabdff1aSopenharmony_ci    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1
760cabdff1aSopenharmony_ci%endif
761cabdff1aSopenharmony_ci
762cabdff1aSopenharmony_ci    mova [dst0q+strideq*0-16], m2
763cabdff1aSopenharmony_ci    mova [dst0q+strideq*1-16], m3
764cabdff1aSopenharmony_ci    mova [dst0q+strideq*2-16], m4
765cabdff1aSopenharmony_ci    mova [dst0q+stride3q -16], m5
766cabdff1aSopenharmony_ci%if ARCH_X86_64
767cabdff1aSopenharmony_ci    mova [dst4q+strideq*0-16], m6
768cabdff1aSopenharmony_ci%endif
769cabdff1aSopenharmony_ci    mova [dst4q+strideq*1-16], m1
770cabdff1aSopenharmony_ci    mova [dst4q+strideq*2-16], m7
771cabdff1aSopenharmony_ci    mova [dst4q+stride3q -16], m0
772cabdff1aSopenharmony_ci
773cabdff1aSopenharmony_ci    UNSCRATCH            2, 8, %%q0
774cabdff1aSopenharmony_ci    UNSCRATCH            6, 9, %%q1
775cabdff1aSopenharmony_ci    mova                m0, [%%q2]
776cabdff1aSopenharmony_ci    mova                m1, [%%q3]
777cabdff1aSopenharmony_ci    mova                m3, [%%q4]
778cabdff1aSopenharmony_ci    mova                m4, [%%q5]
779cabdff1aSopenharmony_ci%if ARCH_X86_64
780cabdff1aSopenharmony_ci    mova                m5, [%%q6]
781cabdff1aSopenharmony_ci%endif
782cabdff1aSopenharmony_ci    mova                m7, [%%q7]
783cabdff1aSopenharmony_ci
784cabdff1aSopenharmony_ci%if ARCH_X86_64
785cabdff1aSopenharmony_ci    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, 8
786cabdff1aSopenharmony_ci%else
787cabdff1aSopenharmony_ci    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1
788cabdff1aSopenharmony_ci%endif
789cabdff1aSopenharmony_ci
790cabdff1aSopenharmony_ci    mova [dst0q+strideq*0], m2
791cabdff1aSopenharmony_ci    mova [dst0q+strideq*1], m6
792cabdff1aSopenharmony_ci    mova [dst0q+strideq*2], m0
793cabdff1aSopenharmony_ci    mova [dst0q+stride3q ], m1
794cabdff1aSopenharmony_ci%if ARCH_X86_64
795cabdff1aSopenharmony_ci    mova [dst4q+strideq*0], m3
796cabdff1aSopenharmony_ci%endif
797cabdff1aSopenharmony_ci    mova [dst4q+strideq*1], m4
798cabdff1aSopenharmony_ci    mova [dst4q+strideq*2], m5
799cabdff1aSopenharmony_ci    mova [dst4q+stride3q ], m7
800cabdff1aSopenharmony_ci%endif ; %2
801cabdff1aSopenharmony_ci%endif ; %1
802cabdff1aSopenharmony_ci    RET
803cabdff1aSopenharmony_ci%endmacro
804cabdff1aSopenharmony_ci
805cabdff1aSopenharmony_ci%macro LOOP_FILTER_CPUSETS 3
806cabdff1aSopenharmony_ciINIT_XMM sse2
807cabdff1aSopenharmony_ciLOOP_FILTER %1, %2, %3
808cabdff1aSopenharmony_ciINIT_XMM ssse3
809cabdff1aSopenharmony_ciLOOP_FILTER %1, %2, %3
810cabdff1aSopenharmony_ciINIT_XMM avx
811cabdff1aSopenharmony_ciLOOP_FILTER %1, %2, %3
812cabdff1aSopenharmony_ci%endmacro
813cabdff1aSopenharmony_ci
814cabdff1aSopenharmony_ci%macro LOOP_FILTER_WDSETS 2
815cabdff1aSopenharmony_ciLOOP_FILTER_CPUSETS %1,  4, %2
816cabdff1aSopenharmony_ciLOOP_FILTER_CPUSETS %1,  8, %2
817cabdff1aSopenharmony_ciLOOP_FILTER_CPUSETS %1, 16, %2
818cabdff1aSopenharmony_ci%endmacro
819cabdff1aSopenharmony_ci
820cabdff1aSopenharmony_ciLOOP_FILTER_WDSETS h, 10
821cabdff1aSopenharmony_ciLOOP_FILTER_WDSETS v, 10
822cabdff1aSopenharmony_ciLOOP_FILTER_WDSETS h, 12
823cabdff1aSopenharmony_ciLOOP_FILTER_WDSETS v, 12
824