1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* x86-optimized functions for the CFHD decoder
3cabdff1aSopenharmony_ci;* Copyright (c) 2020 Paul B Mahol
4cabdff1aSopenharmony_ci;*
5cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
16cabdff1aSopenharmony_ci;*
17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci;******************************************************************************
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ciSECTION_RODATA
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_cifactor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
27cabdff1aSopenharmony_cifactor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1,
28cabdff1aSopenharmony_cifactor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4,
29cabdff1aSopenharmony_cifactor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4,
30cabdff1aSopenharmony_cipd_4: times 4 dd 4
31cabdff1aSopenharmony_cipw_1: times 8 dw 1
32cabdff1aSopenharmony_cipw_0: times 8 dw 0
33cabdff1aSopenharmony_cipw_1023: times 8 dw 1023
34cabdff1aSopenharmony_cipw_4095: times 8 dw 4095
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_ciSECTION .text
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_ci%macro CFHD_HORIZ_FILTER 1
39cabdff1aSopenharmony_ci%if %1 == 1023
40cabdff1aSopenharmony_cicglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp
41cabdff1aSopenharmony_ci    shl        widthd, 1
42cabdff1aSopenharmony_ci%define ostrideq widthq
43cabdff1aSopenharmony_ci%define lwidthq  widthq
44cabdff1aSopenharmony_ci%define hwidthq  widthq
45cabdff1aSopenharmony_ci%elif %1 == 4095
46cabdff1aSopenharmony_cicglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp
47cabdff1aSopenharmony_ci    shl        widthd, 1
48cabdff1aSopenharmony_ci%define ostrideq widthq
49cabdff1aSopenharmony_ci%define lwidthq  widthq
50cabdff1aSopenharmony_ci%define hwidthq  widthq
51cabdff1aSopenharmony_ci%else
52cabdff1aSopenharmony_ci%if ARCH_X86_64
53cabdff1aSopenharmony_cicglobal cfhd_horiz_filter, 8, 11, 12, output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp
54cabdff1aSopenharmony_ci    shl  ostrided, 1
55cabdff1aSopenharmony_ci    shl   lwidthd, 1
56cabdff1aSopenharmony_ci    shl   hwidthd, 1
57cabdff1aSopenharmony_ci    shl    widthd, 1
58cabdff1aSopenharmony_ci
59cabdff1aSopenharmony_ci    mov        yd, heightd
60cabdff1aSopenharmony_ci    neg        yq
61cabdff1aSopenharmony_ci%else
62cabdff1aSopenharmony_cicglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height
63cabdff1aSopenharmony_ci    shl        xd, 1
64cabdff1aSopenharmony_ci    shl        yd, 1
65cabdff1aSopenharmony_ci    shl     tempd, 1
66cabdff1aSopenharmony_ci    shl    widthd, 1
67cabdff1aSopenharmony_ci
68cabdff1aSopenharmony_ci    mov       xmp, xq
69cabdff1aSopenharmony_ci    mov       ymp, yq
70cabdff1aSopenharmony_ci    mov    tempmp, tempq
71cabdff1aSopenharmony_ci
72cabdff1aSopenharmony_ci    mov        yd, r7m
73cabdff1aSopenharmony_ci    neg        yq
74cabdff1aSopenharmony_ci
75cabdff1aSopenharmony_ci%define ostrideq xm
76cabdff1aSopenharmony_ci%define lwidthq  ym
77cabdff1aSopenharmony_ci%define hwidthq  tempm
78cabdff1aSopenharmony_ci%endif
79cabdff1aSopenharmony_ci%endif
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_ci%if ARCH_X86_64
82cabdff1aSopenharmony_ci    mova       m8, [factor_p1_n1]
83cabdff1aSopenharmony_ci    mova       m9, [factor_n1_p1]
84cabdff1aSopenharmony_ci    mova      m10, [pw_1]
85cabdff1aSopenharmony_ci    mova      m11, [pd_4]
86cabdff1aSopenharmony_ci%endif
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci%if %1 == 0
89cabdff1aSopenharmony_ci.looph:
90cabdff1aSopenharmony_ci%endif
91cabdff1aSopenharmony_ci    movsx          xq, word [lowq]
92cabdff1aSopenharmony_ci    imul           xq, 11
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_ci    movsx       tempq, word [lowq + 2]
95cabdff1aSopenharmony_ci    imul        tempq, -4
96cabdff1aSopenharmony_ci    add         tempq, xq
97cabdff1aSopenharmony_ci
98cabdff1aSopenharmony_ci    movsx          xq, word [lowq + 4]
99cabdff1aSopenharmony_ci    add         tempq, xq
100cabdff1aSopenharmony_ci    add         tempq, 4
101cabdff1aSopenharmony_ci    sar         tempq, 3
102cabdff1aSopenharmony_ci
103cabdff1aSopenharmony_ci    movsx          xq, word [highq]
104cabdff1aSopenharmony_ci    add         tempq, xq
105cabdff1aSopenharmony_ci    sar         tempq, 1
106cabdff1aSopenharmony_ci
107cabdff1aSopenharmony_ci%if %1
108cabdff1aSopenharmony_ci    movd          xm0, tempd
109cabdff1aSopenharmony_ci    CLIPW          m0, [pw_0], [pw_%1]
110cabdff1aSopenharmony_ci    pextrw      tempd, xm0, 0
111cabdff1aSopenharmony_ci%endif
112cabdff1aSopenharmony_ci    mov  word [outputq], tempw
113cabdff1aSopenharmony_ci
114cabdff1aSopenharmony_ci    movsx          xq, word [lowq]
115cabdff1aSopenharmony_ci    imul           xq, 5
116cabdff1aSopenharmony_ci
117cabdff1aSopenharmony_ci    movsx       tempq, word [lowq + 2]
118cabdff1aSopenharmony_ci    imul        tempq, 4
119cabdff1aSopenharmony_ci    add         tempq, xq
120cabdff1aSopenharmony_ci
121cabdff1aSopenharmony_ci    movsx          xq, word [lowq + 4]
122cabdff1aSopenharmony_ci    sub         tempq, xq
123cabdff1aSopenharmony_ci    add         tempq, 4
124cabdff1aSopenharmony_ci    sar         tempq, 3
125cabdff1aSopenharmony_ci
126cabdff1aSopenharmony_ci    movsx          xq, word [highq]
127cabdff1aSopenharmony_ci    sub         tempq, xq
128cabdff1aSopenharmony_ci    sar         tempq, 1
129cabdff1aSopenharmony_ci
130cabdff1aSopenharmony_ci%if %1
131cabdff1aSopenharmony_ci    movd          xm0, tempd
132cabdff1aSopenharmony_ci    CLIPW          m0, [pw_0], [pw_%1]
133cabdff1aSopenharmony_ci    pextrw      tempd, xm0, 0
134cabdff1aSopenharmony_ci%endif
135cabdff1aSopenharmony_ci    mov  word [outputq + 2], tempw
136cabdff1aSopenharmony_ci
137cabdff1aSopenharmony_ci    mov            xq, 0
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci.loop:
140cabdff1aSopenharmony_ci    movu           m4, [lowq + xq]
141cabdff1aSopenharmony_ci    movu           m1, [lowq + xq + 4]
142cabdff1aSopenharmony_ci
143cabdff1aSopenharmony_ci    mova           m5, m4
144cabdff1aSopenharmony_ci    punpcklwd      m4, m1
145cabdff1aSopenharmony_ci    punpckhwd      m5, m1
146cabdff1aSopenharmony_ci
147cabdff1aSopenharmony_ci    mova           m6, m4
148cabdff1aSopenharmony_ci    mova           m7, m5
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ci%if ARCH_X86_64
151cabdff1aSopenharmony_ci    pmaddwd        m4, m8
152cabdff1aSopenharmony_ci    pmaddwd        m5, m8
153cabdff1aSopenharmony_ci    pmaddwd        m6, m9
154cabdff1aSopenharmony_ci    pmaddwd        m7, m9
155cabdff1aSopenharmony_ci
156cabdff1aSopenharmony_ci    paddd          m4, m11
157cabdff1aSopenharmony_ci    paddd          m5, m11
158cabdff1aSopenharmony_ci    paddd          m6, m11
159cabdff1aSopenharmony_ci    paddd          m7, m11
160cabdff1aSopenharmony_ci%else
161cabdff1aSopenharmony_ci    pmaddwd        m4, [factor_p1_n1]
162cabdff1aSopenharmony_ci    pmaddwd        m5, [factor_p1_n1]
163cabdff1aSopenharmony_ci    pmaddwd        m6, [factor_n1_p1]
164cabdff1aSopenharmony_ci    pmaddwd        m7, [factor_n1_p1]
165cabdff1aSopenharmony_ci
166cabdff1aSopenharmony_ci    paddd          m4, [pd_4]
167cabdff1aSopenharmony_ci    paddd          m5, [pd_4]
168cabdff1aSopenharmony_ci    paddd          m6, [pd_4]
169cabdff1aSopenharmony_ci    paddd          m7, [pd_4]
170cabdff1aSopenharmony_ci%endif
171cabdff1aSopenharmony_ci
172cabdff1aSopenharmony_ci    psrad          m4, 3
173cabdff1aSopenharmony_ci    psrad          m5, 3
174cabdff1aSopenharmony_ci    psrad          m6, 3
175cabdff1aSopenharmony_ci    psrad          m7, 3
176cabdff1aSopenharmony_ci
177cabdff1aSopenharmony_ci    movu           m2, [lowq + xq + 2]
178cabdff1aSopenharmony_ci    movu           m3, [highq + xq + 2]
179cabdff1aSopenharmony_ci
180cabdff1aSopenharmony_ci    mova           m0, m2
181cabdff1aSopenharmony_ci    punpcklwd      m2, m3
182cabdff1aSopenharmony_ci    punpckhwd      m0, m3
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_ci    mova           m1, m2
185cabdff1aSopenharmony_ci    mova           m3, m0
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_ci%if ARCH_X86_64
188cabdff1aSopenharmony_ci    pmaddwd        m2, m10
189cabdff1aSopenharmony_ci    pmaddwd        m0, m10
190cabdff1aSopenharmony_ci    pmaddwd        m1, m8
191cabdff1aSopenharmony_ci    pmaddwd        m3, m8
192cabdff1aSopenharmony_ci%else
193cabdff1aSopenharmony_ci    pmaddwd        m2, [pw_1]
194cabdff1aSopenharmony_ci    pmaddwd        m0, [pw_1]
195cabdff1aSopenharmony_ci    pmaddwd        m1, [factor_p1_n1]
196cabdff1aSopenharmony_ci    pmaddwd        m3, [factor_p1_n1]
197cabdff1aSopenharmony_ci%endif
198cabdff1aSopenharmony_ci
199cabdff1aSopenharmony_ci    paddd          m2, m4
200cabdff1aSopenharmony_ci    paddd          m0, m5
201cabdff1aSopenharmony_ci    paddd          m1, m6
202cabdff1aSopenharmony_ci    paddd          m3, m7
203cabdff1aSopenharmony_ci
204cabdff1aSopenharmony_ci    psrad          m2, 1
205cabdff1aSopenharmony_ci    psrad          m0, 1
206cabdff1aSopenharmony_ci    psrad          m1, 1
207cabdff1aSopenharmony_ci    psrad          m3, 1
208cabdff1aSopenharmony_ci
209cabdff1aSopenharmony_ci    packssdw       m2, m0
210cabdff1aSopenharmony_ci    packssdw       m1, m3
211cabdff1aSopenharmony_ci
212cabdff1aSopenharmony_ci    mova           m0, m2
213cabdff1aSopenharmony_ci    punpcklwd      m2, m1
214cabdff1aSopenharmony_ci    punpckhwd      m0, m1
215cabdff1aSopenharmony_ci
216cabdff1aSopenharmony_ci%if %1
217cabdff1aSopenharmony_ci    CLIPW          m2, [pw_0], [pw_%1]
218cabdff1aSopenharmony_ci    CLIPW          m0, [pw_0], [pw_%1]
219cabdff1aSopenharmony_ci%endif
220cabdff1aSopenharmony_ci
221cabdff1aSopenharmony_ci    movu  [outputq + xq * 2 + 4], m2
222cabdff1aSopenharmony_ci    movu  [outputq + xq * 2 + mmsize + 4], m0
223cabdff1aSopenharmony_ci
224cabdff1aSopenharmony_ci    add            xq, mmsize
225cabdff1aSopenharmony_ci    cmp            xq, widthq
226cabdff1aSopenharmony_ci    jl .loop
227cabdff1aSopenharmony_ci
228cabdff1aSopenharmony_ci    add          lowq, widthq
229cabdff1aSopenharmony_ci    add         highq, widthq
230cabdff1aSopenharmony_ci    add       outputq, widthq
231cabdff1aSopenharmony_ci    add       outputq, widthq
232cabdff1aSopenharmony_ci
233cabdff1aSopenharmony_ci    movsx          xq, word [lowq - 2]
234cabdff1aSopenharmony_ci    imul           xq, 5
235cabdff1aSopenharmony_ci
236cabdff1aSopenharmony_ci    movsx       tempq, word [lowq - 4]
237cabdff1aSopenharmony_ci    imul        tempq, 4
238cabdff1aSopenharmony_ci    add         tempq, xq
239cabdff1aSopenharmony_ci
240cabdff1aSopenharmony_ci    movsx          xq, word [lowq - 6]
241cabdff1aSopenharmony_ci    sub         tempq, xq
242cabdff1aSopenharmony_ci    add         tempq, 4
243cabdff1aSopenharmony_ci    sar         tempq, 3
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ci    movsx          xq, word [highq - 2]
246cabdff1aSopenharmony_ci    add         tempq, xq
247cabdff1aSopenharmony_ci    sar         tempq, 1
248cabdff1aSopenharmony_ci
249cabdff1aSopenharmony_ci%if %1
250cabdff1aSopenharmony_ci    movd          xm0, tempd
251cabdff1aSopenharmony_ci    CLIPW          m0, [pw_0], [pw_%1]
252cabdff1aSopenharmony_ci    pextrw      tempd, xm0, 0
253cabdff1aSopenharmony_ci%endif
254cabdff1aSopenharmony_ci    mov  word [outputq - 4], tempw
255cabdff1aSopenharmony_ci
256cabdff1aSopenharmony_ci    movsx          xq, word [lowq - 2]
257cabdff1aSopenharmony_ci    imul           xq, 11
258cabdff1aSopenharmony_ci
259cabdff1aSopenharmony_ci    movsx       tempq, word [lowq - 4]
260cabdff1aSopenharmony_ci    imul        tempq, -4
261cabdff1aSopenharmony_ci    add         tempq, xq
262cabdff1aSopenharmony_ci
263cabdff1aSopenharmony_ci    movsx          xq, word [lowq - 6]
264cabdff1aSopenharmony_ci    add         tempq, xq
265cabdff1aSopenharmony_ci    add         tempq, 4
266cabdff1aSopenharmony_ci    sar         tempq, 3
267cabdff1aSopenharmony_ci
268cabdff1aSopenharmony_ci    movsx          xq, word [highq - 2]
269cabdff1aSopenharmony_ci    sub         tempq, xq
270cabdff1aSopenharmony_ci    sar         tempq, 1
271cabdff1aSopenharmony_ci
272cabdff1aSopenharmony_ci%if %1
273cabdff1aSopenharmony_ci    movd          xm0, tempd
274cabdff1aSopenharmony_ci    CLIPW          m0, [pw_0], [pw_%1]
275cabdff1aSopenharmony_ci    pextrw      tempd, xm0, 0
276cabdff1aSopenharmony_ci%endif
277cabdff1aSopenharmony_ci    mov  word [outputq - 2], tempw
278cabdff1aSopenharmony_ci
279cabdff1aSopenharmony_ci%if %1 == 0
280cabdff1aSopenharmony_ci    sub          lowq, widthq
281cabdff1aSopenharmony_ci    sub         highq, widthq
282cabdff1aSopenharmony_ci    sub       outputq, widthq
283cabdff1aSopenharmony_ci    sub       outputq, widthq
284cabdff1aSopenharmony_ci
285cabdff1aSopenharmony_ci    add          lowq, lwidthq
286cabdff1aSopenharmony_ci    add         highq, hwidthq
287cabdff1aSopenharmony_ci    add       outputq, ostrideq
288cabdff1aSopenharmony_ci    add       outputq, ostrideq
289cabdff1aSopenharmony_ci    add            yq, 1
290cabdff1aSopenharmony_ci    jl .looph
291cabdff1aSopenharmony_ci%endif
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci    RET
294cabdff1aSopenharmony_ci%endmacro
295cabdff1aSopenharmony_ci
296cabdff1aSopenharmony_ciINIT_XMM sse2
297cabdff1aSopenharmony_ciCFHD_HORIZ_FILTER 0
298cabdff1aSopenharmony_ci
299cabdff1aSopenharmony_ciINIT_XMM sse2
300cabdff1aSopenharmony_ciCFHD_HORIZ_FILTER 1023
301cabdff1aSopenharmony_ci
302cabdff1aSopenharmony_ciINIT_XMM sse2
303cabdff1aSopenharmony_ciCFHD_HORIZ_FILTER 4095
304cabdff1aSopenharmony_ci
305cabdff1aSopenharmony_ciINIT_XMM sse2
306cabdff1aSopenharmony_ci%if ARCH_X86_64
307cabdff1aSopenharmony_cicglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos
308cabdff1aSopenharmony_ci    shl        ostrided, 1
309cabdff1aSopenharmony_ci    shl         lwidthd, 1
310cabdff1aSopenharmony_ci    shl         hwidthd, 1
311cabdff1aSopenharmony_ci    shl          widthd, 1
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ci    dec   heightd
314cabdff1aSopenharmony_ci
315cabdff1aSopenharmony_ci    mova       m8, [factor_p1_n1]
316cabdff1aSopenharmony_ci    mova       m9, [factor_n1_p1]
317cabdff1aSopenharmony_ci    mova      m10, [pw_1]
318cabdff1aSopenharmony_ci    mova      m11, [pd_4]
319cabdff1aSopenharmony_ci    mova      m12, [factor_p11_n4]
320cabdff1aSopenharmony_ci    mova      m13, [factor_p5_p4]
321cabdff1aSopenharmony_ci%else
322cabdff1aSopenharmony_cicglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
323cabdff1aSopenharmony_ci    shl        xd, 1
324cabdff1aSopenharmony_ci    shl        yd, 1
325cabdff1aSopenharmony_ci    shl      posd, 1
326cabdff1aSopenharmony_ci    shl    widthd, 1
327cabdff1aSopenharmony_ci
328cabdff1aSopenharmony_ci    mov       xmp, xq
329cabdff1aSopenharmony_ci    mov       ymp, yq
330cabdff1aSopenharmony_ci    mov     posmp, posq
331cabdff1aSopenharmony_ci
332cabdff1aSopenharmony_ci    mov        xq, r7m
333cabdff1aSopenharmony_ci    dec        xq
334cabdff1aSopenharmony_ci    mov   widthmp, xq
335cabdff1aSopenharmony_ci
336cabdff1aSopenharmony_ci%define ostrideq xm
337cabdff1aSopenharmony_ci%define lwidthq  ym
338cabdff1aSopenharmony_ci%define hwidthq  posm
339cabdff1aSopenharmony_ci%define heightq  widthm
340cabdff1aSopenharmony_ci
341cabdff1aSopenharmony_ci%endif
342cabdff1aSopenharmony_ci
343cabdff1aSopenharmony_ci    xor        xq, xq
344cabdff1aSopenharmony_ci.loopw:
345cabdff1aSopenharmony_ci    xor        yq, yq
346cabdff1aSopenharmony_ci
347cabdff1aSopenharmony_ci    mov      posq, xq
348cabdff1aSopenharmony_ci    movu       m0, [lowq + posq]
349cabdff1aSopenharmony_ci    add      posq, lwidthq
350cabdff1aSopenharmony_ci    movu       m1, [lowq + posq]
351cabdff1aSopenharmony_ci    mova       m2, m0
352cabdff1aSopenharmony_ci    punpcklwd  m0, m1
353cabdff1aSopenharmony_ci    punpckhwd  m2, m1
354cabdff1aSopenharmony_ci
355cabdff1aSopenharmony_ci%if ARCH_X86_64
356cabdff1aSopenharmony_ci    pmaddwd    m0, m12
357cabdff1aSopenharmony_ci    pmaddwd    m2, m12
358cabdff1aSopenharmony_ci%else
359cabdff1aSopenharmony_ci    pmaddwd    m0, [factor_p11_n4]
360cabdff1aSopenharmony_ci    pmaddwd    m2, [factor_p11_n4]
361cabdff1aSopenharmony_ci%endif
362cabdff1aSopenharmony_ci
363cabdff1aSopenharmony_ci    pxor       m4, m4
364cabdff1aSopenharmony_ci    add      posq, lwidthq
365cabdff1aSopenharmony_ci    movu       m1, [lowq + posq]
366cabdff1aSopenharmony_ci    mova       m3, m4
367cabdff1aSopenharmony_ci    punpcklwd  m4, m1
368cabdff1aSopenharmony_ci    punpckhwd  m3, m1
369cabdff1aSopenharmony_ci
370cabdff1aSopenharmony_ci    psrad      m4, 16
371cabdff1aSopenharmony_ci    psrad      m3, 16
372cabdff1aSopenharmony_ci
373cabdff1aSopenharmony_ci    paddd      m0, m4
374cabdff1aSopenharmony_ci    paddd      m2, m3
375cabdff1aSopenharmony_ci
376cabdff1aSopenharmony_ci    paddd      m0, [pd_4]
377cabdff1aSopenharmony_ci    paddd      m2, [pd_4]
378cabdff1aSopenharmony_ci
379cabdff1aSopenharmony_ci    psrad      m0, 3
380cabdff1aSopenharmony_ci    psrad      m2, 3
381cabdff1aSopenharmony_ci
382cabdff1aSopenharmony_ci    mov      posq, xq
383cabdff1aSopenharmony_ci    pxor       m4, m4
384cabdff1aSopenharmony_ci    movu       m1, [highq + posq]
385cabdff1aSopenharmony_ci    mova       m3, m4
386cabdff1aSopenharmony_ci    punpcklwd  m4, m1
387cabdff1aSopenharmony_ci    punpckhwd  m3, m1
388cabdff1aSopenharmony_ci
389cabdff1aSopenharmony_ci    psrad      m4, 16
390cabdff1aSopenharmony_ci    psrad      m3, 16
391cabdff1aSopenharmony_ci
392cabdff1aSopenharmony_ci    paddd      m0, m4
393cabdff1aSopenharmony_ci    paddd      m2, m3
394cabdff1aSopenharmony_ci
395cabdff1aSopenharmony_ci    psrad      m0, 1
396cabdff1aSopenharmony_ci    psrad      m2, 1
397cabdff1aSopenharmony_ci
398cabdff1aSopenharmony_ci    packssdw   m0, m2
399cabdff1aSopenharmony_ci
400cabdff1aSopenharmony_ci    movu    [outputq + posq], m0
401cabdff1aSopenharmony_ci
402cabdff1aSopenharmony_ci    movu       m0, [lowq + posq]
403cabdff1aSopenharmony_ci    add      posq, lwidthq
404cabdff1aSopenharmony_ci    movu       m1, [lowq + posq]
405cabdff1aSopenharmony_ci    mova       m2, m0
406cabdff1aSopenharmony_ci    punpcklwd  m0, m1
407cabdff1aSopenharmony_ci    punpckhwd  m2, m1
408cabdff1aSopenharmony_ci
409cabdff1aSopenharmony_ci%if ARCH_X86_64
410cabdff1aSopenharmony_ci    pmaddwd    m0, m13
411cabdff1aSopenharmony_ci    pmaddwd    m2, m13
412cabdff1aSopenharmony_ci%else
413cabdff1aSopenharmony_ci    pmaddwd    m0, [factor_p5_p4]
414cabdff1aSopenharmony_ci    pmaddwd    m2, [factor_p5_p4]
415cabdff1aSopenharmony_ci%endif
416cabdff1aSopenharmony_ci
417cabdff1aSopenharmony_ci    pxor       m4, m4
418cabdff1aSopenharmony_ci    add      posq, lwidthq
419cabdff1aSopenharmony_ci    movu       m1, [lowq + posq]
420cabdff1aSopenharmony_ci    mova       m3, m4
421cabdff1aSopenharmony_ci    punpcklwd  m4, m1
422cabdff1aSopenharmony_ci    punpckhwd  m3, m1
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ci    psrad      m4, 16
425cabdff1aSopenharmony_ci    psrad      m3, 16
426cabdff1aSopenharmony_ci
427cabdff1aSopenharmony_ci    psubd      m0, m4
428cabdff1aSopenharmony_ci    psubd      m2, m3
429cabdff1aSopenharmony_ci
430cabdff1aSopenharmony_ci    paddd      m0, [pd_4]
431cabdff1aSopenharmony_ci    paddd      m2, [pd_4]
432cabdff1aSopenharmony_ci
433cabdff1aSopenharmony_ci    psrad      m0, 3
434cabdff1aSopenharmony_ci    psrad      m2, 3
435cabdff1aSopenharmony_ci
436cabdff1aSopenharmony_ci    mov      posq, xq
437cabdff1aSopenharmony_ci    pxor       m4, m4
438cabdff1aSopenharmony_ci    movu       m1, [highq + posq]
439cabdff1aSopenharmony_ci    mova       m3, m4
440cabdff1aSopenharmony_ci    punpcklwd  m4, m1
441cabdff1aSopenharmony_ci    punpckhwd  m3, m1
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci    psrad      m4, 16
444cabdff1aSopenharmony_ci    psrad      m3, 16
445cabdff1aSopenharmony_ci
446cabdff1aSopenharmony_ci    psubd      m0, m4
447cabdff1aSopenharmony_ci    psubd      m2, m3
448cabdff1aSopenharmony_ci
449cabdff1aSopenharmony_ci    psrad      m0, 1
450cabdff1aSopenharmony_ci    psrad      m2, 1
451cabdff1aSopenharmony_ci
452cabdff1aSopenharmony_ci    packssdw   m0, m2
453cabdff1aSopenharmony_ci
454cabdff1aSopenharmony_ci    add      posq, ostrideq
455cabdff1aSopenharmony_ci    movu    [outputq + posq], m0
456cabdff1aSopenharmony_ci
457cabdff1aSopenharmony_ci    add        yq, 1
458cabdff1aSopenharmony_ci.looph:
459cabdff1aSopenharmony_ci    mov      posq, lwidthq
460cabdff1aSopenharmony_ci    imul     posq, yq
461cabdff1aSopenharmony_ci    sub      posq, lwidthq
462cabdff1aSopenharmony_ci    add      posq, xq
463cabdff1aSopenharmony_ci
464cabdff1aSopenharmony_ci    movu       m4, [lowq + posq]
465cabdff1aSopenharmony_ci
466cabdff1aSopenharmony_ci    add      posq, lwidthq
467cabdff1aSopenharmony_ci    add      posq, lwidthq
468cabdff1aSopenharmony_ci    movu       m1, [lowq + posq]
469cabdff1aSopenharmony_ci
470cabdff1aSopenharmony_ci    mova       m5, m4
471cabdff1aSopenharmony_ci    punpcklwd  m4, m1
472cabdff1aSopenharmony_ci    punpckhwd  m5, m1
473cabdff1aSopenharmony_ci
474cabdff1aSopenharmony_ci    mova       m6, m4
475cabdff1aSopenharmony_ci    mova       m7, m5
476cabdff1aSopenharmony_ci
477cabdff1aSopenharmony_ci%if ARCH_X86_64
478cabdff1aSopenharmony_ci    pmaddwd    m4, m8
479cabdff1aSopenharmony_ci    pmaddwd    m5, m8
480cabdff1aSopenharmony_ci    pmaddwd    m6, m9
481cabdff1aSopenharmony_ci    pmaddwd    m7, m9
482cabdff1aSopenharmony_ci
483cabdff1aSopenharmony_ci    paddd      m4, m11
484cabdff1aSopenharmony_ci    paddd      m5, m11
485cabdff1aSopenharmony_ci    paddd      m6, m11
486cabdff1aSopenharmony_ci    paddd      m7, m11
487cabdff1aSopenharmony_ci%else
488cabdff1aSopenharmony_ci    pmaddwd    m4, [factor_p1_n1]
489cabdff1aSopenharmony_ci    pmaddwd    m5, [factor_p1_n1]
490cabdff1aSopenharmony_ci    pmaddwd    m6, [factor_n1_p1]
491cabdff1aSopenharmony_ci    pmaddwd    m7, [factor_n1_p1]
492cabdff1aSopenharmony_ci
493cabdff1aSopenharmony_ci    paddd      m4, [pd_4]
494cabdff1aSopenharmony_ci    paddd      m5, [pd_4]
495cabdff1aSopenharmony_ci    paddd      m6, [pd_4]
496cabdff1aSopenharmony_ci    paddd      m7, [pd_4]
497cabdff1aSopenharmony_ci%endif
498cabdff1aSopenharmony_ci
499cabdff1aSopenharmony_ci    psrad      m4, 3
500cabdff1aSopenharmony_ci    psrad      m5, 3
501cabdff1aSopenharmony_ci    psrad      m6, 3
502cabdff1aSopenharmony_ci    psrad      m7, 3
503cabdff1aSopenharmony_ci
504cabdff1aSopenharmony_ci    sub      posq, lwidthq
505cabdff1aSopenharmony_ci    movu       m0, [lowq + posq]
506cabdff1aSopenharmony_ci
507cabdff1aSopenharmony_ci    mov      posq, hwidthq
508cabdff1aSopenharmony_ci    imul     posq, yq
509cabdff1aSopenharmony_ci    add      posq, xq
510cabdff1aSopenharmony_ci    movu       m1, [highq + posq]
511cabdff1aSopenharmony_ci
512cabdff1aSopenharmony_ci    mova       m2, m0
513cabdff1aSopenharmony_ci    punpcklwd  m0, m1
514cabdff1aSopenharmony_ci    punpckhwd  m2, m1
515cabdff1aSopenharmony_ci
516cabdff1aSopenharmony_ci    mova       m1, m0
517cabdff1aSopenharmony_ci    mova       m3, m2
518cabdff1aSopenharmony_ci
519cabdff1aSopenharmony_ci%if ARCH_X86_64
520cabdff1aSopenharmony_ci    pmaddwd    m0, m10
521cabdff1aSopenharmony_ci    pmaddwd    m2, m10
522cabdff1aSopenharmony_ci    pmaddwd    m1, m8
523cabdff1aSopenharmony_ci    pmaddwd    m3, m8
524cabdff1aSopenharmony_ci%else
525cabdff1aSopenharmony_ci    pmaddwd    m0, [pw_1]
526cabdff1aSopenharmony_ci    pmaddwd    m2, [pw_1]
527cabdff1aSopenharmony_ci    pmaddwd    m1, [factor_p1_n1]
528cabdff1aSopenharmony_ci    pmaddwd    m3, [factor_p1_n1]
529cabdff1aSopenharmony_ci%endif
530cabdff1aSopenharmony_ci
531cabdff1aSopenharmony_ci    paddd      m0, m4
532cabdff1aSopenharmony_ci    paddd      m2, m5
533cabdff1aSopenharmony_ci    paddd      m1, m6
534cabdff1aSopenharmony_ci    paddd      m3, m7
535cabdff1aSopenharmony_ci
536cabdff1aSopenharmony_ci    psrad      m0, 1
537cabdff1aSopenharmony_ci    psrad      m2, 1
538cabdff1aSopenharmony_ci    psrad      m1, 1
539cabdff1aSopenharmony_ci    psrad      m3, 1
540cabdff1aSopenharmony_ci
541cabdff1aSopenharmony_ci    packssdw   m0, m2
542cabdff1aSopenharmony_ci    packssdw   m1, m3
543cabdff1aSopenharmony_ci
544cabdff1aSopenharmony_ci    mov      posq, ostrideq
545cabdff1aSopenharmony_ci    imul     posq, 2
546cabdff1aSopenharmony_ci    imul     posq, yq
547cabdff1aSopenharmony_ci    add      posq, xq
548cabdff1aSopenharmony_ci
549cabdff1aSopenharmony_ci    movu    [outputq + posq], m0
550cabdff1aSopenharmony_ci    add      posq, ostrideq
551cabdff1aSopenharmony_ci    movu    [outputq + posq], m1
552cabdff1aSopenharmony_ci
553cabdff1aSopenharmony_ci    add        yq, 1
554cabdff1aSopenharmony_ci    cmp        yq, heightq
555cabdff1aSopenharmony_ci    jl .looph
556cabdff1aSopenharmony_ci
557cabdff1aSopenharmony_ci    mov      posq, lwidthq
558cabdff1aSopenharmony_ci    imul     posq, yq
559cabdff1aSopenharmony_ci    add      posq, xq
560cabdff1aSopenharmony_ci    movu       m0, [lowq + posq]
561cabdff1aSopenharmony_ci    sub      posq, lwidthq
562cabdff1aSopenharmony_ci    movu       m1, [lowq + posq]
563cabdff1aSopenharmony_ci    mova       m2, m0
564cabdff1aSopenharmony_ci    punpcklwd  m0, m1
565cabdff1aSopenharmony_ci    punpckhwd  m2, m1
566cabdff1aSopenharmony_ci
567cabdff1aSopenharmony_ci%if ARCH_X86_64
568cabdff1aSopenharmony_ci    pmaddwd    m0, m13
569cabdff1aSopenharmony_ci    pmaddwd    m2, m13
570cabdff1aSopenharmony_ci%else
571cabdff1aSopenharmony_ci    pmaddwd    m0, [factor_p5_p4]
572cabdff1aSopenharmony_ci    pmaddwd    m2, [factor_p5_p4]
573cabdff1aSopenharmony_ci%endif
574cabdff1aSopenharmony_ci
575cabdff1aSopenharmony_ci    pxor       m4, m4
576cabdff1aSopenharmony_ci    sub      posq, lwidthq
577cabdff1aSopenharmony_ci    movu       m1, [lowq + posq]
578cabdff1aSopenharmony_ci    mova       m3, m4
579cabdff1aSopenharmony_ci    punpcklwd  m4, m1
580cabdff1aSopenharmony_ci    punpckhwd  m3, m1
581cabdff1aSopenharmony_ci
582cabdff1aSopenharmony_ci    psrad      m4, 16
583cabdff1aSopenharmony_ci    psrad      m3, 16
584cabdff1aSopenharmony_ci
585cabdff1aSopenharmony_ci    psubd      m0, m4
586cabdff1aSopenharmony_ci    psubd      m2, m3
587cabdff1aSopenharmony_ci
588cabdff1aSopenharmony_ci%if ARCH_X86_64
589cabdff1aSopenharmony_ci    paddd      m0, m11
590cabdff1aSopenharmony_ci    paddd      m2, m11
591cabdff1aSopenharmony_ci%else
592cabdff1aSopenharmony_ci    paddd      m0, [pd_4]
593cabdff1aSopenharmony_ci    paddd      m2, [pd_4]
594cabdff1aSopenharmony_ci%endif
595cabdff1aSopenharmony_ci
596cabdff1aSopenharmony_ci    psrad      m0, 3
597cabdff1aSopenharmony_ci    psrad      m2, 3
598cabdff1aSopenharmony_ci
599cabdff1aSopenharmony_ci    mov      posq, hwidthq
600cabdff1aSopenharmony_ci    imul     posq, yq
601cabdff1aSopenharmony_ci    add      posq, xq
602cabdff1aSopenharmony_ci    pxor       m4, m4
603cabdff1aSopenharmony_ci    movu       m1, [highq + posq]
604cabdff1aSopenharmony_ci    mova       m3, m4
605cabdff1aSopenharmony_ci    punpcklwd  m4, m1
606cabdff1aSopenharmony_ci    punpckhwd  m3, m1
607cabdff1aSopenharmony_ci
608cabdff1aSopenharmony_ci    psrad      m4, 16
609cabdff1aSopenharmony_ci    psrad      m3, 16
610cabdff1aSopenharmony_ci
611cabdff1aSopenharmony_ci    paddd      m0, m4
612cabdff1aSopenharmony_ci    paddd      m2, m3
613cabdff1aSopenharmony_ci
614cabdff1aSopenharmony_ci    psrad      m0, 1
615cabdff1aSopenharmony_ci    psrad      m2, 1
616cabdff1aSopenharmony_ci
617cabdff1aSopenharmony_ci    packssdw   m0, m2
618cabdff1aSopenharmony_ci
619cabdff1aSopenharmony_ci    mov      posq, ostrideq
620cabdff1aSopenharmony_ci    imul     posq, 2
621cabdff1aSopenharmony_ci    imul     posq, yq
622cabdff1aSopenharmony_ci    add      posq, xq
623cabdff1aSopenharmony_ci    movu    [outputq + posq], m0
624cabdff1aSopenharmony_ci
625cabdff1aSopenharmony_ci    mov      posq, lwidthq
626cabdff1aSopenharmony_ci    imul     posq, yq
627cabdff1aSopenharmony_ci    add      posq, xq
628cabdff1aSopenharmony_ci    movu       m0, [lowq + posq]
629cabdff1aSopenharmony_ci    sub      posq, lwidthq
630cabdff1aSopenharmony_ci    movu       m1, [lowq + posq]
631cabdff1aSopenharmony_ci    mova       m2, m0
632cabdff1aSopenharmony_ci    punpcklwd  m0, m1
633cabdff1aSopenharmony_ci    punpckhwd  m2, m1
634cabdff1aSopenharmony_ci
635cabdff1aSopenharmony_ci%if ARCH_X86_64
636cabdff1aSopenharmony_ci    pmaddwd    m0, m12
637cabdff1aSopenharmony_ci    pmaddwd    m2, m12
638cabdff1aSopenharmony_ci%else
639cabdff1aSopenharmony_ci    pmaddwd    m0, [factor_p11_n4]
640cabdff1aSopenharmony_ci    pmaddwd    m2, [factor_p11_n4]
641cabdff1aSopenharmony_ci%endif
642cabdff1aSopenharmony_ci
643cabdff1aSopenharmony_ci    pxor       m4, m4
644cabdff1aSopenharmony_ci    sub      posq, lwidthq
645cabdff1aSopenharmony_ci    movu       m1, [lowq + posq]
646cabdff1aSopenharmony_ci    mova       m3, m4
647cabdff1aSopenharmony_ci    punpcklwd  m4, m1
648cabdff1aSopenharmony_ci    punpckhwd  m3, m1
649cabdff1aSopenharmony_ci
650cabdff1aSopenharmony_ci    psrad      m4, 16
651cabdff1aSopenharmony_ci    psrad      m3, 16
652cabdff1aSopenharmony_ci
653cabdff1aSopenharmony_ci    paddd      m0, m4
654cabdff1aSopenharmony_ci    paddd      m2, m3
655cabdff1aSopenharmony_ci
656cabdff1aSopenharmony_ci%if ARCH_X86_64
657cabdff1aSopenharmony_ci    paddd      m0, m11
658cabdff1aSopenharmony_ci    paddd      m2, m11
659cabdff1aSopenharmony_ci%else
660cabdff1aSopenharmony_ci    paddd      m0, [pd_4]
661cabdff1aSopenharmony_ci    paddd      m2, [pd_4]
662cabdff1aSopenharmony_ci%endif
663cabdff1aSopenharmony_ci
664cabdff1aSopenharmony_ci    psrad      m0, 3
665cabdff1aSopenharmony_ci    psrad      m2, 3
666cabdff1aSopenharmony_ci
667cabdff1aSopenharmony_ci    mov      posq, hwidthq
668cabdff1aSopenharmony_ci    imul     posq, yq
669cabdff1aSopenharmony_ci    add      posq, xq
670cabdff1aSopenharmony_ci    pxor       m4, m4
671cabdff1aSopenharmony_ci    movu       m1, [highq + posq]
672cabdff1aSopenharmony_ci    mova       m3, m4
673cabdff1aSopenharmony_ci    punpcklwd  m4, m1
674cabdff1aSopenharmony_ci    punpckhwd  m3, m1
675cabdff1aSopenharmony_ci
676cabdff1aSopenharmony_ci    psrad      m4, 16
677cabdff1aSopenharmony_ci    psrad      m3, 16
678cabdff1aSopenharmony_ci
679cabdff1aSopenharmony_ci    psubd      m0, m4
680cabdff1aSopenharmony_ci    psubd      m2, m3
681cabdff1aSopenharmony_ci
682cabdff1aSopenharmony_ci    psrad      m0, 1
683cabdff1aSopenharmony_ci    psrad      m2, 1
684cabdff1aSopenharmony_ci
685cabdff1aSopenharmony_ci    packssdw   m0, m2
686cabdff1aSopenharmony_ci
687cabdff1aSopenharmony_ci    mov      posq, ostrideq
688cabdff1aSopenharmony_ci    imul     posq, 2
689cabdff1aSopenharmony_ci    imul     posq, yq
690cabdff1aSopenharmony_ci    add      posq, ostrideq
691cabdff1aSopenharmony_ci    add      posq, xq
692cabdff1aSopenharmony_ci    movu    [outputq + posq], m0
693cabdff1aSopenharmony_ci
694cabdff1aSopenharmony_ci    add        xq, mmsize
695cabdff1aSopenharmony_ci    cmp        xq, widthq
696cabdff1aSopenharmony_ci    jl .loopw
697cabdff1aSopenharmony_ci    RET
698