1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* x86-optimized functions for the CFHD encoder
3cabdff1aSopenharmony_ci;* Copyright (c) 2021 Paul B Mahol
4cabdff1aSopenharmony_ci;*
5cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
16cabdff1aSopenharmony_ci;*
17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci;******************************************************************************
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ciSECTION_RODATA
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_cipw_p1_n1:  dw  1, -1, 1, -1, 1, -1, 1, -1
27cabdff1aSopenharmony_cipw_n1_p1:  dw  -1, 1, -1, 1, -1, 1, -1, 1
28cabdff1aSopenharmony_cipw_p5_n11: dw  5, -11, 5, -11, 5, -11, 5, -11
29cabdff1aSopenharmony_cipw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11
30cabdff1aSopenharmony_cipw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5
31cabdff1aSopenharmony_cipw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5
32cabdff1aSopenharmony_cipd_4:  times 4 dd  4
33cabdff1aSopenharmony_cipw_n4: times 8 dw -4
34cabdff1aSopenharmony_cicextern pw_m1
35cabdff1aSopenharmony_cicextern pw_1
36cabdff1aSopenharmony_cicextern pw_4
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_ciSECTION .text
39cabdff1aSopenharmony_ci
40cabdff1aSopenharmony_ci%if ARCH_X86_64
41cabdff1aSopenharmony_ciINIT_XMM sse2
42cabdff1aSopenharmony_cicglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp
43cabdff1aSopenharmony_ci    shl  istrideq, 1
44cabdff1aSopenharmony_ci    shl   lwidthq, 1
45cabdff1aSopenharmony_ci    shl   hwidthq, 1
46cabdff1aSopenharmony_ci    mova       m7, [pd_4]
47cabdff1aSopenharmony_ci    mova       m8, [pw_1]
48cabdff1aSopenharmony_ci    mova       m9, [pw_m1]
49cabdff1aSopenharmony_ci    mova       m10,[pw_p1_n1]
50cabdff1aSopenharmony_ci    movsxdifnidn yq, yd
51cabdff1aSopenharmony_ci    movsxdifnidn widthq, widthd
52cabdff1aSopenharmony_ci    neg        yq
53cabdff1aSopenharmony_ci.looph:
54cabdff1aSopenharmony_ci    movsx          xq, word [inputq]
55cabdff1aSopenharmony_ci
56cabdff1aSopenharmony_ci    movsx       tempq, word [inputq + 2]
57cabdff1aSopenharmony_ci    add         tempq, xq
58cabdff1aSopenharmony_ci
59cabdff1aSopenharmony_ci    movd          xm0, tempd
60cabdff1aSopenharmony_ci    packssdw       m0, m0
61cabdff1aSopenharmony_ci    movd        tempd, m0
62cabdff1aSopenharmony_ci    mov   word [lowq], tempw
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_ci    movsx          xq, word [inputq]
65cabdff1aSopenharmony_ci    imul           xq, 5
66cabdff1aSopenharmony_ci    movsx       tempq, word [inputq + 2]
67cabdff1aSopenharmony_ci    imul        tempq, -11
68cabdff1aSopenharmony_ci    add         tempq, xq
69cabdff1aSopenharmony_ci
70cabdff1aSopenharmony_ci    movsx          xq, word [inputq + 4]
71cabdff1aSopenharmony_ci    imul           xq, 4
72cabdff1aSopenharmony_ci    add         tempq, xq
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_ci    movsx          xq, word [inputq + 6]
75cabdff1aSopenharmony_ci    imul           xq, 4
76cabdff1aSopenharmony_ci    add         tempq, xq
77cabdff1aSopenharmony_ci
78cabdff1aSopenharmony_ci    movsx          xq, word [inputq + 8]
79cabdff1aSopenharmony_ci    imul           xq, -1
80cabdff1aSopenharmony_ci    add         tempq, xq
81cabdff1aSopenharmony_ci
82cabdff1aSopenharmony_ci    movsx          xq, word [inputq + 10]
83cabdff1aSopenharmony_ci    imul           xq, -1
84cabdff1aSopenharmony_ci    add         tempq, xq
85cabdff1aSopenharmony_ci
86cabdff1aSopenharmony_ci    add         tempq, 4
87cabdff1aSopenharmony_ci    sar         tempq, 3
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ci    movd          xm0, tempd
90cabdff1aSopenharmony_ci    packssdw       m0, m0
91cabdff1aSopenharmony_ci    movd        tempd, m0
92cabdff1aSopenharmony_ci    mov  word [highq], tempw
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_ci    mov            xq, 2
95cabdff1aSopenharmony_ci
96cabdff1aSopenharmony_ci.loopw:
97cabdff1aSopenharmony_ci    movu           m0, [inputq + xq * 2]
98cabdff1aSopenharmony_ci    movu           m1, [inputq + xq * 2 + mmsize]
99cabdff1aSopenharmony_ci
100cabdff1aSopenharmony_ci    pmaddwd        m0, m8
101cabdff1aSopenharmony_ci    pmaddwd        m1, m8
102cabdff1aSopenharmony_ci
103cabdff1aSopenharmony_ci    packssdw       m0, m1
104cabdff1aSopenharmony_ci    movu    [lowq+xq], m0
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ci    movu           m2, [inputq + xq * 2 - 4]
107cabdff1aSopenharmony_ci    movu           m3, [inputq + xq * 2 - 4 + mmsize]
108cabdff1aSopenharmony_ci
109cabdff1aSopenharmony_ci    pmaddwd        m2, m9
110cabdff1aSopenharmony_ci    pmaddwd        m3, m9
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ci    movu           m0, [inputq + xq * 2 + 4]
113cabdff1aSopenharmony_ci    movu           m1, [inputq + xq * 2 + 4 + mmsize]
114cabdff1aSopenharmony_ci
115cabdff1aSopenharmony_ci    pmaddwd        m0, m8
116cabdff1aSopenharmony_ci    pmaddwd        m1, m8
117cabdff1aSopenharmony_ci
118cabdff1aSopenharmony_ci    paddd          m0, m2
119cabdff1aSopenharmony_ci    paddd          m1, m3
120cabdff1aSopenharmony_ci
121cabdff1aSopenharmony_ci    paddd          m0, m7
122cabdff1aSopenharmony_ci    paddd          m1, m7
123cabdff1aSopenharmony_ci
124cabdff1aSopenharmony_ci    psrad          m0, 3
125cabdff1aSopenharmony_ci    psrad          m1, 3
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ci    movu           m5, [inputq + xq * 2 + 0]
128cabdff1aSopenharmony_ci    movu           m6, [inputq + xq * 2 + mmsize]
129cabdff1aSopenharmony_ci
130cabdff1aSopenharmony_ci    pmaddwd        m5, m10
131cabdff1aSopenharmony_ci    pmaddwd        m6, m10
132cabdff1aSopenharmony_ci
133cabdff1aSopenharmony_ci    paddd          m0, m5
134cabdff1aSopenharmony_ci    paddd          m1, m6
135cabdff1aSopenharmony_ci
136cabdff1aSopenharmony_ci    packssdw       m0, m1
137cabdff1aSopenharmony_ci    movu   [highq+xq], m0
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci    add            xq, mmsize
140cabdff1aSopenharmony_ci    cmp            xq, widthq
141cabdff1aSopenharmony_ci    jl .loopw
142cabdff1aSopenharmony_ci
143cabdff1aSopenharmony_ci    add          lowq, widthq
144cabdff1aSopenharmony_ci    add         highq, widthq
145cabdff1aSopenharmony_ci    lea        inputq, [inputq + widthq * 2]
146cabdff1aSopenharmony_ci
147cabdff1aSopenharmony_ci    movsx          xq, word [inputq - 4]
148cabdff1aSopenharmony_ci    movsx       tempq, word [inputq - 2]
149cabdff1aSopenharmony_ci    add         tempq, xq
150cabdff1aSopenharmony_ci
151cabdff1aSopenharmony_ci    movd          xm0, tempd
152cabdff1aSopenharmony_ci    packssdw       m0, m0
153cabdff1aSopenharmony_ci    movd        tempd, m0
154cabdff1aSopenharmony_ci    mov word [lowq-2], tempw
155cabdff1aSopenharmony_ci
156cabdff1aSopenharmony_ci    movsx       tempq, word [inputq - 4]
157cabdff1aSopenharmony_ci    imul        tempq, 11
158cabdff1aSopenharmony_ci    movsx          xq, word [inputq - 2]
159cabdff1aSopenharmony_ci    imul           xq, -5
160cabdff1aSopenharmony_ci    add         tempq, xq
161cabdff1aSopenharmony_ci
162cabdff1aSopenharmony_ci    movsx          xq, word [inputq - 6]
163cabdff1aSopenharmony_ci    imul           xq, -4
164cabdff1aSopenharmony_ci    add         tempq, xq
165cabdff1aSopenharmony_ci
166cabdff1aSopenharmony_ci    movsx          xq, word [inputq - 8]
167cabdff1aSopenharmony_ci    imul           xq, -4
168cabdff1aSopenharmony_ci    add         tempq, xq
169cabdff1aSopenharmony_ci
170cabdff1aSopenharmony_ci    movsx          xq, word [inputq - 10]
171cabdff1aSopenharmony_ci    add         tempq, xq
172cabdff1aSopenharmony_ci
173cabdff1aSopenharmony_ci    movsx          xq, word [inputq - 12]
174cabdff1aSopenharmony_ci    add         tempq, xq
175cabdff1aSopenharmony_ci
176cabdff1aSopenharmony_ci    add         tempq, 4
177cabdff1aSopenharmony_ci    sar         tempq, 3
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ci    movd          xm0, tempd
180cabdff1aSopenharmony_ci    packssdw       m0, m0
181cabdff1aSopenharmony_ci    movd        tempd, m0
182cabdff1aSopenharmony_ci    mov word [highq-2], tempw
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_ci    sub        inputq, widthq
185cabdff1aSopenharmony_ci    sub        inputq, widthq
186cabdff1aSopenharmony_ci    sub         highq, widthq
187cabdff1aSopenharmony_ci    sub          lowq, widthq
188cabdff1aSopenharmony_ci
189cabdff1aSopenharmony_ci    add          lowq, lwidthq
190cabdff1aSopenharmony_ci    add         highq, hwidthq
191cabdff1aSopenharmony_ci    add        inputq, istrideq
192cabdff1aSopenharmony_ci    add            yq, 1
193cabdff1aSopenharmony_ci    jl .looph
194cabdff1aSopenharmony_ci
195cabdff1aSopenharmony_ci    RET
196cabdff1aSopenharmony_ci%endif
197cabdff1aSopenharmony_ci
198cabdff1aSopenharmony_ci%if ARCH_X86_64
199cabdff1aSopenharmony_ciINIT_XMM sse2
200cabdff1aSopenharmony_cicglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos
201cabdff1aSopenharmony_ci    shl  istrideq, 1
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ci    shl    widthd, 1
204cabdff1aSopenharmony_ci    sub   heightd, 2
205cabdff1aSopenharmony_ci
206cabdff1aSopenharmony_ci    xor        xq, xq
207cabdff1aSopenharmony_ci
208cabdff1aSopenharmony_ci    mova       m7, [pd_4]
209cabdff1aSopenharmony_ci    mova       m8, [pw_1]
210cabdff1aSopenharmony_ci    mova       m9, [pw_m1]
211cabdff1aSopenharmony_ci    mova       m10,[pw_p1_n1]
212cabdff1aSopenharmony_ci    mova       m11,[pw_n1_p1]
213cabdff1aSopenharmony_ci    mova       m12,[pw_4]
214cabdff1aSopenharmony_ci    mova       m13,[pw_n4]
215cabdff1aSopenharmony_ci.loopw:
216cabdff1aSopenharmony_ci    mov        yq, 2
217cabdff1aSopenharmony_ci
218cabdff1aSopenharmony_ci    mov      posq, xq
219cabdff1aSopenharmony_ci    movu       m0, [inputq + posq]
220cabdff1aSopenharmony_ci    add      posq, istrideq
221cabdff1aSopenharmony_ci    movu       m1, [inputq + posq]
222cabdff1aSopenharmony_ci
223cabdff1aSopenharmony_ci    paddsw     m0, m1
224cabdff1aSopenharmony_ci
225cabdff1aSopenharmony_ci    movu    [lowq + xq], m0
226cabdff1aSopenharmony_ci
227cabdff1aSopenharmony_ci    mov      posq, xq
228cabdff1aSopenharmony_ci
229cabdff1aSopenharmony_ci    movu       m0, [inputq + posq]
230cabdff1aSopenharmony_ci    add      posq, istrideq
231cabdff1aSopenharmony_ci    movu       m1, [inputq + posq]
232cabdff1aSopenharmony_ci    add      posq, istrideq
233cabdff1aSopenharmony_ci    movu       m2, [inputq + posq]
234cabdff1aSopenharmony_ci    add      posq, istrideq
235cabdff1aSopenharmony_ci    movu       m3, [inputq + posq]
236cabdff1aSopenharmony_ci    add      posq, istrideq
237cabdff1aSopenharmony_ci    movu       m4, [inputq + posq]
238cabdff1aSopenharmony_ci    add      posq, istrideq
239cabdff1aSopenharmony_ci    movu       m5, [inputq + posq]
240cabdff1aSopenharmony_ci
241cabdff1aSopenharmony_ci    mova       m6, m0
242cabdff1aSopenharmony_ci    punpcklwd  m0, m1
243cabdff1aSopenharmony_ci    punpckhwd  m1, m6
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ci    mova       m6, m2
246cabdff1aSopenharmony_ci    punpcklwd  m2, m3
247cabdff1aSopenharmony_ci    punpckhwd  m3, m6
248cabdff1aSopenharmony_ci
249cabdff1aSopenharmony_ci    mova       m6, m4
250cabdff1aSopenharmony_ci    punpcklwd  m4, m5
251cabdff1aSopenharmony_ci    punpckhwd  m5, m6
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ci    pmaddwd    m0, [pw_p5_n11]
254cabdff1aSopenharmony_ci    pmaddwd    m1, [pw_n11_p5]
255cabdff1aSopenharmony_ci    pmaddwd    m2, m12
256cabdff1aSopenharmony_ci    pmaddwd    m3, m12
257cabdff1aSopenharmony_ci    pmaddwd    m4, m9
258cabdff1aSopenharmony_ci    pmaddwd    m5, m9
259cabdff1aSopenharmony_ci
260cabdff1aSopenharmony_ci    paddd      m0, m2
261cabdff1aSopenharmony_ci    paddd      m1, m3
262cabdff1aSopenharmony_ci    paddd      m0, m4
263cabdff1aSopenharmony_ci    paddd      m1, m5
264cabdff1aSopenharmony_ci
265cabdff1aSopenharmony_ci    paddd      m0, m7
266cabdff1aSopenharmony_ci    paddd      m1, m7
267cabdff1aSopenharmony_ci
268cabdff1aSopenharmony_ci    psrad      m0, 3
269cabdff1aSopenharmony_ci    psrad      m1, 3
270cabdff1aSopenharmony_ci    packssdw   m0, m1
271cabdff1aSopenharmony_ci
272cabdff1aSopenharmony_ci    movu   [highq + xq], m0
273cabdff1aSopenharmony_ci
274cabdff1aSopenharmony_ci.looph:
275cabdff1aSopenharmony_ci
276cabdff1aSopenharmony_ci    mov      posq, istrideq
277cabdff1aSopenharmony_ci    imul     posq, yq
278cabdff1aSopenharmony_ci    add      posq, xq
279cabdff1aSopenharmony_ci
280cabdff1aSopenharmony_ci    movu       m0, [inputq + posq]
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci    add      posq, istrideq
283cabdff1aSopenharmony_ci    movu       m1, [inputq + posq]
284cabdff1aSopenharmony_ci
285cabdff1aSopenharmony_ci    paddsw     m0, m1
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ci    mov      posq, lwidthq
288cabdff1aSopenharmony_ci    imul     posq, yq
289cabdff1aSopenharmony_ci    add      posq, xq
290cabdff1aSopenharmony_ci
291cabdff1aSopenharmony_ci    movu    [lowq + posq], m0
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci    add        yq, -2
294cabdff1aSopenharmony_ci
295cabdff1aSopenharmony_ci    mov      posq, istrideq
296cabdff1aSopenharmony_ci    imul     posq, yq
297cabdff1aSopenharmony_ci    add      posq, xq
298cabdff1aSopenharmony_ci
299cabdff1aSopenharmony_ci    movu       m0, [inputq + posq]
300cabdff1aSopenharmony_ci    add      posq, istrideq
301cabdff1aSopenharmony_ci    movu       m1, [inputq + posq]
302cabdff1aSopenharmony_ci    add      posq, istrideq
303cabdff1aSopenharmony_ci    movu       m2, [inputq + posq]
304cabdff1aSopenharmony_ci    add      posq, istrideq
305cabdff1aSopenharmony_ci    movu       m3, [inputq + posq]
306cabdff1aSopenharmony_ci    add      posq, istrideq
307cabdff1aSopenharmony_ci    movu       m4, [inputq + posq]
308cabdff1aSopenharmony_ci    add      posq, istrideq
309cabdff1aSopenharmony_ci    movu       m5, [inputq + posq]
310cabdff1aSopenharmony_ci
311cabdff1aSopenharmony_ci    add        yq, 2
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ci    mova       m6, m0
314cabdff1aSopenharmony_ci    punpcklwd  m0, m1
315cabdff1aSopenharmony_ci    punpckhwd  m1, m6
316cabdff1aSopenharmony_ci
317cabdff1aSopenharmony_ci    mova       m6, m2
318cabdff1aSopenharmony_ci    punpcklwd  m2, m3
319cabdff1aSopenharmony_ci    punpckhwd  m3, m6
320cabdff1aSopenharmony_ci
321cabdff1aSopenharmony_ci    mova       m6, m4
322cabdff1aSopenharmony_ci    punpcklwd  m4, m5
323cabdff1aSopenharmony_ci    punpckhwd  m5, m6
324cabdff1aSopenharmony_ci
325cabdff1aSopenharmony_ci    pmaddwd    m0, m9
326cabdff1aSopenharmony_ci    pmaddwd    m1, m9
327cabdff1aSopenharmony_ci    pmaddwd    m2, m10
328cabdff1aSopenharmony_ci    pmaddwd    m3, m11
329cabdff1aSopenharmony_ci    pmaddwd    m4, m8
330cabdff1aSopenharmony_ci    pmaddwd    m5, m8
331cabdff1aSopenharmony_ci
332cabdff1aSopenharmony_ci    paddd      m0, m4
333cabdff1aSopenharmony_ci    paddd      m1, m5
334cabdff1aSopenharmony_ci
335cabdff1aSopenharmony_ci    paddd      m0, m7
336cabdff1aSopenharmony_ci    paddd      m1, m7
337cabdff1aSopenharmony_ci
338cabdff1aSopenharmony_ci    psrad      m0, 3
339cabdff1aSopenharmony_ci    psrad      m1, 3
340cabdff1aSopenharmony_ci    paddd      m0, m2
341cabdff1aSopenharmony_ci    paddd      m1, m3
342cabdff1aSopenharmony_ci    packssdw   m0, m1
343cabdff1aSopenharmony_ci
344cabdff1aSopenharmony_ci    mov      posq, hwidthq
345cabdff1aSopenharmony_ci    imul     posq, yq
346cabdff1aSopenharmony_ci    add      posq, xq
347cabdff1aSopenharmony_ci
348cabdff1aSopenharmony_ci    movu   [highq + posq], m0
349cabdff1aSopenharmony_ci
350cabdff1aSopenharmony_ci    add        yq, 2
351cabdff1aSopenharmony_ci    cmp        yq, heightq
352cabdff1aSopenharmony_ci    jl .looph
353cabdff1aSopenharmony_ci
354cabdff1aSopenharmony_ci    mov      posq, istrideq
355cabdff1aSopenharmony_ci    imul     posq, yq
356cabdff1aSopenharmony_ci    add      posq, xq
357cabdff1aSopenharmony_ci
358cabdff1aSopenharmony_ci    movu       m0, [inputq + posq]
359cabdff1aSopenharmony_ci    add      posq, istrideq
360cabdff1aSopenharmony_ci    movu       m1, [inputq + posq]
361cabdff1aSopenharmony_ci
362cabdff1aSopenharmony_ci    paddsw     m0, m1
363cabdff1aSopenharmony_ci
364cabdff1aSopenharmony_ci    mov      posq, lwidthq
365cabdff1aSopenharmony_ci    imul     posq, yq
366cabdff1aSopenharmony_ci    add      posq, xq
367cabdff1aSopenharmony_ci
368cabdff1aSopenharmony_ci    movu    [lowq + posq], m0
369cabdff1aSopenharmony_ci
370cabdff1aSopenharmony_ci    sub        yq, 4
371cabdff1aSopenharmony_ci
372cabdff1aSopenharmony_ci    mov      posq, istrideq
373cabdff1aSopenharmony_ci    imul     posq, yq
374cabdff1aSopenharmony_ci    add      posq, xq
375cabdff1aSopenharmony_ci
376cabdff1aSopenharmony_ci    movu       m0, [inputq + posq]
377cabdff1aSopenharmony_ci    add      posq, istrideq
378cabdff1aSopenharmony_ci    movu       m1, [inputq + posq]
379cabdff1aSopenharmony_ci    add      posq, istrideq
380cabdff1aSopenharmony_ci    movu       m2, [inputq + posq]
381cabdff1aSopenharmony_ci    add      posq, istrideq
382cabdff1aSopenharmony_ci    movu       m3, [inputq + posq]
383cabdff1aSopenharmony_ci    add      posq, istrideq
384cabdff1aSopenharmony_ci    movu       m4, [inputq + posq]
385cabdff1aSopenharmony_ci    add      posq, istrideq
386cabdff1aSopenharmony_ci    movu       m5, [inputq + posq]
387cabdff1aSopenharmony_ci
388cabdff1aSopenharmony_ci    add        yq, 4
389cabdff1aSopenharmony_ci
390cabdff1aSopenharmony_ci    mova       m6, m0
391cabdff1aSopenharmony_ci    punpcklwd  m0, m1
392cabdff1aSopenharmony_ci    punpckhwd  m1, m6
393cabdff1aSopenharmony_ci
394cabdff1aSopenharmony_ci    mova       m6, m2
395cabdff1aSopenharmony_ci    punpcklwd  m2, m3
396cabdff1aSopenharmony_ci    punpckhwd  m3, m6
397cabdff1aSopenharmony_ci
398cabdff1aSopenharmony_ci    mova       m6, m4
399cabdff1aSopenharmony_ci    punpcklwd  m4, m5
400cabdff1aSopenharmony_ci    punpckhwd  m5, m6
401cabdff1aSopenharmony_ci
402cabdff1aSopenharmony_ci    pmaddwd    m0, m8
403cabdff1aSopenharmony_ci    pmaddwd    m1, m8
404cabdff1aSopenharmony_ci    pmaddwd    m2, m13
405cabdff1aSopenharmony_ci    pmaddwd    m3, m13
406cabdff1aSopenharmony_ci    pmaddwd    m4, [pw_p11_n5]
407cabdff1aSopenharmony_ci    pmaddwd    m5, [pw_n5_p11]
408cabdff1aSopenharmony_ci
409cabdff1aSopenharmony_ci    paddd      m4, m2
410cabdff1aSopenharmony_ci    paddd      m5, m3
411cabdff1aSopenharmony_ci
412cabdff1aSopenharmony_ci    paddd      m4, m0
413cabdff1aSopenharmony_ci    paddd      m5, m1
414cabdff1aSopenharmony_ci
415cabdff1aSopenharmony_ci    paddd      m4, m7
416cabdff1aSopenharmony_ci    paddd      m5, m7
417cabdff1aSopenharmony_ci
418cabdff1aSopenharmony_ci    psrad      m4, 3
419cabdff1aSopenharmony_ci    psrad      m5, 3
420cabdff1aSopenharmony_ci    packssdw   m4, m5
421cabdff1aSopenharmony_ci
422cabdff1aSopenharmony_ci    mov      posq, hwidthq
423cabdff1aSopenharmony_ci    imul     posq, yq
424cabdff1aSopenharmony_ci    add      posq, xq
425cabdff1aSopenharmony_ci
426cabdff1aSopenharmony_ci    movu   [highq + posq], m4
427cabdff1aSopenharmony_ci
428cabdff1aSopenharmony_ci    add        xq, mmsize
429cabdff1aSopenharmony_ci    cmp        xq, widthq
430cabdff1aSopenharmony_ci    jl .loopw
431cabdff1aSopenharmony_ci    RET
432cabdff1aSopenharmony_ci%endif
433