1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* mpeg4 qpel
3cabdff1aSopenharmony_ci;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
4cabdff1aSopenharmony_ci;* Copyright (c) 2008 Loren Merritt
5cabdff1aSopenharmony_ci;* Copyright (c) 2013 Daniel Kang
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
13cabdff1aSopenharmony_ci;*
14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
18cabdff1aSopenharmony_ci;*
19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22cabdff1aSopenharmony_ci;******************************************************************************
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ciSECTION_RODATA
27cabdff1aSopenharmony_cicextern pb_1
28cabdff1aSopenharmony_cicextern pw_3
29cabdff1aSopenharmony_cicextern pw_15
30cabdff1aSopenharmony_cicextern pw_16
31cabdff1aSopenharmony_cicextern pw_20
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_ciSECTION .text
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_ci; void ff_put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
37cabdff1aSopenharmony_ci%macro PUT_NO_RND_PIXELS8_L2 0
38cabdff1aSopenharmony_cicglobal put_no_rnd_pixels8_l2, 6,6
39cabdff1aSopenharmony_ci    movsxdifnidn r4, r4d
40cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
41cabdff1aSopenharmony_ci    pcmpeqb      m6, m6
42cabdff1aSopenharmony_ci    test        r5d, 1
43cabdff1aSopenharmony_ci    je .loop
44cabdff1aSopenharmony_ci    mova         m0, [r1]
45cabdff1aSopenharmony_ci    mova         m1, [r2]
46cabdff1aSopenharmony_ci    add          r1, r4
47cabdff1aSopenharmony_ci    add          r2, 8
48cabdff1aSopenharmony_ci    pxor         m0, m6
49cabdff1aSopenharmony_ci    pxor         m1, m6
50cabdff1aSopenharmony_ci    PAVGB        m0, m1
51cabdff1aSopenharmony_ci    pxor         m0, m6
52cabdff1aSopenharmony_ci    mova       [r0], m0
53cabdff1aSopenharmony_ci    add          r0, r3
54cabdff1aSopenharmony_ci    dec r5d
55cabdff1aSopenharmony_ci.loop:
56cabdff1aSopenharmony_ci    mova         m0, [r1]
57cabdff1aSopenharmony_ci    add          r1, r4
58cabdff1aSopenharmony_ci    mova         m1, [r1]
59cabdff1aSopenharmony_ci    add          r1, r4
60cabdff1aSopenharmony_ci    mova         m2, [r2]
61cabdff1aSopenharmony_ci    mova         m3, [r2+8]
62cabdff1aSopenharmony_ci    pxor         m0, m6
63cabdff1aSopenharmony_ci    pxor         m1, m6
64cabdff1aSopenharmony_ci    pxor         m2, m6
65cabdff1aSopenharmony_ci    pxor         m3, m6
66cabdff1aSopenharmony_ci    PAVGB        m0, m2
67cabdff1aSopenharmony_ci    PAVGB        m1, m3
68cabdff1aSopenharmony_ci    pxor         m0, m6
69cabdff1aSopenharmony_ci    pxor         m1, m6
70cabdff1aSopenharmony_ci    mova       [r0], m0
71cabdff1aSopenharmony_ci    add          r0, r3
72cabdff1aSopenharmony_ci    mova       [r0], m1
73cabdff1aSopenharmony_ci    add          r0, r3
74cabdff1aSopenharmony_ci    mova         m0, [r1]
75cabdff1aSopenharmony_ci    add          r1, r4
76cabdff1aSopenharmony_ci    mova         m1, [r1]
77cabdff1aSopenharmony_ci    add          r1, r4
78cabdff1aSopenharmony_ci    mova         m2, [r2+16]
79cabdff1aSopenharmony_ci    mova         m3, [r2+24]
80cabdff1aSopenharmony_ci    pxor         m0, m6
81cabdff1aSopenharmony_ci    pxor         m1, m6
82cabdff1aSopenharmony_ci    pxor         m2, m6
83cabdff1aSopenharmony_ci    pxor         m3, m6
84cabdff1aSopenharmony_ci    PAVGB        m0, m2
85cabdff1aSopenharmony_ci    PAVGB        m1, m3
86cabdff1aSopenharmony_ci    pxor         m0, m6
87cabdff1aSopenharmony_ci    pxor         m1, m6
88cabdff1aSopenharmony_ci    mova       [r0], m0
89cabdff1aSopenharmony_ci    add          r0, r3
90cabdff1aSopenharmony_ci    mova       [r0], m1
91cabdff1aSopenharmony_ci    add          r0, r3
92cabdff1aSopenharmony_ci    add          r2, 32
93cabdff1aSopenharmony_ci    sub         r5d, 4
94cabdff1aSopenharmony_ci    jne .loop
95cabdff1aSopenharmony_ci    REP_RET
96cabdff1aSopenharmony_ci%endmacro
97cabdff1aSopenharmony_ci
98cabdff1aSopenharmony_ciINIT_MMX mmxext
99cabdff1aSopenharmony_ciPUT_NO_RND_PIXELS8_L2
100cabdff1aSopenharmony_ci
101cabdff1aSopenharmony_ci
102cabdff1aSopenharmony_ci; void ff_put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
103cabdff1aSopenharmony_ci%macro PUT_NO_RND_PIXELS16_l2 0
104cabdff1aSopenharmony_cicglobal put_no_rnd_pixels16_l2, 6,6
105cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
106cabdff1aSopenharmony_ci    movsxdifnidn r4, r4d
107cabdff1aSopenharmony_ci    pcmpeqb      m6, m6
108cabdff1aSopenharmony_ci    test        r5d, 1
109cabdff1aSopenharmony_ci    je .loop
110cabdff1aSopenharmony_ci    mova         m0, [r1]
111cabdff1aSopenharmony_ci    mova         m1, [r1+8]
112cabdff1aSopenharmony_ci    mova         m2, [r2]
113cabdff1aSopenharmony_ci    mova         m3, [r2+8]
114cabdff1aSopenharmony_ci    pxor         m0, m6
115cabdff1aSopenharmony_ci    pxor         m1, m6
116cabdff1aSopenharmony_ci    pxor         m2, m6
117cabdff1aSopenharmony_ci    pxor         m3, m6
118cabdff1aSopenharmony_ci    PAVGB        m0, m2
119cabdff1aSopenharmony_ci    PAVGB        m1, m3
120cabdff1aSopenharmony_ci    pxor         m0, m6
121cabdff1aSopenharmony_ci    pxor         m1, m6
122cabdff1aSopenharmony_ci    add          r1, r4
123cabdff1aSopenharmony_ci    add          r2, 16
124cabdff1aSopenharmony_ci    mova       [r0], m0
125cabdff1aSopenharmony_ci    mova     [r0+8], m1
126cabdff1aSopenharmony_ci    add          r0, r3
127cabdff1aSopenharmony_ci    dec r5d
128cabdff1aSopenharmony_ci.loop:
129cabdff1aSopenharmony_ci    mova         m0, [r1]
130cabdff1aSopenharmony_ci    mova         m1, [r1+8]
131cabdff1aSopenharmony_ci    add          r1, r4
132cabdff1aSopenharmony_ci    mova         m2, [r2]
133cabdff1aSopenharmony_ci    mova         m3, [r2+8]
134cabdff1aSopenharmony_ci    pxor         m0, m6
135cabdff1aSopenharmony_ci    pxor         m1, m6
136cabdff1aSopenharmony_ci    pxor         m2, m6
137cabdff1aSopenharmony_ci    pxor         m3, m6
138cabdff1aSopenharmony_ci    PAVGB        m0, m2
139cabdff1aSopenharmony_ci    PAVGB        m1, m3
140cabdff1aSopenharmony_ci    pxor         m0, m6
141cabdff1aSopenharmony_ci    pxor         m1, m6
142cabdff1aSopenharmony_ci    mova       [r0], m0
143cabdff1aSopenharmony_ci    mova     [r0+8], m1
144cabdff1aSopenharmony_ci    add          r0, r3
145cabdff1aSopenharmony_ci    mova         m0, [r1]
146cabdff1aSopenharmony_ci    mova         m1, [r1+8]
147cabdff1aSopenharmony_ci    add          r1, r4
148cabdff1aSopenharmony_ci    mova         m2, [r2+16]
149cabdff1aSopenharmony_ci    mova         m3, [r2+24]
150cabdff1aSopenharmony_ci    pxor         m0, m6
151cabdff1aSopenharmony_ci    pxor         m1, m6
152cabdff1aSopenharmony_ci    pxor         m2, m6
153cabdff1aSopenharmony_ci    pxor         m3, m6
154cabdff1aSopenharmony_ci    PAVGB        m0, m2
155cabdff1aSopenharmony_ci    PAVGB        m1, m3
156cabdff1aSopenharmony_ci    pxor         m0, m6
157cabdff1aSopenharmony_ci    pxor         m1, m6
158cabdff1aSopenharmony_ci    mova       [r0], m0
159cabdff1aSopenharmony_ci    mova     [r0+8], m1
160cabdff1aSopenharmony_ci    add          r0, r3
161cabdff1aSopenharmony_ci    add          r2, 32
162cabdff1aSopenharmony_ci    sub         r5d, 2
163cabdff1aSopenharmony_ci    jne .loop
164cabdff1aSopenharmony_ci    REP_RET
165cabdff1aSopenharmony_ci%endmacro
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ciINIT_MMX mmxext
168cabdff1aSopenharmony_ciPUT_NO_RND_PIXELS16_l2
169cabdff1aSopenharmony_ci
170cabdff1aSopenharmony_ci%macro MPEG4_QPEL16_H_LOWPASS 1
171cabdff1aSopenharmony_cicglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
172cabdff1aSopenharmony_ci    movsxdifnidn r2, r2d
173cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
174cabdff1aSopenharmony_ci    pxor         m7, m7
175cabdff1aSopenharmony_ci.loop:
176cabdff1aSopenharmony_ci    mova         m0, [r1]
177cabdff1aSopenharmony_ci    mova         m1, m0
178cabdff1aSopenharmony_ci    mova         m2, m0
179cabdff1aSopenharmony_ci    punpcklbw    m0, m7
180cabdff1aSopenharmony_ci    punpckhbw    m1, m7
181cabdff1aSopenharmony_ci    pshufw       m5, m0, 0x90
182cabdff1aSopenharmony_ci    pshufw       m6, m0, 0x41
183cabdff1aSopenharmony_ci    mova         m3, m2
184cabdff1aSopenharmony_ci    mova         m4, m2
185cabdff1aSopenharmony_ci    psllq        m2, 8
186cabdff1aSopenharmony_ci    psllq        m3, 16
187cabdff1aSopenharmony_ci    psllq        m4, 24
188cabdff1aSopenharmony_ci    punpckhbw    m2, m7
189cabdff1aSopenharmony_ci    punpckhbw    m3, m7
190cabdff1aSopenharmony_ci    punpckhbw    m4, m7
191cabdff1aSopenharmony_ci    paddw        m5, m3
192cabdff1aSopenharmony_ci    paddw        m6, m2
193cabdff1aSopenharmony_ci    paddw        m5, m5
194cabdff1aSopenharmony_ci    psubw        m6, m5
195cabdff1aSopenharmony_ci    pshufw       m5, m0, 6
196cabdff1aSopenharmony_ci    pmullw       m6, [pw_3]
197cabdff1aSopenharmony_ci    paddw        m0, m4
198cabdff1aSopenharmony_ci    paddw        m5, m1
199cabdff1aSopenharmony_ci    pmullw       m0, [pw_20]
200cabdff1aSopenharmony_ci    psubw        m0, m5
201cabdff1aSopenharmony_ci    paddw        m6, [PW_ROUND]
202cabdff1aSopenharmony_ci    paddw        m0, m6
203cabdff1aSopenharmony_ci    psraw        m0, 5
204cabdff1aSopenharmony_ci    mova    [rsp+8], m0
205cabdff1aSopenharmony_ci    mova         m0, [r1+5]
206cabdff1aSopenharmony_ci    mova         m5, m0
207cabdff1aSopenharmony_ci    mova         m6, m0
208cabdff1aSopenharmony_ci    psrlq        m0, 8
209cabdff1aSopenharmony_ci    psrlq        m5, 16
210cabdff1aSopenharmony_ci    punpcklbw    m0, m7
211cabdff1aSopenharmony_ci    punpcklbw    m5, m7
212cabdff1aSopenharmony_ci    paddw        m2, m0
213cabdff1aSopenharmony_ci    paddw        m3, m5
214cabdff1aSopenharmony_ci    paddw        m2, m2
215cabdff1aSopenharmony_ci    psubw        m3, m2
216cabdff1aSopenharmony_ci    mova         m2, m6
217cabdff1aSopenharmony_ci    psrlq        m6, 24
218cabdff1aSopenharmony_ci    punpcklbw    m2, m7
219cabdff1aSopenharmony_ci    punpcklbw    m6, m7
220cabdff1aSopenharmony_ci    pmullw       m3, [pw_3]
221cabdff1aSopenharmony_ci    paddw        m1, m2
222cabdff1aSopenharmony_ci    paddw        m4, m6
223cabdff1aSopenharmony_ci    pmullw       m1, [pw_20]
224cabdff1aSopenharmony_ci    psubw        m3, m4
225cabdff1aSopenharmony_ci    paddw        m1, [PW_ROUND]
226cabdff1aSopenharmony_ci    paddw        m3, m1
227cabdff1aSopenharmony_ci    psraw        m3, 5
228cabdff1aSopenharmony_ci    mova         m1, [rsp+8]
229cabdff1aSopenharmony_ci    packuswb     m1, m3
230cabdff1aSopenharmony_ci    OP_MOV     [r0], m1, m4
231cabdff1aSopenharmony_ci    mova         m1, [r1+9]
232cabdff1aSopenharmony_ci    mova         m4, m1
233cabdff1aSopenharmony_ci    mova         m3, m1
234cabdff1aSopenharmony_ci    psrlq        m1, 8
235cabdff1aSopenharmony_ci    psrlq        m4, 16
236cabdff1aSopenharmony_ci    punpcklbw    m1, m7
237cabdff1aSopenharmony_ci    punpcklbw    m4, m7
238cabdff1aSopenharmony_ci    paddw        m5, m1
239cabdff1aSopenharmony_ci    paddw        m0, m4
240cabdff1aSopenharmony_ci    paddw        m5, m5
241cabdff1aSopenharmony_ci    psubw        m0, m5
242cabdff1aSopenharmony_ci    mova         m5, m3
243cabdff1aSopenharmony_ci    psrlq        m3, 24
244cabdff1aSopenharmony_ci    pmullw       m0, [pw_3]
245cabdff1aSopenharmony_ci    punpcklbw    m3, m7
246cabdff1aSopenharmony_ci    paddw        m2, m3
247cabdff1aSopenharmony_ci    psubw        m0, m2
248cabdff1aSopenharmony_ci    mova         m2, m5
249cabdff1aSopenharmony_ci    punpcklbw    m2, m7
250cabdff1aSopenharmony_ci    punpckhbw    m5, m7
251cabdff1aSopenharmony_ci    paddw        m6, m2
252cabdff1aSopenharmony_ci    pmullw       m6, [pw_20]
253cabdff1aSopenharmony_ci    paddw        m0, [PW_ROUND]
254cabdff1aSopenharmony_ci    paddw        m0, m6
255cabdff1aSopenharmony_ci    psraw        m0, 5
256cabdff1aSopenharmony_ci    paddw        m3, m5
257cabdff1aSopenharmony_ci    pshufw       m6, m5, 0xf9
258cabdff1aSopenharmony_ci    paddw        m6, m4
259cabdff1aSopenharmony_ci    pshufw       m4, m5, 0xbe
260cabdff1aSopenharmony_ci    pshufw       m5, m5, 0x6f
261cabdff1aSopenharmony_ci    paddw        m4, m1
262cabdff1aSopenharmony_ci    paddw        m5, m2
263cabdff1aSopenharmony_ci    paddw        m6, m6
264cabdff1aSopenharmony_ci    psubw        m4, m6
265cabdff1aSopenharmony_ci    pmullw       m3, [pw_20]
266cabdff1aSopenharmony_ci    pmullw       m4, [pw_3]
267cabdff1aSopenharmony_ci    psubw        m3, m5
268cabdff1aSopenharmony_ci    paddw        m4, [PW_ROUND]
269cabdff1aSopenharmony_ci    paddw        m4, m3
270cabdff1aSopenharmony_ci    psraw        m4, 5
271cabdff1aSopenharmony_ci    packuswb     m0, m4
272cabdff1aSopenharmony_ci    OP_MOV   [r0+8], m0, m4
273cabdff1aSopenharmony_ci    add          r1, r3
274cabdff1aSopenharmony_ci    add          r0, r2
275cabdff1aSopenharmony_ci    dec r4d
276cabdff1aSopenharmony_ci    jne .loop
277cabdff1aSopenharmony_ci    REP_RET
278cabdff1aSopenharmony_ci%endmacro
279cabdff1aSopenharmony_ci
280cabdff1aSopenharmony_ci%macro PUT_OP 2-3
281cabdff1aSopenharmony_ci    mova %1, %2
282cabdff1aSopenharmony_ci%endmacro
283cabdff1aSopenharmony_ci
284cabdff1aSopenharmony_ci%macro AVG_OP 2-3
285cabdff1aSopenharmony_ci    mova  %3, %1
286cabdff1aSopenharmony_ci    pavgb %2, %3
287cabdff1aSopenharmony_ci    mova  %1, %2
288cabdff1aSopenharmony_ci%endmacro
289cabdff1aSopenharmony_ci
290cabdff1aSopenharmony_ciINIT_MMX mmxext
291cabdff1aSopenharmony_ci%define PW_ROUND pw_16
292cabdff1aSopenharmony_ci%define OP_MOV PUT_OP
293cabdff1aSopenharmony_ciMPEG4_QPEL16_H_LOWPASS put
294cabdff1aSopenharmony_ci%define PW_ROUND pw_16
295cabdff1aSopenharmony_ci%define OP_MOV AVG_OP
296cabdff1aSopenharmony_ciMPEG4_QPEL16_H_LOWPASS avg
297cabdff1aSopenharmony_ci%define PW_ROUND pw_15
298cabdff1aSopenharmony_ci%define OP_MOV PUT_OP
299cabdff1aSopenharmony_ciMPEG4_QPEL16_H_LOWPASS put_no_rnd
300cabdff1aSopenharmony_ci
301cabdff1aSopenharmony_ci
302cabdff1aSopenharmony_ci
303cabdff1aSopenharmony_ci%macro MPEG4_QPEL8_H_LOWPASS 1
304cabdff1aSopenharmony_cicglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
305cabdff1aSopenharmony_ci    movsxdifnidn r2, r2d
306cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
307cabdff1aSopenharmony_ci    pxor         m7, m7
308cabdff1aSopenharmony_ci.loop:
309cabdff1aSopenharmony_ci    mova         m0, [r1]
310cabdff1aSopenharmony_ci    mova         m1, m0
311cabdff1aSopenharmony_ci    mova         m2, m0
312cabdff1aSopenharmony_ci    punpcklbw    m0, m7
313cabdff1aSopenharmony_ci    punpckhbw    m1, m7
314cabdff1aSopenharmony_ci    pshufw       m5, m0, 0x90
315cabdff1aSopenharmony_ci    pshufw       m6, m0, 0x41
316cabdff1aSopenharmony_ci    mova         m3, m2
317cabdff1aSopenharmony_ci    mova         m4, m2
318cabdff1aSopenharmony_ci    psllq        m2, 8
319cabdff1aSopenharmony_ci    psllq        m3, 16
320cabdff1aSopenharmony_ci    psllq        m4, 24
321cabdff1aSopenharmony_ci    punpckhbw    m2, m7
322cabdff1aSopenharmony_ci    punpckhbw    m3, m7
323cabdff1aSopenharmony_ci    punpckhbw    m4, m7
324cabdff1aSopenharmony_ci    paddw        m5, m3
325cabdff1aSopenharmony_ci    paddw        m6, m2
326cabdff1aSopenharmony_ci    paddw        m5, m5
327cabdff1aSopenharmony_ci    psubw        m6, m5
328cabdff1aSopenharmony_ci    pshufw       m5, m0, 0x6
329cabdff1aSopenharmony_ci    pmullw       m6, [pw_3]
330cabdff1aSopenharmony_ci    paddw        m0, m4
331cabdff1aSopenharmony_ci    paddw        m5, m1
332cabdff1aSopenharmony_ci    pmullw       m0, [pw_20]
333cabdff1aSopenharmony_ci    psubw        m0, m5
334cabdff1aSopenharmony_ci    paddw        m6, [PW_ROUND]
335cabdff1aSopenharmony_ci    paddw        m0, m6
336cabdff1aSopenharmony_ci    psraw        m0, 5
337cabdff1aSopenharmony_ci    movh         m5, [r1+5]
338cabdff1aSopenharmony_ci    punpcklbw    m5, m7
339cabdff1aSopenharmony_ci    pshufw       m6, m5, 0xf9
340cabdff1aSopenharmony_ci    paddw        m1, m5
341cabdff1aSopenharmony_ci    paddw        m2, m6
342cabdff1aSopenharmony_ci    pshufw       m6, m5, 0xbe
343cabdff1aSopenharmony_ci    pshufw       m5, m5, 0x6f
344cabdff1aSopenharmony_ci    paddw        m3, m6
345cabdff1aSopenharmony_ci    paddw        m4, m5
346cabdff1aSopenharmony_ci    paddw        m2, m2
347cabdff1aSopenharmony_ci    psubw        m3, m2
348cabdff1aSopenharmony_ci    pmullw       m1, [pw_20]
349cabdff1aSopenharmony_ci    pmullw       m3, [pw_3]
350cabdff1aSopenharmony_ci    psubw        m3, m4
351cabdff1aSopenharmony_ci    paddw        m1, [PW_ROUND]
352cabdff1aSopenharmony_ci    paddw        m3, m1
353cabdff1aSopenharmony_ci    psraw        m3, 5
354cabdff1aSopenharmony_ci    packuswb     m0, m3
355cabdff1aSopenharmony_ci    OP_MOV     [r0], m0, m4
356cabdff1aSopenharmony_ci    add          r1, r3
357cabdff1aSopenharmony_ci    add          r0, r2
358cabdff1aSopenharmony_ci    dec r4d
359cabdff1aSopenharmony_ci    jne .loop
360cabdff1aSopenharmony_ci    REP_RET
361cabdff1aSopenharmony_ci%endmacro
362cabdff1aSopenharmony_ci
363cabdff1aSopenharmony_ciINIT_MMX mmxext
364cabdff1aSopenharmony_ci%define PW_ROUND pw_16
365cabdff1aSopenharmony_ci%define OP_MOV PUT_OP
366cabdff1aSopenharmony_ciMPEG4_QPEL8_H_LOWPASS put
367cabdff1aSopenharmony_ci%define PW_ROUND pw_16
368cabdff1aSopenharmony_ci%define OP_MOV AVG_OP
369cabdff1aSopenharmony_ciMPEG4_QPEL8_H_LOWPASS avg
370cabdff1aSopenharmony_ci%define PW_ROUND pw_15
371cabdff1aSopenharmony_ci%define OP_MOV PUT_OP
372cabdff1aSopenharmony_ciMPEG4_QPEL8_H_LOWPASS put_no_rnd
373cabdff1aSopenharmony_ci
374cabdff1aSopenharmony_ci
375cabdff1aSopenharmony_ci
376cabdff1aSopenharmony_ci%macro QPEL_V_LOW 5
377cabdff1aSopenharmony_ci    paddw      m0, m1
378cabdff1aSopenharmony_ci    mova       m4, [pw_20]
379cabdff1aSopenharmony_ci    pmullw     m4, m0
380cabdff1aSopenharmony_ci    mova       m0, %4
381cabdff1aSopenharmony_ci    mova       m5, %1
382cabdff1aSopenharmony_ci    paddw      m5, m0
383cabdff1aSopenharmony_ci    psubw      m4, m5
384cabdff1aSopenharmony_ci    mova       m5, %2
385cabdff1aSopenharmony_ci    mova       m6, %3
386cabdff1aSopenharmony_ci    paddw      m5, m3
387cabdff1aSopenharmony_ci    paddw      m6, m2
388cabdff1aSopenharmony_ci    paddw      m6, m6
389cabdff1aSopenharmony_ci    psubw      m5, m6
390cabdff1aSopenharmony_ci    pmullw     m5, [pw_3]
391cabdff1aSopenharmony_ci    paddw      m4, [PW_ROUND]
392cabdff1aSopenharmony_ci    paddw      m5, m4
393cabdff1aSopenharmony_ci    psraw      m5, 5
394cabdff1aSopenharmony_ci    packuswb   m5, m5
395cabdff1aSopenharmony_ci    OP_MOV     %5, m5, m7
396cabdff1aSopenharmony_ci    SWAP 0,1,2,3
397cabdff1aSopenharmony_ci%endmacro
398cabdff1aSopenharmony_ci
399cabdff1aSopenharmony_ci%macro MPEG4_QPEL16_V_LOWPASS 1
400cabdff1aSopenharmony_cicglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
401cabdff1aSopenharmony_ci    movsxdifnidn r2, r2d
402cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
403cabdff1aSopenharmony_ci
404cabdff1aSopenharmony_ci    mov         r4d, 17
405cabdff1aSopenharmony_ci    mov          r5, rsp
406cabdff1aSopenharmony_ci    pxor         m7, m7
407cabdff1aSopenharmony_ci.looph:
408cabdff1aSopenharmony_ci    mova         m0, [r1]
409cabdff1aSopenharmony_ci    mova         m1, [r1]
410cabdff1aSopenharmony_ci    mova         m2, [r1+8]
411cabdff1aSopenharmony_ci    mova         m3, [r1+8]
412cabdff1aSopenharmony_ci    punpcklbw    m0, m7
413cabdff1aSopenharmony_ci    punpckhbw    m1, m7
414cabdff1aSopenharmony_ci    punpcklbw    m2, m7
415cabdff1aSopenharmony_ci    punpckhbw    m3, m7
416cabdff1aSopenharmony_ci    mova       [r5], m0
417cabdff1aSopenharmony_ci    mova  [r5+0x88], m1
418cabdff1aSopenharmony_ci    mova [r5+0x110], m2
419cabdff1aSopenharmony_ci    mova [r5+0x198], m3
420cabdff1aSopenharmony_ci    add          r5, 8
421cabdff1aSopenharmony_ci    add          r1, r3
422cabdff1aSopenharmony_ci    dec r4d
423cabdff1aSopenharmony_ci    jne .looph
424cabdff1aSopenharmony_ci
425cabdff1aSopenharmony_ci
426cabdff1aSopenharmony_ci    ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
427cabdff1aSopenharmony_ci    mov         r4d, 4
428cabdff1aSopenharmony_ci    mov          r1, 4
429cabdff1aSopenharmony_ci    neg          r2
430cabdff1aSopenharmony_ci    lea          r1, [r1+r2*8]
431cabdff1aSopenharmony_ci    lea          r1, [r1+r2*4]
432cabdff1aSopenharmony_ci    lea          r1, [r1+r2*2]
433cabdff1aSopenharmony_ci    neg          r2
434cabdff1aSopenharmony_ci    mov          r5, rsp
435cabdff1aSopenharmony_ci.loopv:
436cabdff1aSopenharmony_ci    pxor         m7, m7
437cabdff1aSopenharmony_ci    mova         m0, [r5+ 0x0]
438cabdff1aSopenharmony_ci    mova         m1, [r5+ 0x8]
439cabdff1aSopenharmony_ci    mova         m2, [r5+0x10]
440cabdff1aSopenharmony_ci    mova         m3, [r5+0x18]
441cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
442cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
443cabdff1aSopenharmony_ci    lea    r0, [r0+r2*2]
444cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
445cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
446cabdff1aSopenharmony_ci    lea    r0, [r0+r2*2]
447cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
448cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
449cabdff1aSopenharmony_ci    lea    r0, [r0+r2*2]
450cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
451cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
452cabdff1aSopenharmony_ci    lea    r0, [r0+r2*2]
453cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
454cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
455cabdff1aSopenharmony_ci    lea    r0, [r0+r2*2]
456cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
457cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
458cabdff1aSopenharmony_ci    lea    r0, [r0+r2*2]
459cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
460cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
461cabdff1aSopenharmony_ci    lea    r0, [r0+r2*2]
462cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
463cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
464cabdff1aSopenharmony_ci
465cabdff1aSopenharmony_ci    add    r5, 0x88
466cabdff1aSopenharmony_ci    add    r0, r1
467cabdff1aSopenharmony_ci    dec r4d
468cabdff1aSopenharmony_ci    jne .loopv
469cabdff1aSopenharmony_ci    REP_RET
470cabdff1aSopenharmony_ci%endmacro
471cabdff1aSopenharmony_ci
472cabdff1aSopenharmony_ci%macro PUT_OPH 2-3
473cabdff1aSopenharmony_ci    movh %1, %2
474cabdff1aSopenharmony_ci%endmacro
475cabdff1aSopenharmony_ci
476cabdff1aSopenharmony_ci%macro AVG_OPH 2-3
477cabdff1aSopenharmony_ci    movh  %3, %1
478cabdff1aSopenharmony_ci    pavgb %2, %3
479cabdff1aSopenharmony_ci    movh  %1, %2
480cabdff1aSopenharmony_ci%endmacro
481cabdff1aSopenharmony_ci
482cabdff1aSopenharmony_ciINIT_MMX mmxext
483cabdff1aSopenharmony_ci%define PW_ROUND pw_16
484cabdff1aSopenharmony_ci%define OP_MOV PUT_OPH
485cabdff1aSopenharmony_ciMPEG4_QPEL16_V_LOWPASS put
486cabdff1aSopenharmony_ci%define PW_ROUND pw_16
487cabdff1aSopenharmony_ci%define OP_MOV AVG_OPH
488cabdff1aSopenharmony_ciMPEG4_QPEL16_V_LOWPASS avg
489cabdff1aSopenharmony_ci%define PW_ROUND pw_15
490cabdff1aSopenharmony_ci%define OP_MOV PUT_OPH
491cabdff1aSopenharmony_ciMPEG4_QPEL16_V_LOWPASS put_no_rnd
492cabdff1aSopenharmony_ci
493cabdff1aSopenharmony_ci
494cabdff1aSopenharmony_ci
495cabdff1aSopenharmony_ci%macro MPEG4_QPEL8_V_LOWPASS 1
496cabdff1aSopenharmony_cicglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
497cabdff1aSopenharmony_ci    movsxdifnidn r2, r2d
498cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
499cabdff1aSopenharmony_ci
500cabdff1aSopenharmony_ci    mov         r4d, 9
501cabdff1aSopenharmony_ci    mov          r5, rsp
502cabdff1aSopenharmony_ci    pxor         m7, m7
503cabdff1aSopenharmony_ci.looph:
504cabdff1aSopenharmony_ci    mova         m0, [r1]
505cabdff1aSopenharmony_ci    mova         m1, [r1]
506cabdff1aSopenharmony_ci    punpcklbw    m0, m7
507cabdff1aSopenharmony_ci    punpckhbw    m1, m7
508cabdff1aSopenharmony_ci    mova       [r5], m0
509cabdff1aSopenharmony_ci    mova  [r5+0x48], m1
510cabdff1aSopenharmony_ci    add          r5, 8
511cabdff1aSopenharmony_ci    add          r1, r3
512cabdff1aSopenharmony_ci    dec r4d
513cabdff1aSopenharmony_ci    jne .looph
514cabdff1aSopenharmony_ci
515cabdff1aSopenharmony_ci
516cabdff1aSopenharmony_ci    ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
517cabdff1aSopenharmony_ci    mov         r4d, 2
518cabdff1aSopenharmony_ci    mov          r1, 4
519cabdff1aSopenharmony_ci    neg          r2
520cabdff1aSopenharmony_ci    lea          r1, [r1+r2*4]
521cabdff1aSopenharmony_ci    lea          r1, [r1+r2*2]
522cabdff1aSopenharmony_ci    neg          r2
523cabdff1aSopenharmony_ci    mov          r5, rsp
524cabdff1aSopenharmony_ci.loopv:
525cabdff1aSopenharmony_ci    pxor         m7, m7
526cabdff1aSopenharmony_ci    mova         m0, [r5+ 0x0]
527cabdff1aSopenharmony_ci    mova         m1, [r5+ 0x8]
528cabdff1aSopenharmony_ci    mova         m2, [r5+0x10]
529cabdff1aSopenharmony_ci    mova         m3, [r5+0x18]
530cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
531cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
532cabdff1aSopenharmony_ci    lea    r0, [r0+r2*2]
533cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
534cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
535cabdff1aSopenharmony_ci    lea    r0, [r0+r2*2]
536cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
537cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
538cabdff1aSopenharmony_ci    lea    r0, [r0+r2*2]
539cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
540cabdff1aSopenharmony_ci    QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
541cabdff1aSopenharmony_ci
542cabdff1aSopenharmony_ci    add    r5, 0x48
543cabdff1aSopenharmony_ci    add    r0, r1
544cabdff1aSopenharmony_ci    dec r4d
545cabdff1aSopenharmony_ci    jne .loopv
546cabdff1aSopenharmony_ci    REP_RET
547cabdff1aSopenharmony_ci%endmacro
548cabdff1aSopenharmony_ci
549cabdff1aSopenharmony_ciINIT_MMX mmxext
550cabdff1aSopenharmony_ci%define PW_ROUND pw_16
551cabdff1aSopenharmony_ci%define OP_MOV PUT_OPH
552cabdff1aSopenharmony_ciMPEG4_QPEL8_V_LOWPASS put
553cabdff1aSopenharmony_ci%define PW_ROUND pw_16
554cabdff1aSopenharmony_ci%define OP_MOV AVG_OPH
555cabdff1aSopenharmony_ciMPEG4_QPEL8_V_LOWPASS avg
556cabdff1aSopenharmony_ci%define PW_ROUND pw_15
557cabdff1aSopenharmony_ci%define OP_MOV PUT_OPH
558cabdff1aSopenharmony_ciMPEG4_QPEL8_V_LOWPASS put_no_rnd
559