1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
3cabdff1aSopenharmony_ci;*****************************************************************************
4cabdff1aSopenharmony_ci;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5cabdff1aSopenharmony_ci;* Copyright (C) 2012 Daniel Kang
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
10cabdff1aSopenharmony_ci;*
11cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
12cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
13cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
14cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
15cabdff1aSopenharmony_ci;*
16cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
17cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
20cabdff1aSopenharmony_ci;*
21cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
22cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
23cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24cabdff1aSopenharmony_ci;******************************************************************************
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_ciSECTION_RODATA 32
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_cicextern pw_16
31cabdff1aSopenharmony_cicextern pw_5
32cabdff1aSopenharmony_cicextern pb_0
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_ciSECTION .text
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_ci
37cabdff1aSopenharmony_ci%macro op_avgh 3
38cabdff1aSopenharmony_ci    movh   %3, %2
39cabdff1aSopenharmony_ci    pavgb  %1, %3
40cabdff1aSopenharmony_ci    movh   %2, %1
41cabdff1aSopenharmony_ci%endmacro
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_ci%macro op_avg 2-3
44cabdff1aSopenharmony_ci    pavgb  %1, %2
45cabdff1aSopenharmony_ci    mova   %2, %1
46cabdff1aSopenharmony_ci%endmacro
47cabdff1aSopenharmony_ci
48cabdff1aSopenharmony_ci%macro op_puth 2-3
49cabdff1aSopenharmony_ci    movh   %2, %1
50cabdff1aSopenharmony_ci%endmacro
51cabdff1aSopenharmony_ci
52cabdff1aSopenharmony_ci%macro op_put 2-3
53cabdff1aSopenharmony_ci    mova   %2, %1
54cabdff1aSopenharmony_ci%endmacro
55cabdff1aSopenharmony_ci
56cabdff1aSopenharmony_ci%macro QPEL4_H_LOWPASS_OP 1
57cabdff1aSopenharmony_cicglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
58cabdff1aSopenharmony_ci    movsxdifnidn  r2, r2d
59cabdff1aSopenharmony_ci    movsxdifnidn  r3, r3d
60cabdff1aSopenharmony_ci    pxor          m7, m7
61cabdff1aSopenharmony_ci    mova          m4, [pw_5]
62cabdff1aSopenharmony_ci    mova          m5, [pw_16]
63cabdff1aSopenharmony_ci    mov          r4d, 4
64cabdff1aSopenharmony_ci.loop:
65cabdff1aSopenharmony_ci    movh          m1, [r1-1]
66cabdff1aSopenharmony_ci    movh          m2, [r1+0]
67cabdff1aSopenharmony_ci    movh          m3, [r1+1]
68cabdff1aSopenharmony_ci    movh          m0, [r1+2]
69cabdff1aSopenharmony_ci    punpcklbw     m1, m7
70cabdff1aSopenharmony_ci    punpcklbw     m2, m7
71cabdff1aSopenharmony_ci    punpcklbw     m3, m7
72cabdff1aSopenharmony_ci    punpcklbw     m0, m7
73cabdff1aSopenharmony_ci    paddw         m1, m0
74cabdff1aSopenharmony_ci    paddw         m2, m3
75cabdff1aSopenharmony_ci    movh          m0, [r1-2]
76cabdff1aSopenharmony_ci    movh          m3, [r1+3]
77cabdff1aSopenharmony_ci    punpcklbw     m0, m7
78cabdff1aSopenharmony_ci    punpcklbw     m3, m7
79cabdff1aSopenharmony_ci    paddw         m0, m3
80cabdff1aSopenharmony_ci    psllw         m2, 2
81cabdff1aSopenharmony_ci    psubw         m2, m1
82cabdff1aSopenharmony_ci    pmullw        m2, m4
83cabdff1aSopenharmony_ci    paddw         m0, m5
84cabdff1aSopenharmony_ci    paddw         m0, m2
85cabdff1aSopenharmony_ci    psraw         m0, 5
86cabdff1aSopenharmony_ci    packuswb      m0, m0
87cabdff1aSopenharmony_ci    op_%1h        m0, [r0], m6
88cabdff1aSopenharmony_ci    add           r0, r2
89cabdff1aSopenharmony_ci    add           r1, r3
90cabdff1aSopenharmony_ci    dec          r4d
91cabdff1aSopenharmony_ci    jg         .loop
92cabdff1aSopenharmony_ci    REP_RET
93cabdff1aSopenharmony_ci%endmacro
94cabdff1aSopenharmony_ci
95cabdff1aSopenharmony_ciINIT_MMX mmxext
96cabdff1aSopenharmony_ciQPEL4_H_LOWPASS_OP put
97cabdff1aSopenharmony_ciQPEL4_H_LOWPASS_OP avg
98cabdff1aSopenharmony_ci
99cabdff1aSopenharmony_ci%macro QPEL8_H_LOWPASS_OP 1
100cabdff1aSopenharmony_cicglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
101cabdff1aSopenharmony_ci    movsxdifnidn  r2, r2d
102cabdff1aSopenharmony_ci    movsxdifnidn  r3, r3d
103cabdff1aSopenharmony_ci    mov          r4d, 8
104cabdff1aSopenharmony_ci    pxor          m7, m7
105cabdff1aSopenharmony_ci    mova          m6, [pw_5]
106cabdff1aSopenharmony_ci.loop:
107cabdff1aSopenharmony_ci    mova          m0, [r1]
108cabdff1aSopenharmony_ci    mova          m2, [r1+1]
109cabdff1aSopenharmony_ci    mova          m1, m0
110cabdff1aSopenharmony_ci    mova          m3, m2
111cabdff1aSopenharmony_ci    punpcklbw     m0, m7
112cabdff1aSopenharmony_ci    punpckhbw     m1, m7
113cabdff1aSopenharmony_ci    punpcklbw     m2, m7
114cabdff1aSopenharmony_ci    punpckhbw     m3, m7
115cabdff1aSopenharmony_ci    paddw         m0, m2
116cabdff1aSopenharmony_ci    paddw         m1, m3
117cabdff1aSopenharmony_ci    psllw         m0, 2
118cabdff1aSopenharmony_ci    psllw         m1, 2
119cabdff1aSopenharmony_ci    mova          m2, [r1-1]
120cabdff1aSopenharmony_ci    mova          m4, [r1+2]
121cabdff1aSopenharmony_ci    mova          m3, m2
122cabdff1aSopenharmony_ci    mova          m5, m4
123cabdff1aSopenharmony_ci    punpcklbw     m2, m7
124cabdff1aSopenharmony_ci    punpckhbw     m3, m7
125cabdff1aSopenharmony_ci    punpcklbw     m4, m7
126cabdff1aSopenharmony_ci    punpckhbw     m5, m7
127cabdff1aSopenharmony_ci    paddw         m2, m4
128cabdff1aSopenharmony_ci    paddw         m5, m3
129cabdff1aSopenharmony_ci    psubw         m0, m2
130cabdff1aSopenharmony_ci    psubw         m1, m5
131cabdff1aSopenharmony_ci    pmullw        m0, m6
132cabdff1aSopenharmony_ci    pmullw        m1, m6
133cabdff1aSopenharmony_ci    movd          m2, [r1-2]
134cabdff1aSopenharmony_ci    movd          m5, [r1+7]
135cabdff1aSopenharmony_ci    punpcklbw     m2, m7
136cabdff1aSopenharmony_ci    punpcklbw     m5, m7
137cabdff1aSopenharmony_ci    paddw         m2, m3
138cabdff1aSopenharmony_ci    paddw         m4, m5
139cabdff1aSopenharmony_ci    mova          m5, [pw_16]
140cabdff1aSopenharmony_ci    paddw         m2, m5
141cabdff1aSopenharmony_ci    paddw         m4, m5
142cabdff1aSopenharmony_ci    paddw         m0, m2
143cabdff1aSopenharmony_ci    paddw         m1, m4
144cabdff1aSopenharmony_ci    psraw         m0, 5
145cabdff1aSopenharmony_ci    psraw         m1, 5
146cabdff1aSopenharmony_ci    packuswb      m0, m1
147cabdff1aSopenharmony_ci    op_%1         m0, [r0], m4
148cabdff1aSopenharmony_ci    add           r0, r2
149cabdff1aSopenharmony_ci    add           r1, r3
150cabdff1aSopenharmony_ci    dec          r4d
151cabdff1aSopenharmony_ci    jg         .loop
152cabdff1aSopenharmony_ci    REP_RET
153cabdff1aSopenharmony_ci%endmacro
154cabdff1aSopenharmony_ci
155cabdff1aSopenharmony_ciINIT_MMX mmxext
156cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_OP put
157cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_OP avg
158cabdff1aSopenharmony_ci
159cabdff1aSopenharmony_ci%macro QPEL8_H_LOWPASS_OP_XMM 1
160cabdff1aSopenharmony_cicglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
161cabdff1aSopenharmony_ci    movsxdifnidn  r2, r2d
162cabdff1aSopenharmony_ci    movsxdifnidn  r3, r3d
163cabdff1aSopenharmony_ci    mov          r4d, 8
164cabdff1aSopenharmony_ci    pxor          m7, m7
165cabdff1aSopenharmony_ci    mova          m6, [pw_5]
166cabdff1aSopenharmony_ci.loop:
167cabdff1aSopenharmony_ci    movu          m1, [r1-2]
168cabdff1aSopenharmony_ci    mova          m0, m1
169cabdff1aSopenharmony_ci    punpckhbw     m1, m7
170cabdff1aSopenharmony_ci    punpcklbw     m0, m7
171cabdff1aSopenharmony_ci    mova          m2, m1
172cabdff1aSopenharmony_ci    mova          m3, m1
173cabdff1aSopenharmony_ci    mova          m4, m1
174cabdff1aSopenharmony_ci    mova          m5, m1
175cabdff1aSopenharmony_ci    palignr       m4, m0, 2
176cabdff1aSopenharmony_ci    palignr       m3, m0, 4
177cabdff1aSopenharmony_ci    palignr       m2, m0, 6
178cabdff1aSopenharmony_ci    palignr       m1, m0, 8
179cabdff1aSopenharmony_ci    palignr       m5, m0, 10
180cabdff1aSopenharmony_ci    paddw         m0, m5
181cabdff1aSopenharmony_ci    paddw         m2, m3
182cabdff1aSopenharmony_ci    paddw         m1, m4
183cabdff1aSopenharmony_ci    psllw         m2, 2
184cabdff1aSopenharmony_ci    psubw         m2, m1
185cabdff1aSopenharmony_ci    paddw         m0, [pw_16]
186cabdff1aSopenharmony_ci    pmullw        m2, m6
187cabdff1aSopenharmony_ci    paddw         m2, m0
188cabdff1aSopenharmony_ci    psraw         m2, 5
189cabdff1aSopenharmony_ci    packuswb      m2, m2
190cabdff1aSopenharmony_ci    op_%1h        m2, [r0], m4
191cabdff1aSopenharmony_ci    add           r1, r3
192cabdff1aSopenharmony_ci    add           r0, r2
193cabdff1aSopenharmony_ci    dec          r4d
194cabdff1aSopenharmony_ci    jne        .loop
195cabdff1aSopenharmony_ci    REP_RET
196cabdff1aSopenharmony_ci%endmacro
197cabdff1aSopenharmony_ci
198cabdff1aSopenharmony_ciINIT_XMM ssse3
199cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_OP_XMM put
200cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_OP_XMM avg
201cabdff1aSopenharmony_ci
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ci%macro QPEL4_H_LOWPASS_L2_OP 1
204cabdff1aSopenharmony_cicglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
205cabdff1aSopenharmony_ci    movsxdifnidn  r3, r3d
206cabdff1aSopenharmony_ci    movsxdifnidn  r4, r4d
207cabdff1aSopenharmony_ci    pxor          m7, m7
208cabdff1aSopenharmony_ci    mova          m4, [pw_5]
209cabdff1aSopenharmony_ci    mova          m5, [pw_16]
210cabdff1aSopenharmony_ci    mov          r5d, 4
211cabdff1aSopenharmony_ci.loop:
212cabdff1aSopenharmony_ci    movh          m1, [r1-1]
213cabdff1aSopenharmony_ci    movh          m2, [r1+0]
214cabdff1aSopenharmony_ci    movh          m3, [r1+1]
215cabdff1aSopenharmony_ci    movh          m0, [r1+2]
216cabdff1aSopenharmony_ci    punpcklbw     m1, m7
217cabdff1aSopenharmony_ci    punpcklbw     m2, m7
218cabdff1aSopenharmony_ci    punpcklbw     m3, m7
219cabdff1aSopenharmony_ci    punpcklbw     m0, m7
220cabdff1aSopenharmony_ci    paddw         m1, m0
221cabdff1aSopenharmony_ci    paddw         m2, m3
222cabdff1aSopenharmony_ci    movh          m0, [r1-2]
223cabdff1aSopenharmony_ci    movh          m3, [r1+3]
224cabdff1aSopenharmony_ci    punpcklbw     m0, m7
225cabdff1aSopenharmony_ci    punpcklbw     m3, m7
226cabdff1aSopenharmony_ci    paddw         m0, m3
227cabdff1aSopenharmony_ci    psllw         m2, 2
228cabdff1aSopenharmony_ci    psubw         m2, m1
229cabdff1aSopenharmony_ci    pmullw        m2, m4
230cabdff1aSopenharmony_ci    paddw         m0, m5
231cabdff1aSopenharmony_ci    paddw         m0, m2
232cabdff1aSopenharmony_ci    movh          m3, [r2]
233cabdff1aSopenharmony_ci    psraw         m0, 5
234cabdff1aSopenharmony_ci    packuswb      m0, m0
235cabdff1aSopenharmony_ci    pavgb         m0, m3
236cabdff1aSopenharmony_ci    op_%1h        m0, [r0], m6
237cabdff1aSopenharmony_ci    add           r0, r3
238cabdff1aSopenharmony_ci    add           r1, r3
239cabdff1aSopenharmony_ci    add           r2, r4
240cabdff1aSopenharmony_ci    dec          r5d
241cabdff1aSopenharmony_ci    jg         .loop
242cabdff1aSopenharmony_ci    REP_RET
243cabdff1aSopenharmony_ci%endmacro
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ciINIT_MMX mmxext
246cabdff1aSopenharmony_ciQPEL4_H_LOWPASS_L2_OP put
247cabdff1aSopenharmony_ciQPEL4_H_LOWPASS_L2_OP avg
248cabdff1aSopenharmony_ci
249cabdff1aSopenharmony_ci
250cabdff1aSopenharmony_ci%macro QPEL8_H_LOWPASS_L2_OP 1
251cabdff1aSopenharmony_cicglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
252cabdff1aSopenharmony_ci    movsxdifnidn  r3, r3d
253cabdff1aSopenharmony_ci    movsxdifnidn  r4, r4d
254cabdff1aSopenharmony_ci    mov          r5d, 8
255cabdff1aSopenharmony_ci    pxor          m7, m7
256cabdff1aSopenharmony_ci    mova          m6, [pw_5]
257cabdff1aSopenharmony_ci.loop:
258cabdff1aSopenharmony_ci    mova          m0, [r1]
259cabdff1aSopenharmony_ci    mova          m2, [r1+1]
260cabdff1aSopenharmony_ci    mova          m1, m0
261cabdff1aSopenharmony_ci    mova          m3, m2
262cabdff1aSopenharmony_ci    punpcklbw     m0, m7
263cabdff1aSopenharmony_ci    punpckhbw     m1, m7
264cabdff1aSopenharmony_ci    punpcklbw     m2, m7
265cabdff1aSopenharmony_ci    punpckhbw     m3, m7
266cabdff1aSopenharmony_ci    paddw         m0, m2
267cabdff1aSopenharmony_ci    paddw         m1, m3
268cabdff1aSopenharmony_ci    psllw         m0, 2
269cabdff1aSopenharmony_ci    psllw         m1, 2
270cabdff1aSopenharmony_ci    mova          m2, [r1-1]
271cabdff1aSopenharmony_ci    mova          m4, [r1+2]
272cabdff1aSopenharmony_ci    mova          m3, m2
273cabdff1aSopenharmony_ci    mova          m5, m4
274cabdff1aSopenharmony_ci    punpcklbw     m2, m7
275cabdff1aSopenharmony_ci    punpckhbw     m3, m7
276cabdff1aSopenharmony_ci    punpcklbw     m4, m7
277cabdff1aSopenharmony_ci    punpckhbw     m5, m7
278cabdff1aSopenharmony_ci    paddw         m2, m4
279cabdff1aSopenharmony_ci    paddw         m5, m3
280cabdff1aSopenharmony_ci    psubw         m0, m2
281cabdff1aSopenharmony_ci    psubw         m1, m5
282cabdff1aSopenharmony_ci    pmullw        m0, m6
283cabdff1aSopenharmony_ci    pmullw        m1, m6
284cabdff1aSopenharmony_ci    movd          m2, [r1-2]
285cabdff1aSopenharmony_ci    movd          m5, [r1+7]
286cabdff1aSopenharmony_ci    punpcklbw     m2, m7
287cabdff1aSopenharmony_ci    punpcklbw     m5, m7
288cabdff1aSopenharmony_ci    paddw         m2, m3
289cabdff1aSopenharmony_ci    paddw         m4, m5
290cabdff1aSopenharmony_ci    mova          m5, [pw_16]
291cabdff1aSopenharmony_ci    paddw         m2, m5
292cabdff1aSopenharmony_ci    paddw         m4, m5
293cabdff1aSopenharmony_ci    paddw         m0, m2
294cabdff1aSopenharmony_ci    paddw         m1, m4
295cabdff1aSopenharmony_ci    psraw         m0, 5
296cabdff1aSopenharmony_ci    psraw         m1, 5
297cabdff1aSopenharmony_ci    mova          m4, [r2]
298cabdff1aSopenharmony_ci    packuswb      m0, m1
299cabdff1aSopenharmony_ci    pavgb         m0, m4
300cabdff1aSopenharmony_ci    op_%1         m0, [r0], m4
301cabdff1aSopenharmony_ci    add           r0, r3
302cabdff1aSopenharmony_ci    add           r1, r3
303cabdff1aSopenharmony_ci    add           r2, r4
304cabdff1aSopenharmony_ci    dec          r5d
305cabdff1aSopenharmony_ci    jg         .loop
306cabdff1aSopenharmony_ci    REP_RET
307cabdff1aSopenharmony_ci%endmacro
308cabdff1aSopenharmony_ci
309cabdff1aSopenharmony_ciINIT_MMX mmxext
310cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_L2_OP put
311cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_L2_OP avg
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ci
314cabdff1aSopenharmony_ci%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
315cabdff1aSopenharmony_cicglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
316cabdff1aSopenharmony_ci    movsxdifnidn  r3, r3d
317cabdff1aSopenharmony_ci    movsxdifnidn  r4, r4d
318cabdff1aSopenharmony_ci    mov          r5d, 8
319cabdff1aSopenharmony_ci    pxor          m7, m7
320cabdff1aSopenharmony_ci    mova          m6, [pw_5]
321cabdff1aSopenharmony_ci.loop:
322cabdff1aSopenharmony_ci    lddqu         m1, [r1-2]
323cabdff1aSopenharmony_ci    mova          m0, m1
324cabdff1aSopenharmony_ci    punpckhbw     m1, m7
325cabdff1aSopenharmony_ci    punpcklbw     m0, m7
326cabdff1aSopenharmony_ci    mova          m2, m1
327cabdff1aSopenharmony_ci    mova          m3, m1
328cabdff1aSopenharmony_ci    mova          m4, m1
329cabdff1aSopenharmony_ci    mova          m5, m1
330cabdff1aSopenharmony_ci    palignr       m4, m0, 2
331cabdff1aSopenharmony_ci    palignr       m3, m0, 4
332cabdff1aSopenharmony_ci    palignr       m2, m0, 6
333cabdff1aSopenharmony_ci    palignr       m1, m0, 8
334cabdff1aSopenharmony_ci    palignr       m5, m0, 10
335cabdff1aSopenharmony_ci    paddw         m0, m5
336cabdff1aSopenharmony_ci    paddw         m2, m3
337cabdff1aSopenharmony_ci    paddw         m1, m4
338cabdff1aSopenharmony_ci    psllw         m2, 2
339cabdff1aSopenharmony_ci    movh          m3, [r2]
340cabdff1aSopenharmony_ci    psubw         m2, m1
341cabdff1aSopenharmony_ci    paddw         m0, [pw_16]
342cabdff1aSopenharmony_ci    pmullw        m2, m6
343cabdff1aSopenharmony_ci    paddw         m2, m0
344cabdff1aSopenharmony_ci    psraw         m2, 5
345cabdff1aSopenharmony_ci    packuswb      m2, m2
346cabdff1aSopenharmony_ci    pavgb         m2, m3
347cabdff1aSopenharmony_ci    op_%1h        m2, [r0], m4
348cabdff1aSopenharmony_ci    add           r1, r3
349cabdff1aSopenharmony_ci    add           r0, r3
350cabdff1aSopenharmony_ci    add           r2, r4
351cabdff1aSopenharmony_ci    dec          r5d
352cabdff1aSopenharmony_ci    jg         .loop
353cabdff1aSopenharmony_ci    REP_RET
354cabdff1aSopenharmony_ci%endmacro
355cabdff1aSopenharmony_ci
356cabdff1aSopenharmony_ciINIT_XMM ssse3
357cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_L2_OP_XMM put
358cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_L2_OP_XMM avg
359cabdff1aSopenharmony_ci
360cabdff1aSopenharmony_ci
361cabdff1aSopenharmony_ci; All functions that call this are required to have function arguments of
362cabdff1aSopenharmony_ci; dst, src, dstStride, srcStride
363cabdff1aSopenharmony_ci%macro FILT_V 1
364cabdff1aSopenharmony_ci    mova      m6, m2
365cabdff1aSopenharmony_ci    movh      m5, [r1]
366cabdff1aSopenharmony_ci    paddw     m6, m3
367cabdff1aSopenharmony_ci    psllw     m6, 2
368cabdff1aSopenharmony_ci    psubw     m6, m1
369cabdff1aSopenharmony_ci    psubw     m6, m4
370cabdff1aSopenharmony_ci    punpcklbw m5, m7
371cabdff1aSopenharmony_ci    pmullw    m6, [pw_5]
372cabdff1aSopenharmony_ci    paddw     m0, [pw_16]
373cabdff1aSopenharmony_ci    add       r1, r3
374cabdff1aSopenharmony_ci    paddw     m0, m5
375cabdff1aSopenharmony_ci    paddw     m6, m0
376cabdff1aSopenharmony_ci    psraw     m6, 5
377cabdff1aSopenharmony_ci    packuswb  m6, m6
378cabdff1aSopenharmony_ci    op_%1h    m6, [r0], m0 ; 1
379cabdff1aSopenharmony_ci    add       r0, r2
380cabdff1aSopenharmony_ci    SWAP       0, 1, 2, 3, 4, 5
381cabdff1aSopenharmony_ci%endmacro
382cabdff1aSopenharmony_ci
383cabdff1aSopenharmony_ci%macro QPEL4_V_LOWPASS_OP 1
384cabdff1aSopenharmony_cicglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
385cabdff1aSopenharmony_ci    movsxdifnidn  r2, r2d
386cabdff1aSopenharmony_ci    movsxdifnidn  r3, r3d
387cabdff1aSopenharmony_ci    sub           r1, r3
388cabdff1aSopenharmony_ci    sub           r1, r3
389cabdff1aSopenharmony_ci    pxor          m7, m7
390cabdff1aSopenharmony_ci    movh          m0, [r1]
391cabdff1aSopenharmony_ci    movh          m1, [r1+r3]
392cabdff1aSopenharmony_ci    lea           r1, [r1+2*r3]
393cabdff1aSopenharmony_ci    movh          m2, [r1]
394cabdff1aSopenharmony_ci    movh          m3, [r1+r3]
395cabdff1aSopenharmony_ci    lea           r1, [r1+2*r3]
396cabdff1aSopenharmony_ci    movh          m4, [r1]
397cabdff1aSopenharmony_ci    add           r1, r3
398cabdff1aSopenharmony_ci    punpcklbw     m0, m7
399cabdff1aSopenharmony_ci    punpcklbw     m1, m7
400cabdff1aSopenharmony_ci    punpcklbw     m2, m7
401cabdff1aSopenharmony_ci    punpcklbw     m3, m7
402cabdff1aSopenharmony_ci    punpcklbw     m4, m7
403cabdff1aSopenharmony_ci    FILT_V        %1
404cabdff1aSopenharmony_ci    FILT_V        %1
405cabdff1aSopenharmony_ci    FILT_V        %1
406cabdff1aSopenharmony_ci    FILT_V        %1
407cabdff1aSopenharmony_ci    RET
408cabdff1aSopenharmony_ci%endmacro
409cabdff1aSopenharmony_ci
410cabdff1aSopenharmony_ciINIT_MMX mmxext
411cabdff1aSopenharmony_ciQPEL4_V_LOWPASS_OP put
412cabdff1aSopenharmony_ciQPEL4_V_LOWPASS_OP avg
413cabdff1aSopenharmony_ci
414cabdff1aSopenharmony_ci
415cabdff1aSopenharmony_ci
416cabdff1aSopenharmony_ci%macro QPEL8OR16_V_LOWPASS_OP 1
417cabdff1aSopenharmony_ci%if cpuflag(sse2)
418cabdff1aSopenharmony_cicglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
419cabdff1aSopenharmony_ci    movsxdifnidn  r2, r2d
420cabdff1aSopenharmony_ci    movsxdifnidn  r3, r3d
421cabdff1aSopenharmony_ci    sub           r1, r3
422cabdff1aSopenharmony_ci    sub           r1, r3
423cabdff1aSopenharmony_ci%else
424cabdff1aSopenharmony_cicglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
425cabdff1aSopenharmony_ci    movsxdifnidn  r2, r2d
426cabdff1aSopenharmony_ci    movsxdifnidn  r3, r3d
427cabdff1aSopenharmony_ci%endif
428cabdff1aSopenharmony_ci    pxor          m7, m7
429cabdff1aSopenharmony_ci    movh          m0, [r1]
430cabdff1aSopenharmony_ci    movh          m1, [r1+r3]
431cabdff1aSopenharmony_ci    lea           r1, [r1+2*r3]
432cabdff1aSopenharmony_ci    movh          m2, [r1]
433cabdff1aSopenharmony_ci    movh          m3, [r1+r3]
434cabdff1aSopenharmony_ci    lea           r1, [r1+2*r3]
435cabdff1aSopenharmony_ci    movh          m4, [r1]
436cabdff1aSopenharmony_ci    add           r1, r3
437cabdff1aSopenharmony_ci    punpcklbw     m0, m7
438cabdff1aSopenharmony_ci    punpcklbw     m1, m7
439cabdff1aSopenharmony_ci    punpcklbw     m2, m7
440cabdff1aSopenharmony_ci    punpcklbw     m3, m7
441cabdff1aSopenharmony_ci    punpcklbw     m4, m7
442cabdff1aSopenharmony_ci    FILT_V        %1
443cabdff1aSopenharmony_ci    FILT_V        %1
444cabdff1aSopenharmony_ci    FILT_V        %1
445cabdff1aSopenharmony_ci    FILT_V        %1
446cabdff1aSopenharmony_ci    FILT_V        %1
447cabdff1aSopenharmony_ci    FILT_V        %1
448cabdff1aSopenharmony_ci    FILT_V        %1
449cabdff1aSopenharmony_ci    FILT_V        %1
450cabdff1aSopenharmony_ci    cmp          r4d, 16
451cabdff1aSopenharmony_ci    jne         .end
452cabdff1aSopenharmony_ci    FILT_V        %1
453cabdff1aSopenharmony_ci    FILT_V        %1
454cabdff1aSopenharmony_ci    FILT_V        %1
455cabdff1aSopenharmony_ci    FILT_V        %1
456cabdff1aSopenharmony_ci    FILT_V        %1
457cabdff1aSopenharmony_ci    FILT_V        %1
458cabdff1aSopenharmony_ci    FILT_V        %1
459cabdff1aSopenharmony_ci    FILT_V        %1
460cabdff1aSopenharmony_ci.end:
461cabdff1aSopenharmony_ci    REP_RET
462cabdff1aSopenharmony_ci%endmacro
463cabdff1aSopenharmony_ci
464cabdff1aSopenharmony_ciINIT_XMM sse2
465cabdff1aSopenharmony_ciQPEL8OR16_V_LOWPASS_OP put
466cabdff1aSopenharmony_ciQPEL8OR16_V_LOWPASS_OP avg
467cabdff1aSopenharmony_ci
468cabdff1aSopenharmony_ci
469cabdff1aSopenharmony_ci; All functions that use this are required to have args:
470cabdff1aSopenharmony_ci; src, tmp, srcSize
471cabdff1aSopenharmony_ci%macro FILT_HV 1 ; offset
472cabdff1aSopenharmony_ci    mova           m6, m2
473cabdff1aSopenharmony_ci    movh           m5, [r0]
474cabdff1aSopenharmony_ci    paddw          m6, m3
475cabdff1aSopenharmony_ci    psllw          m6, 2
476cabdff1aSopenharmony_ci    paddw          m0, [pw_16]
477cabdff1aSopenharmony_ci    psubw          m6, m1
478cabdff1aSopenharmony_ci    psubw          m6, m4
479cabdff1aSopenharmony_ci    punpcklbw      m5, m7
480cabdff1aSopenharmony_ci    pmullw         m6, [pw_5]
481cabdff1aSopenharmony_ci    paddw          m0, m5
482cabdff1aSopenharmony_ci    add            r0, r2
483cabdff1aSopenharmony_ci    paddw          m6, m0
484cabdff1aSopenharmony_ci    mova      [r1+%1], m6
485cabdff1aSopenharmony_ci    SWAP            0, 1, 2, 3, 4, 5
486cabdff1aSopenharmony_ci%endmacro
487cabdff1aSopenharmony_ci
488cabdff1aSopenharmony_ci%macro QPEL4_HV1_LOWPASS_OP 1
489cabdff1aSopenharmony_cicglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
490cabdff1aSopenharmony_ci    movsxdifnidn  r2, r2d
491cabdff1aSopenharmony_ci    pxor          m7, m7
492cabdff1aSopenharmony_ci    movh          m0, [r0]
493cabdff1aSopenharmony_ci    movh          m1, [r0+r2]
494cabdff1aSopenharmony_ci    lea           r0, [r0+2*r2]
495cabdff1aSopenharmony_ci    movh          m2, [r0]
496cabdff1aSopenharmony_ci    movh          m3, [r0+r2]
497cabdff1aSopenharmony_ci    lea           r0, [r0+2*r2]
498cabdff1aSopenharmony_ci    movh          m4, [r0]
499cabdff1aSopenharmony_ci    add           r0, r2
500cabdff1aSopenharmony_ci    punpcklbw     m0, m7
501cabdff1aSopenharmony_ci    punpcklbw     m1, m7
502cabdff1aSopenharmony_ci    punpcklbw     m2, m7
503cabdff1aSopenharmony_ci    punpcklbw     m3, m7
504cabdff1aSopenharmony_ci    punpcklbw     m4, m7
505cabdff1aSopenharmony_ci    FILT_HV       0*24
506cabdff1aSopenharmony_ci    FILT_HV       1*24
507cabdff1aSopenharmony_ci    FILT_HV       2*24
508cabdff1aSopenharmony_ci    FILT_HV       3*24
509cabdff1aSopenharmony_ci    RET
510cabdff1aSopenharmony_ci
511cabdff1aSopenharmony_cicglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
512cabdff1aSopenharmony_ci    movsxdifnidn  r2, r2d
513cabdff1aSopenharmony_ci    mov          r3d, 4
514cabdff1aSopenharmony_ci.loop:
515cabdff1aSopenharmony_ci    mova          m0, [r0]
516cabdff1aSopenharmony_ci    paddw         m0, [r0+10]
517cabdff1aSopenharmony_ci    mova          m1, [r0+2]
518cabdff1aSopenharmony_ci    paddw         m1, [r0+8]
519cabdff1aSopenharmony_ci    mova          m2, [r0+4]
520cabdff1aSopenharmony_ci    paddw         m2, [r0+6]
521cabdff1aSopenharmony_ci    psubw         m0, m1
522cabdff1aSopenharmony_ci    psraw         m0, 2
523cabdff1aSopenharmony_ci    psubw         m0, m1
524cabdff1aSopenharmony_ci    paddsw        m0, m2
525cabdff1aSopenharmony_ci    psraw         m0, 2
526cabdff1aSopenharmony_ci    paddw         m0, m2
527cabdff1aSopenharmony_ci    psraw         m0, 6
528cabdff1aSopenharmony_ci    packuswb      m0, m0
529cabdff1aSopenharmony_ci    op_%1h        m0, [r1], m7
530cabdff1aSopenharmony_ci    add           r0, 24
531cabdff1aSopenharmony_ci    add           r1, r2
532cabdff1aSopenharmony_ci    dec          r3d
533cabdff1aSopenharmony_ci    jnz        .loop
534cabdff1aSopenharmony_ci    REP_RET
535cabdff1aSopenharmony_ci%endmacro
536cabdff1aSopenharmony_ci
537cabdff1aSopenharmony_ciINIT_MMX mmxext
538cabdff1aSopenharmony_ciQPEL4_HV1_LOWPASS_OP put
539cabdff1aSopenharmony_ciQPEL4_HV1_LOWPASS_OP avg
540cabdff1aSopenharmony_ci
541cabdff1aSopenharmony_ci%macro QPEL8OR16_HV1_LOWPASS_OP 1
542cabdff1aSopenharmony_cicglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
543cabdff1aSopenharmony_ci    movsxdifnidn  r2, r2d
544cabdff1aSopenharmony_ci    pxor          m7, m7
545cabdff1aSopenharmony_ci    movh          m0, [r0]
546cabdff1aSopenharmony_ci    movh          m1, [r0+r2]
547cabdff1aSopenharmony_ci    lea           r0, [r0+2*r2]
548cabdff1aSopenharmony_ci    movh          m2, [r0]
549cabdff1aSopenharmony_ci    movh          m3, [r0+r2]
550cabdff1aSopenharmony_ci    lea           r0, [r0+2*r2]
551cabdff1aSopenharmony_ci    movh          m4, [r0]
552cabdff1aSopenharmony_ci    add           r0, r2
553cabdff1aSopenharmony_ci    punpcklbw     m0, m7
554cabdff1aSopenharmony_ci    punpcklbw     m1, m7
555cabdff1aSopenharmony_ci    punpcklbw     m2, m7
556cabdff1aSopenharmony_ci    punpcklbw     m3, m7
557cabdff1aSopenharmony_ci    punpcklbw     m4, m7
558cabdff1aSopenharmony_ci    FILT_HV     0*48
559cabdff1aSopenharmony_ci    FILT_HV     1*48
560cabdff1aSopenharmony_ci    FILT_HV     2*48
561cabdff1aSopenharmony_ci    FILT_HV     3*48
562cabdff1aSopenharmony_ci    FILT_HV     4*48
563cabdff1aSopenharmony_ci    FILT_HV     5*48
564cabdff1aSopenharmony_ci    FILT_HV     6*48
565cabdff1aSopenharmony_ci    FILT_HV     7*48
566cabdff1aSopenharmony_ci    cmp          r3d, 16
567cabdff1aSopenharmony_ci    jne         .end
568cabdff1aSopenharmony_ci    FILT_HV     8*48
569cabdff1aSopenharmony_ci    FILT_HV     9*48
570cabdff1aSopenharmony_ci    FILT_HV    10*48
571cabdff1aSopenharmony_ci    FILT_HV    11*48
572cabdff1aSopenharmony_ci    FILT_HV    12*48
573cabdff1aSopenharmony_ci    FILT_HV    13*48
574cabdff1aSopenharmony_ci    FILT_HV    14*48
575cabdff1aSopenharmony_ci    FILT_HV    15*48
576cabdff1aSopenharmony_ci.end:
577cabdff1aSopenharmony_ci    REP_RET
578cabdff1aSopenharmony_ci%endmacro
579cabdff1aSopenharmony_ci
580cabdff1aSopenharmony_ciINIT_XMM sse2
581cabdff1aSopenharmony_ciQPEL8OR16_HV1_LOWPASS_OP put
582cabdff1aSopenharmony_ci
583cabdff1aSopenharmony_ci
584cabdff1aSopenharmony_ci
585cabdff1aSopenharmony_ci%macro QPEL8OR16_HV2_LOWPASS_OP 1
586cabdff1aSopenharmony_ci; unused is to match ssse3 and mmxext args
587cabdff1aSopenharmony_cicglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
588cabdff1aSopenharmony_ci    movsxdifnidn  r2, r2d
589cabdff1aSopenharmony_ci.loop:
590cabdff1aSopenharmony_ci    mova          m0, [r1]
591cabdff1aSopenharmony_ci    mova          m3, [r1+8]
592cabdff1aSopenharmony_ci    mova          m1, [r1+2]
593cabdff1aSopenharmony_ci    mova          m4, [r1+10]
594cabdff1aSopenharmony_ci    paddw         m0, m4
595cabdff1aSopenharmony_ci    paddw         m1, m3
596cabdff1aSopenharmony_ci    paddw         m3, [r1+18]
597cabdff1aSopenharmony_ci    paddw         m4, [r1+16]
598cabdff1aSopenharmony_ci    mova          m2, [r1+4]
599cabdff1aSopenharmony_ci    mova          m5, [r1+12]
600cabdff1aSopenharmony_ci    paddw         m2, [r1+6]
601cabdff1aSopenharmony_ci    paddw         m5, [r1+14]
602cabdff1aSopenharmony_ci    psubw         m0, m1
603cabdff1aSopenharmony_ci    psubw         m3, m4
604cabdff1aSopenharmony_ci    psraw         m0, 2
605cabdff1aSopenharmony_ci    psraw         m3, 2
606cabdff1aSopenharmony_ci    psubw         m0, m1
607cabdff1aSopenharmony_ci    psubw         m3, m4
608cabdff1aSopenharmony_ci    paddsw        m0, m2
609cabdff1aSopenharmony_ci    paddsw        m3, m5
610cabdff1aSopenharmony_ci    psraw         m0, 2
611cabdff1aSopenharmony_ci    psraw         m3, 2
612cabdff1aSopenharmony_ci    paddw         m0, m2
613cabdff1aSopenharmony_ci    paddw         m3, m5
614cabdff1aSopenharmony_ci    psraw         m0, 6
615cabdff1aSopenharmony_ci    psraw         m3, 6
616cabdff1aSopenharmony_ci    packuswb      m0, m3
617cabdff1aSopenharmony_ci    op_%1         m0, [r0], m7
618cabdff1aSopenharmony_ci    add           r1, 48
619cabdff1aSopenharmony_ci    add           r0, r2
620cabdff1aSopenharmony_ci    dec          r4d
621cabdff1aSopenharmony_ci    jne        .loop
622cabdff1aSopenharmony_ci    REP_RET
623cabdff1aSopenharmony_ci%endmacro
624cabdff1aSopenharmony_ci
625cabdff1aSopenharmony_ciINIT_MMX mmxext
626cabdff1aSopenharmony_ciQPEL8OR16_HV2_LOWPASS_OP put
627cabdff1aSopenharmony_ciQPEL8OR16_HV2_LOWPASS_OP avg
628cabdff1aSopenharmony_ci
629cabdff1aSopenharmony_ci%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
630cabdff1aSopenharmony_cicglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
631cabdff1aSopenharmony_ci    movsxdifnidn  r2, r2d
632cabdff1aSopenharmony_ci    movsxdifnidn  r3, r3d
633cabdff1aSopenharmony_ci    cmp          r4d, 16
634cabdff1aSopenharmony_ci    je         .op16
635cabdff1aSopenharmony_ci.loop8:
636cabdff1aSopenharmony_ci    mova          m1, [r1+16]
637cabdff1aSopenharmony_ci    mova          m0, [r1]
638cabdff1aSopenharmony_ci    mova          m2, m1
639cabdff1aSopenharmony_ci    mova          m3, m1
640cabdff1aSopenharmony_ci    mova          m4, m1
641cabdff1aSopenharmony_ci    mova          m5, m1
642cabdff1aSopenharmony_ci    palignr       m5, m0, 10
643cabdff1aSopenharmony_ci    palignr       m4, m0, 8
644cabdff1aSopenharmony_ci    palignr       m3, m0, 6
645cabdff1aSopenharmony_ci    palignr       m2, m0, 4
646cabdff1aSopenharmony_ci    palignr       m1, m0, 2
647cabdff1aSopenharmony_ci    paddw         m0, m5
648cabdff1aSopenharmony_ci    paddw         m1, m4
649cabdff1aSopenharmony_ci    paddw         m2, m3
650cabdff1aSopenharmony_ci    psubw         m0, m1
651cabdff1aSopenharmony_ci    psraw         m0, 2
652cabdff1aSopenharmony_ci    psubw         m0, m1
653cabdff1aSopenharmony_ci    paddw         m0, m2
654cabdff1aSopenharmony_ci    psraw         m0, 2
655cabdff1aSopenharmony_ci    paddw         m0, m2
656cabdff1aSopenharmony_ci    psraw         m0, 6
657cabdff1aSopenharmony_ci    packuswb      m0, m0
658cabdff1aSopenharmony_ci    op_%1h        m0, [r0], m7
659cabdff1aSopenharmony_ci    add           r1, 48
660cabdff1aSopenharmony_ci    add           r0, r2
661cabdff1aSopenharmony_ci    dec          r4d
662cabdff1aSopenharmony_ci    jne       .loop8
663cabdff1aSopenharmony_ci    jmp        .done
664cabdff1aSopenharmony_ci.op16:
665cabdff1aSopenharmony_ci    mova          m4, [r1+32]
666cabdff1aSopenharmony_ci    mova          m5, [r1+16]
667cabdff1aSopenharmony_ci    mova          m7, [r1]
668cabdff1aSopenharmony_ci    mova          m3, m4
669cabdff1aSopenharmony_ci    mova          m2, m4
670cabdff1aSopenharmony_ci    mova          m1, m4
671cabdff1aSopenharmony_ci    mova          m0, m4
672cabdff1aSopenharmony_ci    palignr       m0, m5, 10
673cabdff1aSopenharmony_ci    palignr       m1, m5, 8
674cabdff1aSopenharmony_ci    palignr       m2, m5, 6
675cabdff1aSopenharmony_ci    palignr       m3, m5, 4
676cabdff1aSopenharmony_ci    palignr       m4, m5, 2
677cabdff1aSopenharmony_ci    paddw         m0, m5
678cabdff1aSopenharmony_ci    paddw         m1, m4
679cabdff1aSopenharmony_ci    paddw         m2, m3
680cabdff1aSopenharmony_ci    mova          m6, m5
681cabdff1aSopenharmony_ci    mova          m4, m5
682cabdff1aSopenharmony_ci    mova          m3, m5
683cabdff1aSopenharmony_ci    palignr       m4, m7, 8
684cabdff1aSopenharmony_ci    palignr       m6, m7, 2
685cabdff1aSopenharmony_ci    palignr       m3, m7, 10
686cabdff1aSopenharmony_ci    paddw         m4, m6
687cabdff1aSopenharmony_ci    mova          m6, m5
688cabdff1aSopenharmony_ci    palignr       m5, m7, 6
689cabdff1aSopenharmony_ci    palignr       m6, m7, 4
690cabdff1aSopenharmony_ci    paddw         m3, m7
691cabdff1aSopenharmony_ci    paddw         m5, m6
692cabdff1aSopenharmony_ci    psubw         m0, m1
693cabdff1aSopenharmony_ci    psubw         m3, m4
694cabdff1aSopenharmony_ci    psraw         m0, 2
695cabdff1aSopenharmony_ci    psraw         m3, 2
696cabdff1aSopenharmony_ci    psubw         m0, m1
697cabdff1aSopenharmony_ci    psubw         m3, m4
698cabdff1aSopenharmony_ci    paddw         m0, m2
699cabdff1aSopenharmony_ci    paddw         m3, m5
700cabdff1aSopenharmony_ci    psraw         m0, 2
701cabdff1aSopenharmony_ci    psraw         m3, 2
702cabdff1aSopenharmony_ci    paddw         m0, m2
703cabdff1aSopenharmony_ci    paddw         m3, m5
704cabdff1aSopenharmony_ci    psraw         m0, 6
705cabdff1aSopenharmony_ci    psraw         m3, 6
706cabdff1aSopenharmony_ci    packuswb      m3, m0
707cabdff1aSopenharmony_ci    op_%1         m3, [r0], m7
708cabdff1aSopenharmony_ci    add           r1, 48
709cabdff1aSopenharmony_ci    add           r0, r2
710cabdff1aSopenharmony_ci    dec          r4d
711cabdff1aSopenharmony_ci    jne        .op16
712cabdff1aSopenharmony_ci.done:
713cabdff1aSopenharmony_ci    REP_RET
714cabdff1aSopenharmony_ci%endmacro
715cabdff1aSopenharmony_ci
716cabdff1aSopenharmony_ciINIT_XMM ssse3
717cabdff1aSopenharmony_ciQPEL8OR16_HV2_LOWPASS_OP_XMM put
718cabdff1aSopenharmony_ciQPEL8OR16_HV2_LOWPASS_OP_XMM avg
719cabdff1aSopenharmony_ci
720cabdff1aSopenharmony_ci
721cabdff1aSopenharmony_ci%macro PIXELS4_L2_SHIFT5 1
722cabdff1aSopenharmony_cicglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
723cabdff1aSopenharmony_ci    movsxdifnidn  r3, r3d
724cabdff1aSopenharmony_ci    movsxdifnidn  r4, r4d
725cabdff1aSopenharmony_ci    mova          m0, [r1]
726cabdff1aSopenharmony_ci    mova          m1, [r1+24]
727cabdff1aSopenharmony_ci    psraw         m0, 5
728cabdff1aSopenharmony_ci    psraw         m1, 5
729cabdff1aSopenharmony_ci    packuswb      m0, m0
730cabdff1aSopenharmony_ci    packuswb      m1, m1
731cabdff1aSopenharmony_ci    pavgb         m0, [r2]
732cabdff1aSopenharmony_ci    pavgb         m1, [r2+r4]
733cabdff1aSopenharmony_ci    op_%1h        m0, [r0], m4
734cabdff1aSopenharmony_ci    op_%1h        m1, [r0+r3], m5
735cabdff1aSopenharmony_ci    lea           r2, [r2+r4*2]
736cabdff1aSopenharmony_ci    lea           r0, [r0+r3*2]
737cabdff1aSopenharmony_ci    mova          m0, [r1+48]
738cabdff1aSopenharmony_ci    mova          m1, [r1+72]
739cabdff1aSopenharmony_ci    psraw         m0, 5
740cabdff1aSopenharmony_ci    psraw         m1, 5
741cabdff1aSopenharmony_ci    packuswb      m0, m0
742cabdff1aSopenharmony_ci    packuswb      m1, m1
743cabdff1aSopenharmony_ci    pavgb         m0, [r2]
744cabdff1aSopenharmony_ci    pavgb         m1, [r2+r4]
745cabdff1aSopenharmony_ci    op_%1h        m0, [r0], m4
746cabdff1aSopenharmony_ci    op_%1h        m1, [r0+r3], m5
747cabdff1aSopenharmony_ci    RET
748cabdff1aSopenharmony_ci%endmacro
749cabdff1aSopenharmony_ci
750cabdff1aSopenharmony_ciINIT_MMX mmxext
751cabdff1aSopenharmony_ciPIXELS4_L2_SHIFT5 put
752cabdff1aSopenharmony_ciPIXELS4_L2_SHIFT5 avg
753cabdff1aSopenharmony_ci
754cabdff1aSopenharmony_ci
755cabdff1aSopenharmony_ci%macro PIXELS8_L2_SHIFT5 1
756cabdff1aSopenharmony_cicglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
757cabdff1aSopenharmony_ci    movsxdifnidn  r3, r3d
758cabdff1aSopenharmony_ci    movsxdifnidn  r4, r4d
759cabdff1aSopenharmony_ci.loop:
760cabdff1aSopenharmony_ci    mova          m0, [r1]
761cabdff1aSopenharmony_ci    mova          m1, [r1+8]
762cabdff1aSopenharmony_ci    mova          m2, [r1+48]
763cabdff1aSopenharmony_ci    mova          m3, [r1+48+8]
764cabdff1aSopenharmony_ci    psraw         m0, 5
765cabdff1aSopenharmony_ci    psraw         m1, 5
766cabdff1aSopenharmony_ci    psraw         m2, 5
767cabdff1aSopenharmony_ci    psraw         m3, 5
768cabdff1aSopenharmony_ci    packuswb      m0, m1
769cabdff1aSopenharmony_ci    packuswb      m2, m3
770cabdff1aSopenharmony_ci    pavgb         m0, [r2]
771cabdff1aSopenharmony_ci    pavgb         m2, [r2+r4]
772cabdff1aSopenharmony_ci    op_%1         m0, [r0], m4
773cabdff1aSopenharmony_ci    op_%1         m2, [r0+r3], m5
774cabdff1aSopenharmony_ci    lea           r2, [r2+2*r4]
775cabdff1aSopenharmony_ci    add           r1, 48*2
776cabdff1aSopenharmony_ci    lea           r0, [r0+2*r3]
777cabdff1aSopenharmony_ci    sub          r5d, 2
778cabdff1aSopenharmony_ci    jne        .loop
779cabdff1aSopenharmony_ci    REP_RET
780cabdff1aSopenharmony_ci%endmacro
781cabdff1aSopenharmony_ci
782cabdff1aSopenharmony_ciINIT_MMX mmxext
783cabdff1aSopenharmony_ciPIXELS8_L2_SHIFT5 put
784cabdff1aSopenharmony_ciPIXELS8_L2_SHIFT5 avg
785cabdff1aSopenharmony_ci
786cabdff1aSopenharmony_ci
787cabdff1aSopenharmony_ci%if ARCH_X86_64
788cabdff1aSopenharmony_ci%macro QPEL16_H_LOWPASS_L2_OP 1
789cabdff1aSopenharmony_cicglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
790cabdff1aSopenharmony_ci    movsxdifnidn  r3, r3d
791cabdff1aSopenharmony_ci    movsxdifnidn  r4, r4d
792cabdff1aSopenharmony_ci    mov          r5d, 16
793cabdff1aSopenharmony_ci    pxor         m15, m15
794cabdff1aSopenharmony_ci    mova         m14, [pw_5]
795cabdff1aSopenharmony_ci    mova         m13, [pw_16]
796cabdff1aSopenharmony_ci.loop:
797cabdff1aSopenharmony_ci    lddqu         m1, [r1+6]
798cabdff1aSopenharmony_ci    lddqu         m7, [r1-2]
799cabdff1aSopenharmony_ci    mova          m0, m1
800cabdff1aSopenharmony_ci    punpckhbw     m1, m15
801cabdff1aSopenharmony_ci    punpcklbw     m0, m15
802cabdff1aSopenharmony_ci    punpcklbw     m7, m15
803cabdff1aSopenharmony_ci    mova          m2, m1
804cabdff1aSopenharmony_ci    mova          m6, m0
805cabdff1aSopenharmony_ci    mova          m3, m1
806cabdff1aSopenharmony_ci    mova          m8, m0
807cabdff1aSopenharmony_ci    mova          m4, m1
808cabdff1aSopenharmony_ci    mova          m9, m0
809cabdff1aSopenharmony_ci    mova         m12, m0
810cabdff1aSopenharmony_ci    mova         m11, m1
811cabdff1aSopenharmony_ci    palignr      m11, m0, 10
812cabdff1aSopenharmony_ci    palignr      m12, m7, 10
813cabdff1aSopenharmony_ci    palignr       m4, m0, 2
814cabdff1aSopenharmony_ci    palignr       m9, m7, 2
815cabdff1aSopenharmony_ci    palignr       m3, m0, 4
816cabdff1aSopenharmony_ci    palignr       m8, m7, 4
817cabdff1aSopenharmony_ci    palignr       m2, m0, 6
818cabdff1aSopenharmony_ci    palignr       m6, m7, 6
819cabdff1aSopenharmony_ci    paddw        m11, m0
820cabdff1aSopenharmony_ci    palignr       m1, m0, 8
821cabdff1aSopenharmony_ci    palignr       m0, m7, 8
822cabdff1aSopenharmony_ci    paddw         m7, m12
823cabdff1aSopenharmony_ci    paddw         m2, m3
824cabdff1aSopenharmony_ci    paddw         m6, m8
825cabdff1aSopenharmony_ci    paddw         m1, m4
826cabdff1aSopenharmony_ci    paddw         m0, m9
827cabdff1aSopenharmony_ci    psllw         m2, 2
828cabdff1aSopenharmony_ci    psllw         m6, 2
829cabdff1aSopenharmony_ci    psubw         m2, m1
830cabdff1aSopenharmony_ci    psubw         m6, m0
831cabdff1aSopenharmony_ci    paddw        m11, m13
832cabdff1aSopenharmony_ci    paddw         m7, m13
833cabdff1aSopenharmony_ci    pmullw        m2, m14
834cabdff1aSopenharmony_ci    pmullw        m6, m14
835cabdff1aSopenharmony_ci    lddqu         m3, [r2]
836cabdff1aSopenharmony_ci    paddw         m2, m11
837cabdff1aSopenharmony_ci    paddw         m6, m7
838cabdff1aSopenharmony_ci    psraw         m2, 5
839cabdff1aSopenharmony_ci    psraw         m6, 5
840cabdff1aSopenharmony_ci    packuswb      m6, m2
841cabdff1aSopenharmony_ci    pavgb         m6, m3
842cabdff1aSopenharmony_ci    op_%1         m6, [r0], m11
843cabdff1aSopenharmony_ci    add           r1, r3
844cabdff1aSopenharmony_ci    add           r0, r3
845cabdff1aSopenharmony_ci    add           r2, r4
846cabdff1aSopenharmony_ci    dec          r5d
847cabdff1aSopenharmony_ci    jg         .loop
848cabdff1aSopenharmony_ci    REP_RET
849cabdff1aSopenharmony_ci%endmacro
850cabdff1aSopenharmony_ci
851cabdff1aSopenharmony_ciINIT_XMM ssse3
852cabdff1aSopenharmony_ciQPEL16_H_LOWPASS_L2_OP put
853cabdff1aSopenharmony_ciQPEL16_H_LOWPASS_L2_OP avg
854cabdff1aSopenharmony_ci%endif
855