1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
3cabdff1aSopenharmony_ci;*****************************************************************************
4cabdff1aSopenharmony_ci;* Copyright (C) 2011 x264 project
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
9cabdff1aSopenharmony_ci;*
10cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
11cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
12cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
13cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
14cabdff1aSopenharmony_ci;*
15cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
16cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
19cabdff1aSopenharmony_ci;*
20cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
21cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
22cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23cabdff1aSopenharmony_ci;******************************************************************************
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ciSECTION_RODATA 32
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_cicextern pd_65535
30cabdff1aSopenharmony_cicextern pw_1023
31cabdff1aSopenharmony_ci%define pw_pixel_max pw_1023
32cabdff1aSopenharmony_cicextern pw_16
33cabdff1aSopenharmony_cicextern pw_1
34cabdff1aSopenharmony_cicextern pb_0
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_cipad10: times 8 dw 10*1023
37cabdff1aSopenharmony_cipad20: times 8 dw 20*1023
38cabdff1aSopenharmony_cipad30: times 8 dw 30*1023
39cabdff1aSopenharmony_cidepad: times 4 dd 32*20*1023 + 512
40cabdff1aSopenharmony_cidepad2: times 8 dw 20*1023 + 16*1022 + 16
41cabdff1aSopenharmony_ciunpad: times 8 dw 16*1022/32 ; needs to be mod 16
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_citap1: times 4 dw  1, -5
44cabdff1aSopenharmony_citap2: times 4 dw 20, 20
45cabdff1aSopenharmony_citap3: times 4 dw -5,  1
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_ciSECTION .text
48cabdff1aSopenharmony_ci
49cabdff1aSopenharmony_ci
50cabdff1aSopenharmony_ci%macro AVG_MOV 2
51cabdff1aSopenharmony_ci    pavgw %2, %1
52cabdff1aSopenharmony_ci    mova  %1, %2
53cabdff1aSopenharmony_ci%endmacro
54cabdff1aSopenharmony_ci
55cabdff1aSopenharmony_ci%macro ADDW 3
56cabdff1aSopenharmony_ci%if mmsize == 8
57cabdff1aSopenharmony_ci    paddw %1, %2
58cabdff1aSopenharmony_ci%else
59cabdff1aSopenharmony_ci    movu  %3, %2
60cabdff1aSopenharmony_ci    paddw %1, %3
61cabdff1aSopenharmony_ci%endif
62cabdff1aSopenharmony_ci%endmacro
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_ci%macro FILT_H 4
65cabdff1aSopenharmony_ci    paddw  %1, %4
66cabdff1aSopenharmony_ci    psubw  %1, %2  ; a-b
67cabdff1aSopenharmony_ci    psraw  %1, 2   ; (a-b)/4
68cabdff1aSopenharmony_ci    psubw  %1, %2  ; (a-b)/4-b
69cabdff1aSopenharmony_ci    paddw  %1, %3  ; (a-b)/4-b+c
70cabdff1aSopenharmony_ci    psraw  %1, 2   ; ((a-b)/4-b+c)/4
71cabdff1aSopenharmony_ci    paddw  %1, %3  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
72cabdff1aSopenharmony_ci%endmacro
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_ci%macro PRELOAD_V 0
75cabdff1aSopenharmony_ci    lea      r3, [r2*3]
76cabdff1aSopenharmony_ci    sub      r1, r3
77cabdff1aSopenharmony_ci    movu     m0, [r1+r2]
78cabdff1aSopenharmony_ci    movu     m1, [r1+r2*2]
79cabdff1aSopenharmony_ci    add      r1, r3
80cabdff1aSopenharmony_ci    movu     m2, [r1]
81cabdff1aSopenharmony_ci    movu     m3, [r1+r2]
82cabdff1aSopenharmony_ci    movu     m4, [r1+r2*2]
83cabdff1aSopenharmony_ci    add      r1, r3
84cabdff1aSopenharmony_ci%endmacro
85cabdff1aSopenharmony_ci
86cabdff1aSopenharmony_ci%macro FILT_V 8
87cabdff1aSopenharmony_ci    movu     %6, [r1]
88cabdff1aSopenharmony_ci    paddw    %1, %6
89cabdff1aSopenharmony_ci    mova     %7, %2
90cabdff1aSopenharmony_ci    paddw    %7, %5
91cabdff1aSopenharmony_ci    mova     %8, %3
92cabdff1aSopenharmony_ci    paddw    %8, %4
93cabdff1aSopenharmony_ci    FILT_H   %1, %7, %8, [pw_16]
94cabdff1aSopenharmony_ci    psraw    %1, 1
95cabdff1aSopenharmony_ci    CLIPW    %1, [pb_0], [pw_pixel_max]
96cabdff1aSopenharmony_ci%endmacro
97cabdff1aSopenharmony_ci
98cabdff1aSopenharmony_ci%macro MC 1
99cabdff1aSopenharmony_ci%define OP_MOV mova
100cabdff1aSopenharmony_ciINIT_MMX mmxext
101cabdff1aSopenharmony_ci%1 put, 4
102cabdff1aSopenharmony_ciINIT_XMM sse2
103cabdff1aSopenharmony_ci%1 put, 8
104cabdff1aSopenharmony_ci
105cabdff1aSopenharmony_ci%define OP_MOV AVG_MOV
106cabdff1aSopenharmony_ciINIT_MMX mmxext
107cabdff1aSopenharmony_ci%1 avg, 4
108cabdff1aSopenharmony_ciINIT_XMM sse2
109cabdff1aSopenharmony_ci%1 avg, 8
110cabdff1aSopenharmony_ci%endmacro
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ci%macro MCAxA_OP 7
113cabdff1aSopenharmony_ci%if ARCH_X86_32
114cabdff1aSopenharmony_cicglobal %1_h264_qpel%4_%2_10, %5,%6,%7
115cabdff1aSopenharmony_ci    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
116cabdff1aSopenharmony_ci    mov  r0, r0m
117cabdff1aSopenharmony_ci    mov  r1, r1m
118cabdff1aSopenharmony_ci    add  r0, %3*2
119cabdff1aSopenharmony_ci    add  r1, %3*2
120cabdff1aSopenharmony_ci    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
121cabdff1aSopenharmony_ci    mov  r0, r0m
122cabdff1aSopenharmony_ci    mov  r1, r1m
123cabdff1aSopenharmony_ci    lea  r0, [r0+r2*%3]
124cabdff1aSopenharmony_ci    lea  r1, [r1+r2*%3]
125cabdff1aSopenharmony_ci    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
126cabdff1aSopenharmony_ci    mov  r0, r0m
127cabdff1aSopenharmony_ci    mov  r1, r1m
128cabdff1aSopenharmony_ci    lea  r0, [r0+r2*%3+%3*2]
129cabdff1aSopenharmony_ci    lea  r1, [r1+r2*%3+%3*2]
130cabdff1aSopenharmony_ci    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
131cabdff1aSopenharmony_ci    RET
132cabdff1aSopenharmony_ci%else ; ARCH_X86_64
133cabdff1aSopenharmony_cicglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
134cabdff1aSopenharmony_ci    mov r%6, r0
135cabdff1aSopenharmony_ci%assign p1 %6+1
136cabdff1aSopenharmony_ci    mov r %+ p1, r1
137cabdff1aSopenharmony_ci    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
138cabdff1aSopenharmony_ci    lea  r0, [r%6+%3*2]
139cabdff1aSopenharmony_ci    lea  r1, [r %+ p1+%3*2]
140cabdff1aSopenharmony_ci    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
141cabdff1aSopenharmony_ci    lea  r0, [r%6+r2*%3]
142cabdff1aSopenharmony_ci    lea  r1, [r %+ p1+r2*%3]
143cabdff1aSopenharmony_ci    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
144cabdff1aSopenharmony_ci    lea  r0, [r%6+r2*%3+%3*2]
145cabdff1aSopenharmony_ci    lea  r1, [r %+ p1+r2*%3+%3*2]
146cabdff1aSopenharmony_ci%if UNIX64 == 0 ; fall through to function
147cabdff1aSopenharmony_ci    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
148cabdff1aSopenharmony_ci    RET
149cabdff1aSopenharmony_ci%endif
150cabdff1aSopenharmony_ci%endif
151cabdff1aSopenharmony_ci%endmacro
152cabdff1aSopenharmony_ci
153cabdff1aSopenharmony_ci;cpu, put/avg, mc, 4/8, ...
154cabdff1aSopenharmony_ci%macro cglobal_mc 6
155cabdff1aSopenharmony_ci%assign i %3*2
156cabdff1aSopenharmony_ci%if cpuflag(sse2)
157cabdff1aSopenharmony_ciMCAxA_OP %1, %2, %3, i, %4,%5,%6
158cabdff1aSopenharmony_ci%endif
159cabdff1aSopenharmony_ci
160cabdff1aSopenharmony_cicglobal %1_h264_qpel%3_%2_10, %4,%5,%6
161cabdff1aSopenharmony_ci%if UNIX64 == 0 ; no prologue or epilogue for UNIX64
162cabdff1aSopenharmony_ci    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
163cabdff1aSopenharmony_ci    RET
164cabdff1aSopenharmony_ci%endif
165cabdff1aSopenharmony_ci
166cabdff1aSopenharmony_cistub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
167cabdff1aSopenharmony_ci%endmacro
168cabdff1aSopenharmony_ci
169cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
170cabdff1aSopenharmony_ci; void ff_h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
171cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
172cabdff1aSopenharmony_ci%macro COPY4 0
173cabdff1aSopenharmony_ci    movu          m0, [r1     ]
174cabdff1aSopenharmony_ci    OP_MOV [r0     ], m0
175cabdff1aSopenharmony_ci    movu          m0, [r1+r2  ]
176cabdff1aSopenharmony_ci    OP_MOV [r0+r2  ], m0
177cabdff1aSopenharmony_ci    movu          m0, [r1+r2*2]
178cabdff1aSopenharmony_ci    OP_MOV [r0+r2*2], m0
179cabdff1aSopenharmony_ci    movu          m0, [r1+r3  ]
180cabdff1aSopenharmony_ci    OP_MOV [r0+r3  ], m0
181cabdff1aSopenharmony_ci%endmacro
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_ci%macro MC00 1
184cabdff1aSopenharmony_ciINIT_MMX mmxext
185cabdff1aSopenharmony_cicglobal_mc %1, mc00, 4, 3,4,0
186cabdff1aSopenharmony_ci    lea           r3, [r2*3]
187cabdff1aSopenharmony_ci    COPY4
188cabdff1aSopenharmony_ci    ret
189cabdff1aSopenharmony_ci
190cabdff1aSopenharmony_ciINIT_XMM sse2
191cabdff1aSopenharmony_cicglobal %1_h264_qpel8_mc00_10, 3,4
192cabdff1aSopenharmony_ci    lea  r3, [r2*3]
193cabdff1aSopenharmony_ci    COPY4
194cabdff1aSopenharmony_ci    lea  r0, [r0+r2*4]
195cabdff1aSopenharmony_ci    lea  r1, [r1+r2*4]
196cabdff1aSopenharmony_ci    COPY4
197cabdff1aSopenharmony_ci    RET
198cabdff1aSopenharmony_ci
199cabdff1aSopenharmony_cicglobal %1_h264_qpel16_mc00_10, 3,4
200cabdff1aSopenharmony_ci    mov r3d, 8
201cabdff1aSopenharmony_ci.loop:
202cabdff1aSopenharmony_ci    movu           m0, [r1      ]
203cabdff1aSopenharmony_ci    movu           m1, [r1   +16]
204cabdff1aSopenharmony_ci    OP_MOV [r0      ], m0
205cabdff1aSopenharmony_ci    OP_MOV [r0   +16], m1
206cabdff1aSopenharmony_ci    movu           m0, [r1+r2   ]
207cabdff1aSopenharmony_ci    movu           m1, [r1+r2+16]
208cabdff1aSopenharmony_ci    OP_MOV [r0+r2   ], m0
209cabdff1aSopenharmony_ci    OP_MOV [r0+r2+16], m1
210cabdff1aSopenharmony_ci    lea            r0, [r0+r2*2]
211cabdff1aSopenharmony_ci    lea            r1, [r1+r2*2]
212cabdff1aSopenharmony_ci    dec r3d
213cabdff1aSopenharmony_ci    jg .loop
214cabdff1aSopenharmony_ci    REP_RET
215cabdff1aSopenharmony_ci%endmacro
216cabdff1aSopenharmony_ci
217cabdff1aSopenharmony_ci%define OP_MOV mova
218cabdff1aSopenharmony_ciMC00 put
219cabdff1aSopenharmony_ci
220cabdff1aSopenharmony_ci%define OP_MOV AVG_MOV
221cabdff1aSopenharmony_ciMC00 avg
222cabdff1aSopenharmony_ci
223cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
224cabdff1aSopenharmony_ci; void ff_h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
225cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
226cabdff1aSopenharmony_ci%macro MC_CACHE 1
227cabdff1aSopenharmony_ci%define OP_MOV mova
228cabdff1aSopenharmony_ciINIT_MMX mmxext
229cabdff1aSopenharmony_ci%1 put, 4
230cabdff1aSopenharmony_ciINIT_XMM sse2, cache64
231cabdff1aSopenharmony_ci%1 put, 8
232cabdff1aSopenharmony_ciINIT_XMM ssse3, cache64
233cabdff1aSopenharmony_ci%1 put, 8
234cabdff1aSopenharmony_ciINIT_XMM sse2
235cabdff1aSopenharmony_ci%1 put, 8
236cabdff1aSopenharmony_ci
237cabdff1aSopenharmony_ci%define OP_MOV AVG_MOV
238cabdff1aSopenharmony_ciINIT_MMX mmxext
239cabdff1aSopenharmony_ci%1 avg, 4
240cabdff1aSopenharmony_ciINIT_XMM sse2, cache64
241cabdff1aSopenharmony_ci%1 avg, 8
242cabdff1aSopenharmony_ciINIT_XMM ssse3, cache64
243cabdff1aSopenharmony_ci%1 avg, 8
244cabdff1aSopenharmony_ciINIT_XMM sse2
245cabdff1aSopenharmony_ci%1 avg, 8
246cabdff1aSopenharmony_ci%endmacro
247cabdff1aSopenharmony_ci
248cabdff1aSopenharmony_ci%macro MC20 2
249cabdff1aSopenharmony_cicglobal_mc %1, mc20, %2, 3,4,9
250cabdff1aSopenharmony_ci    mov     r3d, %2
251cabdff1aSopenharmony_ci    mova     m1, [pw_pixel_max]
252cabdff1aSopenharmony_ci%if num_mmregs > 8
253cabdff1aSopenharmony_ci    mova     m8, [pw_16]
254cabdff1aSopenharmony_ci    %define p16 m8
255cabdff1aSopenharmony_ci%else
256cabdff1aSopenharmony_ci    %define p16 [pw_16]
257cabdff1aSopenharmony_ci%endif
258cabdff1aSopenharmony_ci.nextrow:
259cabdff1aSopenharmony_ci%if %0 == 4
260cabdff1aSopenharmony_ci    movu     m2, [r1-4]
261cabdff1aSopenharmony_ci    movu     m3, [r1-2]
262cabdff1aSopenharmony_ci    movu     m4, [r1+0]
263cabdff1aSopenharmony_ci    ADDW     m2, [r1+6], m5
264cabdff1aSopenharmony_ci    ADDW     m3, [r1+4], m5
265cabdff1aSopenharmony_ci    ADDW     m4, [r1+2], m5
266cabdff1aSopenharmony_ci%else ; movu is slow on these processors
267cabdff1aSopenharmony_ci%if mmsize==16
268cabdff1aSopenharmony_ci    movu     m2, [r1-4]
269cabdff1aSopenharmony_ci    movu     m0, [r1+6]
270cabdff1aSopenharmony_ci    mova     m6, m0
271cabdff1aSopenharmony_ci    psrldq   m0, 6
272cabdff1aSopenharmony_ci
273cabdff1aSopenharmony_ci    paddw    m6, m2
274cabdff1aSopenharmony_ci    PALIGNR  m3, m0, m2, 2, m5
275cabdff1aSopenharmony_ci    PALIGNR  m7, m0, m2, 8, m5
276cabdff1aSopenharmony_ci    paddw    m3, m7
277cabdff1aSopenharmony_ci    PALIGNR  m4, m0, m2, 4, m5
278cabdff1aSopenharmony_ci    PALIGNR  m7, m0, m2, 6, m5
279cabdff1aSopenharmony_ci    paddw    m4, m7
280cabdff1aSopenharmony_ci    SWAP      2, 6
281cabdff1aSopenharmony_ci%else
282cabdff1aSopenharmony_ci    movu     m2, [r1-4]
283cabdff1aSopenharmony_ci    movu     m6, [r1+4]
284cabdff1aSopenharmony_ci    PALIGNR  m3, m6, m2, 2, m5
285cabdff1aSopenharmony_ci    paddw    m3, m6
286cabdff1aSopenharmony_ci    PALIGNR  m4, m6, m2, 4, m5
287cabdff1aSopenharmony_ci    PALIGNR  m7, m6, m2, 6, m5
288cabdff1aSopenharmony_ci    paddw    m4, m7
289cabdff1aSopenharmony_ci    paddw    m2, [r1+6]
290cabdff1aSopenharmony_ci%endif
291cabdff1aSopenharmony_ci%endif
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci    FILT_H   m2, m3, m4, p16
294cabdff1aSopenharmony_ci    psraw    m2, 1
295cabdff1aSopenharmony_ci    pxor     m0, m0
296cabdff1aSopenharmony_ci    CLIPW    m2, m0, m1
297cabdff1aSopenharmony_ci    OP_MOV [r0], m2
298cabdff1aSopenharmony_ci    add      r0, r2
299cabdff1aSopenharmony_ci    add      r1, r2
300cabdff1aSopenharmony_ci    dec     r3d
301cabdff1aSopenharmony_ci    jg .nextrow
302cabdff1aSopenharmony_ci    rep ret
303cabdff1aSopenharmony_ci%endmacro
304cabdff1aSopenharmony_ci
305cabdff1aSopenharmony_ciMC_CACHE MC20
306cabdff1aSopenharmony_ci
307cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
308cabdff1aSopenharmony_ci; void ff_h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
309cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
310cabdff1aSopenharmony_ci%macro MC30 2
311cabdff1aSopenharmony_cicglobal_mc %1, mc30, %2, 3,5,9
312cabdff1aSopenharmony_ci    lea r4, [r1+2]
313cabdff1aSopenharmony_ci    jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
314cabdff1aSopenharmony_ci%endmacro
315cabdff1aSopenharmony_ci
316cabdff1aSopenharmony_ciMC_CACHE MC30
317cabdff1aSopenharmony_ci
318cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
319cabdff1aSopenharmony_ci; void ff_h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
320cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
321cabdff1aSopenharmony_ci%macro MC10 2
322cabdff1aSopenharmony_cicglobal_mc %1, mc10, %2, 3,5,9
323cabdff1aSopenharmony_ci    mov      r4, r1
324cabdff1aSopenharmony_ci.body:
325cabdff1aSopenharmony_ci    mov     r3d, %2
326cabdff1aSopenharmony_ci    mova     m1, [pw_pixel_max]
327cabdff1aSopenharmony_ci%if num_mmregs > 8
328cabdff1aSopenharmony_ci    mova     m8, [pw_16]
329cabdff1aSopenharmony_ci    %define p16 m8
330cabdff1aSopenharmony_ci%else
331cabdff1aSopenharmony_ci    %define p16 [pw_16]
332cabdff1aSopenharmony_ci%endif
333cabdff1aSopenharmony_ci.nextrow:
334cabdff1aSopenharmony_ci%if %0 == 4
335cabdff1aSopenharmony_ci    movu     m2, [r1-4]
336cabdff1aSopenharmony_ci    movu     m3, [r1-2]
337cabdff1aSopenharmony_ci    movu     m4, [r1+0]
338cabdff1aSopenharmony_ci    ADDW     m2, [r1+6], m5
339cabdff1aSopenharmony_ci    ADDW     m3, [r1+4], m5
340cabdff1aSopenharmony_ci    ADDW     m4, [r1+2], m5
341cabdff1aSopenharmony_ci%else ; movu is slow on these processors
342cabdff1aSopenharmony_ci%if mmsize==16
343cabdff1aSopenharmony_ci    movu     m2, [r1-4]
344cabdff1aSopenharmony_ci    movu     m0, [r1+6]
345cabdff1aSopenharmony_ci    mova     m6, m0
346cabdff1aSopenharmony_ci    psrldq   m0, 6
347cabdff1aSopenharmony_ci
348cabdff1aSopenharmony_ci    paddw    m6, m2
349cabdff1aSopenharmony_ci    PALIGNR  m3, m0, m2, 2, m5
350cabdff1aSopenharmony_ci    PALIGNR  m7, m0, m2, 8, m5
351cabdff1aSopenharmony_ci    paddw    m3, m7
352cabdff1aSopenharmony_ci    PALIGNR  m4, m0, m2, 4, m5
353cabdff1aSopenharmony_ci    PALIGNR  m7, m0, m2, 6, m5
354cabdff1aSopenharmony_ci    paddw    m4, m7
355cabdff1aSopenharmony_ci    SWAP      2, 6
356cabdff1aSopenharmony_ci%else
357cabdff1aSopenharmony_ci    movu     m2, [r1-4]
358cabdff1aSopenharmony_ci    movu     m6, [r1+4]
359cabdff1aSopenharmony_ci    PALIGNR  m3, m6, m2, 2, m5
360cabdff1aSopenharmony_ci    paddw    m3, m6
361cabdff1aSopenharmony_ci    PALIGNR  m4, m6, m2, 4, m5
362cabdff1aSopenharmony_ci    PALIGNR  m7, m6, m2, 6, m5
363cabdff1aSopenharmony_ci    paddw    m4, m7
364cabdff1aSopenharmony_ci    paddw    m2, [r1+6]
365cabdff1aSopenharmony_ci%endif
366cabdff1aSopenharmony_ci%endif
367cabdff1aSopenharmony_ci
368cabdff1aSopenharmony_ci    FILT_H   m2, m3, m4, p16
369cabdff1aSopenharmony_ci    psraw    m2, 1
370cabdff1aSopenharmony_ci    pxor     m0, m0
371cabdff1aSopenharmony_ci    CLIPW    m2, m0, m1
372cabdff1aSopenharmony_ci    movu     m3, [r4]
373cabdff1aSopenharmony_ci    pavgw    m2, m3
374cabdff1aSopenharmony_ci    OP_MOV [r0], m2
375cabdff1aSopenharmony_ci    add      r0, r2
376cabdff1aSopenharmony_ci    add      r1, r2
377cabdff1aSopenharmony_ci    add      r4, r2
378cabdff1aSopenharmony_ci    dec     r3d
379cabdff1aSopenharmony_ci    jg .nextrow
380cabdff1aSopenharmony_ci    rep ret
381cabdff1aSopenharmony_ci%endmacro
382cabdff1aSopenharmony_ci
383cabdff1aSopenharmony_ciMC_CACHE MC10
384cabdff1aSopenharmony_ci
385cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
386cabdff1aSopenharmony_ci; void ff_h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
387cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
388cabdff1aSopenharmony_ci%macro V_FILT 10
389cabdff1aSopenharmony_civ_filt%9_%10_10:
390cabdff1aSopenharmony_ci    add    r4, r2
391cabdff1aSopenharmony_ci.no_addr4:
392cabdff1aSopenharmony_ci    FILT_V m0, m1, m2, m3, m4, m5, m6, m7
393cabdff1aSopenharmony_ci    add    r1, r2
394cabdff1aSopenharmony_ci    add    r0, r2
395cabdff1aSopenharmony_ci    ret
396cabdff1aSopenharmony_ci%endmacro
397cabdff1aSopenharmony_ci
398cabdff1aSopenharmony_ciINIT_MMX mmxext
399cabdff1aSopenharmony_ciRESET_MM_PERMUTATION
400cabdff1aSopenharmony_ci%assign i 0
401cabdff1aSopenharmony_ci%rep 4
402cabdff1aSopenharmony_ciV_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
403cabdff1aSopenharmony_ciSWAP 0,1,2,3,4,5
404cabdff1aSopenharmony_ci%assign i i+1
405cabdff1aSopenharmony_ci%endrep
406cabdff1aSopenharmony_ci
407cabdff1aSopenharmony_ciINIT_XMM sse2
408cabdff1aSopenharmony_ciRESET_MM_PERMUTATION
409cabdff1aSopenharmony_ci%assign i 0
410cabdff1aSopenharmony_ci%rep 6
411cabdff1aSopenharmony_ciV_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
412cabdff1aSopenharmony_ciSWAP 0,1,2,3,4,5
413cabdff1aSopenharmony_ci%assign i i+1
414cabdff1aSopenharmony_ci%endrep
415cabdff1aSopenharmony_ci
416cabdff1aSopenharmony_ci%macro MC02 2
417cabdff1aSopenharmony_cicglobal_mc %1, mc02, %2, 3,4,8
418cabdff1aSopenharmony_ci    PRELOAD_V
419cabdff1aSopenharmony_ci
420cabdff1aSopenharmony_ci    sub      r0, r2
421cabdff1aSopenharmony_ci%assign j 0
422cabdff1aSopenharmony_ci%rep %2
423cabdff1aSopenharmony_ci    %assign i (j % 6)
424cabdff1aSopenharmony_ci    call v_filt%2_ %+ i %+ _10.no_addr4
425cabdff1aSopenharmony_ci    OP_MOV [r0], m0
426cabdff1aSopenharmony_ci    SWAP 0,1,2,3,4,5
427cabdff1aSopenharmony_ci    %assign j j+1
428cabdff1aSopenharmony_ci%endrep
429cabdff1aSopenharmony_ci    ret
430cabdff1aSopenharmony_ci%endmacro
431cabdff1aSopenharmony_ci
432cabdff1aSopenharmony_ciMC MC02
433cabdff1aSopenharmony_ci
434cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
435cabdff1aSopenharmony_ci; void ff_h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
436cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
437cabdff1aSopenharmony_ci%macro MC01 2
438cabdff1aSopenharmony_cicglobal_mc %1, mc01, %2, 3,5,8
439cabdff1aSopenharmony_ci    mov      r4, r1
440cabdff1aSopenharmony_ci.body:
441cabdff1aSopenharmony_ci    PRELOAD_V
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci    sub      r4, r2
444cabdff1aSopenharmony_ci    sub      r0, r2
445cabdff1aSopenharmony_ci%assign j 0
446cabdff1aSopenharmony_ci%rep %2
447cabdff1aSopenharmony_ci    %assign i (j % 6)
448cabdff1aSopenharmony_ci    call v_filt%2_ %+ i %+ _10
449cabdff1aSopenharmony_ci    movu     m7, [r4]
450cabdff1aSopenharmony_ci    pavgw    m0, m7
451cabdff1aSopenharmony_ci    OP_MOV [r0], m0
452cabdff1aSopenharmony_ci    SWAP 0,1,2,3,4,5
453cabdff1aSopenharmony_ci    %assign j j+1
454cabdff1aSopenharmony_ci%endrep
455cabdff1aSopenharmony_ci    ret
456cabdff1aSopenharmony_ci%endmacro
457cabdff1aSopenharmony_ci
458cabdff1aSopenharmony_ciMC MC01
459cabdff1aSopenharmony_ci
460cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
461cabdff1aSopenharmony_ci; void ff_h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
462cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
463cabdff1aSopenharmony_ci%macro MC03 2
464cabdff1aSopenharmony_cicglobal_mc %1, mc03, %2, 3,5,8
465cabdff1aSopenharmony_ci    lea r4, [r1+r2]
466cabdff1aSopenharmony_ci    jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
467cabdff1aSopenharmony_ci%endmacro
468cabdff1aSopenharmony_ci
469cabdff1aSopenharmony_ciMC MC03
470cabdff1aSopenharmony_ci
471cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
472cabdff1aSopenharmony_ci; void ff_h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
473cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
474cabdff1aSopenharmony_ci%macro H_FILT_AVG 2-3
475cabdff1aSopenharmony_cih_filt%1_%2_10:
476cabdff1aSopenharmony_ci;FILT_H with fewer registers and averaged with the FILT_V result
477cabdff1aSopenharmony_ci;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
478cabdff1aSopenharmony_ci;unfortunately I need three registers, so m5 will have to be re-read from memory
479cabdff1aSopenharmony_ci    movu     m5, [r4-4]
480cabdff1aSopenharmony_ci    ADDW     m5, [r4+6], m7
481cabdff1aSopenharmony_ci    movu     m6, [r4-2]
482cabdff1aSopenharmony_ci    ADDW     m6, [r4+4], m7
483cabdff1aSopenharmony_ci    paddw    m5, [pw_16]
484cabdff1aSopenharmony_ci    psubw    m5, m6  ; a-b
485cabdff1aSopenharmony_ci    psraw    m5, 2   ; (a-b)/4
486cabdff1aSopenharmony_ci    psubw    m5, m6  ; (a-b)/4-b
487cabdff1aSopenharmony_ci    movu     m6, [r4+0]
488cabdff1aSopenharmony_ci    ADDW     m6, [r4+2], m7
489cabdff1aSopenharmony_ci    paddw    m5, m6  ; (a-b)/4-b+c
490cabdff1aSopenharmony_ci    psraw    m5, 2   ; ((a-b)/4-b+c)/4
491cabdff1aSopenharmony_ci    paddw    m5, m6  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
492cabdff1aSopenharmony_ci    psraw    m5, 1
493cabdff1aSopenharmony_ci    CLIPW    m5, [pb_0], [pw_pixel_max]
494cabdff1aSopenharmony_ci;avg FILT_V, FILT_H
495cabdff1aSopenharmony_ci    pavgw    m0, m5
496cabdff1aSopenharmony_ci%if %0!=4
497cabdff1aSopenharmony_ci    movu     m5, [r1+r5]
498cabdff1aSopenharmony_ci%endif
499cabdff1aSopenharmony_ci    ret
500cabdff1aSopenharmony_ci%endmacro
501cabdff1aSopenharmony_ci
502cabdff1aSopenharmony_ciINIT_MMX mmxext
503cabdff1aSopenharmony_ciRESET_MM_PERMUTATION
504cabdff1aSopenharmony_ci%assign i 0
505cabdff1aSopenharmony_ci%rep 3
506cabdff1aSopenharmony_ciH_FILT_AVG 4, i
507cabdff1aSopenharmony_ciSWAP 0,1,2,3,4,5
508cabdff1aSopenharmony_ci%assign i i+1
509cabdff1aSopenharmony_ci%endrep
510cabdff1aSopenharmony_ciH_FILT_AVG 4, i, 0
511cabdff1aSopenharmony_ci
512cabdff1aSopenharmony_ciINIT_XMM sse2
513cabdff1aSopenharmony_ciRESET_MM_PERMUTATION
514cabdff1aSopenharmony_ci%assign i 0
515cabdff1aSopenharmony_ci%rep 6
516cabdff1aSopenharmony_ci%if i==1
517cabdff1aSopenharmony_ciH_FILT_AVG 8, i, 0
518cabdff1aSopenharmony_ci%else
519cabdff1aSopenharmony_ciH_FILT_AVG 8, i
520cabdff1aSopenharmony_ci%endif
521cabdff1aSopenharmony_ciSWAP 0,1,2,3,4,5
522cabdff1aSopenharmony_ci%assign i i+1
523cabdff1aSopenharmony_ci%endrep
524cabdff1aSopenharmony_ci
525cabdff1aSopenharmony_ci%macro MC11 2
526cabdff1aSopenharmony_ci; this REALLY needs x86_64
527cabdff1aSopenharmony_cicglobal_mc %1, mc11, %2, 3,6,8
528cabdff1aSopenharmony_ci    mov      r4, r1
529cabdff1aSopenharmony_ci.body:
530cabdff1aSopenharmony_ci    PRELOAD_V
531cabdff1aSopenharmony_ci
532cabdff1aSopenharmony_ci    sub      r0, r2
533cabdff1aSopenharmony_ci    sub      r4, r2
534cabdff1aSopenharmony_ci    mov      r5, r2
535cabdff1aSopenharmony_ci    neg      r5
536cabdff1aSopenharmony_ci%assign j 0
537cabdff1aSopenharmony_ci%rep %2
538cabdff1aSopenharmony_ci    %assign i (j % 6)
539cabdff1aSopenharmony_ci    call v_filt%2_ %+ i %+ _10
540cabdff1aSopenharmony_ci    call h_filt%2_ %+ i %+ _10
541cabdff1aSopenharmony_ci%if %2==8 && i==1
542cabdff1aSopenharmony_ci    movu     m5, [r1+r5]
543cabdff1aSopenharmony_ci%endif
544cabdff1aSopenharmony_ci    OP_MOV [r0], m0
545cabdff1aSopenharmony_ci    SWAP 0,1,2,3,4,5
546cabdff1aSopenharmony_ci    %assign j j+1
547cabdff1aSopenharmony_ci%endrep
548cabdff1aSopenharmony_ci    ret
549cabdff1aSopenharmony_ci%endmacro
550cabdff1aSopenharmony_ci
551cabdff1aSopenharmony_ciMC MC11
552cabdff1aSopenharmony_ci
553cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
554cabdff1aSopenharmony_ci; void ff_h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
555cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
556cabdff1aSopenharmony_ci%macro MC31 2
557cabdff1aSopenharmony_cicglobal_mc %1, mc31, %2, 3,6,8
558cabdff1aSopenharmony_ci    mov r4, r1
559cabdff1aSopenharmony_ci    add r1, 2
560cabdff1aSopenharmony_ci    jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
561cabdff1aSopenharmony_ci%endmacro
562cabdff1aSopenharmony_ci
563cabdff1aSopenharmony_ciMC MC31
564cabdff1aSopenharmony_ci
565cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
566cabdff1aSopenharmony_ci; void ff_h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
567cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
568cabdff1aSopenharmony_ci%macro MC13 2
569cabdff1aSopenharmony_cicglobal_mc %1, mc13, %2, 3,7,12
570cabdff1aSopenharmony_ci    lea r4, [r1+r2]
571cabdff1aSopenharmony_ci    jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
572cabdff1aSopenharmony_ci%endmacro
573cabdff1aSopenharmony_ci
574cabdff1aSopenharmony_ciMC MC13
575cabdff1aSopenharmony_ci
576cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
577cabdff1aSopenharmony_ci; void ff_h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
578cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
579cabdff1aSopenharmony_ci%macro MC33 2
580cabdff1aSopenharmony_cicglobal_mc %1, mc33, %2, 3,6,8
581cabdff1aSopenharmony_ci    lea r4, [r1+r2]
582cabdff1aSopenharmony_ci    add r1, 2
583cabdff1aSopenharmony_ci    jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
584cabdff1aSopenharmony_ci%endmacro
585cabdff1aSopenharmony_ci
586cabdff1aSopenharmony_ciMC MC33
587cabdff1aSopenharmony_ci
588cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
589cabdff1aSopenharmony_ci; void ff_h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
590cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
591cabdff1aSopenharmony_ci%macro FILT_H2 3
592cabdff1aSopenharmony_ci    psubw  %1, %2  ; a-b
593cabdff1aSopenharmony_ci    psubw  %2, %3  ; b-c
594cabdff1aSopenharmony_ci    psllw  %2, 2
595cabdff1aSopenharmony_ci    psubw  %1, %2  ; a-5*b+4*c
596cabdff1aSopenharmony_ci    psllw  %3, 4
597cabdff1aSopenharmony_ci    paddw  %1, %3  ; a-5*b+20*c
598cabdff1aSopenharmony_ci%endmacro
599cabdff1aSopenharmony_ci
600cabdff1aSopenharmony_ci%macro FILT_VNRD 8
601cabdff1aSopenharmony_ci    movu     %6, [r1]
602cabdff1aSopenharmony_ci    paddw    %1, %6
603cabdff1aSopenharmony_ci    mova     %7, %2
604cabdff1aSopenharmony_ci    paddw    %7, %5
605cabdff1aSopenharmony_ci    mova     %8, %3
606cabdff1aSopenharmony_ci    paddw    %8, %4
607cabdff1aSopenharmony_ci    FILT_H2  %1, %7, %8
608cabdff1aSopenharmony_ci%endmacro
609cabdff1aSopenharmony_ci
610cabdff1aSopenharmony_ci%macro HV 1
611cabdff1aSopenharmony_ci%if mmsize==16
612cabdff1aSopenharmony_ci%define PAD 12
613cabdff1aSopenharmony_ci%define COUNT 2
614cabdff1aSopenharmony_ci%else
615cabdff1aSopenharmony_ci%define PAD 4
616cabdff1aSopenharmony_ci%define COUNT 3
617cabdff1aSopenharmony_ci%endif
618cabdff1aSopenharmony_ciput_hv%1_10:
619cabdff1aSopenharmony_ci    neg      r2           ; This actually saves instructions
620cabdff1aSopenharmony_ci    lea      r1, [r1+r2*2-mmsize+PAD]
621cabdff1aSopenharmony_ci    lea      r4, [rsp+PAD+gprsize]
622cabdff1aSopenharmony_ci    mov     r3d, COUNT
623cabdff1aSopenharmony_ci.v_loop:
624cabdff1aSopenharmony_ci    movu     m0, [r1]
625cabdff1aSopenharmony_ci    sub      r1, r2
626cabdff1aSopenharmony_ci    movu     m1, [r1]
627cabdff1aSopenharmony_ci    sub      r1, r2
628cabdff1aSopenharmony_ci    movu     m2, [r1]
629cabdff1aSopenharmony_ci    sub      r1, r2
630cabdff1aSopenharmony_ci    movu     m3, [r1]
631cabdff1aSopenharmony_ci    sub      r1, r2
632cabdff1aSopenharmony_ci    movu     m4, [r1]
633cabdff1aSopenharmony_ci    sub      r1, r2
634cabdff1aSopenharmony_ci%assign i 0
635cabdff1aSopenharmony_ci%rep %1-1
636cabdff1aSopenharmony_ci    FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
637cabdff1aSopenharmony_ci    psubw    m0, [pad20]
638cabdff1aSopenharmony_ci    movu     [r4+i*mmsize*3], m0
639cabdff1aSopenharmony_ci    sub      r1, r2
640cabdff1aSopenharmony_ci    SWAP 0,1,2,3,4,5
641cabdff1aSopenharmony_ci%assign i i+1
642cabdff1aSopenharmony_ci%endrep
643cabdff1aSopenharmony_ci    FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
644cabdff1aSopenharmony_ci    psubw    m0, [pad20]
645cabdff1aSopenharmony_ci    movu     [r4+i*mmsize*3], m0
646cabdff1aSopenharmony_ci    add      r4, mmsize
647cabdff1aSopenharmony_ci    lea      r1, [r1+r2*8+mmsize]
648cabdff1aSopenharmony_ci%if %1==8
649cabdff1aSopenharmony_ci    lea      r1, [r1+r2*4]
650cabdff1aSopenharmony_ci%endif
651cabdff1aSopenharmony_ci    dec      r3d
652cabdff1aSopenharmony_ci    jg .v_loop
653cabdff1aSopenharmony_ci    neg      r2
654cabdff1aSopenharmony_ci    ret
655cabdff1aSopenharmony_ci%endmacro
656cabdff1aSopenharmony_ci
657cabdff1aSopenharmony_ciINIT_MMX mmxext
658cabdff1aSopenharmony_ciHV 4
659cabdff1aSopenharmony_ciINIT_XMM sse2
660cabdff1aSopenharmony_ciHV 8
661cabdff1aSopenharmony_ci
662cabdff1aSopenharmony_ci%macro H_LOOP 1
663cabdff1aSopenharmony_ci%if num_mmregs > 8
664cabdff1aSopenharmony_ci    %define s1 m8
665cabdff1aSopenharmony_ci    %define s2 m9
666cabdff1aSopenharmony_ci    %define s3 m10
667cabdff1aSopenharmony_ci    %define d1 m11
668cabdff1aSopenharmony_ci%else
669cabdff1aSopenharmony_ci    %define s1 [tap1]
670cabdff1aSopenharmony_ci    %define s2 [tap2]
671cabdff1aSopenharmony_ci    %define s3 [tap3]
672cabdff1aSopenharmony_ci    %define d1 [depad]
673cabdff1aSopenharmony_ci%endif
674cabdff1aSopenharmony_cih%1_loop_op:
675cabdff1aSopenharmony_ci    movu       m1, [r1+mmsize-4]
676cabdff1aSopenharmony_ci    movu       m2, [r1+mmsize-2]
677cabdff1aSopenharmony_ci    mova       m3, [r1+mmsize+0]
678cabdff1aSopenharmony_ci    movu       m4, [r1+mmsize+2]
679cabdff1aSopenharmony_ci    movu       m5, [r1+mmsize+4]
680cabdff1aSopenharmony_ci    movu       m6, [r1+mmsize+6]
681cabdff1aSopenharmony_ci%if num_mmregs > 8
682cabdff1aSopenharmony_ci    pmaddwd    m1, s1
683cabdff1aSopenharmony_ci    pmaddwd    m2, s1
684cabdff1aSopenharmony_ci    pmaddwd    m3, s2
685cabdff1aSopenharmony_ci    pmaddwd    m4, s2
686cabdff1aSopenharmony_ci    pmaddwd    m5, s3
687cabdff1aSopenharmony_ci    pmaddwd    m6, s3
688cabdff1aSopenharmony_ci    paddd      m1, d1
689cabdff1aSopenharmony_ci    paddd      m2, d1
690cabdff1aSopenharmony_ci%else
691cabdff1aSopenharmony_ci    mova       m0, s1
692cabdff1aSopenharmony_ci    pmaddwd    m1, m0
693cabdff1aSopenharmony_ci    pmaddwd    m2, m0
694cabdff1aSopenharmony_ci    mova       m0, s2
695cabdff1aSopenharmony_ci    pmaddwd    m3, m0
696cabdff1aSopenharmony_ci    pmaddwd    m4, m0
697cabdff1aSopenharmony_ci    mova       m0, s3
698cabdff1aSopenharmony_ci    pmaddwd    m5, m0
699cabdff1aSopenharmony_ci    pmaddwd    m6, m0
700cabdff1aSopenharmony_ci    mova       m0, d1
701cabdff1aSopenharmony_ci    paddd      m1, m0
702cabdff1aSopenharmony_ci    paddd      m2, m0
703cabdff1aSopenharmony_ci%endif
704cabdff1aSopenharmony_ci    paddd      m3, m5
705cabdff1aSopenharmony_ci    paddd      m4, m6
706cabdff1aSopenharmony_ci    paddd      m1, m3
707cabdff1aSopenharmony_ci    paddd      m2, m4
708cabdff1aSopenharmony_ci    psrad      m1, 10
709cabdff1aSopenharmony_ci    psrad      m2, 10
710cabdff1aSopenharmony_ci    pslld      m2, 16
711cabdff1aSopenharmony_ci    pand       m1, [pd_65535]
712cabdff1aSopenharmony_ci    por        m1, m2
713cabdff1aSopenharmony_ci%if num_mmregs <= 8
714cabdff1aSopenharmony_ci    pxor       m0, m0
715cabdff1aSopenharmony_ci%endif
716cabdff1aSopenharmony_ci    CLIPW      m1, m0, m7
717cabdff1aSopenharmony_ci    add        r1, mmsize*3
718cabdff1aSopenharmony_ci    ret
719cabdff1aSopenharmony_ci%endmacro
720cabdff1aSopenharmony_ci
721cabdff1aSopenharmony_ciINIT_MMX mmxext
722cabdff1aSopenharmony_ciH_LOOP 4
723cabdff1aSopenharmony_ciINIT_XMM sse2
724cabdff1aSopenharmony_ciH_LOOP 8
725cabdff1aSopenharmony_ci
726cabdff1aSopenharmony_ci%macro MC22 2
727cabdff1aSopenharmony_cicglobal_mc %1, mc22, %2, 3,7,12
728cabdff1aSopenharmony_ci%define PAD mmsize*8*4*2      ; SIZE*16*4*sizeof(pixel)
729cabdff1aSopenharmony_ci    mov      r6, rsp          ; backup stack pointer
730cabdff1aSopenharmony_ci    and     rsp, ~(mmsize-1)  ; align stack
731cabdff1aSopenharmony_ci    sub     rsp, PAD
732cabdff1aSopenharmony_ci
733cabdff1aSopenharmony_ci    call put_hv%2_10
734cabdff1aSopenharmony_ci
735cabdff1aSopenharmony_ci    mov       r3d, %2
736cabdff1aSopenharmony_ci    mova       m7, [pw_pixel_max]
737cabdff1aSopenharmony_ci%if num_mmregs > 8
738cabdff1aSopenharmony_ci    pxor       m0, m0
739cabdff1aSopenharmony_ci    mova       m8, [tap1]
740cabdff1aSopenharmony_ci    mova       m9, [tap2]
741cabdff1aSopenharmony_ci    mova      m10, [tap3]
742cabdff1aSopenharmony_ci    mova      m11, [depad]
743cabdff1aSopenharmony_ci%endif
744cabdff1aSopenharmony_ci    mov        r1, rsp
745cabdff1aSopenharmony_ci.h_loop:
746cabdff1aSopenharmony_ci    call h%2_loop_op
747cabdff1aSopenharmony_ci
748cabdff1aSopenharmony_ci    OP_MOV   [r0], m1
749cabdff1aSopenharmony_ci    add        r0, r2
750cabdff1aSopenharmony_ci    dec       r3d
751cabdff1aSopenharmony_ci    jg .h_loop
752cabdff1aSopenharmony_ci
753cabdff1aSopenharmony_ci    mov     rsp, r6          ; restore stack pointer
754cabdff1aSopenharmony_ci    ret
755cabdff1aSopenharmony_ci%endmacro
756cabdff1aSopenharmony_ci
757cabdff1aSopenharmony_ciMC MC22
758cabdff1aSopenharmony_ci
759cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
760cabdff1aSopenharmony_ci; void ff_h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
761cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
762cabdff1aSopenharmony_ci%macro MC12 2
763cabdff1aSopenharmony_cicglobal_mc %1, mc12, %2, 3,7,12
764cabdff1aSopenharmony_ci%define PAD mmsize*8*4*2        ; SIZE*16*4*sizeof(pixel)
765cabdff1aSopenharmony_ci    mov        r6, rsp          ; backup stack pointer
766cabdff1aSopenharmony_ci    and       rsp, ~(mmsize-1)  ; align stack
767cabdff1aSopenharmony_ci    sub       rsp, PAD
768cabdff1aSopenharmony_ci
769cabdff1aSopenharmony_ci    call put_hv%2_10
770cabdff1aSopenharmony_ci
771cabdff1aSopenharmony_ci    xor       r4d, r4d
772cabdff1aSopenharmony_ci.body:
773cabdff1aSopenharmony_ci    mov       r3d, %2
774cabdff1aSopenharmony_ci    pxor       m0, m0
775cabdff1aSopenharmony_ci    mova       m7, [pw_pixel_max]
776cabdff1aSopenharmony_ci%if num_mmregs > 8
777cabdff1aSopenharmony_ci    mova       m8, [tap1]
778cabdff1aSopenharmony_ci    mova       m9, [tap2]
779cabdff1aSopenharmony_ci    mova      m10, [tap3]
780cabdff1aSopenharmony_ci    mova      m11, [depad]
781cabdff1aSopenharmony_ci%endif
782cabdff1aSopenharmony_ci    mov        r1, rsp
783cabdff1aSopenharmony_ci.h_loop:
784cabdff1aSopenharmony_ci    call h%2_loop_op
785cabdff1aSopenharmony_ci
786cabdff1aSopenharmony_ci    movu       m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
787cabdff1aSopenharmony_ci    paddw      m3, [depad2]
788cabdff1aSopenharmony_ci    psrlw      m3, 5
789cabdff1aSopenharmony_ci    psubw      m3, [unpad]
790cabdff1aSopenharmony_ci    CLIPW      m3, m0, m7
791cabdff1aSopenharmony_ci    pavgw      m1, m3
792cabdff1aSopenharmony_ci
793cabdff1aSopenharmony_ci    OP_MOV   [r0], m1
794cabdff1aSopenharmony_ci    add        r0, r2
795cabdff1aSopenharmony_ci    dec       r3d
796cabdff1aSopenharmony_ci    jg .h_loop
797cabdff1aSopenharmony_ci
798cabdff1aSopenharmony_ci    mov     rsp, r6          ; restore stack pointer
799cabdff1aSopenharmony_ci    ret
800cabdff1aSopenharmony_ci%endmacro
801cabdff1aSopenharmony_ci
802cabdff1aSopenharmony_ciMC MC12
803cabdff1aSopenharmony_ci
804cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
805cabdff1aSopenharmony_ci; void ff_h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
806cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
807cabdff1aSopenharmony_ci%macro MC32 2
808cabdff1aSopenharmony_cicglobal_mc %1, mc32, %2, 3,7,12
809cabdff1aSopenharmony_ci%define PAD mmsize*8*3*2  ; SIZE*16*4*sizeof(pixel)
810cabdff1aSopenharmony_ci    mov  r6, rsp          ; backup stack pointer
811cabdff1aSopenharmony_ci    and rsp, ~(mmsize-1)  ; align stack
812cabdff1aSopenharmony_ci    sub rsp, PAD
813cabdff1aSopenharmony_ci
814cabdff1aSopenharmony_ci    call put_hv%2_10
815cabdff1aSopenharmony_ci
816cabdff1aSopenharmony_ci    mov r4d, 2            ; sizeof(pixel)
817cabdff1aSopenharmony_ci    jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
818cabdff1aSopenharmony_ci%endmacro
819cabdff1aSopenharmony_ci
820cabdff1aSopenharmony_ciMC MC32
821cabdff1aSopenharmony_ci
822cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
823cabdff1aSopenharmony_ci; void ff_h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
824cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
825cabdff1aSopenharmony_ci%macro H_NRD 1
826cabdff1aSopenharmony_ciput_h%1_10:
827cabdff1aSopenharmony_ci    add       rsp, gprsize
828cabdff1aSopenharmony_ci    mov       r3d, %1
829cabdff1aSopenharmony_ci    xor       r4d, r4d
830cabdff1aSopenharmony_ci    mova       m6, [pad20]
831cabdff1aSopenharmony_ci.nextrow:
832cabdff1aSopenharmony_ci    movu       m2, [r5-4]
833cabdff1aSopenharmony_ci    movu       m3, [r5-2]
834cabdff1aSopenharmony_ci    movu       m4, [r5+0]
835cabdff1aSopenharmony_ci    ADDW       m2, [r5+6], m5
836cabdff1aSopenharmony_ci    ADDW       m3, [r5+4], m5
837cabdff1aSopenharmony_ci    ADDW       m4, [r5+2], m5
838cabdff1aSopenharmony_ci
839cabdff1aSopenharmony_ci    FILT_H2    m2, m3, m4
840cabdff1aSopenharmony_ci    psubw      m2, m6
841cabdff1aSopenharmony_ci    mova [rsp+r4], m2
842cabdff1aSopenharmony_ci    add       r4d, mmsize*3
843cabdff1aSopenharmony_ci    add        r5, r2
844cabdff1aSopenharmony_ci    dec       r3d
845cabdff1aSopenharmony_ci    jg .nextrow
846cabdff1aSopenharmony_ci    sub       rsp, gprsize
847cabdff1aSopenharmony_ci    ret
848cabdff1aSopenharmony_ci%endmacro
849cabdff1aSopenharmony_ci
850cabdff1aSopenharmony_ciINIT_MMX mmxext
851cabdff1aSopenharmony_ciH_NRD 4
852cabdff1aSopenharmony_ciINIT_XMM sse2
853cabdff1aSopenharmony_ciH_NRD 8
854cabdff1aSopenharmony_ci
855cabdff1aSopenharmony_ci%macro MC21 2
856cabdff1aSopenharmony_cicglobal_mc %1, mc21, %2, 3,7,12
857cabdff1aSopenharmony_ci    mov   r5, r1
858cabdff1aSopenharmony_ci.body:
859cabdff1aSopenharmony_ci%define PAD mmsize*8*3*2   ; SIZE*16*4*sizeof(pixel)
860cabdff1aSopenharmony_ci    mov   r6, rsp          ; backup stack pointer
861cabdff1aSopenharmony_ci    and  rsp, ~(mmsize-1)  ; align stack
862cabdff1aSopenharmony_ci
863cabdff1aSopenharmony_ci    sub  rsp, PAD
864cabdff1aSopenharmony_ci    call put_h%2_10
865cabdff1aSopenharmony_ci
866cabdff1aSopenharmony_ci    sub  rsp, PAD
867cabdff1aSopenharmony_ci    call put_hv%2_10
868cabdff1aSopenharmony_ci
869cabdff1aSopenharmony_ci    mov r4d, PAD-mmsize    ; H buffer
870cabdff1aSopenharmony_ci    jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
871cabdff1aSopenharmony_ci%endmacro
872cabdff1aSopenharmony_ci
873cabdff1aSopenharmony_ciMC MC21
874cabdff1aSopenharmony_ci
875cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
876cabdff1aSopenharmony_ci; void ff_h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
877cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
878cabdff1aSopenharmony_ci%macro MC23 2
879cabdff1aSopenharmony_cicglobal_mc %1, mc23, %2, 3,7,12
880cabdff1aSopenharmony_ci    lea   r5, [r1+r2]
881cabdff1aSopenharmony_ci    jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
882cabdff1aSopenharmony_ci%endmacro
883cabdff1aSopenharmony_ci
884cabdff1aSopenharmony_ciMC MC23
885