1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* MMX/SSSE3-optimized functions for H.264 chroma MC
3cabdff1aSopenharmony_ci;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
4cabdff1aSopenharmony_ci;*               2005-2008 Loren Merritt
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci;*
13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
17cabdff1aSopenharmony_ci;*
18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci;******************************************************************************
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ciSECTION_RODATA
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_cirnd_rv40_2d_tbl: times 4 dw  0
28cabdff1aSopenharmony_ci                 times 4 dw 16
29cabdff1aSopenharmony_ci                 times 4 dw 32
30cabdff1aSopenharmony_ci                 times 4 dw 16
31cabdff1aSopenharmony_ci                 times 4 dw 32
32cabdff1aSopenharmony_ci                 times 4 dw 28
33cabdff1aSopenharmony_ci                 times 4 dw 32
34cabdff1aSopenharmony_ci                 times 4 dw 28
35cabdff1aSopenharmony_ci                 times 4 dw  0
36cabdff1aSopenharmony_ci                 times 4 dw 32
37cabdff1aSopenharmony_ci                 times 4 dw 16
38cabdff1aSopenharmony_ci                 times 4 dw 32
39cabdff1aSopenharmony_ci                 times 4 dw 32
40cabdff1aSopenharmony_ci                 times 4 dw 28
41cabdff1aSopenharmony_ci                 times 4 dw 32
42cabdff1aSopenharmony_ci                 times 4 dw 28
43cabdff1aSopenharmony_cirnd_rv40_1d_tbl: times 4 dw  0
44cabdff1aSopenharmony_ci                 times 4 dw  2
45cabdff1aSopenharmony_ci                 times 4 dw  4
46cabdff1aSopenharmony_ci                 times 4 dw  2
47cabdff1aSopenharmony_ci                 times 4 dw  4
48cabdff1aSopenharmony_ci                 times 4 dw  3
49cabdff1aSopenharmony_ci                 times 4 dw  4
50cabdff1aSopenharmony_ci                 times 4 dw  3
51cabdff1aSopenharmony_ci                 times 4 dw  0
52cabdff1aSopenharmony_ci                 times 4 dw  4
53cabdff1aSopenharmony_ci                 times 4 dw  2
54cabdff1aSopenharmony_ci                 times 4 dw  4
55cabdff1aSopenharmony_ci                 times 4 dw  4
56cabdff1aSopenharmony_ci                 times 4 dw  3
57cabdff1aSopenharmony_ci                 times 4 dw  4
58cabdff1aSopenharmony_ci                 times 4 dw  3
59cabdff1aSopenharmony_ci
60cabdff1aSopenharmony_cicextern pw_3
61cabdff1aSopenharmony_cicextern pw_4
62cabdff1aSopenharmony_cicextern pw_8
63cabdff1aSopenharmony_cipw_28: times 8 dw 28
64cabdff1aSopenharmony_cicextern pw_32
65cabdff1aSopenharmony_cicextern pw_64
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_ciSECTION .text
68cabdff1aSopenharmony_ci
69cabdff1aSopenharmony_ci%macro mv0_pixels_mc8 0
70cabdff1aSopenharmony_ci    lea           r4, [r2*2 ]
71cabdff1aSopenharmony_ci.next4rows:
72cabdff1aSopenharmony_ci    movq         mm0, [r1   ]
73cabdff1aSopenharmony_ci    movq         mm1, [r1+r2]
74cabdff1aSopenharmony_ci    add           r1, r4
75cabdff1aSopenharmony_ci    CHROMAMC_AVG mm0, [r0   ]
76cabdff1aSopenharmony_ci    CHROMAMC_AVG mm1, [r0+r2]
77cabdff1aSopenharmony_ci    movq     [r0   ], mm0
78cabdff1aSopenharmony_ci    movq     [r0+r2], mm1
79cabdff1aSopenharmony_ci    add           r0, r4
80cabdff1aSopenharmony_ci    movq         mm0, [r1   ]
81cabdff1aSopenharmony_ci    movq         mm1, [r1+r2]
82cabdff1aSopenharmony_ci    add           r1, r4
83cabdff1aSopenharmony_ci    CHROMAMC_AVG mm0, [r0   ]
84cabdff1aSopenharmony_ci    CHROMAMC_AVG mm1, [r0+r2]
85cabdff1aSopenharmony_ci    movq     [r0   ], mm0
86cabdff1aSopenharmony_ci    movq     [r0+r2], mm1
87cabdff1aSopenharmony_ci    add           r0, r4
88cabdff1aSopenharmony_ci    sub          r3d, 4
89cabdff1aSopenharmony_ci    jne .next4rows
90cabdff1aSopenharmony_ci%endmacro
91cabdff1aSopenharmony_ci
92cabdff1aSopenharmony_ci%macro chroma_mc8_mmx_func 2-3
93cabdff1aSopenharmony_ci%ifidn %2, rv40
94cabdff1aSopenharmony_ci%ifdef PIC
95cabdff1aSopenharmony_ci%define rnd_1d_rv40 r8
96cabdff1aSopenharmony_ci%define rnd_2d_rv40 r8
97cabdff1aSopenharmony_ci%define extra_regs 2
98cabdff1aSopenharmony_ci%else ; no-PIC
99cabdff1aSopenharmony_ci%define rnd_1d_rv40 rnd_rv40_1d_tbl
100cabdff1aSopenharmony_ci%define rnd_2d_rv40 rnd_rv40_2d_tbl
101cabdff1aSopenharmony_ci%define extra_regs 1
102cabdff1aSopenharmony_ci%endif ; PIC
103cabdff1aSopenharmony_ci%else
104cabdff1aSopenharmony_ci%define extra_regs 0
105cabdff1aSopenharmony_ci%endif ; rv40
106cabdff1aSopenharmony_ci; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */,
107cabdff1aSopenharmony_ci;                                   uint8_t *src /* align 1 */,
108cabdff1aSopenharmony_ci;                                   ptrdiff_t stride, int h, int mx, int my)
109cabdff1aSopenharmony_cicglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
110cabdff1aSopenharmony_ci    mov          r6d, r5d
111cabdff1aSopenharmony_ci    or           r6d, r4d
112cabdff1aSopenharmony_ci    jne .at_least_one_non_zero
113cabdff1aSopenharmony_ci    ; mx == 0 AND my == 0 - no filter needed
114cabdff1aSopenharmony_ci    mv0_pixels_mc8
115cabdff1aSopenharmony_ci    REP_RET
116cabdff1aSopenharmony_ci
117cabdff1aSopenharmony_ci.at_least_one_non_zero:
118cabdff1aSopenharmony_ci%ifidn %2, rv40
119cabdff1aSopenharmony_ci%if ARCH_X86_64
120cabdff1aSopenharmony_ci    mov           r7, r5
121cabdff1aSopenharmony_ci    and           r7, 6         ; &~1 for mx/my=[0,7]
122cabdff1aSopenharmony_ci    lea           r7, [r7*4+r4]
123cabdff1aSopenharmony_ci    sar          r7d, 1
124cabdff1aSopenharmony_ci%define rnd_bias r7
125cabdff1aSopenharmony_ci%define dest_reg r0
126cabdff1aSopenharmony_ci%else ; x86-32
127cabdff1aSopenharmony_ci    mov           r0, r5
128cabdff1aSopenharmony_ci    and           r0, 6         ; &~1 for mx/my=[0,7]
129cabdff1aSopenharmony_ci    lea           r0, [r0*4+r4]
130cabdff1aSopenharmony_ci    sar          r0d, 1
131cabdff1aSopenharmony_ci%define rnd_bias r0
132cabdff1aSopenharmony_ci%define dest_reg r5
133cabdff1aSopenharmony_ci%endif
134cabdff1aSopenharmony_ci%else ; vc1, h264
135cabdff1aSopenharmony_ci%define rnd_bias  0
136cabdff1aSopenharmony_ci%define dest_reg r0
137cabdff1aSopenharmony_ci%endif
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci    test         r5d, r5d
140cabdff1aSopenharmony_ci    mov           r6, 1
141cabdff1aSopenharmony_ci    je .my_is_zero
142cabdff1aSopenharmony_ci    test         r4d, r4d
143cabdff1aSopenharmony_ci    mov           r6, r2        ; dxy = x ? 1 : stride
144cabdff1aSopenharmony_ci    jne .both_non_zero
145cabdff1aSopenharmony_ci.my_is_zero:
146cabdff1aSopenharmony_ci    ; mx == 0 XOR my == 0 - 1 dimensional filter only
147cabdff1aSopenharmony_ci    or           r4d, r5d       ; x + y
148cabdff1aSopenharmony_ci
149cabdff1aSopenharmony_ci%ifidn %2, rv40
150cabdff1aSopenharmony_ci%ifdef PIC
151cabdff1aSopenharmony_ci    lea           r8, [rnd_rv40_1d_tbl]
152cabdff1aSopenharmony_ci%endif
153cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
154cabdff1aSopenharmony_ci    mov           r5, r0m
155cabdff1aSopenharmony_ci%endif
156cabdff1aSopenharmony_ci%endif
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_ci    movd          m5, r4d
159cabdff1aSopenharmony_ci    movq          m4, [pw_8]
160cabdff1aSopenharmony_ci    movq          m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
161cabdff1aSopenharmony_ci    punpcklwd     m5, m5
162cabdff1aSopenharmony_ci    punpckldq     m5, m5        ; mm5 = B = x
163cabdff1aSopenharmony_ci    pxor          m7, m7
164cabdff1aSopenharmony_ci    psubw         m4, m5        ; mm4 = A = 8-x
165cabdff1aSopenharmony_ci
166cabdff1aSopenharmony_ci.next1drow:
167cabdff1aSopenharmony_ci    movq          m0, [r1   ]   ; mm0 = src[0..7]
168cabdff1aSopenharmony_ci    movq          m2, [r1+r6]   ; mm1 = src[1..8]
169cabdff1aSopenharmony_ci
170cabdff1aSopenharmony_ci    movq          m1, m0
171cabdff1aSopenharmony_ci    movq          m3, m2
172cabdff1aSopenharmony_ci    punpcklbw     m0, m7
173cabdff1aSopenharmony_ci    punpckhbw     m1, m7
174cabdff1aSopenharmony_ci    punpcklbw     m2, m7
175cabdff1aSopenharmony_ci    punpckhbw     m3, m7
176cabdff1aSopenharmony_ci    pmullw        m0, m4        ; [mm0,mm1] = A * src[0..7]
177cabdff1aSopenharmony_ci    pmullw        m1, m4
178cabdff1aSopenharmony_ci    pmullw        m2, m5        ; [mm2,mm3] = B * src[1..8]
179cabdff1aSopenharmony_ci    pmullw        m3, m5
180cabdff1aSopenharmony_ci
181cabdff1aSopenharmony_ci    paddw         m0, m6
182cabdff1aSopenharmony_ci    paddw         m1, m6
183cabdff1aSopenharmony_ci    paddw         m0, m2
184cabdff1aSopenharmony_ci    paddw         m1, m3
185cabdff1aSopenharmony_ci    psrlw         m0, 3
186cabdff1aSopenharmony_ci    psrlw         m1, 3
187cabdff1aSopenharmony_ci    packuswb      m0, m1
188cabdff1aSopenharmony_ci    CHROMAMC_AVG  m0, [dest_reg]
189cabdff1aSopenharmony_ci    movq  [dest_reg], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
190cabdff1aSopenharmony_ci
191cabdff1aSopenharmony_ci    add     dest_reg, r2
192cabdff1aSopenharmony_ci    add           r1, r2
193cabdff1aSopenharmony_ci    dec           r3d
194cabdff1aSopenharmony_ci    jne .next1drow
195cabdff1aSopenharmony_ci    REP_RET
196cabdff1aSopenharmony_ci
197cabdff1aSopenharmony_ci.both_non_zero: ; general case, bilinear
198cabdff1aSopenharmony_ci    movd          m4, r4d         ; x
199cabdff1aSopenharmony_ci    movd          m6, r5d         ; y
200cabdff1aSopenharmony_ci%ifidn %2, rv40
201cabdff1aSopenharmony_ci%ifdef PIC
202cabdff1aSopenharmony_ci    lea           r8, [rnd_rv40_2d_tbl]
203cabdff1aSopenharmony_ci%endif
204cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
205cabdff1aSopenharmony_ci    mov           r5, r0m
206cabdff1aSopenharmony_ci%endif
207cabdff1aSopenharmony_ci%endif
208cabdff1aSopenharmony_ci    mov           r6, rsp         ; backup stack pointer
209cabdff1aSopenharmony_ci    and          rsp, ~(mmsize-1) ; align stack
210cabdff1aSopenharmony_ci    sub          rsp, 16          ; AA and DD
211cabdff1aSopenharmony_ci
212cabdff1aSopenharmony_ci    punpcklwd     m4, m4
213cabdff1aSopenharmony_ci    punpcklwd     m6, m6
214cabdff1aSopenharmony_ci    punpckldq     m4, m4          ; mm4 = x words
215cabdff1aSopenharmony_ci    punpckldq     m6, m6          ; mm6 = y words
216cabdff1aSopenharmony_ci    movq          m5, m4
217cabdff1aSopenharmony_ci    pmullw        m4, m6          ; mm4 = x * y
218cabdff1aSopenharmony_ci    psllw         m5, 3
219cabdff1aSopenharmony_ci    psllw         m6, 3
220cabdff1aSopenharmony_ci    movq          m7, m5
221cabdff1aSopenharmony_ci    paddw         m7, m6
222cabdff1aSopenharmony_ci    movq     [rsp+8], m4          ; DD = x * y
223cabdff1aSopenharmony_ci    psubw         m5, m4          ; mm5 = B = 8x - xy
224cabdff1aSopenharmony_ci    psubw         m6, m4          ; mm6 = C = 8y - xy
225cabdff1aSopenharmony_ci    paddw         m4, [pw_64]
226cabdff1aSopenharmony_ci    psubw         m4, m7          ; mm4 = A = xy - (8x+8y) + 64
227cabdff1aSopenharmony_ci    pxor          m7, m7
228cabdff1aSopenharmony_ci    movq     [rsp  ], m4
229cabdff1aSopenharmony_ci
230cabdff1aSopenharmony_ci    movq          m0, [r1  ]      ; mm0 = src[0..7]
231cabdff1aSopenharmony_ci    movq          m1, [r1+1]      ; mm1 = src[1..8]
232cabdff1aSopenharmony_ci.next2drow:
233cabdff1aSopenharmony_ci    add           r1, r2
234cabdff1aSopenharmony_ci
235cabdff1aSopenharmony_ci    movq          m2, m0
236cabdff1aSopenharmony_ci    movq          m3, m1
237cabdff1aSopenharmony_ci    punpckhbw     m0, m7
238cabdff1aSopenharmony_ci    punpcklbw     m1, m7
239cabdff1aSopenharmony_ci    punpcklbw     m2, m7
240cabdff1aSopenharmony_ci    punpckhbw     m3, m7
241cabdff1aSopenharmony_ci    pmullw        m0, [rsp]
242cabdff1aSopenharmony_ci    pmullw        m2, [rsp]
243cabdff1aSopenharmony_ci    pmullw        m1, m5
244cabdff1aSopenharmony_ci    pmullw        m3, m5
245cabdff1aSopenharmony_ci    paddw         m2, m1          ; mm2 = A * src[0..3] + B * src[1..4]
246cabdff1aSopenharmony_ci    paddw         m3, m0          ; mm3 = A * src[4..7] + B * src[5..8]
247cabdff1aSopenharmony_ci
248cabdff1aSopenharmony_ci    movq          m0, [r1]
249cabdff1aSopenharmony_ci    movq          m1, m0
250cabdff1aSopenharmony_ci    punpcklbw     m0, m7
251cabdff1aSopenharmony_ci    punpckhbw     m1, m7
252cabdff1aSopenharmony_ci    pmullw        m0, m6
253cabdff1aSopenharmony_ci    pmullw        m1, m6
254cabdff1aSopenharmony_ci    paddw         m2, m0
255cabdff1aSopenharmony_ci    paddw         m3, m1          ; [mm2,mm3] += C * src[0..7]
256cabdff1aSopenharmony_ci
257cabdff1aSopenharmony_ci    movq          m1, [r1+1]
258cabdff1aSopenharmony_ci    movq          m0, m1
259cabdff1aSopenharmony_ci    movq          m4, m1
260cabdff1aSopenharmony_ci    punpcklbw     m0, m7
261cabdff1aSopenharmony_ci    punpckhbw     m4, m7
262cabdff1aSopenharmony_ci    pmullw        m0, [rsp+8]
263cabdff1aSopenharmony_ci    pmullw        m4, [rsp+8]
264cabdff1aSopenharmony_ci    paddw         m2, m0
265cabdff1aSopenharmony_ci    paddw         m3, m4          ; [mm2,mm3] += D * src[1..8]
266cabdff1aSopenharmony_ci    movq          m0, [r1]
267cabdff1aSopenharmony_ci
268cabdff1aSopenharmony_ci    paddw         m2, [rnd_2d_%2+rnd_bias*8]
269cabdff1aSopenharmony_ci    paddw         m3, [rnd_2d_%2+rnd_bias*8]
270cabdff1aSopenharmony_ci    psrlw         m2, 6
271cabdff1aSopenharmony_ci    psrlw         m3, 6
272cabdff1aSopenharmony_ci    packuswb      m2, m3
273cabdff1aSopenharmony_ci    CHROMAMC_AVG  m2, [dest_reg]
274cabdff1aSopenharmony_ci    movq  [dest_reg], m2          ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
275cabdff1aSopenharmony_ci
276cabdff1aSopenharmony_ci    add     dest_reg, r2
277cabdff1aSopenharmony_ci    dec          r3d
278cabdff1aSopenharmony_ci    jne .next2drow
279cabdff1aSopenharmony_ci    mov          rsp, r6          ; restore stack pointer
280cabdff1aSopenharmony_ci    RET
281cabdff1aSopenharmony_ci%endmacro
282cabdff1aSopenharmony_ci
283cabdff1aSopenharmony_ci%macro chroma_mc4_mmx_func 2
284cabdff1aSopenharmony_ci%define extra_regs 0
285cabdff1aSopenharmony_ci%ifidn %2, rv40
286cabdff1aSopenharmony_ci%ifdef PIC
287cabdff1aSopenharmony_ci%define extra_regs 1
288cabdff1aSopenharmony_ci%endif ; PIC
289cabdff1aSopenharmony_ci%endif ; rv40
290cabdff1aSopenharmony_cicglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
291cabdff1aSopenharmony_ci    pxor          m7, m7
292cabdff1aSopenharmony_ci    movd          m2, r4d         ; x
293cabdff1aSopenharmony_ci    movd          m3, r5d         ; y
294cabdff1aSopenharmony_ci    movq          m4, [pw_8]
295cabdff1aSopenharmony_ci    movq          m5, [pw_8]
296cabdff1aSopenharmony_ci    punpcklwd     m2, m2
297cabdff1aSopenharmony_ci    punpcklwd     m3, m3
298cabdff1aSopenharmony_ci    punpcklwd     m2, m2
299cabdff1aSopenharmony_ci    punpcklwd     m3, m3
300cabdff1aSopenharmony_ci    psubw         m4, m2
301cabdff1aSopenharmony_ci    psubw         m5, m3
302cabdff1aSopenharmony_ci
303cabdff1aSopenharmony_ci%ifidn %2, rv40
304cabdff1aSopenharmony_ci%ifdef PIC
305cabdff1aSopenharmony_ci   lea            r6, [rnd_rv40_2d_tbl]
306cabdff1aSopenharmony_ci%define rnd_2d_rv40 r6
307cabdff1aSopenharmony_ci%else
308cabdff1aSopenharmony_ci%define rnd_2d_rv40 rnd_rv40_2d_tbl
309cabdff1aSopenharmony_ci%endif
310cabdff1aSopenharmony_ci    and           r5, 6         ; &~1 for mx/my=[0,7]
311cabdff1aSopenharmony_ci    lea           r5, [r5*4+r4]
312cabdff1aSopenharmony_ci    sar          r5d, 1
313cabdff1aSopenharmony_ci%define rnd_bias r5
314cabdff1aSopenharmony_ci%else ; vc1, h264
315cabdff1aSopenharmony_ci%define rnd_bias 0
316cabdff1aSopenharmony_ci%endif
317cabdff1aSopenharmony_ci
318cabdff1aSopenharmony_ci    movd          m0, [r1  ]
319cabdff1aSopenharmony_ci    movd          m6, [r1+1]
320cabdff1aSopenharmony_ci    add           r1, r2
321cabdff1aSopenharmony_ci    punpcklbw     m0, m7
322cabdff1aSopenharmony_ci    punpcklbw     m6, m7
323cabdff1aSopenharmony_ci    pmullw        m0, m4
324cabdff1aSopenharmony_ci    pmullw        m6, m2
325cabdff1aSopenharmony_ci    paddw         m6, m0
326cabdff1aSopenharmony_ci
327cabdff1aSopenharmony_ci.next2rows:
328cabdff1aSopenharmony_ci    movd          m0, [r1  ]
329cabdff1aSopenharmony_ci    movd          m1, [r1+1]
330cabdff1aSopenharmony_ci    add           r1, r2
331cabdff1aSopenharmony_ci    punpcklbw     m0, m7
332cabdff1aSopenharmony_ci    punpcklbw     m1, m7
333cabdff1aSopenharmony_ci    pmullw        m0, m4
334cabdff1aSopenharmony_ci    pmullw        m1, m2
335cabdff1aSopenharmony_ci    paddw         m1, m0
336cabdff1aSopenharmony_ci    movq          m0, m1
337cabdff1aSopenharmony_ci
338cabdff1aSopenharmony_ci    pmullw        m6, m5
339cabdff1aSopenharmony_ci    pmullw        m1, m3
340cabdff1aSopenharmony_ci    paddw         m6, [rnd_2d_%2+rnd_bias*8]
341cabdff1aSopenharmony_ci    paddw         m1, m6
342cabdff1aSopenharmony_ci    psrlw         m1, 6
343cabdff1aSopenharmony_ci    packuswb      m1, m1
344cabdff1aSopenharmony_ci    CHROMAMC_AVG4 m1, m6, [r0]
345cabdff1aSopenharmony_ci    movd        [r0], m1
346cabdff1aSopenharmony_ci    add           r0, r2
347cabdff1aSopenharmony_ci
348cabdff1aSopenharmony_ci    movd          m6, [r1  ]
349cabdff1aSopenharmony_ci    movd          m1, [r1+1]
350cabdff1aSopenharmony_ci    add           r1, r2
351cabdff1aSopenharmony_ci    punpcklbw     m6, m7
352cabdff1aSopenharmony_ci    punpcklbw     m1, m7
353cabdff1aSopenharmony_ci    pmullw        m6, m4
354cabdff1aSopenharmony_ci    pmullw        m1, m2
355cabdff1aSopenharmony_ci    paddw         m1, m6
356cabdff1aSopenharmony_ci    movq          m6, m1
357cabdff1aSopenharmony_ci    pmullw        m0, m5
358cabdff1aSopenharmony_ci    pmullw        m1, m3
359cabdff1aSopenharmony_ci    paddw         m0, [rnd_2d_%2+rnd_bias*8]
360cabdff1aSopenharmony_ci    paddw         m1, m0
361cabdff1aSopenharmony_ci    psrlw         m1, 6
362cabdff1aSopenharmony_ci    packuswb      m1, m1
363cabdff1aSopenharmony_ci    CHROMAMC_AVG4 m1, m0, [r0]
364cabdff1aSopenharmony_ci    movd        [r0], m1
365cabdff1aSopenharmony_ci    add           r0, r2
366cabdff1aSopenharmony_ci    sub          r3d, 2
367cabdff1aSopenharmony_ci    jnz .next2rows
368cabdff1aSopenharmony_ci    REP_RET
369cabdff1aSopenharmony_ci%endmacro
370cabdff1aSopenharmony_ci
371cabdff1aSopenharmony_ci%macro chroma_mc2_mmx_func 2
372cabdff1aSopenharmony_cicglobal %1_%2_chroma_mc2, 6, 7, 0
373cabdff1aSopenharmony_ci    mov          r6d, r4d
374cabdff1aSopenharmony_ci    shl          r4d, 16
375cabdff1aSopenharmony_ci    sub          r4d, r6d
376cabdff1aSopenharmony_ci    add          r4d, 8
377cabdff1aSopenharmony_ci    imul         r5d, r4d         ; x*y<<16 | y*(8-x)
378cabdff1aSopenharmony_ci    shl          r4d, 3
379cabdff1aSopenharmony_ci    sub          r4d, r5d         ; x*(8-y)<<16 | (8-x)*(8-y)
380cabdff1aSopenharmony_ci
381cabdff1aSopenharmony_ci    movd          m5, r4d
382cabdff1aSopenharmony_ci    movd          m6, r5d
383cabdff1aSopenharmony_ci    punpckldq     m5, m5          ; mm5 = {A,B,A,B}
384cabdff1aSopenharmony_ci    punpckldq     m6, m6          ; mm6 = {C,D,C,D}
385cabdff1aSopenharmony_ci    pxor          m7, m7
386cabdff1aSopenharmony_ci    movd          m2, [r1]
387cabdff1aSopenharmony_ci    punpcklbw     m2, m7
388cabdff1aSopenharmony_ci    pshufw        m2, m2, 0x94    ; mm0 = src[0,1,1,2]
389cabdff1aSopenharmony_ci
390cabdff1aSopenharmony_ci.nextrow:
391cabdff1aSopenharmony_ci    add           r1, r2
392cabdff1aSopenharmony_ci    movq          m1, m2
393cabdff1aSopenharmony_ci    pmaddwd       m1, m5          ; mm1 = A * src[0,1] + B * src[1,2]
394cabdff1aSopenharmony_ci    movd          m0, [r1]
395cabdff1aSopenharmony_ci    punpcklbw     m0, m7
396cabdff1aSopenharmony_ci    pshufw        m0, m0, 0x94    ; mm0 = src[0,1,1,2]
397cabdff1aSopenharmony_ci    movq          m2, m0
398cabdff1aSopenharmony_ci    pmaddwd       m0, m6
399cabdff1aSopenharmony_ci    paddw         m1, [rnd_2d_%2]
400cabdff1aSopenharmony_ci    paddw         m1, m0          ; mm1 += C * src[0,1] + D * src[1,2]
401cabdff1aSopenharmony_ci    psrlw         m1, 6
402cabdff1aSopenharmony_ci    packssdw      m1, m7
403cabdff1aSopenharmony_ci    packuswb      m1, m7
404cabdff1aSopenharmony_ci    CHROMAMC_AVG4 m1, m3, [r0]
405cabdff1aSopenharmony_ci    movd         r5d, m1
406cabdff1aSopenharmony_ci    mov         [r0], r5w
407cabdff1aSopenharmony_ci    add           r0, r2
408cabdff1aSopenharmony_ci    sub          r3d, 1
409cabdff1aSopenharmony_ci    jnz .nextrow
410cabdff1aSopenharmony_ci    REP_RET
411cabdff1aSopenharmony_ci%endmacro
412cabdff1aSopenharmony_ci
413cabdff1aSopenharmony_ci%define rnd_1d_h264 pw_4
414cabdff1aSopenharmony_ci%define rnd_2d_h264 pw_32
415cabdff1aSopenharmony_ci%define rnd_1d_vc1  pw_3
416cabdff1aSopenharmony_ci%define rnd_2d_vc1  pw_28
417cabdff1aSopenharmony_ci
418cabdff1aSopenharmony_ci%macro NOTHING 2-3
419cabdff1aSopenharmony_ci%endmacro
420cabdff1aSopenharmony_ci%macro DIRECT_AVG 2
421cabdff1aSopenharmony_ci    PAVGB         %1, %2
422cabdff1aSopenharmony_ci%endmacro
423cabdff1aSopenharmony_ci%macro COPY_AVG 3
424cabdff1aSopenharmony_ci    movd          %2, %3
425cabdff1aSopenharmony_ci    PAVGB         %1, %2
426cabdff1aSopenharmony_ci%endmacro
427cabdff1aSopenharmony_ci
428cabdff1aSopenharmony_ciINIT_MMX mmx
429cabdff1aSopenharmony_ci%define CHROMAMC_AVG  NOTHING
430cabdff1aSopenharmony_ci%define CHROMAMC_AVG4 NOTHING
431cabdff1aSopenharmony_cichroma_mc8_mmx_func put, h264, _rnd
432cabdff1aSopenharmony_cichroma_mc8_mmx_func put, vc1,  _nornd
433cabdff1aSopenharmony_cichroma_mc8_mmx_func put, rv40
434cabdff1aSopenharmony_cichroma_mc4_mmx_func put, h264
435cabdff1aSopenharmony_cichroma_mc4_mmx_func put, rv40
436cabdff1aSopenharmony_ci
437cabdff1aSopenharmony_ciINIT_MMX mmxext
438cabdff1aSopenharmony_cichroma_mc2_mmx_func put, h264
439cabdff1aSopenharmony_ci
440cabdff1aSopenharmony_ci%define CHROMAMC_AVG  DIRECT_AVG
441cabdff1aSopenharmony_ci%define CHROMAMC_AVG4 COPY_AVG
442cabdff1aSopenharmony_cichroma_mc8_mmx_func avg, h264, _rnd
443cabdff1aSopenharmony_cichroma_mc8_mmx_func avg, vc1,  _nornd
444cabdff1aSopenharmony_cichroma_mc8_mmx_func avg, rv40
445cabdff1aSopenharmony_cichroma_mc4_mmx_func avg, h264
446cabdff1aSopenharmony_cichroma_mc4_mmx_func avg, rv40
447cabdff1aSopenharmony_cichroma_mc2_mmx_func avg, h264
448cabdff1aSopenharmony_ci
449cabdff1aSopenharmony_ci%macro chroma_mc8_ssse3_func 2-3
450cabdff1aSopenharmony_cicglobal %1_%2_chroma_mc8%3, 6, 7, 8
451cabdff1aSopenharmony_ci    mov          r6d, r5d
452cabdff1aSopenharmony_ci    or           r6d, r4d
453cabdff1aSopenharmony_ci    jne .at_least_one_non_zero
454cabdff1aSopenharmony_ci    ; mx == 0 AND my == 0 - no filter needed
455cabdff1aSopenharmony_ci    mv0_pixels_mc8
456cabdff1aSopenharmony_ci    REP_RET
457cabdff1aSopenharmony_ci
458cabdff1aSopenharmony_ci.at_least_one_non_zero:
459cabdff1aSopenharmony_ci    test         r5d, r5d
460cabdff1aSopenharmony_ci    je .my_is_zero
461cabdff1aSopenharmony_ci    test         r4d, r4d
462cabdff1aSopenharmony_ci    je .mx_is_zero
463cabdff1aSopenharmony_ci
464cabdff1aSopenharmony_ci    ; general case, bilinear
465cabdff1aSopenharmony_ci    mov          r6d, r4d
466cabdff1aSopenharmony_ci    shl          r4d, 8
467cabdff1aSopenharmony_ci    sub           r4, r6
468cabdff1aSopenharmony_ci    mov           r6, 8
469cabdff1aSopenharmony_ci    add           r4, 8           ; x*288+8 = x<<8 | (8-x)
470cabdff1aSopenharmony_ci    sub          r6d, r5d
471cabdff1aSopenharmony_ci    imul          r6, r4          ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
472cabdff1aSopenharmony_ci    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
473cabdff1aSopenharmony_ci
474cabdff1aSopenharmony_ci    movd          m7, r6d
475cabdff1aSopenharmony_ci    movd          m6, r4d
476cabdff1aSopenharmony_ci    movdqa        m5, [rnd_2d_%2]
477cabdff1aSopenharmony_ci    movq          m0, [r1  ]
478cabdff1aSopenharmony_ci    movq          m1, [r1+1]
479cabdff1aSopenharmony_ci    pshuflw       m7, m7, 0
480cabdff1aSopenharmony_ci    pshuflw       m6, m6, 0
481cabdff1aSopenharmony_ci    punpcklbw     m0, m1
482cabdff1aSopenharmony_ci    movlhps       m7, m7
483cabdff1aSopenharmony_ci    movlhps       m6, m6
484cabdff1aSopenharmony_ci
485cabdff1aSopenharmony_ci.next2rows:
486cabdff1aSopenharmony_ci    movq          m1, [r1+r2*1   ]
487cabdff1aSopenharmony_ci    movq          m2, [r1+r2*1+1]
488cabdff1aSopenharmony_ci    movq          m3, [r1+r2*2  ]
489cabdff1aSopenharmony_ci    movq          m4, [r1+r2*2+1]
490cabdff1aSopenharmony_ci    lea           r1, [r1+r2*2]
491cabdff1aSopenharmony_ci    punpcklbw     m1, m2
492cabdff1aSopenharmony_ci    movdqa        m2, m1
493cabdff1aSopenharmony_ci    punpcklbw     m3, m4
494cabdff1aSopenharmony_ci    movdqa        m4, m3
495cabdff1aSopenharmony_ci    pmaddubsw     m0, m7
496cabdff1aSopenharmony_ci    pmaddubsw     m1, m6
497cabdff1aSopenharmony_ci    pmaddubsw     m2, m7
498cabdff1aSopenharmony_ci    pmaddubsw     m3, m6
499cabdff1aSopenharmony_ci    paddw         m0, m5
500cabdff1aSopenharmony_ci    paddw         m2, m5
501cabdff1aSopenharmony_ci    paddw         m1, m0
502cabdff1aSopenharmony_ci    paddw         m3, m2
503cabdff1aSopenharmony_ci    psrlw         m1, 6
504cabdff1aSopenharmony_ci    movdqa        m0, m4
505cabdff1aSopenharmony_ci    psrlw         m3, 6
506cabdff1aSopenharmony_ci%ifidn %1, avg
507cabdff1aSopenharmony_ci    movq          m2, [r0   ]
508cabdff1aSopenharmony_ci    movhps        m2, [r0+r2]
509cabdff1aSopenharmony_ci%endif
510cabdff1aSopenharmony_ci    packuswb      m1, m3
511cabdff1aSopenharmony_ci    CHROMAMC_AVG  m1, m2
512cabdff1aSopenharmony_ci    movq     [r0   ], m1
513cabdff1aSopenharmony_ci    movhps   [r0+r2], m1
514cabdff1aSopenharmony_ci    sub          r3d, 2
515cabdff1aSopenharmony_ci    lea           r0, [r0+r2*2]
516cabdff1aSopenharmony_ci    jg .next2rows
517cabdff1aSopenharmony_ci    REP_RET
518cabdff1aSopenharmony_ci
519cabdff1aSopenharmony_ci.my_is_zero:
520cabdff1aSopenharmony_ci    mov          r5d, r4d
521cabdff1aSopenharmony_ci    shl          r4d, 8
522cabdff1aSopenharmony_ci    add           r4, 8
523cabdff1aSopenharmony_ci    sub           r4, r5          ; 255*x+8 = x<<8 | (8-x)
524cabdff1aSopenharmony_ci    movd          m7, r4d
525cabdff1aSopenharmony_ci    movdqa        m6, [rnd_1d_%2]
526cabdff1aSopenharmony_ci    pshuflw       m7, m7, 0
527cabdff1aSopenharmony_ci    movlhps       m7, m7
528cabdff1aSopenharmony_ci
529cabdff1aSopenharmony_ci.next2xrows:
530cabdff1aSopenharmony_ci    movq          m0, [r1     ]
531cabdff1aSopenharmony_ci    movq          m1, [r1   +1]
532cabdff1aSopenharmony_ci    movq          m2, [r1+r2  ]
533cabdff1aSopenharmony_ci    movq          m3, [r1+r2+1]
534cabdff1aSopenharmony_ci    punpcklbw     m0, m1
535cabdff1aSopenharmony_ci    punpcklbw     m2, m3
536cabdff1aSopenharmony_ci    pmaddubsw     m0, m7
537cabdff1aSopenharmony_ci    pmaddubsw     m2, m7
538cabdff1aSopenharmony_ci%ifidn %1, avg
539cabdff1aSopenharmony_ci    movq          m4, [r0   ]
540cabdff1aSopenharmony_ci    movhps        m4, [r0+r2]
541cabdff1aSopenharmony_ci%endif
542cabdff1aSopenharmony_ci    paddw         m0, m6
543cabdff1aSopenharmony_ci    paddw         m2, m6
544cabdff1aSopenharmony_ci    psrlw         m0, 3
545cabdff1aSopenharmony_ci    psrlw         m2, 3
546cabdff1aSopenharmony_ci    packuswb      m0, m2
547cabdff1aSopenharmony_ci    CHROMAMC_AVG  m0, m4
548cabdff1aSopenharmony_ci    movq     [r0   ], m0
549cabdff1aSopenharmony_ci    movhps   [r0+r2], m0
550cabdff1aSopenharmony_ci    sub          r3d, 2
551cabdff1aSopenharmony_ci    lea           r0, [r0+r2*2]
552cabdff1aSopenharmony_ci    lea           r1, [r1+r2*2]
553cabdff1aSopenharmony_ci    jg .next2xrows
554cabdff1aSopenharmony_ci    REP_RET
555cabdff1aSopenharmony_ci
556cabdff1aSopenharmony_ci.mx_is_zero:
557cabdff1aSopenharmony_ci    mov          r4d, r5d
558cabdff1aSopenharmony_ci    shl          r5d, 8
559cabdff1aSopenharmony_ci    add           r5, 8
560cabdff1aSopenharmony_ci    sub           r5, r4          ; 255*y+8 = y<<8 | (8-y)
561cabdff1aSopenharmony_ci    movd          m7, r5d
562cabdff1aSopenharmony_ci    movdqa        m6, [rnd_1d_%2]
563cabdff1aSopenharmony_ci    pshuflw       m7, m7, 0
564cabdff1aSopenharmony_ci    movlhps       m7, m7
565cabdff1aSopenharmony_ci
566cabdff1aSopenharmony_ci.next2yrows:
567cabdff1aSopenharmony_ci    movq          m0, [r1     ]
568cabdff1aSopenharmony_ci    movq          m1, [r1+r2  ]
569cabdff1aSopenharmony_ci    movdqa        m2, m1
570cabdff1aSopenharmony_ci    movq          m3, [r1+r2*2]
571cabdff1aSopenharmony_ci    lea           r1, [r1+r2*2]
572cabdff1aSopenharmony_ci    punpcklbw     m0, m1
573cabdff1aSopenharmony_ci    punpcklbw     m2, m3
574cabdff1aSopenharmony_ci    pmaddubsw     m0, m7
575cabdff1aSopenharmony_ci    pmaddubsw     m2, m7
576cabdff1aSopenharmony_ci%ifidn %1, avg
577cabdff1aSopenharmony_ci    movq          m4, [r0   ]
578cabdff1aSopenharmony_ci    movhps        m4, [r0+r2]
579cabdff1aSopenharmony_ci%endif
580cabdff1aSopenharmony_ci    paddw         m0, m6
581cabdff1aSopenharmony_ci    paddw         m2, m6
582cabdff1aSopenharmony_ci    psrlw         m0, 3
583cabdff1aSopenharmony_ci    psrlw         m2, 3
584cabdff1aSopenharmony_ci    packuswb      m0, m2
585cabdff1aSopenharmony_ci    CHROMAMC_AVG  m0, m4
586cabdff1aSopenharmony_ci    movq     [r0   ], m0
587cabdff1aSopenharmony_ci    movhps   [r0+r2], m0
588cabdff1aSopenharmony_ci    sub          r3d, 2
589cabdff1aSopenharmony_ci    lea           r0, [r0+r2*2]
590cabdff1aSopenharmony_ci    jg .next2yrows
591cabdff1aSopenharmony_ci    REP_RET
592cabdff1aSopenharmony_ci%endmacro
593cabdff1aSopenharmony_ci
594cabdff1aSopenharmony_ci%macro chroma_mc4_ssse3_func 2
595cabdff1aSopenharmony_cicglobal %1_%2_chroma_mc4, 6, 7, 0
596cabdff1aSopenharmony_ci    mov           r6, r4
597cabdff1aSopenharmony_ci    shl          r4d, 8
598cabdff1aSopenharmony_ci    sub          r4d, r6d
599cabdff1aSopenharmony_ci    mov           r6, 8
600cabdff1aSopenharmony_ci    add          r4d, 8           ; x*288+8
601cabdff1aSopenharmony_ci    sub          r6d, r5d
602cabdff1aSopenharmony_ci    imul         r6d, r4d         ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
603cabdff1aSopenharmony_ci    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
604cabdff1aSopenharmony_ci
605cabdff1aSopenharmony_ci    movd          m7, r6d
606cabdff1aSopenharmony_ci    movd          m6, r4d
607cabdff1aSopenharmony_ci    movq          m5, [pw_32]
608cabdff1aSopenharmony_ci    movd          m0, [r1  ]
609cabdff1aSopenharmony_ci    pshufw        m7, m7, 0
610cabdff1aSopenharmony_ci    punpcklbw     m0, [r1+1]
611cabdff1aSopenharmony_ci    pshufw        m6, m6, 0
612cabdff1aSopenharmony_ci
613cabdff1aSopenharmony_ci.next2rows:
614cabdff1aSopenharmony_ci    movd          m1, [r1+r2*1  ]
615cabdff1aSopenharmony_ci    movd          m3, [r1+r2*2  ]
616cabdff1aSopenharmony_ci    punpcklbw     m1, [r1+r2*1+1]
617cabdff1aSopenharmony_ci    punpcklbw     m3, [r1+r2*2+1]
618cabdff1aSopenharmony_ci    lea           r1, [r1+r2*2]
619cabdff1aSopenharmony_ci    movq          m2, m1
620cabdff1aSopenharmony_ci    movq          m4, m3
621cabdff1aSopenharmony_ci    pmaddubsw     m0, m7
622cabdff1aSopenharmony_ci    pmaddubsw     m1, m6
623cabdff1aSopenharmony_ci    pmaddubsw     m2, m7
624cabdff1aSopenharmony_ci    pmaddubsw     m3, m6
625cabdff1aSopenharmony_ci    paddw         m0, m5
626cabdff1aSopenharmony_ci    paddw         m2, m5
627cabdff1aSopenharmony_ci    paddw         m1, m0
628cabdff1aSopenharmony_ci    paddw         m3, m2
629cabdff1aSopenharmony_ci    psrlw         m1, 6
630cabdff1aSopenharmony_ci    movq          m0, m4
631cabdff1aSopenharmony_ci    psrlw         m3, 6
632cabdff1aSopenharmony_ci    packuswb      m1, m1
633cabdff1aSopenharmony_ci    packuswb      m3, m3
634cabdff1aSopenharmony_ci    CHROMAMC_AVG  m1, [r0  ]
635cabdff1aSopenharmony_ci    CHROMAMC_AVG  m3, [r0+r2]
636cabdff1aSopenharmony_ci    movd     [r0   ], m1
637cabdff1aSopenharmony_ci    movd     [r0+r2], m3
638cabdff1aSopenharmony_ci    sub          r3d, 2
639cabdff1aSopenharmony_ci    lea           r0, [r0+r2*2]
640cabdff1aSopenharmony_ci    jg .next2rows
641cabdff1aSopenharmony_ci    REP_RET
642cabdff1aSopenharmony_ci%endmacro
643cabdff1aSopenharmony_ci
644cabdff1aSopenharmony_ci%define CHROMAMC_AVG NOTHING
645cabdff1aSopenharmony_ciINIT_XMM ssse3
646cabdff1aSopenharmony_cichroma_mc8_ssse3_func put, h264, _rnd
647cabdff1aSopenharmony_cichroma_mc8_ssse3_func put, vc1,  _nornd
648cabdff1aSopenharmony_ciINIT_MMX ssse3
649cabdff1aSopenharmony_cichroma_mc4_ssse3_func put, h264
650cabdff1aSopenharmony_ci
651cabdff1aSopenharmony_ci%define CHROMAMC_AVG DIRECT_AVG
652cabdff1aSopenharmony_ciINIT_XMM ssse3
653cabdff1aSopenharmony_cichroma_mc8_ssse3_func avg, h264, _rnd
654cabdff1aSopenharmony_cichroma_mc8_ssse3_func avg, vc1,  _nornd
655cabdff1aSopenharmony_ciINIT_MMX ssse3
656cabdff1aSopenharmony_cichroma_mc4_ssse3_func avg, h264
657