1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code
3cabdff1aSopenharmony_ci;*****************************************************************************
4cabdff1aSopenharmony_ci;* Copyright (C) 2005-2011 x264 project
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
9cabdff1aSopenharmony_ci;*
10cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
11cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
12cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
13cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
14cabdff1aSopenharmony_ci;*
15cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
16cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
19cabdff1aSopenharmony_ci;*
20cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
21cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
22cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23cabdff1aSopenharmony_ci;******************************************************************************
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ciSECTION_RODATA
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_cicextern pw_4
30cabdff1aSopenharmony_cicextern pw_8
31cabdff1aSopenharmony_cicextern pw_32
32cabdff1aSopenharmony_cicextern pw_64
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_ciSECTION .text
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_ci
37cabdff1aSopenharmony_ci%macro MV0_PIXELS_MC8 0
38cabdff1aSopenharmony_ci    lea           r4, [r2*3   ]
39cabdff1aSopenharmony_ci    lea           r5, [r2*4   ]
40cabdff1aSopenharmony_ci.next4rows:
41cabdff1aSopenharmony_ci    movu          m0, [r1     ]
42cabdff1aSopenharmony_ci    movu          m1, [r1+r2  ]
43cabdff1aSopenharmony_ci    CHROMAMC_AVG  m0, [r0     ]
44cabdff1aSopenharmony_ci    CHROMAMC_AVG  m1, [r0+r2  ]
45cabdff1aSopenharmony_ci    mova   [r0     ], m0
46cabdff1aSopenharmony_ci    mova   [r0+r2  ], m1
47cabdff1aSopenharmony_ci    movu          m0, [r1+r2*2]
48cabdff1aSopenharmony_ci    movu          m1, [r1+r4  ]
49cabdff1aSopenharmony_ci    CHROMAMC_AVG  m0, [r0+r2*2]
50cabdff1aSopenharmony_ci    CHROMAMC_AVG  m1, [r0+r4  ]
51cabdff1aSopenharmony_ci    mova   [r0+r2*2], m0
52cabdff1aSopenharmony_ci    mova   [r0+r4  ], m1
53cabdff1aSopenharmony_ci    add           r1, r5
54cabdff1aSopenharmony_ci    add           r0, r5
55cabdff1aSopenharmony_ci    sub          r3d, 4
56cabdff1aSopenharmony_ci    jne .next4rows
57cabdff1aSopenharmony_ci%endmacro
58cabdff1aSopenharmony_ci
59cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
60cabdff1aSopenharmony_ci; void ff_put/avg_h264_chroma_mc8(pixel *dst, pixel *src, ptrdiff_t stride,
61cabdff1aSopenharmony_ci;                                 int h, int mx, int my)
62cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
63cabdff1aSopenharmony_ci%macro CHROMA_MC8 1
64cabdff1aSopenharmony_cicglobal %1_h264_chroma_mc8_10, 6,7,8
65cabdff1aSopenharmony_ci    mov          r6d, r5d
66cabdff1aSopenharmony_ci    or           r6d, r4d
67cabdff1aSopenharmony_ci    jne .at_least_one_non_zero
68cabdff1aSopenharmony_ci    ; mx == 0 AND my == 0 - no filter needed
69cabdff1aSopenharmony_ci    MV0_PIXELS_MC8
70cabdff1aSopenharmony_ci    REP_RET
71cabdff1aSopenharmony_ci
72cabdff1aSopenharmony_ci.at_least_one_non_zero:
73cabdff1aSopenharmony_ci    mov          r6d, 2
74cabdff1aSopenharmony_ci    test         r5d, r5d
75cabdff1aSopenharmony_ci    je .x_interpolation
76cabdff1aSopenharmony_ci    mov           r6, r2        ; dxy = x ? 1 : stride
77cabdff1aSopenharmony_ci    test         r4d, r4d
78cabdff1aSopenharmony_ci    jne .xy_interpolation
79cabdff1aSopenharmony_ci.x_interpolation:
80cabdff1aSopenharmony_ci    ; mx == 0 XOR my == 0 - 1 dimensional filter only
81cabdff1aSopenharmony_ci    or           r4d, r5d       ; x + y
82cabdff1aSopenharmony_ci    movd          m5, r4d
83cabdff1aSopenharmony_ci    mova          m4, [pw_8]
84cabdff1aSopenharmony_ci    mova          m6, [pw_4]    ; mm6 = rnd >> 3
85cabdff1aSopenharmony_ci    SPLATW        m5, m5        ; mm5 = B = x
86cabdff1aSopenharmony_ci    psubw         m4, m5        ; mm4 = A = 8-x
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci.next1drow:
89cabdff1aSopenharmony_ci    movu          m0, [r1   ]   ; mm0 = src[0..7]
90cabdff1aSopenharmony_ci    movu          m2, [r1+r6]   ; mm2 = src[1..8]
91cabdff1aSopenharmony_ci
92cabdff1aSopenharmony_ci    pmullw        m0, m4        ; mm0 = A * src[0..7]
93cabdff1aSopenharmony_ci    pmullw        m2, m5        ; mm2 = B * src[1..8]
94cabdff1aSopenharmony_ci
95cabdff1aSopenharmony_ci    paddw         m0, m6
96cabdff1aSopenharmony_ci    paddw         m0, m2
97cabdff1aSopenharmony_ci    psrlw         m0, 3
98cabdff1aSopenharmony_ci    CHROMAMC_AVG  m0, [r0]
99cabdff1aSopenharmony_ci    mova        [r0], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
100cabdff1aSopenharmony_ci
101cabdff1aSopenharmony_ci    add           r0, r2
102cabdff1aSopenharmony_ci    add           r1, r2
103cabdff1aSopenharmony_ci    dec           r3d
104cabdff1aSopenharmony_ci    jne .next1drow
105cabdff1aSopenharmony_ci    REP_RET
106cabdff1aSopenharmony_ci
107cabdff1aSopenharmony_ci.xy_interpolation: ; general case, bilinear
108cabdff1aSopenharmony_ci    movd          m4, r4m         ; x
109cabdff1aSopenharmony_ci    movd          m6, r5m         ; y
110cabdff1aSopenharmony_ci
111cabdff1aSopenharmony_ci    SPLATW        m4, m4          ; mm4 = x words
112cabdff1aSopenharmony_ci    SPLATW        m6, m6          ; mm6 = y words
113cabdff1aSopenharmony_ci    psllw         m5, m4, 3       ; mm5 = 8x
114cabdff1aSopenharmony_ci    pmullw        m4, m6          ; mm4 = x * y
115cabdff1aSopenharmony_ci    psllw         m6, 3           ; mm6 = 8y
116cabdff1aSopenharmony_ci    paddw         m1, m5, m6      ; mm7 = 8x+8y
117cabdff1aSopenharmony_ci    mova          m7, m4          ; DD = x * y
118cabdff1aSopenharmony_ci    psubw         m5, m4          ; mm5 = B = 8x - xy
119cabdff1aSopenharmony_ci    psubw         m6, m4          ; mm6 = C = 8y - xy
120cabdff1aSopenharmony_ci    paddw         m4, [pw_64]
121cabdff1aSopenharmony_ci    psubw         m4, m1          ; mm4 = A = xy - (8x+8y) + 64
122cabdff1aSopenharmony_ci
123cabdff1aSopenharmony_ci    movu          m0, [r1  ]      ; mm0 = src[0..7]
124cabdff1aSopenharmony_ci    movu          m1, [r1+2]      ; mm1 = src[1..8]
125cabdff1aSopenharmony_ci.next2drow:
126cabdff1aSopenharmony_ci    add           r1, r2
127cabdff1aSopenharmony_ci
128cabdff1aSopenharmony_ci    pmullw        m2, m0, m4
129cabdff1aSopenharmony_ci    pmullw        m1, m5
130cabdff1aSopenharmony_ci    paddw         m2, m1          ; mm2 = A * src[0..7] + B * src[1..8]
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci    movu          m0, [r1]
133cabdff1aSopenharmony_ci    movu          m1, [r1+2]
134cabdff1aSopenharmony_ci    pmullw        m3, m0, m6
135cabdff1aSopenharmony_ci    paddw         m2, m3          ; mm2 += C * src[0..7+strde]
136cabdff1aSopenharmony_ci    pmullw        m3, m1, m7
137cabdff1aSopenharmony_ci    paddw         m2, m3          ; mm2 += D * src[1..8+strde]
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci    paddw         m2, [pw_32]
140cabdff1aSopenharmony_ci    psrlw         m2, 6
141cabdff1aSopenharmony_ci    CHROMAMC_AVG  m2, [r0]
142cabdff1aSopenharmony_ci    mova        [r0], m2          ; dst[0..7] = (mm2 + 32) >> 6
143cabdff1aSopenharmony_ci
144cabdff1aSopenharmony_ci    add           r0, r2
145cabdff1aSopenharmony_ci    dec          r3d
146cabdff1aSopenharmony_ci    jne .next2drow
147cabdff1aSopenharmony_ci    REP_RET
148cabdff1aSopenharmony_ci%endmacro
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
151cabdff1aSopenharmony_ci; void ff_put/avg_h264_chroma_mc4(pixel *dst, pixel *src, ptrdiff_t stride,
152cabdff1aSopenharmony_ci;                                 int h, int mx, int my)
153cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
154cabdff1aSopenharmony_ci;TODO: xmm mc4
155cabdff1aSopenharmony_ci%macro MC4_OP 2
156cabdff1aSopenharmony_ci    movq          %1, [r1  ]
157cabdff1aSopenharmony_ci    movq          m1, [r1+2]
158cabdff1aSopenharmony_ci    add           r1, r2
159cabdff1aSopenharmony_ci    pmullw        %1, m4
160cabdff1aSopenharmony_ci    pmullw        m1, m2
161cabdff1aSopenharmony_ci    paddw         m1, %1
162cabdff1aSopenharmony_ci    mova          %1, m1
163cabdff1aSopenharmony_ci
164cabdff1aSopenharmony_ci    pmullw        %2, m5
165cabdff1aSopenharmony_ci    pmullw        m1, m3
166cabdff1aSopenharmony_ci    paddw         %2, [pw_32]
167cabdff1aSopenharmony_ci    paddw         m1, %2
168cabdff1aSopenharmony_ci    psrlw         m1, 6
169cabdff1aSopenharmony_ci    CHROMAMC_AVG  m1, %2, [r0]
170cabdff1aSopenharmony_ci    movq        [r0], m1
171cabdff1aSopenharmony_ci    add           r0, r2
172cabdff1aSopenharmony_ci%endmacro
173cabdff1aSopenharmony_ci
174cabdff1aSopenharmony_ci%macro CHROMA_MC4 1
175cabdff1aSopenharmony_cicglobal %1_h264_chroma_mc4_10, 6,6,7
176cabdff1aSopenharmony_ci    movd          m2, r4m         ; x
177cabdff1aSopenharmony_ci    movd          m3, r5m         ; y
178cabdff1aSopenharmony_ci    mova          m4, [pw_8]
179cabdff1aSopenharmony_ci    mova          m5, m4
180cabdff1aSopenharmony_ci    SPLATW        m2, m2
181cabdff1aSopenharmony_ci    SPLATW        m3, m3
182cabdff1aSopenharmony_ci    psubw         m4, m2
183cabdff1aSopenharmony_ci    psubw         m5, m3
184cabdff1aSopenharmony_ci
185cabdff1aSopenharmony_ci    movq          m0, [r1  ]
186cabdff1aSopenharmony_ci    movq          m6, [r1+2]
187cabdff1aSopenharmony_ci    add           r1, r2
188cabdff1aSopenharmony_ci    pmullw        m0, m4
189cabdff1aSopenharmony_ci    pmullw        m6, m2
190cabdff1aSopenharmony_ci    paddw         m6, m0
191cabdff1aSopenharmony_ci
192cabdff1aSopenharmony_ci.next2rows:
193cabdff1aSopenharmony_ci    MC4_OP m0, m6
194cabdff1aSopenharmony_ci    MC4_OP m6, m0
195cabdff1aSopenharmony_ci    sub   r3d, 2
196cabdff1aSopenharmony_ci    jnz .next2rows
197cabdff1aSopenharmony_ci    REP_RET
198cabdff1aSopenharmony_ci%endmacro
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
201cabdff1aSopenharmony_ci; void ff_put/avg_h264_chroma_mc2(pixel *dst, pixel *src, ptrdiff_t stride,
202cabdff1aSopenharmony_ci;                                 int h, int mx, int my)
203cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
204cabdff1aSopenharmony_ci%macro CHROMA_MC2 1
205cabdff1aSopenharmony_cicglobal %1_h264_chroma_mc2_10, 6,7
206cabdff1aSopenharmony_ci    mov          r6d, r4d
207cabdff1aSopenharmony_ci    shl          r4d, 16
208cabdff1aSopenharmony_ci    sub          r4d, r6d
209cabdff1aSopenharmony_ci    add          r4d, 8
210cabdff1aSopenharmony_ci    imul         r5d, r4d         ; x*y<<16 | y*(8-x)
211cabdff1aSopenharmony_ci    shl          r4d, 3
212cabdff1aSopenharmony_ci    sub          r4d, r5d         ; x*(8-y)<<16 | (8-x)*(8-y)
213cabdff1aSopenharmony_ci
214cabdff1aSopenharmony_ci    movd          m5, r4d
215cabdff1aSopenharmony_ci    movd          m6, r5d
216cabdff1aSopenharmony_ci    punpckldq     m5, m5          ; mm5 = {A,B,A,B}
217cabdff1aSopenharmony_ci    punpckldq     m6, m6          ; mm6 = {C,D,C,D}
218cabdff1aSopenharmony_ci    pxor          m7, m7
219cabdff1aSopenharmony_ci    pshufw        m2, [r1], 0x94    ; mm0 = src[0,1,1,2]
220cabdff1aSopenharmony_ci
221cabdff1aSopenharmony_ci.nextrow:
222cabdff1aSopenharmony_ci    add           r1, r2
223cabdff1aSopenharmony_ci    movq          m1, m2
224cabdff1aSopenharmony_ci    pmaddwd       m1, m5          ; mm1 = A * src[0,1] + B * src[1,2]
225cabdff1aSopenharmony_ci    pshufw        m0, [r1], 0x94    ; mm0 = src[0,1,1,2]
226cabdff1aSopenharmony_ci    movq          m2, m0
227cabdff1aSopenharmony_ci    pmaddwd       m0, m6
228cabdff1aSopenharmony_ci    paddw         m1, [pw_32]
229cabdff1aSopenharmony_ci    paddw         m1, m0          ; mm1 += C * src[0,1] + D * src[1,2]
230cabdff1aSopenharmony_ci    psrlw         m1, 6
231cabdff1aSopenharmony_ci    packssdw      m1, m7
232cabdff1aSopenharmony_ci    CHROMAMC_AVG  m1, m3, [r0]
233cabdff1aSopenharmony_ci    movd        [r0], m1
234cabdff1aSopenharmony_ci    add           r0, r2
235cabdff1aSopenharmony_ci    dec          r3d
236cabdff1aSopenharmony_ci    jnz .nextrow
237cabdff1aSopenharmony_ci    REP_RET
238cabdff1aSopenharmony_ci%endmacro
239cabdff1aSopenharmony_ci
240cabdff1aSopenharmony_ci%macro NOTHING 2-3
241cabdff1aSopenharmony_ci%endmacro
242cabdff1aSopenharmony_ci%macro AVG 2-3
243cabdff1aSopenharmony_ci%if %0==3
244cabdff1aSopenharmony_ci    movq          %2, %3
245cabdff1aSopenharmony_ci%endif
246cabdff1aSopenharmony_ci    pavgw         %1, %2
247cabdff1aSopenharmony_ci%endmacro
248cabdff1aSopenharmony_ci
249cabdff1aSopenharmony_ci%define CHROMAMC_AVG  NOTHING
250cabdff1aSopenharmony_ciINIT_XMM sse2
251cabdff1aSopenharmony_ciCHROMA_MC8 put
252cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
253cabdff1aSopenharmony_ciINIT_XMM avx
254cabdff1aSopenharmony_ciCHROMA_MC8 put
255cabdff1aSopenharmony_ci%endif
256cabdff1aSopenharmony_ciINIT_MMX mmxext
257cabdff1aSopenharmony_ciCHROMA_MC4 put
258cabdff1aSopenharmony_ciCHROMA_MC2 put
259cabdff1aSopenharmony_ci
260cabdff1aSopenharmony_ci%define CHROMAMC_AVG  AVG
261cabdff1aSopenharmony_ciINIT_XMM sse2
262cabdff1aSopenharmony_ciCHROMA_MC8 avg
263cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
264cabdff1aSopenharmony_ciINIT_XMM avx
265cabdff1aSopenharmony_ciCHROMA_MC8 avg
266cabdff1aSopenharmony_ci%endif
267cabdff1aSopenharmony_ciINIT_MMX mmxext
268cabdff1aSopenharmony_ciCHROMA_MC4 avg
269cabdff1aSopenharmony_ciCHROMA_MC2 avg
270