1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* SSE2-optimized weighted prediction code
3cabdff1aSopenharmony_ci;*****************************************************************************
4cabdff1aSopenharmony_ci;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5cabdff1aSopenharmony_ci;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
13cabdff1aSopenharmony_ci;*
14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
18cabdff1aSopenharmony_ci;*
19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22cabdff1aSopenharmony_ci;******************************************************************************
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ciSECTION .text
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
29cabdff1aSopenharmony_ci; biweight pred:
30cabdff1aSopenharmony_ci;
31cabdff1aSopenharmony_ci; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
32cabdff1aSopenharmony_ci;                               int height, int log2_denom, int weightd,
33cabdff1aSopenharmony_ci;                               int weights, int offset);
34cabdff1aSopenharmony_ci; and
35cabdff1aSopenharmony_ci; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height,
36cabdff1aSopenharmony_ci;                             int log2_denom, int weight, int offset);
37cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ci%macro WEIGHT_SETUP 0
40cabdff1aSopenharmony_ci    add        r5, r5
41cabdff1aSopenharmony_ci    inc        r5
42cabdff1aSopenharmony_ci    movd       m3, r4d
43cabdff1aSopenharmony_ci    movd       m5, r5d
44cabdff1aSopenharmony_ci    movd       m6, r3d
45cabdff1aSopenharmony_ci    pslld      m5, m6
46cabdff1aSopenharmony_ci    psrld      m5, 1
47cabdff1aSopenharmony_ci%if mmsize == 16
48cabdff1aSopenharmony_ci    pshuflw    m3, m3, 0
49cabdff1aSopenharmony_ci    pshuflw    m5, m5, 0
50cabdff1aSopenharmony_ci    punpcklqdq m3, m3
51cabdff1aSopenharmony_ci    punpcklqdq m5, m5
52cabdff1aSopenharmony_ci%else
53cabdff1aSopenharmony_ci    pshufw     m3, m3, 0
54cabdff1aSopenharmony_ci    pshufw     m5, m5, 0
55cabdff1aSopenharmony_ci%endif
56cabdff1aSopenharmony_ci    pxor       m7, m7
57cabdff1aSopenharmony_ci%endmacro
58cabdff1aSopenharmony_ci
59cabdff1aSopenharmony_ci%macro WEIGHT_OP 2
60cabdff1aSopenharmony_ci    movh          m0, [r0+%1]
61cabdff1aSopenharmony_ci    movh          m1, [r0+%2]
62cabdff1aSopenharmony_ci    punpcklbw     m0, m7
63cabdff1aSopenharmony_ci    punpcklbw     m1, m7
64cabdff1aSopenharmony_ci    pmullw        m0, m3
65cabdff1aSopenharmony_ci    pmullw        m1, m3
66cabdff1aSopenharmony_ci    paddsw        m0, m5
67cabdff1aSopenharmony_ci    paddsw        m1, m5
68cabdff1aSopenharmony_ci    psraw         m0, m6
69cabdff1aSopenharmony_ci    psraw         m1, m6
70cabdff1aSopenharmony_ci    packuswb      m0, m1
71cabdff1aSopenharmony_ci%endmacro
72cabdff1aSopenharmony_ci
73cabdff1aSopenharmony_ci%macro WEIGHT_FUNC_MM 2
74cabdff1aSopenharmony_cicglobal h264_weight_%1, 6, 6, %2
75cabdff1aSopenharmony_ci    WEIGHT_SETUP
76cabdff1aSopenharmony_ci.nextrow:
77cabdff1aSopenharmony_ci    WEIGHT_OP 0, mmsize/2
78cabdff1aSopenharmony_ci    mova     [r0], m0
79cabdff1aSopenharmony_ci    add        r0, r1
80cabdff1aSopenharmony_ci    dec        r2d
81cabdff1aSopenharmony_ci    jnz .nextrow
82cabdff1aSopenharmony_ci    REP_RET
83cabdff1aSopenharmony_ci%endmacro
84cabdff1aSopenharmony_ci
85cabdff1aSopenharmony_ciINIT_XMM sse2
86cabdff1aSopenharmony_ciWEIGHT_FUNC_MM 16, 8
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci%macro WEIGHT_FUNC_HALF_MM 2
89cabdff1aSopenharmony_cicglobal h264_weight_%1, 6, 6, %2
90cabdff1aSopenharmony_ci    WEIGHT_SETUP
91cabdff1aSopenharmony_ci    sar       r2d, 1
92cabdff1aSopenharmony_ci    lea        r3, [r1*2]
93cabdff1aSopenharmony_ci.nextrow:
94cabdff1aSopenharmony_ci    WEIGHT_OP 0, r1
95cabdff1aSopenharmony_ci    movh     [r0], m0
96cabdff1aSopenharmony_ci%if mmsize == 16
97cabdff1aSopenharmony_ci    movhps   [r0+r1], m0
98cabdff1aSopenharmony_ci%else
99cabdff1aSopenharmony_ci    psrlq      m0, 32
100cabdff1aSopenharmony_ci    movh     [r0+r1], m0
101cabdff1aSopenharmony_ci%endif
102cabdff1aSopenharmony_ci    add        r0, r3
103cabdff1aSopenharmony_ci    dec        r2d
104cabdff1aSopenharmony_ci    jnz .nextrow
105cabdff1aSopenharmony_ci    REP_RET
106cabdff1aSopenharmony_ci%endmacro
107cabdff1aSopenharmony_ci
108cabdff1aSopenharmony_ciINIT_MMX mmxext
109cabdff1aSopenharmony_ciWEIGHT_FUNC_HALF_MM 4, 0
110cabdff1aSopenharmony_ciINIT_XMM sse2
111cabdff1aSopenharmony_ciWEIGHT_FUNC_HALF_MM 8, 8
112cabdff1aSopenharmony_ci
113cabdff1aSopenharmony_ci%macro BIWEIGHT_SETUP 0
114cabdff1aSopenharmony_ci%if ARCH_X86_64
115cabdff1aSopenharmony_ci%define off_regd r7d
116cabdff1aSopenharmony_ci%else
117cabdff1aSopenharmony_ci%define off_regd r3d
118cabdff1aSopenharmony_ci%endif
119cabdff1aSopenharmony_ci    mov  off_regd, r7m
120cabdff1aSopenharmony_ci    add  off_regd, 1
121cabdff1aSopenharmony_ci    or   off_regd, 1
122cabdff1aSopenharmony_ci    add       r4d, 1
123cabdff1aSopenharmony_ci    cmp       r6d, 128
124cabdff1aSopenharmony_ci    je .nonnormal
125cabdff1aSopenharmony_ci    cmp       r5d, 128
126cabdff1aSopenharmony_ci    jne .normal
127cabdff1aSopenharmony_ci.nonnormal:
128cabdff1aSopenharmony_ci    sar       r5d, 1
129cabdff1aSopenharmony_ci    sar       r6d, 1
130cabdff1aSopenharmony_ci    sar  off_regd, 1
131cabdff1aSopenharmony_ci    sub       r4d, 1
132cabdff1aSopenharmony_ci.normal:
133cabdff1aSopenharmony_ci%if cpuflag(ssse3)
134cabdff1aSopenharmony_ci    movd       m4, r5d
135cabdff1aSopenharmony_ci    movd       m0, r6d
136cabdff1aSopenharmony_ci%else
137cabdff1aSopenharmony_ci    movd       m3, r5d
138cabdff1aSopenharmony_ci    movd       m4, r6d
139cabdff1aSopenharmony_ci%endif
140cabdff1aSopenharmony_ci    movd       m5, off_regd
141cabdff1aSopenharmony_ci    movd       m6, r4d
142cabdff1aSopenharmony_ci    pslld      m5, m6
143cabdff1aSopenharmony_ci    psrld      m5, 1
144cabdff1aSopenharmony_ci%if cpuflag(ssse3)
145cabdff1aSopenharmony_ci    punpcklbw  m4, m0
146cabdff1aSopenharmony_ci    pshuflw    m4, m4, 0
147cabdff1aSopenharmony_ci    pshuflw    m5, m5, 0
148cabdff1aSopenharmony_ci    punpcklqdq m4, m4
149cabdff1aSopenharmony_ci    punpcklqdq m5, m5
150cabdff1aSopenharmony_ci
151cabdff1aSopenharmony_ci%else
152cabdff1aSopenharmony_ci%if mmsize == 16
153cabdff1aSopenharmony_ci    pshuflw    m3, m3, 0
154cabdff1aSopenharmony_ci    pshuflw    m4, m4, 0
155cabdff1aSopenharmony_ci    pshuflw    m5, m5, 0
156cabdff1aSopenharmony_ci    punpcklqdq m3, m3
157cabdff1aSopenharmony_ci    punpcklqdq m4, m4
158cabdff1aSopenharmony_ci    punpcklqdq m5, m5
159cabdff1aSopenharmony_ci%else
160cabdff1aSopenharmony_ci    pshufw     m3, m3, 0
161cabdff1aSopenharmony_ci    pshufw     m4, m4, 0
162cabdff1aSopenharmony_ci    pshufw     m5, m5, 0
163cabdff1aSopenharmony_ci%endif
164cabdff1aSopenharmony_ci    pxor       m7, m7
165cabdff1aSopenharmony_ci%endif
166cabdff1aSopenharmony_ci%endmacro
167cabdff1aSopenharmony_ci
168cabdff1aSopenharmony_ci%macro BIWEIGHT_STEPA 3
169cabdff1aSopenharmony_ci    movh       m%1, [r0+%3]
170cabdff1aSopenharmony_ci    movh       m%2, [r1+%3]
171cabdff1aSopenharmony_ci    punpcklbw  m%1, m7
172cabdff1aSopenharmony_ci    punpcklbw  m%2, m7
173cabdff1aSopenharmony_ci    pmullw     m%1, m3
174cabdff1aSopenharmony_ci    pmullw     m%2, m4
175cabdff1aSopenharmony_ci    paddsw     m%1, m%2
176cabdff1aSopenharmony_ci%endmacro
177cabdff1aSopenharmony_ci
178cabdff1aSopenharmony_ci%macro BIWEIGHT_STEPB 0
179cabdff1aSopenharmony_ci    paddsw     m0, m5
180cabdff1aSopenharmony_ci    paddsw     m1, m5
181cabdff1aSopenharmony_ci    psraw      m0, m6
182cabdff1aSopenharmony_ci    psraw      m1, m6
183cabdff1aSopenharmony_ci    packuswb   m0, m1
184cabdff1aSopenharmony_ci%endmacro
185cabdff1aSopenharmony_ci
186cabdff1aSopenharmony_ci%macro BIWEIGHT_FUNC_MM 2
187cabdff1aSopenharmony_cicglobal h264_biweight_%1, 7, 8, %2
188cabdff1aSopenharmony_ci    BIWEIGHT_SETUP
189cabdff1aSopenharmony_ci    movifnidn r3d, r3m
190cabdff1aSopenharmony_ci.nextrow:
191cabdff1aSopenharmony_ci    BIWEIGHT_STEPA 0, 1, 0
192cabdff1aSopenharmony_ci    BIWEIGHT_STEPA 1, 2, mmsize/2
193cabdff1aSopenharmony_ci    BIWEIGHT_STEPB
194cabdff1aSopenharmony_ci    mova       [r0], m0
195cabdff1aSopenharmony_ci    add        r0, r2
196cabdff1aSopenharmony_ci    add        r1, r2
197cabdff1aSopenharmony_ci    dec        r3d
198cabdff1aSopenharmony_ci    jnz .nextrow
199cabdff1aSopenharmony_ci    REP_RET
200cabdff1aSopenharmony_ci%endmacro
201cabdff1aSopenharmony_ci
202cabdff1aSopenharmony_ciINIT_XMM sse2
203cabdff1aSopenharmony_ciBIWEIGHT_FUNC_MM 16, 8
204cabdff1aSopenharmony_ci
205cabdff1aSopenharmony_ci%macro BIWEIGHT_FUNC_HALF_MM 2
206cabdff1aSopenharmony_cicglobal h264_biweight_%1, 7, 8, %2
207cabdff1aSopenharmony_ci    BIWEIGHT_SETUP
208cabdff1aSopenharmony_ci    movifnidn r3d, r3m
209cabdff1aSopenharmony_ci    sar        r3, 1
210cabdff1aSopenharmony_ci    lea        r4, [r2*2]
211cabdff1aSopenharmony_ci.nextrow:
212cabdff1aSopenharmony_ci    BIWEIGHT_STEPA 0, 1, 0
213cabdff1aSopenharmony_ci    BIWEIGHT_STEPA 1, 2, r2
214cabdff1aSopenharmony_ci    BIWEIGHT_STEPB
215cabdff1aSopenharmony_ci    movh       [r0], m0
216cabdff1aSopenharmony_ci%if mmsize == 16
217cabdff1aSopenharmony_ci    movhps     [r0+r2], m0
218cabdff1aSopenharmony_ci%else
219cabdff1aSopenharmony_ci    psrlq      m0, 32
220cabdff1aSopenharmony_ci    movh       [r0+r2], m0
221cabdff1aSopenharmony_ci%endif
222cabdff1aSopenharmony_ci    add        r0, r4
223cabdff1aSopenharmony_ci    add        r1, r4
224cabdff1aSopenharmony_ci    dec        r3d
225cabdff1aSopenharmony_ci    jnz .nextrow
226cabdff1aSopenharmony_ci    REP_RET
227cabdff1aSopenharmony_ci%endmacro
228cabdff1aSopenharmony_ci
229cabdff1aSopenharmony_ciINIT_MMX mmxext
230cabdff1aSopenharmony_ciBIWEIGHT_FUNC_HALF_MM 4, 0
231cabdff1aSopenharmony_ciINIT_XMM sse2
232cabdff1aSopenharmony_ciBIWEIGHT_FUNC_HALF_MM 8, 8
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_ci%macro BIWEIGHT_SSSE3_OP 0
235cabdff1aSopenharmony_ci    pmaddubsw  m0, m4
236cabdff1aSopenharmony_ci    pmaddubsw  m2, m4
237cabdff1aSopenharmony_ci    paddsw     m0, m5
238cabdff1aSopenharmony_ci    paddsw     m2, m5
239cabdff1aSopenharmony_ci    psraw      m0, m6
240cabdff1aSopenharmony_ci    psraw      m2, m6
241cabdff1aSopenharmony_ci    packuswb   m0, m2
242cabdff1aSopenharmony_ci%endmacro
243cabdff1aSopenharmony_ci
244cabdff1aSopenharmony_ciINIT_XMM ssse3
245cabdff1aSopenharmony_cicglobal h264_biweight_16, 7, 8, 8
246cabdff1aSopenharmony_ci    BIWEIGHT_SETUP
247cabdff1aSopenharmony_ci    movifnidn r3d, r3m
248cabdff1aSopenharmony_ci
249cabdff1aSopenharmony_ci.nextrow:
250cabdff1aSopenharmony_ci    movh       m0, [r0]
251cabdff1aSopenharmony_ci    movh       m2, [r0+8]
252cabdff1aSopenharmony_ci    movh       m3, [r1+8]
253cabdff1aSopenharmony_ci    punpcklbw  m0, [r1]
254cabdff1aSopenharmony_ci    punpcklbw  m2, m3
255cabdff1aSopenharmony_ci    BIWEIGHT_SSSE3_OP
256cabdff1aSopenharmony_ci    mova       [r0], m0
257cabdff1aSopenharmony_ci    add        r0, r2
258cabdff1aSopenharmony_ci    add        r1, r2
259cabdff1aSopenharmony_ci    dec        r3d
260cabdff1aSopenharmony_ci    jnz .nextrow
261cabdff1aSopenharmony_ci    REP_RET
262cabdff1aSopenharmony_ci
263cabdff1aSopenharmony_ciINIT_XMM ssse3
264cabdff1aSopenharmony_cicglobal h264_biweight_8, 7, 8, 8
265cabdff1aSopenharmony_ci    BIWEIGHT_SETUP
266cabdff1aSopenharmony_ci    movifnidn r3d, r3m
267cabdff1aSopenharmony_ci    sar        r3, 1
268cabdff1aSopenharmony_ci    lea        r4, [r2*2]
269cabdff1aSopenharmony_ci
270cabdff1aSopenharmony_ci.nextrow:
271cabdff1aSopenharmony_ci    movh       m0, [r0]
272cabdff1aSopenharmony_ci    movh       m1, [r1]
273cabdff1aSopenharmony_ci    movh       m2, [r0+r2]
274cabdff1aSopenharmony_ci    movh       m3, [r1+r2]
275cabdff1aSopenharmony_ci    punpcklbw  m0, m1
276cabdff1aSopenharmony_ci    punpcklbw  m2, m3
277cabdff1aSopenharmony_ci    BIWEIGHT_SSSE3_OP
278cabdff1aSopenharmony_ci    movh       [r0], m0
279cabdff1aSopenharmony_ci    movhps     [r0+r2], m0
280cabdff1aSopenharmony_ci    add        r0, r4
281cabdff1aSopenharmony_ci    add        r1, r4
282cabdff1aSopenharmony_ci    dec        r3d
283cabdff1aSopenharmony_ci    jnz .nextrow
284cabdff1aSopenharmony_ci    REP_RET
285