1;*****************************************************************************
2;* SSE2-optimized weighted prediction code
3;*****************************************************************************
4;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION .text
27
28;-----------------------------------------------------------------------------
29; biweight pred:
30;
31; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
32;                               int height, int log2_denom, int weightd,
33;                               int weights, int offset);
34; and
35; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height,
36;                             int log2_denom, int weight, int offset);
37;-----------------------------------------------------------------------------
38
39%macro WEIGHT_SETUP 0
40    add        r5, r5
41    inc        r5
42    movd       m3, r4d
43    movd       m5, r5d
44    movd       m6, r3d
45    pslld      m5, m6
46    psrld      m5, 1
47%if mmsize == 16
48    pshuflw    m3, m3, 0
49    pshuflw    m5, m5, 0
50    punpcklqdq m3, m3
51    punpcklqdq m5, m5
52%else
53    pshufw     m3, m3, 0
54    pshufw     m5, m5, 0
55%endif
56    pxor       m7, m7
57%endmacro
58
59%macro WEIGHT_OP 2
60    movh          m0, [r0+%1]
61    movh          m1, [r0+%2]
62    punpcklbw     m0, m7
63    punpcklbw     m1, m7
64    pmullw        m0, m3
65    pmullw        m1, m3
66    paddsw        m0, m5
67    paddsw        m1, m5
68    psraw         m0, m6
69    psraw         m1, m6
70    packuswb      m0, m1
71%endmacro
72
73%macro WEIGHT_FUNC_MM 2
74cglobal h264_weight_%1, 6, 6, %2
75    WEIGHT_SETUP
76.nextrow:
77    WEIGHT_OP 0, mmsize/2
78    mova     [r0], m0
79    add        r0, r1
80    dec        r2d
81    jnz .nextrow
82    REP_RET
83%endmacro
84
85INIT_XMM sse2
86WEIGHT_FUNC_MM 16, 8
87
88%macro WEIGHT_FUNC_HALF_MM 2
89cglobal h264_weight_%1, 6, 6, %2
90    WEIGHT_SETUP
91    sar       r2d, 1
92    lea        r3, [r1*2]
93.nextrow:
94    WEIGHT_OP 0, r1
95    movh     [r0], m0
96%if mmsize == 16
97    movhps   [r0+r1], m0
98%else
99    psrlq      m0, 32
100    movh     [r0+r1], m0
101%endif
102    add        r0, r3
103    dec        r2d
104    jnz .nextrow
105    REP_RET
106%endmacro
107
108INIT_MMX mmxext
109WEIGHT_FUNC_HALF_MM 4, 0
110INIT_XMM sse2
111WEIGHT_FUNC_HALF_MM 8, 8
112
113%macro BIWEIGHT_SETUP 0
114%if ARCH_X86_64
115%define off_regd r7d
116%else
117%define off_regd r3d
118%endif
119    mov  off_regd, r7m
120    add  off_regd, 1
121    or   off_regd, 1
122    add       r4d, 1
123    cmp       r6d, 128
124    je .nonnormal
125    cmp       r5d, 128
126    jne .normal
127.nonnormal:
128    sar       r5d, 1
129    sar       r6d, 1
130    sar  off_regd, 1
131    sub       r4d, 1
132.normal:
133%if cpuflag(ssse3)
134    movd       m4, r5d
135    movd       m0, r6d
136%else
137    movd       m3, r5d
138    movd       m4, r6d
139%endif
140    movd       m5, off_regd
141    movd       m6, r4d
142    pslld      m5, m6
143    psrld      m5, 1
144%if cpuflag(ssse3)
145    punpcklbw  m4, m0
146    pshuflw    m4, m4, 0
147    pshuflw    m5, m5, 0
148    punpcklqdq m4, m4
149    punpcklqdq m5, m5
150
151%else
152%if mmsize == 16
153    pshuflw    m3, m3, 0
154    pshuflw    m4, m4, 0
155    pshuflw    m5, m5, 0
156    punpcklqdq m3, m3
157    punpcklqdq m4, m4
158    punpcklqdq m5, m5
159%else
160    pshufw     m3, m3, 0
161    pshufw     m4, m4, 0
162    pshufw     m5, m5, 0
163%endif
164    pxor       m7, m7
165%endif
166%endmacro
167
168%macro BIWEIGHT_STEPA 3
169    movh       m%1, [r0+%3]
170    movh       m%2, [r1+%3]
171    punpcklbw  m%1, m7
172    punpcklbw  m%2, m7
173    pmullw     m%1, m3
174    pmullw     m%2, m4
175    paddsw     m%1, m%2
176%endmacro
177
178%macro BIWEIGHT_STEPB 0
179    paddsw     m0, m5
180    paddsw     m1, m5
181    psraw      m0, m6
182    psraw      m1, m6
183    packuswb   m0, m1
184%endmacro
185
186%macro BIWEIGHT_FUNC_MM 2
187cglobal h264_biweight_%1, 7, 8, %2
188    BIWEIGHT_SETUP
189    movifnidn r3d, r3m
190.nextrow:
191    BIWEIGHT_STEPA 0, 1, 0
192    BIWEIGHT_STEPA 1, 2, mmsize/2
193    BIWEIGHT_STEPB
194    mova       [r0], m0
195    add        r0, r2
196    add        r1, r2
197    dec        r3d
198    jnz .nextrow
199    REP_RET
200%endmacro
201
202INIT_XMM sse2
203BIWEIGHT_FUNC_MM 16, 8
204
205%macro BIWEIGHT_FUNC_HALF_MM 2
206cglobal h264_biweight_%1, 7, 8, %2
207    BIWEIGHT_SETUP
208    movifnidn r3d, r3m
209    sar        r3, 1
210    lea        r4, [r2*2]
211.nextrow:
212    BIWEIGHT_STEPA 0, 1, 0
213    BIWEIGHT_STEPA 1, 2, r2
214    BIWEIGHT_STEPB
215    movh       [r0], m0
216%if mmsize == 16
217    movhps     [r0+r2], m0
218%else
219    psrlq      m0, 32
220    movh       [r0+r2], m0
221%endif
222    add        r0, r4
223    add        r1, r4
224    dec        r3d
225    jnz .nextrow
226    REP_RET
227%endmacro
228
229INIT_MMX mmxext
230BIWEIGHT_FUNC_HALF_MM 4, 0
231INIT_XMM sse2
232BIWEIGHT_FUNC_HALF_MM 8, 8
233
234%macro BIWEIGHT_SSSE3_OP 0
235    pmaddubsw  m0, m4
236    pmaddubsw  m2, m4
237    paddsw     m0, m5
238    paddsw     m2, m5
239    psraw      m0, m6
240    psraw      m2, m6
241    packuswb   m0, m2
242%endmacro
243
244INIT_XMM ssse3
245cglobal h264_biweight_16, 7, 8, 8
246    BIWEIGHT_SETUP
247    movifnidn r3d, r3m
248
249.nextrow:
250    movh       m0, [r0]
251    movh       m2, [r0+8]
252    movh       m3, [r1+8]
253    punpcklbw  m0, [r1]
254    punpcklbw  m2, m3
255    BIWEIGHT_SSSE3_OP
256    mova       [r0], m0
257    add        r0, r2
258    add        r1, r2
259    dec        r3d
260    jnz .nextrow
261    REP_RET
262
263INIT_XMM ssse3
264cglobal h264_biweight_8, 7, 8, 8
265    BIWEIGHT_SETUP
266    movifnidn r3d, r3m
267    sar        r3, 1
268    lea        r4, [r2*2]
269
270.nextrow:
271    movh       m0, [r0]
272    movh       m1, [r1]
273    movh       m2, [r0+r2]
274    movh       m3, [r1+r2]
275    punpcklbw  m0, m1
276    punpcklbw  m2, m3
277    BIWEIGHT_SSSE3_OP
278    movh       [r0], m0
279    movhps     [r0+r2], m0
280    add        r0, r4
281    add        r1, r4
282    dec        r3d
283    jnz .nextrow
284    REP_RET
285