1/*
2 * VP8 DSP functions x86-optimized
3 * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavutil/attributes.h"
24#include "libavutil/cpu.h"
25#include "libavutil/mem_internal.h"
26#include "libavutil/x86/cpu.h"
27#include "libavcodec/vp8dsp.h"
28
29#if HAVE_X86ASM
30
31/*
32 * MC functions
33 */
34void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
35                                uint8_t *src, ptrdiff_t srcstride,
36                                int height, int mx, int my);
37void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
38                                uint8_t *src, ptrdiff_t srcstride,
39                                int height, int mx, int my);
40void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
41                                uint8_t *src, ptrdiff_t srcstride,
42                                int height, int mx, int my);
43void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
44                                uint8_t *src, ptrdiff_t srcstride,
45                                int height, int mx, int my);
46
47void ff_put_vp8_epel8_h4_sse2  (uint8_t *dst, ptrdiff_t dststride,
48                                uint8_t *src, ptrdiff_t srcstride,
49                                int height, int mx, int my);
50void ff_put_vp8_epel8_h6_sse2  (uint8_t *dst, ptrdiff_t dststride,
51                                uint8_t *src, ptrdiff_t srcstride,
52                                int height, int mx, int my);
53void ff_put_vp8_epel8_v4_sse2  (uint8_t *dst, ptrdiff_t dststride,
54                                uint8_t *src, ptrdiff_t srcstride,
55                                int height, int mx, int my);
56void ff_put_vp8_epel8_v6_sse2  (uint8_t *dst, ptrdiff_t dststride,
57                                uint8_t *src, ptrdiff_t srcstride,
58                                int height, int mx, int my);
59
60void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
61                                uint8_t *src, ptrdiff_t srcstride,
62                                int height, int mx, int my);
63void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
64                                uint8_t *src, ptrdiff_t srcstride,
65                                int height, int mx, int my);
66void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
67                                uint8_t *src, ptrdiff_t srcstride,
68                                int height, int mx, int my);
69void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
70                                uint8_t *src, ptrdiff_t srcstride,
71                                int height, int mx, int my);
72void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
73                                uint8_t *src, ptrdiff_t srcstride,
74                                int height, int mx, int my);
75void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
76                                uint8_t *src, ptrdiff_t srcstride,
77                                int height, int mx, int my);
78void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
79                                uint8_t *src, ptrdiff_t srcstride,
80                                int height, int mx, int my);
81void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
82                                uint8_t *src, ptrdiff_t srcstride,
83                                int height, int mx, int my);
84
85void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
86                                   uint8_t *src, ptrdiff_t srcstride,
87                                   int height, int mx, int my);
88void ff_put_vp8_bilinear8_h_sse2  (uint8_t *dst, ptrdiff_t dststride,
89                                   uint8_t *src, ptrdiff_t srcstride,
90                                   int height, int mx, int my);
91void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
92                                   uint8_t *src, ptrdiff_t srcstride,
93                                   int height, int mx, int my);
94void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
95                                   uint8_t *src, ptrdiff_t srcstride,
96                                   int height, int mx, int my);
97
98void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
99                                   uint8_t *src, ptrdiff_t srcstride,
100                                   int height, int mx, int my);
101void ff_put_vp8_bilinear8_v_sse2  (uint8_t *dst, ptrdiff_t dststride,
102                                   uint8_t *src, ptrdiff_t srcstride,
103                                   int height, int mx, int my);
104void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
105                                   uint8_t *src, ptrdiff_t srcstride,
106                                   int height, int mx, int my);
107void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
108                                   uint8_t *src, ptrdiff_t srcstride,
109                                   int height, int mx, int my);
110
111
112void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
113                             uint8_t *src, ptrdiff_t srcstride,
114                             int height, int mx, int my);
115void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
116                             uint8_t *src, ptrdiff_t srcstride,
117                             int height, int mx, int my);
118
119#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
120static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
121    uint8_t *dst,  ptrdiff_t dststride, uint8_t *src, \
122    ptrdiff_t srcstride, int height, int mx, int my) \
123{ \
124    ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
125        dst,     dststride, src,     srcstride, height, mx, my); \
126    ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
127        dst + 8, dststride, src + 8, srcstride, height, mx, my); \
128}
129#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
130static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
131    uint8_t *dst,  ptrdiff_t dststride, uint8_t *src, \
132    ptrdiff_t srcstride, int height, int mx, int my) \
133{ \
134    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
135        dst,     dststride, src,     srcstride, height, mx, my); \
136    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
137        dst + 4, dststride, src + 4, srcstride, height, mx, my); \
138}
139
140TAP_W16(sse2,  epel, h6)
141TAP_W16(sse2,  epel, v6)
142TAP_W16(sse2,  bilinear, h)
143TAP_W16(sse2,  bilinear, v)
144
145TAP_W16(ssse3, epel, h6)
146TAP_W16(ssse3, epel, v6)
147TAP_W16(ssse3, bilinear, h)
148TAP_W16(ssse3, bilinear, v)
149
150#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
151static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
152    uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
153    ptrdiff_t srcstride, int height, int mx, int my) \
154{ \
155    LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \
156    uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
157    src -= srcstride * (TAPNUMY / 2 - 1); \
158    ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
159        tmp, SIZE,      src,    srcstride, height + TAPNUMY - 1, mx, my); \
160    ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
161        dst, dststride, tmpptr, SIZE,      height,               mx, my); \
162}
163
164#define HVTAPMMX(x, y) \
165HVTAP(mmxext, 8, x, y,  4,  8)
166
167HVTAPMMX(4, 4)
168HVTAPMMX(4, 6)
169HVTAPMMX(6, 4)
170HVTAPMMX(6, 6)
171
172#define HVTAPSSE2(x, y, w) \
173HVTAP(sse2,  16, x, y, w, 16) \
174HVTAP(ssse3, 16, x, y, w, 16)
175
176HVTAPSSE2(4, 4, 8)
177HVTAPSSE2(4, 6, 8)
178HVTAPSSE2(6, 4, 8)
179HVTAPSSE2(6, 6, 8)
180HVTAPSSE2(6, 6, 16)
181
182HVTAP(ssse3, 16, 4, 4, 4, 8)
183HVTAP(ssse3, 16, 4, 6, 4, 8)
184HVTAP(ssse3, 16, 6, 4, 4, 8)
185HVTAP(ssse3, 16, 6, 6, 4, 8)
186
187#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
188static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
189    uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
190    ptrdiff_t srcstride, int height, int mx, int my) \
191{ \
192    LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \
193    ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
194        tmp, SIZE,      src, srcstride, height + 1, mx, my); \
195    ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
196        dst, dststride, tmp, SIZE,      height,     mx, my); \
197}
198
199HVBILIN(mmxext,  8,  4,  8)
200HVBILIN(sse2,  8,  8, 16)
201HVBILIN(sse2,  8, 16, 16)
202HVBILIN(ssse3, 8,  4,  8)
203HVBILIN(ssse3, 8,  8, 16)
204HVBILIN(ssse3, 8, 16, 16)
205
206void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16],
207                             ptrdiff_t stride);
208void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
209                             ptrdiff_t stride);
210void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
211                               ptrdiff_t stride);
212void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
213                               ptrdiff_t stride);
214void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
215void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
216
217#define DECLARE_LOOP_FILTER(NAME)                                       \
218void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst,                 \
219                                          ptrdiff_t stride,             \
220                                          int flim);                    \
221void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst,                 \
222                                          ptrdiff_t stride,             \
223                                          int flim);                    \
224void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst,              \
225                                             ptrdiff_t stride,          \
226                                             int e, int i, int hvt);    \
227void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst,              \
228                                             ptrdiff_t stride,          \
229                                             int e, int i, int hvt);    \
230void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU,             \
231                                             uint8_t *dstV,             \
232                                             ptrdiff_t s,               \
233                                             int e, int i, int hvt);    \
234void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU,             \
235                                             uint8_t *dstV,             \
236                                             ptrdiff_t s,               \
237                                             int e, int i, int hvt);    \
238void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst,              \
239                                             ptrdiff_t stride,          \
240                                             int e, int i, int hvt);    \
241void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst,              \
242                                             ptrdiff_t stride,          \
243                                             int e, int i, int hvt);    \
244void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU,             \
245                                             uint8_t *dstV,             \
246                                             ptrdiff_t s,               \
247                                             int e, int i, int hvt);    \
248void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU,             \
249                                             uint8_t *dstV,             \
250                                             ptrdiff_t s,               \
251                                             int e, int i, int hvt);
252
253DECLARE_LOOP_FILTER(sse2)
254DECLARE_LOOP_FILTER(ssse3)
255DECLARE_LOOP_FILTER(sse4)
256
257#endif /* HAVE_X86ASM */
258
259#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
260    c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
261    c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
262    c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
263
264#define VP8_MC_FUNC(IDX, SIZE, OPT) \
265    c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
266    c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
267    c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
268    c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
269    c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
270    VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
271
272#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
273    c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
274    c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
275    c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
276    c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
277    c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
278    c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
279    c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
280    c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
281
282
283av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
284{
285#if HAVE_X86ASM
286    int cpu_flags = av_get_cpu_flags();
287
288    if (EXTERNAL_MMX(cpu_flags)) {
289        c->put_vp8_epel_pixels_tab[1][0][0]     =
290        c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
291    }
292
293    /* note that 4-tap width=16 functions are missing because w=16
294     * is only used for luma, and luma is always a copy or sixtap. */
295    if (EXTERNAL_MMXEXT(cpu_flags)) {
296        VP8_MC_FUNC(2, 4, mmxext);
297        VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
298    }
299
300    if (EXTERNAL_SSE(cpu_flags)) {
301        c->put_vp8_epel_pixels_tab[0][0][0]     =
302        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
303    }
304
305    if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
306        VP8_LUMA_MC_FUNC(0, 16, sse2);
307        VP8_MC_FUNC(1, 8, sse2);
308        VP8_BILINEAR_MC_FUNC(0, 16, sse2);
309        VP8_BILINEAR_MC_FUNC(1, 8, sse2);
310    }
311
312    if (EXTERNAL_SSSE3(cpu_flags)) {
313        VP8_LUMA_MC_FUNC(0, 16, ssse3);
314        VP8_MC_FUNC(1, 8, ssse3);
315        VP8_MC_FUNC(2, 4, ssse3);
316        VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
317        VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
318        VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
319    }
320#endif /* HAVE_X86ASM */
321}
322
323av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
324{
325#if HAVE_X86ASM
326    int cpu_flags = av_get_cpu_flags();
327
328    if (EXTERNAL_MMX(cpu_flags)) {
329        c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
330    }
331
332    if (EXTERNAL_SSE(cpu_flags)) {
333        c->vp8_idct_add                         = ff_vp8_idct_add_sse;
334        c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
335    }
336
337    if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
338        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
339
340        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
341        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
342
343        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_sse2;
344        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_sse2;
345    }
346
347    if (EXTERNAL_SSE2(cpu_flags)) {
348        c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse2;
349        c->vp8_idct_dc_add4y          = ff_vp8_idct_dc_add4y_sse2;
350
351        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse2;
352
353        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
354        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
355
356        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse2;
357        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse2;
358    }
359
360    if (EXTERNAL_SSSE3(cpu_flags)) {
361        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
362        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
363
364        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
365        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
366        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
367        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
368
369        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_ssse3;
370        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_ssse3;
371        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
372        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
373    }
374
375    if (EXTERNAL_SSE4(cpu_flags)) {
376        c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse4;
377
378        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse4;
379        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse4;
380        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse4;
381    }
382#endif /* HAVE_X86ASM */
383}
384