1/* 2 * VP8 DSP functions x86-optimized 3 * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> 4 * Copyright (c) 2010 Fiona Glaser <fiona@x264.com> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "libavutil/attributes.h" 24#include "libavutil/cpu.h" 25#include "libavutil/mem_internal.h" 26#include "libavutil/x86/cpu.h" 27#include "libavcodec/vp8dsp.h" 28 29#if HAVE_X86ASM 30 31/* 32 * MC functions 33 */ 34void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, 35 uint8_t *src, ptrdiff_t srcstride, 36 int height, int mx, int my); 37void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, 38 uint8_t *src, ptrdiff_t srcstride, 39 int height, int mx, int my); 40void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, 41 uint8_t *src, ptrdiff_t srcstride, 42 int height, int mx, int my); 43void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, 44 uint8_t *src, ptrdiff_t srcstride, 45 int height, int mx, int my); 46 47void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride, 48 uint8_t *src, ptrdiff_t srcstride, 49 int height, int mx, int my); 50void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride, 51 uint8_t *src, ptrdiff_t srcstride, 52 int height, int mx, int my); 53void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride, 54 uint8_t *src, ptrdiff_t srcstride, 55 int height, int mx, int my); 56void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride, 57 uint8_t *src, ptrdiff_t srcstride, 58 int height, int mx, int my); 59 60void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, 61 uint8_t *src, ptrdiff_t srcstride, 62 int height, int mx, int my); 63void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, 64 uint8_t *src, ptrdiff_t srcstride, 65 int height, int mx, int my); 66void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, 67 uint8_t *src, ptrdiff_t srcstride, 68 int height, int mx, int my); 69void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, 70 uint8_t *src, ptrdiff_t srcstride, 71 int height, int mx, int my); 72void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, 73 uint8_t *src, ptrdiff_t srcstride, 74 int height, int mx, int my); 75void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, 76 uint8_t *src, ptrdiff_t srcstride, 77 int height, int mx, int my); 78void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, 79 uint8_t *src, ptrdiff_t srcstride, 80 int height, int mx, int my); 81void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, 82 uint8_t *src, ptrdiff_t srcstride, 83 int height, int mx, int my); 84 85void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, 86 uint8_t *src, ptrdiff_t srcstride, 87 int height, int mx, int my); 88void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, 89 uint8_t *src, ptrdiff_t srcstride, 90 int height, int mx, int my); 91void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, 92 uint8_t *src, ptrdiff_t srcstride, 93 int height, int mx, int my); 94void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, 95 uint8_t *src, ptrdiff_t srcstride, 96 int height, int mx, int my); 97 98void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, 99 uint8_t *src, ptrdiff_t srcstride, 100 int height, int mx, int my); 101void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, 102 uint8_t *src, ptrdiff_t srcstride, 103 int height, int mx, int my); 104void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, 105 uint8_t *src, ptrdiff_t srcstride, 106 int height, int mx, int my); 107void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, 108 uint8_t *src, ptrdiff_t srcstride, 109 int height, int mx, int my); 110 111 112void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride, 113 uint8_t *src, ptrdiff_t srcstride, 114 int height, int mx, int my); 115void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride, 116 uint8_t *src, ptrdiff_t srcstride, 117 int height, int mx, int my); 118 119#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \ 120static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \ 121 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ 122 ptrdiff_t srcstride, int height, int mx, int my) \ 123{ \ 124 ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ 125 dst, dststride, src, srcstride, height, mx, my); \ 126 ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ 127 dst + 8, dststride, src + 8, srcstride, height, mx, my); \ 128} 129#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \ 130static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ 131 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ 132 ptrdiff_t srcstride, int height, int mx, int my) \ 133{ \ 134 ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \ 135 dst, dststride, src, srcstride, height, mx, my); \ 136 ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \ 137 dst + 4, dststride, src + 4, srcstride, height, mx, my); \ 138} 139 140TAP_W16(sse2, epel, h6) 141TAP_W16(sse2, epel, v6) 142TAP_W16(sse2, bilinear, h) 143TAP_W16(sse2, bilinear, v) 144 145TAP_W16(ssse3, epel, h6) 146TAP_W16(ssse3, epel, v6) 147TAP_W16(ssse3, bilinear, h) 148TAP_W16(ssse3, bilinear, v) 149 150#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \ 151static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \ 152 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ 153 ptrdiff_t srcstride, int height, int mx, int my) \ 154{ \ 155 LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \ 156 uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \ 157 src -= srcstride * (TAPNUMY / 2 - 1); \ 158 ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \ 159 tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \ 160 ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \ 161 dst, dststride, tmpptr, SIZE, height, mx, my); \ 162} 163 164#define HVTAPMMX(x, y) \ 165HVTAP(mmxext, 8, x, y, 4, 8) 166 167HVTAPMMX(4, 4) 168HVTAPMMX(4, 6) 169HVTAPMMX(6, 4) 170HVTAPMMX(6, 6) 171 172#define HVTAPSSE2(x, y, w) \ 173HVTAP(sse2, 16, x, y, w, 16) \ 174HVTAP(ssse3, 16, x, y, w, 16) 175 176HVTAPSSE2(4, 4, 8) 177HVTAPSSE2(4, 6, 8) 178HVTAPSSE2(6, 4, 8) 179HVTAPSSE2(6, 6, 8) 180HVTAPSSE2(6, 6, 16) 181 182HVTAP(ssse3, 16, 4, 4, 4, 8) 183HVTAP(ssse3, 16, 4, 6, 4, 8) 184HVTAP(ssse3, 16, 6, 4, 4, 8) 185HVTAP(ssse3, 16, 6, 6, 4, 8) 186 187#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \ 188static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ 189 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ 190 ptrdiff_t srcstride, int height, int mx, int my) \ 191{ \ 192 LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \ 193 ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \ 194 tmp, SIZE, src, srcstride, height + 1, mx, my); \ 195 ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \ 196 dst, dststride, tmp, SIZE, height, mx, my); \ 197} 198 199HVBILIN(mmxext, 8, 4, 8) 200HVBILIN(sse2, 8, 8, 16) 201HVBILIN(sse2, 8, 16, 16) 202HVBILIN(ssse3, 8, 4, 8) 203HVBILIN(ssse3, 8, 8, 16) 204HVBILIN(ssse3, 8, 16, 16) 205 206void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16], 207 ptrdiff_t stride); 208void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16], 209 ptrdiff_t stride); 210void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16], 211 ptrdiff_t stride); 212void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16], 213 ptrdiff_t stride); 214void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]); 215void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride); 216 217#define DECLARE_LOOP_FILTER(NAME) \ 218void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \ 219 ptrdiff_t stride, \ 220 int flim); \ 221void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \ 222 ptrdiff_t stride, \ 223 int flim); \ 224void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ 225 ptrdiff_t stride, \ 226 int e, int i, int hvt); \ 227void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ 228 ptrdiff_t stride, \ 229 int e, int i, int hvt); \ 230void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ 231 uint8_t *dstV, \ 232 ptrdiff_t s, \ 233 int e, int i, int hvt); \ 234void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ 235 uint8_t *dstV, \ 236 ptrdiff_t s, \ 237 int e, int i, int hvt); \ 238void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ 239 ptrdiff_t stride, \ 240 int e, int i, int hvt); \ 241void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ 242 ptrdiff_t stride, \ 243 int e, int i, int hvt); \ 244void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ 245 uint8_t *dstV, \ 246 ptrdiff_t s, \ 247 int e, int i, int hvt); \ 248void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ 249 uint8_t *dstV, \ 250 ptrdiff_t s, \ 251 int e, int i, int hvt); 252 253DECLARE_LOOP_FILTER(sse2) 254DECLARE_LOOP_FILTER(ssse3) 255DECLARE_LOOP_FILTER(sse4) 256 257#endif /* HAVE_X86ASM */ 258 259#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ 260 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \ 261 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \ 262 c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT 263 264#define VP8_MC_FUNC(IDX, SIZE, OPT) \ 265 c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \ 266 c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \ 267 c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \ 268 c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \ 269 c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \ 270 VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) 271 272#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \ 273 c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \ 274 c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \ 275 c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \ 276 c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ 277 c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ 278 c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \ 279 c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ 280 c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT 281 282 283av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) 284{ 285#if HAVE_X86ASM 286 int cpu_flags = av_get_cpu_flags(); 287 288 if (EXTERNAL_MMX(cpu_flags)) { 289 c->put_vp8_epel_pixels_tab[1][0][0] = 290 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; 291 } 292 293 /* note that 4-tap width=16 functions are missing because w=16 294 * is only used for luma, and luma is always a copy or sixtap. */ 295 if (EXTERNAL_MMXEXT(cpu_flags)) { 296 VP8_MC_FUNC(2, 4, mmxext); 297 VP8_BILINEAR_MC_FUNC(2, 4, mmxext); 298 } 299 300 if (EXTERNAL_SSE(cpu_flags)) { 301 c->put_vp8_epel_pixels_tab[0][0][0] = 302 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; 303 } 304 305 if (EXTERNAL_SSE2_SLOW(cpu_flags)) { 306 VP8_LUMA_MC_FUNC(0, 16, sse2); 307 VP8_MC_FUNC(1, 8, sse2); 308 VP8_BILINEAR_MC_FUNC(0, 16, sse2); 309 VP8_BILINEAR_MC_FUNC(1, 8, sse2); 310 } 311 312 if (EXTERNAL_SSSE3(cpu_flags)) { 313 VP8_LUMA_MC_FUNC(0, 16, ssse3); 314 VP8_MC_FUNC(1, 8, ssse3); 315 VP8_MC_FUNC(2, 4, ssse3); 316 VP8_BILINEAR_MC_FUNC(0, 16, ssse3); 317 VP8_BILINEAR_MC_FUNC(1, 8, ssse3); 318 VP8_BILINEAR_MC_FUNC(2, 4, ssse3); 319 } 320#endif /* HAVE_X86ASM */ 321} 322 323av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c) 324{ 325#if HAVE_X86ASM 326 int cpu_flags = av_get_cpu_flags(); 327 328 if (EXTERNAL_MMX(cpu_flags)) { 329 c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx; 330 } 331 332 if (EXTERNAL_SSE(cpu_flags)) { 333 c->vp8_idct_add = ff_vp8_idct_add_sse; 334 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse; 335 } 336 337 if (EXTERNAL_SSE2_SLOW(cpu_flags)) { 338 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; 339 340 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; 341 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; 342 343 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; 344 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; 345 } 346 347 if (EXTERNAL_SSE2(cpu_flags)) { 348 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse2; 349 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; 350 351 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; 352 353 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; 354 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; 355 356 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; 357 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; 358 } 359 360 if (EXTERNAL_SSSE3(cpu_flags)) { 361 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3; 362 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3; 363 364 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3; 365 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3; 366 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3; 367 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3; 368 369 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3; 370 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3; 371 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3; 372 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; 373 } 374 375 if (EXTERNAL_SSE4(cpu_flags)) { 376 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; 377 378 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; 379 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; 380 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; 381 } 382#endif /* HAVE_X86ASM */ 383} 384