1/* 2 * VP9 SIMD optimizations 3 * 4 * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "libavutil/attributes.h" 24#include "libavutil/cpu.h" 25#include "libavutil/x86/cpu.h" 26#include "libavcodec/vp9dsp.h" 27#include "libavcodec/x86/vp9dsp_init.h" 28 29#if HAVE_X86ASM 30 31decl_fpel_func(put, 4, , mmx); 32decl_fpel_func(put, 8, , mmx); 33decl_fpel_func(put, 16, , sse); 34decl_fpel_func(put, 32, , sse); 35decl_fpel_func(put, 64, , sse); 36decl_fpel_func(avg, 4, _8, mmxext); 37decl_fpel_func(avg, 8, _8, mmxext); 38decl_fpel_func(avg, 16, _8, sse2); 39decl_fpel_func(avg, 32, _8, sse2); 40decl_fpel_func(avg, 64, _8, sse2); 41decl_fpel_func(put, 32, , avx); 42decl_fpel_func(put, 64, , avx); 43decl_fpel_func(avg, 32, _8, avx2); 44decl_fpel_func(avg, 64, _8, avx2); 45 46decl_mc_funcs(4, mmxext, int16_t, 8, 8); 47decl_mc_funcs(8, sse2, int16_t, 8, 8); 48decl_mc_funcs(4, ssse3, int8_t, 32, 8); 49decl_mc_funcs(8, ssse3, int8_t, 32, 8); 50#if ARCH_X86_64 51decl_mc_funcs(16, ssse3, int8_t, 32, 8); 52decl_mc_funcs(32, avx2, int8_t, 32, 8); 53#endif 54 55mc_rep_funcs(16, 8, 8, sse2, int16_t, 8, 8) 56#if ARCH_X86_32 57mc_rep_funcs(16, 8, 8, ssse3, int8_t, 32, 8) 58#endif 59mc_rep_funcs(32, 16, 16, sse2, int16_t, 8, 8) 60mc_rep_funcs(32, 16, 16, ssse3, int8_t, 32, 8) 61mc_rep_funcs(64, 32, 32, sse2, int16_t, 8, 8) 62mc_rep_funcs(64, 32, 32, ssse3, int8_t, 32, 8) 63#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 64mc_rep_funcs(64, 32, 32, avx2, int8_t, 32, 8) 65#endif 66 67extern const int8_t ff_filters_ssse3[3][15][4][32]; 68extern const int16_t ff_filters_sse2[3][15][8][8]; 69 70filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2) 71filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2) 72filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3) 73filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3) 74#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 75filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3) 76filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3) 77filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3) 78filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3) 79#endif 80 81filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2) 82filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2) 83filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3) 84filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3) 85#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 86filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3) 87filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3) 88filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3) 89filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3) 90#endif 91 92#define itxfm_func(typea, typeb, size, opt) \ 93void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \ 94 int16_t *block, int eob) 95#define itxfm_funcs(size, opt) \ 96itxfm_func(idct, idct, size, opt); \ 97itxfm_func(iadst, idct, size, opt); \ 98itxfm_func(idct, iadst, size, opt); \ 99itxfm_func(iadst, iadst, size, opt) 100 101itxfm_func(idct, idct, 4, mmxext); 102itxfm_func(idct, iadst, 4, sse2); 103itxfm_func(iadst, idct, 4, sse2); 104itxfm_func(iadst, iadst, 4, sse2); 105itxfm_funcs(4, ssse3); 106itxfm_funcs(8, sse2); 107itxfm_funcs(8, ssse3); 108itxfm_funcs(8, avx); 109itxfm_funcs(16, sse2); 110itxfm_funcs(16, ssse3); 111itxfm_funcs(16, avx); 112itxfm_func(idct, idct, 32, sse2); 113itxfm_func(idct, idct, 32, ssse3); 114itxfm_func(idct, idct, 32, avx); 115itxfm_func(iwht, iwht, 4, mmx); 116itxfm_funcs(16, avx2); 117itxfm_func(idct, idct, 32, avx2); 118 119#undef itxfm_func 120#undef itxfm_funcs 121 122#define lpf_funcs(size1, size2, opt) \ 123void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ 124 int E, int I, int H); \ 125void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ 126 int E, int I, int H) 127 128lpf_funcs(4, 8, mmxext); 129lpf_funcs(8, 8, mmxext); 130lpf_funcs(16, 16, sse2); 131lpf_funcs(16, 16, ssse3); 132lpf_funcs(16, 16, avx); 133lpf_funcs(44, 16, sse2); 134lpf_funcs(44, 16, ssse3); 135lpf_funcs(44, 16, avx); 136lpf_funcs(84, 16, sse2); 137lpf_funcs(84, 16, ssse3); 138lpf_funcs(84, 16, avx); 139lpf_funcs(48, 16, sse2); 140lpf_funcs(48, 16, ssse3); 141lpf_funcs(48, 16, avx); 142lpf_funcs(88, 16, sse2); 143lpf_funcs(88, 16, ssse3); 144lpf_funcs(88, 16, avx); 145 146#undef lpf_funcs 147 148#define ipred_func(size, type, opt) \ 149void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \ 150 const uint8_t *l, const uint8_t *a) 151 152ipred_func(8, v, mmx); 153 154#define ipred_dc_funcs(size, opt) \ 155ipred_func(size, dc, opt); \ 156ipred_func(size, dc_left, opt); \ 157ipred_func(size, dc_top, opt) 158 159ipred_dc_funcs(4, mmxext); 160ipred_dc_funcs(8, mmxext); 161 162#define ipred_dir_tm_funcs(size, opt) \ 163ipred_func(size, tm, opt); \ 164ipred_func(size, dl, opt); \ 165ipred_func(size, dr, opt); \ 166ipred_func(size, hd, opt); \ 167ipred_func(size, hu, opt); \ 168ipred_func(size, vl, opt); \ 169ipred_func(size, vr, opt) 170 171ipred_dir_tm_funcs(4, mmxext); 172 173ipred_func(16, v, sse); 174ipred_func(32, v, sse); 175 176ipred_dc_funcs(16, sse2); 177ipred_dc_funcs(32, sse2); 178 179#define ipred_dir_tm_h_funcs(size, opt) \ 180ipred_dir_tm_funcs(size, opt); \ 181ipred_func(size, h, opt) 182 183ipred_dir_tm_h_funcs(8, sse2); 184ipred_dir_tm_h_funcs(16, sse2); 185ipred_dir_tm_h_funcs(32, sse2); 186 187ipred_func(4, h, sse2); 188 189#define ipred_all_funcs(size, opt) \ 190ipred_dc_funcs(size, opt); \ 191ipred_dir_tm_h_funcs(size, opt) 192 193// FIXME hd/vl_4x4_ssse3 does not exist 194ipred_all_funcs(4, ssse3); 195ipred_all_funcs(8, ssse3); 196ipred_all_funcs(16, ssse3); 197ipred_all_funcs(32, ssse3); 198 199ipred_dir_tm_h_funcs(8, avx); 200ipred_dir_tm_h_funcs(16, avx); 201ipred_dir_tm_h_funcs(32, avx); 202 203ipred_func(32, v, avx); 204 205ipred_dc_funcs(32, avx2); 206ipred_func(32, h, avx2); 207ipred_func(32, tm, avx2); 208 209#undef ipred_func 210#undef ipred_dir_tm_h_funcs 211#undef ipred_dir_tm_funcs 212#undef ipred_dc_funcs 213 214#endif /* HAVE_X86ASM */ 215 216av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) 217{ 218#if HAVE_X86ASM 219 int cpu_flags; 220 221 if (bpp == 10) { 222 ff_vp9dsp_init_10bpp_x86(dsp, bitexact); 223 return; 224 } else if (bpp == 12) { 225 ff_vp9dsp_init_12bpp_x86(dsp, bitexact); 226 return; 227 } 228 229 cpu_flags = av_get_cpu_flags(); 230 231#define init_lpf(opt) do { \ 232 dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \ 233 dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \ 234 dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \ 235 dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \ 236 dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \ 237 dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \ 238 dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \ 239 dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \ 240 dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \ 241 dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \ 242} while (0) 243 244#define init_ipred(sz, opt, t, e) \ 245 dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt 246 247#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext 248#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext 249#define init_dir_tm_ipred(sz, opt) do { \ 250 init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \ 251 init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \ 252 init_ipred(sz, opt, hd, HOR_DOWN); \ 253 init_ipred(sz, opt, vl, VERT_LEFT); \ 254 init_ipred(sz, opt, hu, HOR_UP); \ 255 init_ipred(sz, opt, tm, TM_VP8); \ 256 init_ipred(sz, opt, vr, VERT_RIGHT); \ 257} while (0) 258#define init_dir_tm_h_ipred(sz, opt) do { \ 259 init_dir_tm_ipred(sz, opt); \ 260 init_ipred(sz, opt, h, HOR); \ 261} while (0) 262#define init_dc_ipred(sz, opt) do { \ 263 init_ipred(sz, opt, dc, DC); \ 264 init_ipred(sz, opt, dc_left, LEFT_DC); \ 265 init_ipred(sz, opt, dc_top, TOP_DC); \ 266} while (0) 267#define init_all_ipred(sz, opt) do { \ 268 init_dc_ipred(sz, opt); \ 269 init_dir_tm_h_ipred(sz, opt); \ 270} while (0) 271 272 if (EXTERNAL_MMX(cpu_flags)) { 273 init_fpel_func(4, 0, 4, put, , mmx); 274 init_fpel_func(3, 0, 8, put, , mmx); 275 if (!bitexact) { 276 dsp->itxfm_add[4 /* lossless */][DCT_DCT] = 277 dsp->itxfm_add[4 /* lossless */][ADST_DCT] = 278 dsp->itxfm_add[4 /* lossless */][DCT_ADST] = 279 dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx; 280 } 281 init_ipred(8, mmx, v, VERT); 282 } 283 284 if (EXTERNAL_MMXEXT(cpu_flags)) { 285 dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext; 286 dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext; 287 dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext; 288 dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext; 289 init_subpel2(4, 0, 4, put, 8, mmxext); 290 init_subpel2(4, 1, 4, avg, 8, mmxext); 291 init_fpel_func(4, 1, 4, avg, _8, mmxext); 292 init_fpel_func(3, 1, 8, avg, _8, mmxext); 293 dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; 294 init_dc_ipred(4, mmxext); 295 init_dc_ipred(8, mmxext); 296 init_dir_tm_ipred(4, mmxext); 297 } 298 299 if (EXTERNAL_SSE(cpu_flags)) { 300 init_fpel_func(2, 0, 16, put, , sse); 301 init_fpel_func(1, 0, 32, put, , sse); 302 init_fpel_func(0, 0, 64, put, , sse); 303 init_ipred(16, sse, v, VERT); 304 init_ipred(32, sse, v, VERT); 305 } 306 307 if (EXTERNAL_SSE2(cpu_flags)) { 308 init_subpel3_8to64(0, put, 8, sse2); 309 init_subpel3_8to64(1, avg, 8, sse2); 310 init_fpel_func(2, 1, 16, avg, _8, sse2); 311 init_fpel_func(1, 1, 32, avg, _8, sse2); 312 init_fpel_func(0, 1, 64, avg, _8, sse2); 313 init_lpf(sse2); 314 dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2; 315 dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2; 316 dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2; 317 dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2; 318 dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_sse2; 319 dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_sse2; 320 dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2; 321 dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_sse2; 322 dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_sse2; 323 dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_sse2; 324 dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2; 325 dsp->itxfm_add[TX_32X32][ADST_ADST] = 326 dsp->itxfm_add[TX_32X32][ADST_DCT] = 327 dsp->itxfm_add[TX_32X32][DCT_ADST] = 328 dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2; 329 init_dc_ipred(16, sse2); 330 init_dc_ipred(32, sse2); 331 init_dir_tm_h_ipred(8, sse2); 332 init_dir_tm_h_ipred(16, sse2); 333 init_dir_tm_h_ipred(32, sse2); 334 init_ipred(4, sse2, h, HOR); 335 } 336 337 if (EXTERNAL_SSSE3(cpu_flags)) { 338 init_subpel3(0, put, 8, ssse3); 339 init_subpel3(1, avg, 8, ssse3); 340 dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3; 341 dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3; 342 dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3; 343 dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3; 344 dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3; 345 dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3; 346 dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_ssse3; 347 dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3; 348 dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3; 349 dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3; 350 dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3; 351 dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3; 352 dsp->itxfm_add[TX_32X32][ADST_ADST] = 353 dsp->itxfm_add[TX_32X32][ADST_DCT] = 354 dsp->itxfm_add[TX_32X32][DCT_ADST] = 355 dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3; 356 init_lpf(ssse3); 357 init_all_ipred(4, ssse3); 358 init_all_ipred(8, ssse3); 359 init_all_ipred(16, ssse3); 360 init_all_ipred(32, ssse3); 361 } 362 363 if (EXTERNAL_AVX(cpu_flags)) { 364 dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx; 365 dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx; 366 dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx; 367 dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx; 368 dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx; 369 dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx; 370 dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx; 371 dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx; 372 dsp->itxfm_add[TX_32X32][ADST_ADST] = 373 dsp->itxfm_add[TX_32X32][ADST_DCT] = 374 dsp->itxfm_add[TX_32X32][DCT_ADST] = 375 dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx; 376 init_lpf(avx); 377 init_dir_tm_h_ipred(8, avx); 378 init_dir_tm_h_ipred(16, avx); 379 init_dir_tm_h_ipred(32, avx); 380 } 381 if (EXTERNAL_AVX_FAST(cpu_flags)) { 382 init_fpel_func(1, 0, 32, put, , avx); 383 init_fpel_func(0, 0, 64, put, , avx); 384 init_ipred(32, avx, v, VERT); 385 } 386 387 if (EXTERNAL_AVX2_FAST(cpu_flags)) { 388 init_fpel_func(1, 1, 32, avg, _8, avx2); 389 init_fpel_func(0, 1, 64, avg, _8, avx2); 390 if (ARCH_X86_64) { 391#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 392 dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2; 393 dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2; 394 dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2; 395 dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2; 396 dsp->itxfm_add[TX_32X32][ADST_ADST] = 397 dsp->itxfm_add[TX_32X32][ADST_DCT] = 398 dsp->itxfm_add[TX_32X32][DCT_ADST] = 399 dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2; 400 init_subpel3_32_64(0, put, 8, avx2); 401 init_subpel3_32_64(1, avg, 8, avx2); 402#endif 403 } 404 init_dc_ipred(32, avx2); 405 init_ipred(32, avx2, h, HOR); 406 init_ipred(32, avx2, tm, TM_VP8); 407 } 408 409#undef init_fpel 410#undef init_subpel1 411#undef init_subpel2 412#undef init_subpel3 413 414#endif /* HAVE_X86ASM */ 415} 416