1/* 2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/attributes.h" 22#include "libavutil/cpu.h" 23#include "libavutil/x86/asm.h" 24#include "libavutil/x86/cpu.h" 25#include "libavcodec/h264dsp.h" 26 27/***********************************/ 28/* IDCT */ 29#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \ 30void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ 31 int16_t *block, \ 32 int stride); 33 34IDCT_ADD_FUNC(, 8, sse2) 35IDCT_ADD_FUNC(, 8, avx) 36IDCT_ADD_FUNC(, 10, sse2) 37IDCT_ADD_FUNC(_dc, 8, sse2) 38IDCT_ADD_FUNC(_dc, 8, avx) 39IDCT_ADD_FUNC(_dc, 10, mmxext) 40IDCT_ADD_FUNC(8_dc, 8, mmxext) 41IDCT_ADD_FUNC(8_dc, 10, sse2) 42IDCT_ADD_FUNC(8, 8, sse2) 43IDCT_ADD_FUNC(8, 10, sse2) 44IDCT_ADD_FUNC(, 10, avx) 45IDCT_ADD_FUNC(8_dc, 10, avx) 46IDCT_ADD_FUNC(8, 10, avx) 47 48 49#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \ 50void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ 51 (uint8_t *dst, const int *block_offset, \ 52 int16_t *block, int stride, const uint8_t nnzc[5 * 8]); 53 54IDCT_ADD_REP_FUNC(8, 4, 8, sse2) 55IDCT_ADD_REP_FUNC(8, 4, 10, sse2) 56IDCT_ADD_REP_FUNC(8, 4, 10, avx) 57IDCT_ADD_REP_FUNC(, 16, 8, sse2) 58IDCT_ADD_REP_FUNC(, 16, 10, sse2) 59IDCT_ADD_REP_FUNC(, 16intra, 8, sse2) 60IDCT_ADD_REP_FUNC(, 16intra, 10, sse2) 61IDCT_ADD_REP_FUNC(, 16, 10, avx) 62IDCT_ADD_REP_FUNC(, 16intra, 10, avx) 63 64 65#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \ 66void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ 67 (uint8_t **dst, const int *block_offset, \ 68 int16_t *block, int stride, const uint8_t nnzc[15 * 8]); 69 70IDCT_ADD_REP_FUNC2(, 8, 8, sse2) 71IDCT_ADD_REP_FUNC2(, 8, 10, sse2) 72IDCT_ADD_REP_FUNC2(, 8, 10, avx) 73 74IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx) 75 76IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2) 77IDCT_ADD_REP_FUNC2(, 8_422, 10, avx) 78 79void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul); 80 81/***********************************/ 82/* deblocking */ 83 84void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40], 85 int8_t ref[2][40], 86 int16_t mv[2][40][2], 87 int bidir, int edges, int step, 88 int mask_mv0, int mask_mv1, int field); 89 90#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ 91void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ 92 ptrdiff_t stride, \ 93 int alpha, \ 94 int beta, \ 95 int8_t *tc0); 96#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \ 97void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ 98 ptrdiff_t stride, \ 99 int alpha, \ 100 int beta); 101 102#define LF_FUNCS(type, depth) \ 103LF_FUNC(h, luma, depth, sse2) \ 104LF_IFUNC(h, luma_intra, depth, sse2) \ 105LF_FUNC(v, luma, depth, sse2) \ 106LF_IFUNC(v, luma_intra, depth, sse2) \ 107LF_FUNC(h, chroma, depth, sse2) \ 108LF_IFUNC(h, chroma_intra, depth, sse2) \ 109LF_FUNC(h, chroma422, depth, sse2) \ 110LF_IFUNC(h, chroma422_intra, depth, sse2) \ 111LF_FUNC(v, chroma, depth, sse2) \ 112LF_IFUNC(v, chroma_intra, depth, sse2) \ 113LF_FUNC(h, luma, depth, avx) \ 114LF_IFUNC(h, luma_intra, depth, avx) \ 115LF_FUNC(v, luma, depth, avx) \ 116LF_IFUNC(v, luma_intra, depth, avx) \ 117LF_FUNC(h, chroma, depth, avx) \ 118LF_IFUNC(h, chroma_intra, depth, avx) \ 119LF_FUNC(h, chroma422, depth, avx) \ 120LF_IFUNC(h, chroma422_intra, depth, avx) \ 121LF_FUNC(v, chroma, depth, avx) \ 122LF_IFUNC(v, chroma_intra, depth, avx) 123 124LF_FUNC(h, luma_mbaff, 8, sse2) 125LF_FUNC(h, luma_mbaff, 8, avx) 126 127LF_FUNCS(uint8_t, 8) 128LF_FUNCS(uint16_t, 10) 129 130LF_FUNC(v, luma, 10, mmxext) 131LF_FUNC(h, luma, 10, mmxext) 132LF_IFUNC(v, luma_intra, 10, mmxext) 133LF_IFUNC(h, luma_intra, 10, mmxext) 134 135/***********************************/ 136/* weighted prediction */ 137 138#define H264_WEIGHT(W, OPT) \ 139void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, ptrdiff_t stride, \ 140 int height, int log2_denom, \ 141 int weight, int offset); 142 143#define H264_BIWEIGHT(W, OPT) \ 144void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \ 145 ptrdiff_t stride, int height, \ 146 int log2_denom, int weightd, \ 147 int weights, int offset); 148 149#define H264_BIWEIGHT_MMX(W) \ 150 H264_WEIGHT(W, mmxext) \ 151 H264_BIWEIGHT(W, mmxext) 152 153#define H264_BIWEIGHT_SSE(W) \ 154 H264_WEIGHT(W, sse2) \ 155 H264_BIWEIGHT(W, sse2) \ 156 H264_BIWEIGHT(W, ssse3) 157 158H264_BIWEIGHT_SSE(16) 159H264_BIWEIGHT_SSE(8) 160H264_BIWEIGHT_MMX(4) 161 162#define H264_WEIGHT_10(W, DEPTH, OPT) \ 163void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ 164 ptrdiff_t stride, \ 165 int height, \ 166 int log2_denom, \ 167 int weight, \ 168 int offset); 169 170#define H264_BIWEIGHT_10(W, DEPTH, OPT) \ 171void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ 172 uint8_t *src, \ 173 ptrdiff_t stride, \ 174 int height, \ 175 int log2_denom, \ 176 int weightd, \ 177 int weights, \ 178 int offset); 179 180#define H264_BIWEIGHT_10_SSE(W, DEPTH) \ 181 H264_WEIGHT_10(W, DEPTH, sse2) \ 182 H264_WEIGHT_10(W, DEPTH, sse4) \ 183 H264_BIWEIGHT_10(W, DEPTH, sse2) \ 184 H264_BIWEIGHT_10(W, DEPTH, sse4) 185 186H264_BIWEIGHT_10_SSE(16, 10) 187H264_BIWEIGHT_10_SSE(8, 10) 188H264_BIWEIGHT_10_SSE(4, 10) 189 190av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, 191 const int chroma_format_idc) 192{ 193#if HAVE_X86ASM 194 int cpu_flags = av_get_cpu_flags(); 195 196 if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1) 197 c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext; 198 199 if (bit_depth == 8) { 200 if (EXTERNAL_MMX(cpu_flags)) { 201 if (chroma_format_idc <= 1) { 202 } else { 203 c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx; 204 } 205 } 206 if (EXTERNAL_MMXEXT(cpu_flags)) { 207 c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext; 208 209 c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext; 210 211 c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext; 212 } 213 if (EXTERNAL_SSE2(cpu_flags)) { 214 c->h264_idct8_add = ff_h264_idct8_add_8_sse2; 215 216 c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; 217 c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; 218 if (chroma_format_idc <= 1) 219 c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; 220 c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; 221 c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2; 222 223 c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2; 224 c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2; 225 226 c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; 227 c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; 228 229 c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; 230 c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; 231 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; 232 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; 233 234#if ARCH_X86_64 235 c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2; 236#endif 237 238 c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_sse2; 239 c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_sse2; 240 if (chroma_format_idc <= 1) { 241 c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_sse2; 242 c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_sse2; 243 } else { 244 c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_sse2; 245 c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2; 246 } 247 248 c->h264_idct_add = ff_h264_idct_add_8_sse2; 249 c->h264_idct_dc_add = ff_h264_idct_dc_add_8_sse2; 250 } 251 if (EXTERNAL_SSSE3(cpu_flags)) { 252 c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; 253 c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; 254 } 255 if (EXTERNAL_AVX(cpu_flags)) { 256 c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; 257 c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; 258 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; 259 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; 260#if ARCH_X86_64 261 c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx; 262#endif 263 264 c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_avx; 265 c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_avx; 266 if (chroma_format_idc <= 1) { 267 c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_avx; 268 c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_avx; 269 } else { 270 c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx; 271 c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx; 272 } 273 274 c->h264_idct_add = ff_h264_idct_add_8_avx; 275 c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx; 276 } 277 } else if (bit_depth == 10) { 278 if (EXTERNAL_MMXEXT(cpu_flags)) { 279#if ARCH_X86_32 && !HAVE_ALIGNED_STACK 280 c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext; 281 c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext; 282 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; 283 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; 284#endif /* ARCH_X86_32 && !HAVE_ALIGNED_STACK */ 285 c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext; 286 } 287 if (EXTERNAL_SSE2(cpu_flags)) { 288 c->h264_idct_add = ff_h264_idct_add_10_sse2; 289 c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; 290 291 c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; 292 if (chroma_format_idc <= 1) { 293 c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; 294 } else { 295 c->h264_idct_add8 = ff_h264_idct_add8_422_10_sse2; 296 } 297 c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2; 298#if HAVE_ALIGNED_STACK 299 c->h264_idct8_add = ff_h264_idct8_add_10_sse2; 300 c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; 301#endif /* HAVE_ALIGNED_STACK */ 302 303 c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; 304 c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; 305 c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; 306 307 c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; 308 c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; 309 c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; 310 311 c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2; 312 c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2; 313 if (chroma_format_idc <= 1) { 314 c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2; 315 } else { 316 c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2; 317 } 318#if HAVE_ALIGNED_STACK 319 c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; 320 c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; 321 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; 322 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; 323#endif /* HAVE_ALIGNED_STACK */ 324 } 325 if (EXTERNAL_SSE4(cpu_flags)) { 326 c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; 327 c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; 328 c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; 329 330 c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; 331 c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; 332 c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; 333 } 334 if (EXTERNAL_AVX(cpu_flags)) { 335 c->h264_idct_dc_add = 336 c->h264_idct_add = ff_h264_idct_add_10_avx; 337 c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; 338 339 c->h264_idct_add16 = ff_h264_idct_add16_10_avx; 340 if (chroma_format_idc <= 1) { 341 c->h264_idct_add8 = ff_h264_idct_add8_10_avx; 342 } else { 343 c->h264_idct_add8 = ff_h264_idct_add8_422_10_avx; 344 } 345 c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx; 346#if HAVE_ALIGNED_STACK 347 c->h264_idct8_add = ff_h264_idct8_add_10_avx; 348 c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; 349#endif /* HAVE_ALIGNED_STACK */ 350 351 c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx; 352 c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx; 353 if (chroma_format_idc <= 1) { 354 c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx; 355 } else { 356 c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx; 357 } 358#if HAVE_ALIGNED_STACK 359 c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; 360 c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; 361 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; 362 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; 363#endif /* HAVE_ALIGNED_STACK */ 364 } 365 } 366#endif 367} 368