1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * RV40 decoder motion compensation functions x86-optimised 3cabdff1aSopenharmony_ci * Copyright (c) 2008 Konstantin Shishkov 4cabdff1aSopenharmony_ci * 5cabdff1aSopenharmony_ci * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci * 12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci * 17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci */ 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci/** 23cabdff1aSopenharmony_ci * @file 24cabdff1aSopenharmony_ci * RV40 decoder motion compensation functions x86-optimised 25cabdff1aSopenharmony_ci * 2,0 and 0,2 have h264 equivalents. 26cabdff1aSopenharmony_ci * 3,3 is bugged in the rv40 format and maps to _xy2 version 27cabdff1aSopenharmony_ci */ 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_ci#include "libavcodec/rv34dsp.h" 30cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 31cabdff1aSopenharmony_ci#include "libavutil/mem_internal.h" 32cabdff1aSopenharmony_ci#include "libavutil/x86/cpu.h" 33cabdff1aSopenharmony_ci#include "hpeldsp.h" 34cabdff1aSopenharmony_ci 35cabdff1aSopenharmony_ci#define DEFINE_FN(op, size, insn) \ 36cabdff1aSopenharmony_cistatic void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \ 37cabdff1aSopenharmony_ci ptrdiff_t stride) \ 38cabdff1aSopenharmony_ci{ \ 39cabdff1aSopenharmony_ci ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \ 40cabdff1aSopenharmony_ci} 41cabdff1aSopenharmony_ci 42cabdff1aSopenharmony_ci#if HAVE_X86ASM 43cabdff1aSopenharmony_civoid ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, 44cabdff1aSopenharmony_ci ptrdiff_t stride, int h, int x, int y); 45cabdff1aSopenharmony_civoid ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src, 46cabdff1aSopenharmony_ci ptrdiff_t stride, int h, int x, int y); 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_civoid ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, 49cabdff1aSopenharmony_ci ptrdiff_t stride, int h, int x, int y); 50cabdff1aSopenharmony_civoid ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src, 51cabdff1aSopenharmony_ci ptrdiff_t stride, int h, int x, int y); 52cabdff1aSopenharmony_ci 53cabdff1aSopenharmony_ci#define DECLARE_WEIGHT(opt) \ 54cabdff1aSopenharmony_civoid ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ 55cabdff1aSopenharmony_ci int w1, int w2, ptrdiff_t stride); \ 56cabdff1aSopenharmony_civoid ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ 57cabdff1aSopenharmony_ci int w1, int w2, ptrdiff_t stride); \ 58cabdff1aSopenharmony_civoid ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ 59cabdff1aSopenharmony_ci int w1, int w2, ptrdiff_t stride); \ 60cabdff1aSopenharmony_civoid ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ 61cabdff1aSopenharmony_ci int w1, int w2, ptrdiff_t stride); 62cabdff1aSopenharmony_ciDECLARE_WEIGHT(sse2) 63cabdff1aSopenharmony_ciDECLARE_WEIGHT(ssse3) 64cabdff1aSopenharmony_ci 65cabdff1aSopenharmony_ci/** @{ */ 66cabdff1aSopenharmony_ci/** 67cabdff1aSopenharmony_ci * Define one qpel function. 68cabdff1aSopenharmony_ci * LOOPSIZE must be already set to the number of pixels processed per 69cabdff1aSopenharmony_ci * iteration in the inner loop of the called functions. 70cabdff1aSopenharmony_ci * COFF(x) must be already defined so as to provide the offset into any 71cabdff1aSopenharmony_ci * array of coeffs used by the called function for the qpel position x. 72cabdff1aSopenharmony_ci */ 73cabdff1aSopenharmony_ci#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \ 74cabdff1aSopenharmony_cistatic void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \ 75cabdff1aSopenharmony_ci const uint8_t *src, \ 76cabdff1aSopenharmony_ci ptrdiff_t stride) \ 77cabdff1aSopenharmony_ci{ \ 78cabdff1aSopenharmony_ci int i; \ 79cabdff1aSopenharmony_ci if (PH && PV) { \ 80cabdff1aSopenharmony_ci LOCAL_ALIGNED(16, uint8_t, tmp, [SIZE * (SIZE + 5)]); \ 81cabdff1aSopenharmony_ci uint8_t *tmpptr = tmp + SIZE * 2; \ 82cabdff1aSopenharmony_ci src -= stride * 2; \ 83cabdff1aSopenharmony_ci \ 84cabdff1aSopenharmony_ci for (i = 0; i < SIZE; i += LOOPSIZE) \ 85cabdff1aSopenharmony_ci ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \ 86cabdff1aSopenharmony_ci SIZE + 5, HCOFF(PH)); \ 87cabdff1aSopenharmony_ci for (i = 0; i < SIZE; i += LOOPSIZE) \ 88cabdff1aSopenharmony_ci ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \ 89cabdff1aSopenharmony_ci SIZE, SIZE, VCOFF(PV)); \ 90cabdff1aSopenharmony_ci } else if (PV) { \ 91cabdff1aSopenharmony_ci for (i = 0; i < SIZE; i += LOOPSIZE) \ 92cabdff1aSopenharmony_ci ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \ 93cabdff1aSopenharmony_ci stride, SIZE, VCOFF(PV)); \ 94cabdff1aSopenharmony_ci } else { \ 95cabdff1aSopenharmony_ci for (i = 0; i < SIZE; i += LOOPSIZE) \ 96cabdff1aSopenharmony_ci ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \ 97cabdff1aSopenharmony_ci stride, SIZE, HCOFF(PH)); \ 98cabdff1aSopenharmony_ci } \ 99cabdff1aSopenharmony_ci} 100cabdff1aSopenharmony_ci 101cabdff1aSopenharmony_ci/** Declare functions for sizes 8 and 16 and given operations 102cabdff1aSopenharmony_ci * and qpel position. */ 103cabdff1aSopenharmony_ci#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \ 104cabdff1aSopenharmony_ci QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \ 105cabdff1aSopenharmony_ci QPEL_FUNC_DECL(OP, 16, PH, PV, OPT) 106cabdff1aSopenharmony_ci 107cabdff1aSopenharmony_ci/** Declare all functions for all sizes and qpel positions */ 108cabdff1aSopenharmony_ci#define QPEL_MC_DECL(OP, OPT) \ 109cabdff1aSopenharmony_civoid ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ 110cabdff1aSopenharmony_ci const uint8_t *src, \ 111cabdff1aSopenharmony_ci ptrdiff_t srcStride, \ 112cabdff1aSopenharmony_ci int len, int m); \ 113cabdff1aSopenharmony_civoid ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ 114cabdff1aSopenharmony_ci const uint8_t *src, \ 115cabdff1aSopenharmony_ci ptrdiff_t srcStride, \ 116cabdff1aSopenharmony_ci int len, int m); \ 117cabdff1aSopenharmony_ciQPEL_FUNCS_DECL(OP, 0, 1, OPT) \ 118cabdff1aSopenharmony_ciQPEL_FUNCS_DECL(OP, 0, 3, OPT) \ 119cabdff1aSopenharmony_ciQPEL_FUNCS_DECL(OP, 1, 0, OPT) \ 120cabdff1aSopenharmony_ciQPEL_FUNCS_DECL(OP, 1, 1, OPT) \ 121cabdff1aSopenharmony_ciQPEL_FUNCS_DECL(OP, 1, 2, OPT) \ 122cabdff1aSopenharmony_ciQPEL_FUNCS_DECL(OP, 1, 3, OPT) \ 123cabdff1aSopenharmony_ciQPEL_FUNCS_DECL(OP, 2, 1, OPT) \ 124cabdff1aSopenharmony_ciQPEL_FUNCS_DECL(OP, 2, 2, OPT) \ 125cabdff1aSopenharmony_ciQPEL_FUNCS_DECL(OP, 2, 3, OPT) \ 126cabdff1aSopenharmony_ciQPEL_FUNCS_DECL(OP, 3, 0, OPT) \ 127cabdff1aSopenharmony_ciQPEL_FUNCS_DECL(OP, 3, 1, OPT) \ 128cabdff1aSopenharmony_ciQPEL_FUNCS_DECL(OP, 3, 2, OPT) 129cabdff1aSopenharmony_ci/** @} */ 130cabdff1aSopenharmony_ci 131cabdff1aSopenharmony_ci#define LOOPSIZE 8 132cabdff1aSopenharmony_ci#define HCOFF(x) (32 * ((x) - 1)) 133cabdff1aSopenharmony_ci#define VCOFF(x) (32 * ((x) - 1)) 134cabdff1aSopenharmony_ciQPEL_MC_DECL(put_, _ssse3) 135cabdff1aSopenharmony_ciQPEL_MC_DECL(avg_, _ssse3) 136cabdff1aSopenharmony_ci 137cabdff1aSopenharmony_ci#undef LOOPSIZE 138cabdff1aSopenharmony_ci#undef HCOFF 139cabdff1aSopenharmony_ci#undef VCOFF 140cabdff1aSopenharmony_ci#define LOOPSIZE 8 141cabdff1aSopenharmony_ci#define HCOFF(x) (64 * ((x) - 1)) 142cabdff1aSopenharmony_ci#define VCOFF(x) (64 * ((x) - 1)) 143cabdff1aSopenharmony_ciQPEL_MC_DECL(put_, _sse2) 144cabdff1aSopenharmony_ciQPEL_MC_DECL(avg_, _sse2) 145cabdff1aSopenharmony_ci 146cabdff1aSopenharmony_ci/** @{ */ 147cabdff1aSopenharmony_ci/** Set one function */ 148cabdff1aSopenharmony_ci#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \ 149cabdff1aSopenharmony_ci c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT; 150cabdff1aSopenharmony_ci 151cabdff1aSopenharmony_ci/** Set functions put and avg for sizes 8 and 16 and a given qpel position */ 152cabdff1aSopenharmony_ci#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \ 153cabdff1aSopenharmony_ci QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \ 154cabdff1aSopenharmony_ci QPEL_FUNC_SET(OP, 16, PH, PV, OPT) 155cabdff1aSopenharmony_ci 156cabdff1aSopenharmony_ci/** Set all functions for all sizes and qpel positions */ 157cabdff1aSopenharmony_ci#define QPEL_MC_SET(OP, OPT) \ 158cabdff1aSopenharmony_ciQPEL_FUNCS_SET (OP, 0, 1, OPT) \ 159cabdff1aSopenharmony_ciQPEL_FUNCS_SET (OP, 0, 3, OPT) \ 160cabdff1aSopenharmony_ciQPEL_FUNCS_SET (OP, 1, 0, OPT) \ 161cabdff1aSopenharmony_ciQPEL_FUNCS_SET (OP, 1, 1, OPT) \ 162cabdff1aSopenharmony_ciQPEL_FUNCS_SET (OP, 1, 2, OPT) \ 163cabdff1aSopenharmony_ciQPEL_FUNCS_SET (OP, 1, 3, OPT) \ 164cabdff1aSopenharmony_ciQPEL_FUNCS_SET (OP, 2, 1, OPT) \ 165cabdff1aSopenharmony_ciQPEL_FUNCS_SET (OP, 2, 2, OPT) \ 166cabdff1aSopenharmony_ciQPEL_FUNCS_SET (OP, 2, 3, OPT) \ 167cabdff1aSopenharmony_ciQPEL_FUNCS_SET (OP, 3, 0, OPT) \ 168cabdff1aSopenharmony_ciQPEL_FUNCS_SET (OP, 3, 1, OPT) \ 169cabdff1aSopenharmony_ciQPEL_FUNCS_SET (OP, 3, 2, OPT) 170cabdff1aSopenharmony_ci/** @} */ 171cabdff1aSopenharmony_ci 172cabdff1aSopenharmony_ciDEFINE_FN(put, 8, ssse3) 173cabdff1aSopenharmony_ci 174cabdff1aSopenharmony_ciDEFINE_FN(put, 16, sse2) 175cabdff1aSopenharmony_ciDEFINE_FN(put, 16, ssse3) 176cabdff1aSopenharmony_ci 177cabdff1aSopenharmony_ciDEFINE_FN(avg, 8, mmxext) 178cabdff1aSopenharmony_ciDEFINE_FN(avg, 8, ssse3) 179cabdff1aSopenharmony_ci 180cabdff1aSopenharmony_ciDEFINE_FN(avg, 16, sse2) 181cabdff1aSopenharmony_ciDEFINE_FN(avg, 16, ssse3) 182cabdff1aSopenharmony_ci#endif /* HAVE_X86ASM */ 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ci#if HAVE_MMX_INLINE 185cabdff1aSopenharmony_ciDEFINE_FN(put, 8, mmx) 186cabdff1aSopenharmony_ci#endif 187cabdff1aSopenharmony_ci 188cabdff1aSopenharmony_ciav_cold void ff_rv40dsp_init_x86(RV34DSPContext *c) 189cabdff1aSopenharmony_ci{ 190cabdff1aSopenharmony_ci av_unused int cpu_flags = av_get_cpu_flags(); 191cabdff1aSopenharmony_ci 192cabdff1aSopenharmony_ci#if HAVE_MMX_INLINE 193cabdff1aSopenharmony_ci if (INLINE_MMX(cpu_flags)) { 194cabdff1aSopenharmony_ci c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_mmx; 195cabdff1aSopenharmony_ci } 196cabdff1aSopenharmony_ci#endif /* HAVE_MMX_INLINE */ 197cabdff1aSopenharmony_ci 198cabdff1aSopenharmony_ci#if HAVE_X86ASM 199cabdff1aSopenharmony_ci if (EXTERNAL_MMX(cpu_flags)) { 200cabdff1aSopenharmony_ci c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx; 201cabdff1aSopenharmony_ci c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx; 202cabdff1aSopenharmony_ci } 203cabdff1aSopenharmony_ci if (EXTERNAL_MMXEXT(cpu_flags)) { 204cabdff1aSopenharmony_ci c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmxext; 205cabdff1aSopenharmony_ci c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext; 206cabdff1aSopenharmony_ci c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext; 207cabdff1aSopenharmony_ci } 208cabdff1aSopenharmony_ci if (EXTERNAL_SSE2(cpu_flags)) { 209cabdff1aSopenharmony_ci c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_sse2; 210cabdff1aSopenharmony_ci c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_sse2; 211cabdff1aSopenharmony_ci c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2; 212cabdff1aSopenharmony_ci c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2; 213cabdff1aSopenharmony_ci c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2; 214cabdff1aSopenharmony_ci c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2; 215cabdff1aSopenharmony_ci QPEL_MC_SET(put_, _sse2) 216cabdff1aSopenharmony_ci QPEL_MC_SET(avg_, _sse2) 217cabdff1aSopenharmony_ci } 218cabdff1aSopenharmony_ci if (EXTERNAL_SSSE3(cpu_flags)) { 219cabdff1aSopenharmony_ci c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_ssse3; 220cabdff1aSopenharmony_ci c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_ssse3; 221cabdff1aSopenharmony_ci c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_ssse3; 222cabdff1aSopenharmony_ci c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_ssse3; 223cabdff1aSopenharmony_ci c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3; 224cabdff1aSopenharmony_ci c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3; 225cabdff1aSopenharmony_ci c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3; 226cabdff1aSopenharmony_ci c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3; 227cabdff1aSopenharmony_ci QPEL_MC_SET(put_, _ssse3) 228cabdff1aSopenharmony_ci QPEL_MC_SET(avg_, _ssse3) 229cabdff1aSopenharmony_ci } 230cabdff1aSopenharmony_ci#endif /* HAVE_X86ASM */ 231cabdff1aSopenharmony_ci} 232