1/* 2 * RV40 decoder motion compensation functions x86-optimised 3 * Copyright (c) 2008 Konstantin Shishkov 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22/** 23 * @file 24 * RV40 decoder motion compensation functions x86-optimised 25 * 2,0 and 0,2 have h264 equivalents. 26 * 3,3 is bugged in the rv40 format and maps to _xy2 version 27 */ 28 29#include "libavcodec/rv34dsp.h" 30#include "libavutil/attributes.h" 31#include "libavutil/mem_internal.h" 32#include "libavutil/x86/cpu.h" 33#include "hpeldsp.h" 34 35#define DEFINE_FN(op, size, insn) \ 36static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \ 37 ptrdiff_t stride) \ 38{ \ 39 ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \ 40} 41 42#if HAVE_X86ASM 43void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, 44 ptrdiff_t stride, int h, int x, int y); 45void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src, 46 ptrdiff_t stride, int h, int x, int y); 47 48void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, 49 ptrdiff_t stride, int h, int x, int y); 50void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src, 51 ptrdiff_t stride, int h, int x, int y); 52 53#define DECLARE_WEIGHT(opt) \ 54void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ 55 int w1, int w2, ptrdiff_t stride); \ 56void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ 57 int w1, int w2, ptrdiff_t stride); \ 58void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ 59 int w1, int w2, ptrdiff_t stride); \ 60void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ 61 int w1, int w2, ptrdiff_t stride); 62DECLARE_WEIGHT(sse2) 63DECLARE_WEIGHT(ssse3) 64 65/** @{ */ 66/** 67 * Define one qpel function. 68 * LOOPSIZE must be already set to the number of pixels processed per 69 * iteration in the inner loop of the called functions. 70 * COFF(x) must be already defined so as to provide the offset into any 71 * array of coeffs used by the called function for the qpel position x. 72 */ 73#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \ 74static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \ 75 const uint8_t *src, \ 76 ptrdiff_t stride) \ 77{ \ 78 int i; \ 79 if (PH && PV) { \ 80 LOCAL_ALIGNED(16, uint8_t, tmp, [SIZE * (SIZE + 5)]); \ 81 uint8_t *tmpptr = tmp + SIZE * 2; \ 82 src -= stride * 2; \ 83 \ 84 for (i = 0; i < SIZE; i += LOOPSIZE) \ 85 ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \ 86 SIZE + 5, HCOFF(PH)); \ 87 for (i = 0; i < SIZE; i += LOOPSIZE) \ 88 ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \ 89 SIZE, SIZE, VCOFF(PV)); \ 90 } else if (PV) { \ 91 for (i = 0; i < SIZE; i += LOOPSIZE) \ 92 ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \ 93 stride, SIZE, VCOFF(PV)); \ 94 } else { \ 95 for (i = 0; i < SIZE; i += LOOPSIZE) \ 96 ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \ 97 stride, SIZE, HCOFF(PH)); \ 98 } \ 99} 100 101/** Declare functions for sizes 8 and 16 and given operations 102 * and qpel position. */ 103#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \ 104 QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \ 105 QPEL_FUNC_DECL(OP, 16, PH, PV, OPT) 106 107/** Declare all functions for all sizes and qpel positions */ 108#define QPEL_MC_DECL(OP, OPT) \ 109void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ 110 const uint8_t *src, \ 111 ptrdiff_t srcStride, \ 112 int len, int m); \ 113void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ 114 const uint8_t *src, \ 115 ptrdiff_t srcStride, \ 116 int len, int m); \ 117QPEL_FUNCS_DECL(OP, 0, 1, OPT) \ 118QPEL_FUNCS_DECL(OP, 0, 3, OPT) \ 119QPEL_FUNCS_DECL(OP, 1, 0, OPT) \ 120QPEL_FUNCS_DECL(OP, 1, 1, OPT) \ 121QPEL_FUNCS_DECL(OP, 1, 2, OPT) \ 122QPEL_FUNCS_DECL(OP, 1, 3, OPT) \ 123QPEL_FUNCS_DECL(OP, 2, 1, OPT) \ 124QPEL_FUNCS_DECL(OP, 2, 2, OPT) \ 125QPEL_FUNCS_DECL(OP, 2, 3, OPT) \ 126QPEL_FUNCS_DECL(OP, 3, 0, OPT) \ 127QPEL_FUNCS_DECL(OP, 3, 1, OPT) \ 128QPEL_FUNCS_DECL(OP, 3, 2, OPT) 129/** @} */ 130 131#define LOOPSIZE 8 132#define HCOFF(x) (32 * ((x) - 1)) 133#define VCOFF(x) (32 * ((x) - 1)) 134QPEL_MC_DECL(put_, _ssse3) 135QPEL_MC_DECL(avg_, _ssse3) 136 137#undef LOOPSIZE 138#undef HCOFF 139#undef VCOFF 140#define LOOPSIZE 8 141#define HCOFF(x) (64 * ((x) - 1)) 142#define VCOFF(x) (64 * ((x) - 1)) 143QPEL_MC_DECL(put_, _sse2) 144QPEL_MC_DECL(avg_, _sse2) 145 146/** @{ */ 147/** Set one function */ 148#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \ 149 c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT; 150 151/** Set functions put and avg for sizes 8 and 16 and a given qpel position */ 152#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \ 153 QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \ 154 QPEL_FUNC_SET(OP, 16, PH, PV, OPT) 155 156/** Set all functions for all sizes and qpel positions */ 157#define QPEL_MC_SET(OP, OPT) \ 158QPEL_FUNCS_SET (OP, 0, 1, OPT) \ 159QPEL_FUNCS_SET (OP, 0, 3, OPT) \ 160QPEL_FUNCS_SET (OP, 1, 0, OPT) \ 161QPEL_FUNCS_SET (OP, 1, 1, OPT) \ 162QPEL_FUNCS_SET (OP, 1, 2, OPT) \ 163QPEL_FUNCS_SET (OP, 1, 3, OPT) \ 164QPEL_FUNCS_SET (OP, 2, 1, OPT) \ 165QPEL_FUNCS_SET (OP, 2, 2, OPT) \ 166QPEL_FUNCS_SET (OP, 2, 3, OPT) \ 167QPEL_FUNCS_SET (OP, 3, 0, OPT) \ 168QPEL_FUNCS_SET (OP, 3, 1, OPT) \ 169QPEL_FUNCS_SET (OP, 3, 2, OPT) 170/** @} */ 171 172DEFINE_FN(put, 8, ssse3) 173 174DEFINE_FN(put, 16, sse2) 175DEFINE_FN(put, 16, ssse3) 176 177DEFINE_FN(avg, 8, mmxext) 178DEFINE_FN(avg, 8, ssse3) 179 180DEFINE_FN(avg, 16, sse2) 181DEFINE_FN(avg, 16, ssse3) 182#endif /* HAVE_X86ASM */ 183 184#if HAVE_MMX_INLINE 185DEFINE_FN(put, 8, mmx) 186#endif 187 188av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c) 189{ 190 av_unused int cpu_flags = av_get_cpu_flags(); 191 192#if HAVE_MMX_INLINE 193 if (INLINE_MMX(cpu_flags)) { 194 c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_mmx; 195 } 196#endif /* HAVE_MMX_INLINE */ 197 198#if HAVE_X86ASM 199 if (EXTERNAL_MMX(cpu_flags)) { 200 c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx; 201 c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx; 202 } 203 if (EXTERNAL_MMXEXT(cpu_flags)) { 204 c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmxext; 205 c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext; 206 c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext; 207 } 208 if (EXTERNAL_SSE2(cpu_flags)) { 209 c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_sse2; 210 c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_sse2; 211 c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2; 212 c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2; 213 c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2; 214 c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2; 215 QPEL_MC_SET(put_, _sse2) 216 QPEL_MC_SET(avg_, _sse2) 217 } 218 if (EXTERNAL_SSSE3(cpu_flags)) { 219 c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_ssse3; 220 c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_ssse3; 221 c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_ssse3; 222 c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_ssse3; 223 c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3; 224 c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3; 225 c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3; 226 c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3; 227 QPEL_MC_SET(put_, _ssse3) 228 QPEL_MC_SET(avg_, _ssse3) 229 } 230#endif /* HAVE_X86ASM */ 231} 232