1/* 2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 3 * Copyright (c) 2011 Daniel Kang 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/attributes.h" 23#include "libavutil/cpu.h" 24#include "libavutil/mem_internal.h" 25#include "libavutil/x86/asm.h" 26#include "libavutil/x86/cpu.h" 27#include "libavcodec/h264qpel.h" 28#include "libavcodec/pixels.h" 29#include "fpel.h" 30 31#if HAVE_X86ASM 32void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, 33 int dstStride, int src1Stride, int h); 34void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, 35 int dstStride, int src1Stride, int h); 36void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, 37 int dstStride, int src1Stride, int h); 38void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, 39 int dstStride, int src1Stride, int h); 40void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, 41 int dstStride, int src1Stride, int h); 42void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, 43 int dstStride, int src1Stride, int h); 44#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext 45#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext 46#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext 47#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext 48#define ff_put_pixels16_mmxext ff_put_pixels16_mmx 49#define ff_put_pixels8_mmxext ff_put_pixels8_mmx 50#define ff_put_pixels4_mmxext ff_put_pixels4_mmx 51 52#define DEF_QPEL(OPNAME)\ 53void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ 54void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ 55void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ 56void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ 57void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ 58void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\ 59void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ 60void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\ 61void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, int srcStride);\ 62void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\ 63void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\ 64void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\ 65void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\ 66void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\ 67void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h); 68 69DEF_QPEL(avg) 70DEF_QPEL(put) 71 72#define QPEL_H264(OPNAME, OP, MMX)\ 73static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 74 int w=3;\ 75 src -= 2*srcStride+2;\ 76 while(w--){\ 77 ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\ 78 tmp += 4;\ 79 src += 4;\ 80 }\ 81 tmp -= 3*4;\ 82 ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\ 83}\ 84\ 85static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ 86 int w = size>>4;\ 87 do{\ 88 ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\ 89 tmp += 8;\ 90 dst += 8;\ 91 }while(w--);\ 92}\ 93\ 94static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\ 95 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 96 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 97 src += 8*srcStride;\ 98 dst += 8*dstStride;\ 99 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 100 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 101}\ 102\ 103static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\ 104 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 105 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 106 src += 8*dstStride;\ 107 dst += 8*dstStride;\ 108 src2 += 8*src2Stride;\ 109 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 110 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 111}\ 112\ 113static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h)\ 114{\ 115 ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ 116 ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ 117}\ 118 119 120#if ARCH_X86_64 121#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 122 123void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride); 124void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride); 125 126#else // ARCH_X86_64 127#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 128static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\ 129 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 130 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 131 src += 8*dstStride;\ 132 dst += 8*dstStride;\ 133 src2 += 8*src2Stride;\ 134 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 135 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 136} 137#endif // ARCH_X86_64 138 139#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ 140QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 141static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\ 142 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 143 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 144 src += 8*srcStride;\ 145 dst += 8*dstStride;\ 146 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 147 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 148}\ 149 150#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ 151static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\ 152 ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ 153}\ 154static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\ 155 ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ 156 ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ 157} 158 159static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, 160 const uint8_t *src, 161 int tmpStride, 162 int srcStride, 163 int size) 164{ 165 int w = (size+8)>>3; 166 src -= 2*srcStride+2; 167 while(w--){ 168 ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size); 169 tmp += 8; 170 src += 8; 171 } 172} 173 174#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ 175static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ 176 put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ 177 ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ 178}\ 179static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 180 ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ 181}\ 182static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 183 ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ 184}\ 185 186#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext 187#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext 188#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext 189#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext 190 191#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2 192#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2 193#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2 194#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2 195 196#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext 197#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext 198 199#define H264_MC_C_H(OPNAME, SIZE, MMX, ALIGN) \ 200H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ 201H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ 202 203#define H264_MC_C_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \ 204H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ 205H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ 206H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ 207H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ 208 209static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src, 210 ptrdiff_t stride) 211{ 212 ff_put_pixels16_sse2(dst, src, stride, 16); 213} 214static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src, 215 ptrdiff_t stride) 216{ 217 ff_avg_pixels16_sse2(dst, src, stride, 16); 218} 219#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext 220#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext 221 222#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ 223static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 224{\ 225 ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ 226}\ 227 228#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ 229static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 230{\ 231 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ 232}\ 233\ 234static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 235{\ 236 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ 237}\ 238\ 239static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 240{\ 241 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ 242}\ 243 244#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ 245static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 246{\ 247 LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\ 248 ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 249 ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ 250}\ 251\ 252static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 253{\ 254 ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ 255}\ 256\ 257static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 258{\ 259 LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\ 260 ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 261 ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ 262}\ 263 264#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ 265static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 266{\ 267 LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\ 268 ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 269 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ 270}\ 271\ 272static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 273{\ 274 LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\ 275 ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ 276 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ 277}\ 278\ 279static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 280{\ 281 LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\ 282 ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 283 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ 284}\ 285\ 286static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 287{\ 288 LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\ 289 ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ 290 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ 291}\ 292\ 293static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 294{\ 295 LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\ 296 ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ 297}\ 298\ 299static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 300{\ 301 LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ 302 uint8_t * const halfHV= temp;\ 303 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 304 av_assert2(((uintptr_t)temp & 7) == 0);\ 305 ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 306 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ 307}\ 308\ 309static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 310{\ 311 LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ 312 uint8_t * const halfHV= temp;\ 313 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 314 av_assert2(((uintptr_t)temp & 7) == 0);\ 315 ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 316 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ 317}\ 318\ 319static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 320{\ 321 LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ 322 uint8_t * const halfHV= temp;\ 323 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 324 av_assert2(((uintptr_t)temp & 7) == 0);\ 325 ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 326 ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ 327}\ 328\ 329static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ 330{\ 331 LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ 332 uint8_t * const halfHV= temp;\ 333 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 334 av_assert2(((uintptr_t)temp & 7) == 0);\ 335 ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 336 ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ 337}\ 338 339#define H264_MC(QPEL, SIZE, MMX, ALIGN)\ 340QPEL(put_, SIZE, MMX, ALIGN) \ 341QPEL(avg_, SIZE, MMX, ALIGN) \ 342 343#define H264_MC_816(QPEL, XMM)\ 344QPEL(put_, 8, XMM, 16)\ 345QPEL(put_, 16,XMM, 16)\ 346QPEL(avg_, 8, XMM, 16)\ 347QPEL(avg_, 16,XMM, 16)\ 348 349QPEL_H264(put_, PUT_OP, mmxext) 350QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext) 351QPEL_H264_V_XMM(put_, PUT_OP, sse2) 352QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2) 353QPEL_H264_HV_XMM(put_, PUT_OP, sse2) 354QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2) 355QPEL_H264_H_XMM(put_, PUT_OP, ssse3) 356QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3) 357QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) 358QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3) 359 360H264_MC(H264_MC_C_V_H_HV, 4, mmxext, 8) 361H264_MC(H264_MC_C_H, 8, mmxext, 8) 362H264_MC(H264_MC_C_H, 16, mmxext, 8) 363H264_MC_816(H264_MC_V, sse2) 364H264_MC_816(H264_MC_HV, sse2) 365H264_MC_816(H264_MC_H, ssse3) 366H264_MC_816(H264_MC_HV, ssse3) 367 368 369//10bit 370#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \ 371void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \ 372 (uint8_t *dst, const uint8_t *src, ptrdiff_t stride); 373 374#define LUMA_MC_4(DEPTH, TYPE, OPT) \ 375 LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \ 376 LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) 377 378#define LUMA_MC_816(DEPTH, TYPE, OPT) \ 379 LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \ 380 LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \ 381 LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \ 382 LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT) 383 384LUMA_MC_4(10, mc00, mmxext) 385LUMA_MC_4(10, mc10, mmxext) 386LUMA_MC_4(10, mc20, mmxext) 387LUMA_MC_4(10, mc30, mmxext) 388LUMA_MC_4(10, mc01, mmxext) 389LUMA_MC_4(10, mc11, mmxext) 390LUMA_MC_4(10, mc21, mmxext) 391LUMA_MC_4(10, mc31, mmxext) 392LUMA_MC_4(10, mc02, mmxext) 393LUMA_MC_4(10, mc12, mmxext) 394LUMA_MC_4(10, mc22, mmxext) 395LUMA_MC_4(10, mc32, mmxext) 396LUMA_MC_4(10, mc03, mmxext) 397LUMA_MC_4(10, mc13, mmxext) 398LUMA_MC_4(10, mc23, mmxext) 399LUMA_MC_4(10, mc33, mmxext) 400 401LUMA_MC_816(10, mc00, sse2) 402LUMA_MC_816(10, mc10, sse2) 403LUMA_MC_816(10, mc10, sse2_cache64) 404LUMA_MC_816(10, mc10, ssse3_cache64) 405LUMA_MC_816(10, mc20, sse2) 406LUMA_MC_816(10, mc20, sse2_cache64) 407LUMA_MC_816(10, mc20, ssse3_cache64) 408LUMA_MC_816(10, mc30, sse2) 409LUMA_MC_816(10, mc30, sse2_cache64) 410LUMA_MC_816(10, mc30, ssse3_cache64) 411LUMA_MC_816(10, mc01, sse2) 412LUMA_MC_816(10, mc11, sse2) 413LUMA_MC_816(10, mc21, sse2) 414LUMA_MC_816(10, mc31, sse2) 415LUMA_MC_816(10, mc02, sse2) 416LUMA_MC_816(10, mc12, sse2) 417LUMA_MC_816(10, mc22, sse2) 418LUMA_MC_816(10, mc32, sse2) 419LUMA_MC_816(10, mc03, sse2) 420LUMA_MC_816(10, mc13, sse2) 421LUMA_MC_816(10, mc23, sse2) 422LUMA_MC_816(10, mc33, sse2) 423 424#endif /* HAVE_X86ASM */ 425 426#define SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX) \ 427 do { \ 428 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \ 429 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ 430 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \ 431 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \ 432 } while (0) 433#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ 434 do { \ 435 SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX); \ 436 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \ 437 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \ 438 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \ 439 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \ 440 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \ 441 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \ 442 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \ 443 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \ 444 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \ 445 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \ 446 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \ 447 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \ 448 } while (0) 449 450#define H264_QPEL_FUNCS(x, y, CPU) \ 451 do { \ 452 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \ 453 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \ 454 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \ 455 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \ 456 } while (0) 457 458#define H264_QPEL_FUNCS_10(x, y, CPU) \ 459 do { \ 460 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \ 461 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \ 462 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \ 463 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \ 464 } while (0) 465 466av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) 467{ 468#if HAVE_X86ASM 469 int high_bit_depth = bit_depth > 8; 470 int cpu_flags = av_get_cpu_flags(); 471 472 if (EXTERNAL_MMXEXT(cpu_flags)) { 473 if (!high_bit_depth) { 474 SET_QPEL_FUNCS0123(put_h264_qpel, 0, 16, mmxext, ); 475 SET_QPEL_FUNCS0123(put_h264_qpel, 1, 8, mmxext, ); 476 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, ); 477 SET_QPEL_FUNCS0123(avg_h264_qpel, 0, 16, mmxext, ); 478 SET_QPEL_FUNCS0123(avg_h264_qpel, 1, 8, mmxext, ); 479 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, ); 480 } else if (bit_depth == 10) { 481 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_); 482 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_); 483 } 484 } 485 486 if (EXTERNAL_SSE2(cpu_flags)) { 487 if (!high_bit_depth) { 488 H264_QPEL_FUNCS(0, 1, sse2); 489 H264_QPEL_FUNCS(0, 2, sse2); 490 H264_QPEL_FUNCS(0, 3, sse2); 491 H264_QPEL_FUNCS(1, 1, sse2); 492 H264_QPEL_FUNCS(1, 2, sse2); 493 H264_QPEL_FUNCS(1, 3, sse2); 494 H264_QPEL_FUNCS(2, 1, sse2); 495 H264_QPEL_FUNCS(2, 2, sse2); 496 H264_QPEL_FUNCS(2, 3, sse2); 497 H264_QPEL_FUNCS(3, 1, sse2); 498 H264_QPEL_FUNCS(3, 2, sse2); 499 H264_QPEL_FUNCS(3, 3, sse2); 500 } 501 502 if (bit_depth == 10) { 503 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_); 504 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_); 505 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_); 506 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_); 507 H264_QPEL_FUNCS_10(1, 0, sse2_cache64); 508 H264_QPEL_FUNCS_10(2, 0, sse2_cache64); 509 H264_QPEL_FUNCS_10(3, 0, sse2_cache64); 510 } 511 } 512 513 if (EXTERNAL_SSE2_FAST(cpu_flags)) { 514 if (!high_bit_depth) { 515 H264_QPEL_FUNCS(0, 0, sse2); 516 } 517 } 518 519 if (EXTERNAL_SSSE3(cpu_flags)) { 520 if (!high_bit_depth) { 521 H264_QPEL_FUNCS(1, 0, ssse3); 522 H264_QPEL_FUNCS(1, 1, ssse3); 523 H264_QPEL_FUNCS(1, 2, ssse3); 524 H264_QPEL_FUNCS(1, 3, ssse3); 525 H264_QPEL_FUNCS(2, 0, ssse3); 526 H264_QPEL_FUNCS(2, 1, ssse3); 527 H264_QPEL_FUNCS(2, 2, ssse3); 528 H264_QPEL_FUNCS(2, 3, ssse3); 529 H264_QPEL_FUNCS(3, 0, ssse3); 530 H264_QPEL_FUNCS(3, 1, ssse3); 531 H264_QPEL_FUNCS(3, 2, ssse3); 532 H264_QPEL_FUNCS(3, 3, ssse3); 533 } 534 535 if (bit_depth == 10) { 536 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64); 537 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64); 538 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64); 539 } 540 } 541 542 if (EXTERNAL_AVX(cpu_flags)) { 543 /* AVX implies 64 byte cache lines without the need to avoid unaligned 544 * memory accesses that cross the boundary between two cache lines. 545 * TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid 546 * having to treat SSE2 functions with such properties as AVX. */ 547 if (bit_depth == 10) { 548 H264_QPEL_FUNCS_10(1, 0, sse2); 549 H264_QPEL_FUNCS_10(2, 0, sse2); 550 H264_QPEL_FUNCS_10(3, 0, sse2); 551 } 552 } 553#endif 554} 555