1/* 2 * This file is part of FFmpeg. 3 * 4 * FFmpeg is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU Lesser General Public 6 * License as published by the Free Software Foundation; either 7 * version 2.1 of the License, or (at your option) any later version. 8 * 9 * FFmpeg is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * Lesser General Public License for more details. 13 * 14 * You should have received a copy of the GNU Lesser General Public 15 * License along with FFmpeg; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 19#include "libavutil/attributes.h" 20#include "libavutil/avassert.h" 21#include "libavutil/cpu.h" 22#include "libavutil/x86/cpu.h" 23#include "libavcodec/avcodec.h" 24#include "libavcodec/mpegvideoencdsp.h" 25 26int ff_pix_sum16_sse2(uint8_t *pix, int line_size); 27int ff_pix_sum16_xop(uint8_t *pix, int line_size); 28int ff_pix_norm1_sse2(uint8_t *pix, int line_size); 29 30#if HAVE_INLINE_ASM 31 32#define PHADDD(a, t) \ 33 "movq " #a ", " #t " \n\t" \ 34 "psrlq $32, " #a " \n\t" \ 35 "paddd " #t ", " #a " \n\t" 36 37/* 38 * pmulhw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31] 39 * pmulhrw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31] 40 * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30] 41 */ 42#define PMULHRW(x, y, s, o) \ 43 "pmulhw " #s ", " #x " \n\t" \ 44 "pmulhw " #s ", " #y " \n\t" \ 45 "paddw " #o ", " #x " \n\t" \ 46 "paddw " #o ", " #y " \n\t" \ 47 "psraw $1, " #x " \n\t" \ 48 "psraw $1, " #y " \n\t" 49#define DEF(x) x ## _mmx 50#define SET_RND MOVQ_WONE 51#define SCALE_OFFSET 1 52 53#include "mpegvideoenc_qns_template.c" 54 55#undef DEF 56#undef SET_RND 57#undef SCALE_OFFSET 58#undef PMULHRW 59 60#define DEF(x) x ## _3dnow 61#define SET_RND(x) 62#define SCALE_OFFSET 0 63#define PMULHRW(x, y, s, o) \ 64 "pmulhrw " #s ", " #x " \n\t" \ 65 "pmulhrw " #s ", " #y " \n\t" 66 67#include "mpegvideoenc_qns_template.c" 68 69#undef DEF 70#undef SET_RND 71#undef SCALE_OFFSET 72#undef PMULHRW 73 74#if HAVE_SSSE3_INLINE 75#undef PHADDD 76#define DEF(x) x ## _ssse3 77#define SET_RND(x) 78#define SCALE_OFFSET -1 79 80#define PHADDD(a, t) \ 81 "pshufw $0x0E, " #a ", " #t " \n\t" \ 82 /* faster than phaddd on core2 */ \ 83 "paddd " #t ", " #a " \n\t" 84 85#define PMULHRW(x, y, s, o) \ 86 "pmulhrsw " #s ", " #x " \n\t" \ 87 "pmulhrsw " #s ", " #y " \n\t" 88 89#include "mpegvideoenc_qns_template.c" 90 91#undef DEF 92#undef SET_RND 93#undef SCALE_OFFSET 94#undef PMULHRW 95#undef PHADDD 96#endif /* HAVE_SSSE3_INLINE */ 97 98/* Draw the edges of width 'w' of an image of size width, height 99 * this MMX version can only handle w == 8 || w == 16. */ 100static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, 101 int w, int h, int sides) 102{ 103 uint8_t *ptr, *last_line; 104 int i; 105 106 last_line = buf + (height - 1) * wrap; 107 /* left and right */ 108 ptr = buf; 109 if (w == 8) { 110 __asm__ volatile ( 111 "1: \n\t" 112 "movd (%0), %%mm0 \n\t" 113 "punpcklbw %%mm0, %%mm0 \n\t" 114 "punpcklwd %%mm0, %%mm0 \n\t" 115 "punpckldq %%mm0, %%mm0 \n\t" 116 "movq %%mm0, -8(%0) \n\t" 117 "movq -8(%0, %2), %%mm1 \n\t" 118 "punpckhbw %%mm1, %%mm1 \n\t" 119 "punpckhwd %%mm1, %%mm1 \n\t" 120 "punpckhdq %%mm1, %%mm1 \n\t" 121 "movq %%mm1, (%0, %2) \n\t" 122 "add %1, %0 \n\t" 123 "cmp %3, %0 \n\t" 124 "jb 1b \n\t" 125 : "+r" (ptr) 126 : "r" ((x86_reg) wrap), "r" ((x86_reg) width), 127 "r" (ptr + wrap * height)); 128 } else if (w == 16) { 129 __asm__ volatile ( 130 "1: \n\t" 131 "movd (%0), %%mm0 \n\t" 132 "punpcklbw %%mm0, %%mm0 \n\t" 133 "punpcklwd %%mm0, %%mm0 \n\t" 134 "punpckldq %%mm0, %%mm0 \n\t" 135 "movq %%mm0, -8(%0) \n\t" 136 "movq %%mm0, -16(%0) \n\t" 137 "movq -8(%0, %2), %%mm1 \n\t" 138 "punpckhbw %%mm1, %%mm1 \n\t" 139 "punpckhwd %%mm1, %%mm1 \n\t" 140 "punpckhdq %%mm1, %%mm1 \n\t" 141 "movq %%mm1, (%0, %2) \n\t" 142 "movq %%mm1, 8(%0, %2) \n\t" 143 "add %1, %0 \n\t" 144 "cmp %3, %0 \n\t" 145 "jb 1b \n\t" 146 : "+r"(ptr) 147 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) 148 ); 149 } else { 150 av_assert1(w == 4); 151 __asm__ volatile ( 152 "1: \n\t" 153 "movd (%0), %%mm0 \n\t" 154 "punpcklbw %%mm0, %%mm0 \n\t" 155 "punpcklwd %%mm0, %%mm0 \n\t" 156 "movd %%mm0, -4(%0) \n\t" 157 "movd -4(%0, %2), %%mm1 \n\t" 158 "punpcklbw %%mm1, %%mm1 \n\t" 159 "punpckhwd %%mm1, %%mm1 \n\t" 160 "punpckhdq %%mm1, %%mm1 \n\t" 161 "movd %%mm1, (%0, %2) \n\t" 162 "add %1, %0 \n\t" 163 "cmp %3, %0 \n\t" 164 "jb 1b \n\t" 165 : "+r" (ptr) 166 : "r" ((x86_reg) wrap), "r" ((x86_reg) width), 167 "r" (ptr + wrap * height)); 168 } 169 170 /* top and bottom (and hopefully also the corners) */ 171 if (sides & EDGE_TOP) { 172 for (i = 0; i < h; i += 4) { 173 ptr = buf - (i + 1) * wrap - w; 174 __asm__ volatile ( 175 "1: \n\t" 176 "movq (%1, %0), %%mm0 \n\t" 177 "movq %%mm0, (%0) \n\t" 178 "movq %%mm0, (%0, %2) \n\t" 179 "movq %%mm0, (%0, %2, 2) \n\t" 180 "movq %%mm0, (%0, %3) \n\t" 181 "add $8, %0 \n\t" 182 "cmp %4, %0 \n\t" 183 "jb 1b \n\t" 184 : "+r" (ptr) 185 : "r" ((x86_reg) buf - (x86_reg) ptr - w), 186 "r" ((x86_reg) - wrap), "r" ((x86_reg) - wrap * 3), 187 "r" (ptr + width + 2 * w)); 188 } 189 } 190 191 if (sides & EDGE_BOTTOM) { 192 for (i = 0; i < h; i += 4) { 193 ptr = last_line + (i + 1) * wrap - w; 194 __asm__ volatile ( 195 "1: \n\t" 196 "movq (%1, %0), %%mm0 \n\t" 197 "movq %%mm0, (%0) \n\t" 198 "movq %%mm0, (%0, %2) \n\t" 199 "movq %%mm0, (%0, %2, 2) \n\t" 200 "movq %%mm0, (%0, %3) \n\t" 201 "add $8, %0 \n\t" 202 "cmp %4, %0 \n\t" 203 "jb 1b \n\t" 204 : "+r" (ptr) 205 : "r" ((x86_reg) last_line - (x86_reg) ptr - w), 206 "r" ((x86_reg) wrap), "r" ((x86_reg) wrap * 3), 207 "r" (ptr + width + 2 * w)); 208 } 209 } 210} 211 212#endif /* HAVE_INLINE_ASM */ 213 214av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, 215 AVCodecContext *avctx) 216{ 217 int cpu_flags = av_get_cpu_flags(); 218 219 if (EXTERNAL_SSE2(cpu_flags)) { 220 c->pix_sum = ff_pix_sum16_sse2; 221 c->pix_norm1 = ff_pix_norm1_sse2; 222 } 223 224 if (EXTERNAL_XOP(cpu_flags)) { 225 c->pix_sum = ff_pix_sum16_xop; 226 } 227 228#if HAVE_INLINE_ASM 229 230 if (INLINE_MMX(cpu_flags)) { 231 if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { 232 c->try_8x8basis = try_8x8basis_mmx; 233 } 234 c->add_8x8basis = add_8x8basis_mmx; 235 236 if (avctx->bits_per_raw_sample <= 8) { 237 c->draw_edges = draw_edges_mmx; 238 } 239 } 240 241 if (INLINE_AMD3DNOW(cpu_flags)) { 242 if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { 243 c->try_8x8basis = try_8x8basis_3dnow; 244 } 245 c->add_8x8basis = add_8x8basis_3dnow; 246 } 247 248#if HAVE_SSSE3_INLINE 249 if (INLINE_SSSE3(cpu_flags)) { 250 if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { 251 c->try_8x8basis = try_8x8basis_ssse3; 252 } 253 c->add_8x8basis = add_8x8basis_ssse3; 254 } 255#endif /* HAVE_SSSE3_INLINE */ 256 257#endif /* HAVE_INLINE_ASM */ 258} 259