1 /*
2  * SIMD-optimized motion estimation
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/mem_internal.h"
28 #include "libavutil/x86/asm.h"
29 #include "libavutil/x86/cpu.h"
30 #include "libavcodec/me_cmp.h"
31 #include "libavcodec/mpegvideo.h"
32 
33 int ff_sum_abs_dctelem_sse2(int16_t *block);
34 int ff_sum_abs_dctelem_ssse3(int16_t *block);
35 int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
36                 ptrdiff_t stride, int h);
37 int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
38                  ptrdiff_t stride, int h);
39 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
40                   ptrdiff_t stride, int h);
41 int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
42 int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
43 int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
44                    ptrdiff_t stride, int h);
45 int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
46                     ptrdiff_t stride, int h);
47 int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
48                   ptrdiff_t stride, int h);
49 int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
50                       ptrdiff_t stride, int h);
51 int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
52                        ptrdiff_t stride, int h);
53 int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
54                      ptrdiff_t stride, int h);
55 int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
56                       ptrdiff_t stride, int h);
57 int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
58                        ptrdiff_t stride, int h);
59 int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
60                      ptrdiff_t stride, int h);
61 int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
62                               ptrdiff_t stride, int h);
63 int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
64                                ptrdiff_t stride, int h);
65 int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
66                              ptrdiff_t stride, int h);
67 int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
68                           ptrdiff_t stride, int h);
69 int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
70                            ptrdiff_t stride, int h);
71 int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
72                          ptrdiff_t stride, int h);
73 int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
74                     ptrdiff_t stride, int h);
75 int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
76                      ptrdiff_t stride, int h);
77 int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
78                    ptrdiff_t stride, int h);
79 
80 #define hadamard_func(cpu)                                                    \
81     int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,           \
82                                   uint8_t *src2, ptrdiff_t stride, int h);    \
83     int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,         \
84                                     uint8_t *src2, ptrdiff_t stride, int h);
85 
86 hadamard_func(mmxext)
87 hadamard_func(sse2)
88 hadamard_func(ssse3)
89 
90 #if HAVE_X86ASM
nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)91 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
92                       ptrdiff_t stride, int h)
93 {
94     int score1, score2;
95 
96     if (c)
97         score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
98     else
99         score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
100     score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
101            - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
102 
103     if (c)
104         return score1 + FFABS(score2) * c->avctx->nsse_weight;
105     else
106         return score1 + FFABS(score2) * 8;
107 }
108 
nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)109 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
110                      ptrdiff_t stride, int h)
111 {
112     int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
113     int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
114                  ff_hf_noise8_mmx(pix2, stride, h);
115 
116     if (c)
117         return score1 + FFABS(score2) * c->avctx->nsse_weight;
118     else
119         return score1 + FFABS(score2) * 8;
120 }
121 
122 #endif /* HAVE_X86ASM */
123 
124 #if HAVE_INLINE_ASM
125 
126 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
127     0x0000000000000000ULL,
128     0x0001000100010001ULL,
129     0x0002000200020002ULL,
130 };
131 
sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, ptrdiff_t stride, int h)132 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
133                               ptrdiff_t stride, int h)
134 {
135     x86_reg len = -stride * h;
136     __asm__ volatile (
137         "movq  (%1, %%"FF_REG_a"), %%mm0\n\t"
138         "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
139         "movq %%mm0, %%mm1              \n\t"
140         "movq %%mm2, %%mm3              \n\t"
141         "punpcklbw %%mm7, %%mm0         \n\t"
142         "punpckhbw %%mm7, %%mm1         \n\t"
143         "punpcklbw %%mm7, %%mm2         \n\t"
144         "punpckhbw %%mm7, %%mm3         \n\t"
145         "paddw %%mm2, %%mm0             \n\t"
146         "paddw %%mm3, %%mm1             \n\t"
147         ".p2align 4                     \n\t"
148         "1:                             \n\t"
149         "movq  (%2, %%"FF_REG_a"), %%mm2\n\t"
150         "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
151         "movq %%mm2, %%mm3              \n\t"
152         "movq %%mm4, %%mm5              \n\t"
153         "punpcklbw %%mm7, %%mm2         \n\t"
154         "punpckhbw %%mm7, %%mm3         \n\t"
155         "punpcklbw %%mm7, %%mm4         \n\t"
156         "punpckhbw %%mm7, %%mm5         \n\t"
157         "paddw %%mm4, %%mm2             \n\t"
158         "paddw %%mm5, %%mm3             \n\t"
159         "movq %5, %%mm5                 \n\t"
160         "paddw %%mm2, %%mm0             \n\t"
161         "paddw %%mm3, %%mm1             \n\t"
162         "paddw %%mm5, %%mm0             \n\t"
163         "paddw %%mm5, %%mm1             \n\t"
164         "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
165         "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
166         "psrlw $2, %%mm0                \n\t"
167         "psrlw $2, %%mm1                \n\t"
168         "packuswb %%mm1, %%mm0          \n\t"
169         "psubusb %%mm0, %%mm4           \n\t"
170         "psubusb %%mm5, %%mm0           \n\t"
171         "por %%mm4, %%mm0               \n\t"
172         "movq %%mm0, %%mm4              \n\t"
173         "punpcklbw %%mm7, %%mm0         \n\t"
174         "punpckhbw %%mm7, %%mm4         \n\t"
175         "paddw %%mm0, %%mm6             \n\t"
176         "paddw %%mm4, %%mm6             \n\t"
177         "movq  %%mm2, %%mm0             \n\t"
178         "movq  %%mm3, %%mm1             \n\t"
179         "add %4, %%"FF_REG_a"           \n\t"
180         " js 1b                         \n\t"
181         : "+a" (len)
182         : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
183           "r" (stride), "m" (round_tab[2]));
184 }
185 
sum_mmx(void)186 static inline int sum_mmx(void)
187 {
188     int ret;
189     __asm__ volatile (
190         "movq %%mm6, %%mm0              \n\t"
191         "psrlq $32, %%mm6               \n\t"
192         "paddw %%mm0, %%mm6             \n\t"
193         "movq %%mm6, %%mm0              \n\t"
194         "psrlq $16, %%mm6               \n\t"
195         "paddw %%mm0, %%mm6             \n\t"
196         "movd %%mm6, %0                 \n\t"
197         : "=r" (ret));
198     return ret & 0xFFFF;
199 }
200 
201 #define PIX_SADXY(suf)                                                  \
202 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
203                             uint8_t *blk1, ptrdiff_t stride, int h)     \
204 {                                                                       \
205     av_assert2(h == 8);                                                     \
206     __asm__ volatile (                                                  \
207         "pxor %%mm7, %%mm7     \n\t"                                    \
208         "pxor %%mm6, %%mm6     \n\t"                                    \
209         ::);                                                            \
210                                                                         \
211     sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
212                                                                         \
213     return sum_ ## suf();                                               \
214 }                                                                       \
215                                                                         \
216 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
217                              uint8_t *blk1, ptrdiff_t stride, int h)    \
218 {                                                                       \
219     __asm__ volatile (                                                  \
220         "pxor %%mm7, %%mm7     \n\t"                                    \
221         "pxor %%mm6, %%mm6     \n\t"                                    \
222         ::);                                                            \
223                                                                         \
224     sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
225     sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
226                                                                         \
227     return sum_ ## suf();                                               \
228 }                                                                       \
229 
PIX_SADXYnull230 PIX_SADXY(mmx)
231 
232 #endif /* HAVE_INLINE_ASM */
233 
234 av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
235 {
236     int cpu_flags = av_get_cpu_flags();
237 
238 #if HAVE_INLINE_ASM
239     if (INLINE_MMX(cpu_flags)) {
240         c->pix_abs[0][3] = sad16_xy2_mmx;
241         c->pix_abs[1][3] = sad8_xy2_mmx;
242     }
243 
244 #endif /* HAVE_INLINE_ASM */
245 
246     if (EXTERNAL_MMX(cpu_flags)) {
247         c->sse[1]            = ff_sse8_mmx;
248 #if HAVE_X86ASM
249         c->nsse[0]           = nsse16_mmx;
250         c->nsse[1]           = nsse8_mmx;
251 #endif
252     }
253 
254     if (EXTERNAL_MMXEXT(cpu_flags)) {
255 #if !HAVE_ALIGNED_STACK
256         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
257         c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
258 #endif
259 
260         c->sad[0] = ff_sad16_mmxext;
261         c->sad[1] = ff_sad8_mmxext;
262 
263         c->pix_abs[0][0] = ff_sad16_mmxext;
264         c->pix_abs[0][1] = ff_sad16_x2_mmxext;
265         c->pix_abs[0][2] = ff_sad16_y2_mmxext;
266         c->pix_abs[1][0] = ff_sad8_mmxext;
267         c->pix_abs[1][1] = ff_sad8_x2_mmxext;
268         c->pix_abs[1][2] = ff_sad8_y2_mmxext;
269 
270         c->vsad[4] = ff_vsad_intra16_mmxext;
271         c->vsad[5] = ff_vsad_intra8_mmxext;
272 
273         if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
274             c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
275             c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
276 
277             c->vsad[0] = ff_vsad16_approx_mmxext;
278             c->vsad[1] = ff_vsad8_approx_mmxext;
279         }
280     }
281 
282     if (EXTERNAL_SSE2(cpu_flags)) {
283         c->sse[0] = ff_sse16_sse2;
284         c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
285 
286 #if HAVE_ALIGNED_STACK
287         c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
288         c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
289 #endif
290         if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
291             c->sad[0]        = ff_sad16_sse2;
292             c->pix_abs[0][0] = ff_sad16_sse2;
293             c->pix_abs[0][1] = ff_sad16_x2_sse2;
294             c->pix_abs[0][2] = ff_sad16_y2_sse2;
295 
296             c->vsad[4]       = ff_vsad_intra16_sse2;
297             if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
298                 c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
299                 c->vsad[0]       = ff_vsad16_approx_sse2;
300             }
301         }
302     }
303 
304     if (EXTERNAL_SSSE3(cpu_flags)) {
305         c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
306 #if HAVE_ALIGNED_STACK
307         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
308         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
309 #endif
310     }
311 }
312