xref: /third_party/ffmpeg/libavcodec/x86/fdct.c (revision cabdff1a)
1/*
2 * SIMD-optimized forward DCT
3 * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
4 * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
6 *
7 * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
8 *
9 *  Intel Application Note AP-922 - fast, precise implementation of DCT
10 *        http://developer.intel.com/vtune/cbts/appnotes.htm
11 *
12 * Also of inspiration:
13 * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
14 * Skal's fdct at http://skal.planet-d.net/coding/dct.html
15 *
16 * This file is part of FFmpeg.
17 *
18 * FFmpeg is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU Lesser General Public
20 * License as published by the Free Software Foundation; either
21 * version 2.1 of the License, or (at your option) any later version.
22 *
23 * FFmpeg is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26 * Lesser General Public License for more details.
27 *
28 * You should have received a copy of the GNU Lesser General Public
29 * License along with FFmpeg; if not, write to the Free Software
30 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
31 */
32
33#include "config.h"
34#include "libavutil/attributes.h"
35#include "libavutil/macros.h"
36#include "libavutil/mem_internal.h"
37#include "libavutil/x86/asm.h"
38#include "fdct.h"
39
40#if HAVE_SSE2_INLINE
41
42//////////////////////////////////////////////////////////////////////
43//
44// constants for the forward DCT
45// -----------------------------
46//
47// Be sure to check that your compiler is aligning all constants to QWORD
48// (8-byte) memory boundaries!  Otherwise the unaligned memory access will
49// severely stall MMX execution.
50//
51//////////////////////////////////////////////////////////////////////
52
53#define BITS_FRW_ACC   3 //; 2 or 3 for accuracy
54#define SHIFT_FRW_COL  BITS_FRW_ACC
55#define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17 - 3)
56#define RND_FRW_ROW    (1 << (SHIFT_FRW_ROW-1))
57//#define RND_FRW_COL    (1 << (SHIFT_FRW_COL-1))
58
59#define X8(x) x,x,x,x,x,x,x,x
60
61//concatenated table, for forward DCT transformation
62DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
63    X8(13036),  // tg * (2<<16) + 0.5
64    X8(27146),  // tg * (2<<16) + 0.5
65    X8(-21746)  // tg * (2<<16) + 0.5
66};
67
68DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
69    X8(23170)   //cos * (2<<15) + 0.5
70};
71
72DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
73
74static const struct
75{
76 DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
77} fdct_r_row_sse2 =
78{{
79 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
80}};
81//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
82
83static const struct
84{
85 DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
86} tab_frw_01234567_sse2 =
87{{
88//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = {  // forward_dct coeff table
89#define TABLE_SSE2 C4,  C4,  C1,  C3, -C6, -C2, -C1, -C5, \
90                   C4,  C4,  C5,  C7,  C2,  C6,  C3, -C7, \
91                  -C4,  C4,  C7,  C3,  C6, -C2,  C7, -C5, \
92                   C4, -C4,  C5, -C1,  C2, -C6,  C3, -C1,
93// c1..c7 * cos(pi/4) * 2^15
94#define C1 22725
95#define C2 21407
96#define C3 19266
97#define C4 16384
98#define C5 12873
99#define C6 8867
100#define C7 4520
101TABLE_SSE2
102
103#undef C1
104#undef C2
105#undef C3
106#undef C4
107#undef C5
108#undef C6
109#undef C7
110#define C1 31521
111#define C2 29692
112#define C3 26722
113#define C4 22725
114#define C5 17855
115#define C6 12299
116#define C7 6270
117TABLE_SSE2
118
119#undef C1
120#undef C2
121#undef C3
122#undef C4
123#undef C5
124#undef C6
125#undef C7
126#define C1 29692
127#define C2 27969
128#define C3 25172
129#define C4 21407
130#define C5 16819
131#define C6 11585
132#define C7 5906
133TABLE_SSE2
134
135#undef C1
136#undef C2
137#undef C3
138#undef C4
139#undef C5
140#undef C6
141#undef C7
142#define C1 26722
143#define C2 25172
144#define C3 22654
145#define C4 19266
146#define C5 15137
147#define C6 10426
148#define C7 5315
149TABLE_SSE2
150
151#undef C1
152#undef C2
153#undef C3
154#undef C4
155#undef C5
156#undef C6
157#undef C7
158#define C1 22725
159#define C2 21407
160#define C3 19266
161#define C4 16384
162#define C5 12873
163#define C6 8867
164#define C7 4520
165TABLE_SSE2
166
167#undef C1
168#undef C2
169#undef C3
170#undef C4
171#undef C5
172#undef C6
173#undef C7
174#define C1 26722
175#define C2 25172
176#define C3 22654
177#define C4 19266
178#define C5 15137
179#define C6 10426
180#define C7 5315
181TABLE_SSE2
182
183#undef C1
184#undef C2
185#undef C3
186#undef C4
187#undef C5
188#undef C6
189#undef C7
190#define C1 29692
191#define C2 27969
192#define C3 25172
193#define C4 21407
194#define C5 16819
195#define C6 11585
196#define C7 5906
197TABLE_SSE2
198
199#undef C1
200#undef C2
201#undef C3
202#undef C4
203#undef C5
204#undef C6
205#undef C7
206#define C1 31521
207#define C2 29692
208#define C3 26722
209#define C4 22725
210#define C5 17855
211#define C6 12299
212#define C7 6270
213TABLE_SSE2
214}};
215
216#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
217
218#define FDCT_COL(cpu, mm, mov)\
219static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
220{\
221    __asm__ volatile (\
222        #mov"      16(%0),  %%"#mm"0 \n\t" \
223        #mov"      96(%0),  %%"#mm"1 \n\t" \
224        #mov"    %%"#mm"0,  %%"#mm"2 \n\t" \
225        #mov"      32(%0),  %%"#mm"3 \n\t" \
226        "paddsw  %%"#mm"1,  %%"#mm"0 \n\t" \
227        #mov"      80(%0),  %%"#mm"4 \n\t" \
228        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
229        #mov"        (%0),  %%"#mm"5 \n\t" \
230        "paddsw  %%"#mm"3,  %%"#mm"4 \n\t" \
231        "paddsw   112(%0),  %%"#mm"5 \n\t" \
232        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
233        #mov"    %%"#mm"0,  %%"#mm"6 \n\t" \
234        "psubsw  %%"#mm"1,  %%"#mm"2 \n\t" \
235        #mov"      16(%1),  %%"#mm"1 \n\t" \
236        "psubsw  %%"#mm"4,  %%"#mm"0 \n\t" \
237        #mov"      48(%0),  %%"#mm"7 \n\t" \
238        "pmulhw  %%"#mm"0,  %%"#mm"1 \n\t" \
239        "paddsw    64(%0),  %%"#mm"7 \n\t" \
240        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
241        "paddsw  %%"#mm"4,  %%"#mm"6 \n\t" \
242        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
243        #mov"    %%"#mm"5,  %%"#mm"4 \n\t" \
244        "psubsw  %%"#mm"7,  %%"#mm"5 \n\t" \
245        "paddsw  %%"#mm"5,  %%"#mm"1 \n\t" \
246        "paddsw  %%"#mm"7,  %%"#mm"4 \n\t" \
247        "por         (%2),  %%"#mm"1 \n\t" \
248        "psllw  $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
249        "pmulhw    16(%1),  %%"#mm"5 \n\t" \
250        #mov"    %%"#mm"4,  %%"#mm"7 \n\t" \
251        "psubsw    80(%0),  %%"#mm"3 \n\t" \
252        "psubsw  %%"#mm"6,  %%"#mm"4 \n\t" \
253        #mov"    %%"#mm"1,    32(%3) \n\t" \
254        "paddsw  %%"#mm"6,  %%"#mm"7 \n\t" \
255        #mov"      48(%0),  %%"#mm"1 \n\t" \
256        "psllw  $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
257        "psubsw    64(%0),  %%"#mm"1 \n\t" \
258        #mov"    %%"#mm"2,  %%"#mm"6 \n\t" \
259        #mov"    %%"#mm"4,    64(%3) \n\t" \
260        "paddsw  %%"#mm"3,  %%"#mm"2 \n\t" \
261        "pmulhw      (%4),  %%"#mm"2 \n\t" \
262        "psubsw  %%"#mm"3,  %%"#mm"6 \n\t" \
263        "pmulhw      (%4),  %%"#mm"6 \n\t" \
264        "psubsw  %%"#mm"0,  %%"#mm"5 \n\t" \
265        "por         (%2),  %%"#mm"5 \n\t" \
266        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
267        "por         (%2),  %%"#mm"2 \n\t" \
268        #mov"    %%"#mm"1,  %%"#mm"4 \n\t" \
269        #mov"        (%0),  %%"#mm"3 \n\t" \
270        "paddsw  %%"#mm"6,  %%"#mm"1 \n\t" \
271        "psubsw   112(%0),  %%"#mm"3 \n\t" \
272        "psubsw  %%"#mm"6,  %%"#mm"4 \n\t" \
273        #mov"        (%1),  %%"#mm"0 \n\t" \
274        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
275        #mov"      32(%1),  %%"#mm"6 \n\t" \
276        "pmulhw  %%"#mm"1,  %%"#mm"0 \n\t" \
277        #mov"    %%"#mm"7,      (%3) \n\t" \
278        "pmulhw  %%"#mm"4,  %%"#mm"6 \n\t" \
279        #mov"    %%"#mm"5,    96(%3) \n\t" \
280        #mov"    %%"#mm"3,  %%"#mm"7 \n\t" \
281        #mov"      32(%1),  %%"#mm"5 \n\t" \
282        "psubsw  %%"#mm"2,  %%"#mm"7 \n\t" \
283        "paddsw  %%"#mm"2,  %%"#mm"3 \n\t" \
284        "pmulhw  %%"#mm"7,  %%"#mm"5 \n\t" \
285        "paddsw  %%"#mm"3,  %%"#mm"0 \n\t" \
286        "paddsw  %%"#mm"4,  %%"#mm"6 \n\t" \
287        "pmulhw      (%1),  %%"#mm"3 \n\t" \
288        "por         (%2),  %%"#mm"0 \n\t" \
289        "paddsw  %%"#mm"7,  %%"#mm"5 \n\t" \
290        "psubsw  %%"#mm"6,  %%"#mm"7 \n\t" \
291        #mov"    %%"#mm"0,    16(%3) \n\t" \
292        "paddsw  %%"#mm"4,  %%"#mm"5 \n\t" \
293        #mov"    %%"#mm"7,    48(%3) \n\t" \
294        "psubsw  %%"#mm"1,  %%"#mm"3 \n\t" \
295        #mov"    %%"#mm"5,    80(%3) \n\t" \
296        #mov"    %%"#mm"3,   112(%3) \n\t" \
297        : \
298        : "r" (in  + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
299          "r" (out + offset), "r" (ocos_4_16)); \
300}
301
302FDCT_COL(sse2, xmm, movdqa)
303
304static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
305{
306    __asm__ volatile(
307#define FDCT_ROW_SSE2_H1(i,t)                    \
308        "movq      " #i "(%0), %%xmm2      \n\t" \
309        "movq      " #i "+8(%0), %%xmm0    \n\t" \
310        "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
311        "movdqa    " #t "+48(%1), %%xmm7   \n\t" \
312        "movdqa    " #t "(%1), %%xmm4      \n\t" \
313        "movdqa    " #t "+16(%1), %%xmm5   \n\t"
314
315#define FDCT_ROW_SSE2_H2(i,t)                    \
316        "movq      " #i "(%0), %%xmm2      \n\t" \
317        "movq      " #i "+8(%0), %%xmm0    \n\t" \
318        "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
319        "movdqa    " #t "+48(%1), %%xmm7   \n\t"
320
321#define FDCT_ROW_SSE2(i)                      \
322        "movq      %%xmm2, %%xmm1       \n\t" \
323        "pshuflw   $27, %%xmm0, %%xmm0  \n\t" \
324        "paddsw    %%xmm0, %%xmm1       \n\t" \
325        "psubsw    %%xmm0, %%xmm2       \n\t" \
326        "punpckldq %%xmm2, %%xmm1       \n\t" \
327        "pshufd    $78, %%xmm1, %%xmm2  \n\t" \
328        "pmaddwd   %%xmm2, %%xmm3       \n\t" \
329        "pmaddwd   %%xmm1, %%xmm7       \n\t" \
330        "pmaddwd   %%xmm5, %%xmm2       \n\t" \
331        "pmaddwd   %%xmm4, %%xmm1       \n\t" \
332        "paddd     %%xmm7, %%xmm3       \n\t" \
333        "paddd     %%xmm2, %%xmm1       \n\t" \
334        "paddd     %%xmm6, %%xmm3       \n\t" \
335        "paddd     %%xmm6, %%xmm1       \n\t" \
336        "psrad     %3, %%xmm3           \n\t" \
337        "psrad     %3, %%xmm1           \n\t" \
338        "packssdw  %%xmm3, %%xmm1       \n\t" \
339        "movdqa    %%xmm1, " #i "(%4)   \n\t"
340
341        "movdqa    (%2), %%xmm6         \n\t"
342        FDCT_ROW_SSE2_H1(0,0)
343        FDCT_ROW_SSE2(0)
344        FDCT_ROW_SSE2_H2(64,0)
345        FDCT_ROW_SSE2(64)
346
347        FDCT_ROW_SSE2_H1(16,64)
348        FDCT_ROW_SSE2(16)
349        FDCT_ROW_SSE2_H2(112,64)
350        FDCT_ROW_SSE2(112)
351
352        FDCT_ROW_SSE2_H1(32,128)
353        FDCT_ROW_SSE2(32)
354        FDCT_ROW_SSE2_H2(96,128)
355        FDCT_ROW_SSE2(96)
356
357        FDCT_ROW_SSE2_H1(48,192)
358        FDCT_ROW_SSE2(48)
359        FDCT_ROW_SSE2_H2(80,192)
360        FDCT_ROW_SSE2(80)
361        :
362        : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
363          "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
364          XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
365                            "%xmm4", "%xmm5", "%xmm6", "%xmm7")
366    );
367}
368
369void ff_fdct_sse2(int16_t *block)
370{
371    DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
372    int16_t * const block1= (int16_t*)align_tmp;
373
374    fdct_col_sse2(block, block1, 0);
375    fdct_row_sse2(block1, block);
376}
377
378#endif /* HAVE_SSE2_INLINE */
379