1/* 2 * SIMD-optimized forward DCT 3 * The gcc porting is Copyright (c) 2001 Fabrice Bellard. 4 * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5 * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. 6 * 7 * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT 8 * 9 * Intel Application Note AP-922 - fast, precise implementation of DCT 10 * http://developer.intel.com/vtune/cbts/appnotes.htm 11 * 12 * Also of inspiration: 13 * a page about fdct at http://www.geocities.com/ssavekar/dct.htm 14 * Skal's fdct at http://skal.planet-d.net/coding/dct.html 15 * 16 * This file is part of FFmpeg. 17 * 18 * FFmpeg is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU Lesser General Public 20 * License as published by the Free Software Foundation; either 21 * version 2.1 of the License, or (at your option) any later version. 22 * 23 * FFmpeg is distributed in the hope that it will be useful, 24 * but WITHOUT ANY WARRANTY; without even the implied warranty of 25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26 * Lesser General Public License for more details. 27 * 28 * You should have received a copy of the GNU Lesser General Public 29 * License along with FFmpeg; if not, write to the Free Software 30 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 31 */ 32 33#include "config.h" 34#include "libavutil/attributes.h" 35#include "libavutil/macros.h" 36#include "libavutil/mem_internal.h" 37#include "libavutil/x86/asm.h" 38#include "fdct.h" 39 40#if HAVE_SSE2_INLINE 41 42////////////////////////////////////////////////////////////////////// 43// 44// constants for the forward DCT 45// ----------------------------- 46// 47// Be sure to check that your compiler is aligning all constants to QWORD 48// (8-byte) memory boundaries! Otherwise the unaligned memory access will 49// severely stall MMX execution. 50// 51////////////////////////////////////////////////////////////////////// 52 53#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy 54#define SHIFT_FRW_COL BITS_FRW_ACC 55#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) 56#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) 57//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) 58 59#define X8(x) x,x,x,x,x,x,x,x 60 61//concatenated table, for forward DCT transformation 62DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = { 63 X8(13036), // tg * (2<<16) + 0.5 64 X8(27146), // tg * (2<<16) + 0.5 65 X8(-21746) // tg * (2<<16) + 0.5 66}; 67 68DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = { 69 X8(23170) //cos * (2<<15) + 0.5 70}; 71 72DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; 73 74static const struct 75{ 76 DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; 77} fdct_r_row_sse2 = 78{{ 79 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW 80}}; 81//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; 82 83static const struct 84{ 85 DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; 86} tab_frw_01234567_sse2 = 87{{ 88//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table 89#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ 90 C4, C4, C5, C7, C2, C6, C3, -C7, \ 91 -C4, C4, C7, C3, C6, -C2, C7, -C5, \ 92 C4, -C4, C5, -C1, C2, -C6, C3, -C1, 93// c1..c7 * cos(pi/4) * 2^15 94#define C1 22725 95#define C2 21407 96#define C3 19266 97#define C4 16384 98#define C5 12873 99#define C6 8867 100#define C7 4520 101TABLE_SSE2 102 103#undef C1 104#undef C2 105#undef C3 106#undef C4 107#undef C5 108#undef C6 109#undef C7 110#define C1 31521 111#define C2 29692 112#define C3 26722 113#define C4 22725 114#define C5 17855 115#define C6 12299 116#define C7 6270 117TABLE_SSE2 118 119#undef C1 120#undef C2 121#undef C3 122#undef C4 123#undef C5 124#undef C6 125#undef C7 126#define C1 29692 127#define C2 27969 128#define C3 25172 129#define C4 21407 130#define C5 16819 131#define C6 11585 132#define C7 5906 133TABLE_SSE2 134 135#undef C1 136#undef C2 137#undef C3 138#undef C4 139#undef C5 140#undef C6 141#undef C7 142#define C1 26722 143#define C2 25172 144#define C3 22654 145#define C4 19266 146#define C5 15137 147#define C6 10426 148#define C7 5315 149TABLE_SSE2 150 151#undef C1 152#undef C2 153#undef C3 154#undef C4 155#undef C5 156#undef C6 157#undef C7 158#define C1 22725 159#define C2 21407 160#define C3 19266 161#define C4 16384 162#define C5 12873 163#define C6 8867 164#define C7 4520 165TABLE_SSE2 166 167#undef C1 168#undef C2 169#undef C3 170#undef C4 171#undef C5 172#undef C6 173#undef C7 174#define C1 26722 175#define C2 25172 176#define C3 22654 177#define C4 19266 178#define C5 15137 179#define C6 10426 180#define C7 5315 181TABLE_SSE2 182 183#undef C1 184#undef C2 185#undef C3 186#undef C4 187#undef C5 188#undef C6 189#undef C7 190#define C1 29692 191#define C2 27969 192#define C3 25172 193#define C4 21407 194#define C5 16819 195#define C6 11585 196#define C7 5906 197TABLE_SSE2 198 199#undef C1 200#undef C2 201#undef C3 202#undef C4 203#undef C5 204#undef C6 205#undef C7 206#define C1 31521 207#define C2 29692 208#define C3 26722 209#define C4 22725 210#define C5 17855 211#define C6 12299 212#define C7 6270 213TABLE_SSE2 214}}; 215 216#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long 217 218#define FDCT_COL(cpu, mm, mov)\ 219static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ 220{\ 221 __asm__ volatile (\ 222 #mov" 16(%0), %%"#mm"0 \n\t" \ 223 #mov" 96(%0), %%"#mm"1 \n\t" \ 224 #mov" %%"#mm"0, %%"#mm"2 \n\t" \ 225 #mov" 32(%0), %%"#mm"3 \n\t" \ 226 "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ 227 #mov" 80(%0), %%"#mm"4 \n\t" \ 228 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ 229 #mov" (%0), %%"#mm"5 \n\t" \ 230 "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ 231 "paddsw 112(%0), %%"#mm"5 \n\t" \ 232 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ 233 #mov" %%"#mm"0, %%"#mm"6 \n\t" \ 234 "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ 235 #mov" 16(%1), %%"#mm"1 \n\t" \ 236 "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ 237 #mov" 48(%0), %%"#mm"7 \n\t" \ 238 "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ 239 "paddsw 64(%0), %%"#mm"7 \n\t" \ 240 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ 241 "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ 242 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ 243 #mov" %%"#mm"5, %%"#mm"4 \n\t" \ 244 "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ 245 "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ 246 "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ 247 "por (%2), %%"#mm"1 \n\t" \ 248 "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ 249 "pmulhw 16(%1), %%"#mm"5 \n\t" \ 250 #mov" %%"#mm"4, %%"#mm"7 \n\t" \ 251 "psubsw 80(%0), %%"#mm"3 \n\t" \ 252 "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ 253 #mov" %%"#mm"1, 32(%3) \n\t" \ 254 "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ 255 #mov" 48(%0), %%"#mm"1 \n\t" \ 256 "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ 257 "psubsw 64(%0), %%"#mm"1 \n\t" \ 258 #mov" %%"#mm"2, %%"#mm"6 \n\t" \ 259 #mov" %%"#mm"4, 64(%3) \n\t" \ 260 "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ 261 "pmulhw (%4), %%"#mm"2 \n\t" \ 262 "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ 263 "pmulhw (%4), %%"#mm"6 \n\t" \ 264 "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ 265 "por (%2), %%"#mm"5 \n\t" \ 266 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ 267 "por (%2), %%"#mm"2 \n\t" \ 268 #mov" %%"#mm"1, %%"#mm"4 \n\t" \ 269 #mov" (%0), %%"#mm"3 \n\t" \ 270 "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ 271 "psubsw 112(%0), %%"#mm"3 \n\t" \ 272 "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ 273 #mov" (%1), %%"#mm"0 \n\t" \ 274 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ 275 #mov" 32(%1), %%"#mm"6 \n\t" \ 276 "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ 277 #mov" %%"#mm"7, (%3) \n\t" \ 278 "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ 279 #mov" %%"#mm"5, 96(%3) \n\t" \ 280 #mov" %%"#mm"3, %%"#mm"7 \n\t" \ 281 #mov" 32(%1), %%"#mm"5 \n\t" \ 282 "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ 283 "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ 284 "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ 285 "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ 286 "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ 287 "pmulhw (%1), %%"#mm"3 \n\t" \ 288 "por (%2), %%"#mm"0 \n\t" \ 289 "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ 290 "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ 291 #mov" %%"#mm"0, 16(%3) \n\t" \ 292 "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ 293 #mov" %%"#mm"7, 48(%3) \n\t" \ 294 "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ 295 #mov" %%"#mm"5, 80(%3) \n\t" \ 296 #mov" %%"#mm"3, 112(%3) \n\t" \ 297 : \ 298 : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ 299 "r" (out + offset), "r" (ocos_4_16)); \ 300} 301 302FDCT_COL(sse2, xmm, movdqa) 303 304static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) 305{ 306 __asm__ volatile( 307#define FDCT_ROW_SSE2_H1(i,t) \ 308 "movq " #i "(%0), %%xmm2 \n\t" \ 309 "movq " #i "+8(%0), %%xmm0 \n\t" \ 310 "movdqa " #t "+32(%1), %%xmm3 \n\t" \ 311 "movdqa " #t "+48(%1), %%xmm7 \n\t" \ 312 "movdqa " #t "(%1), %%xmm4 \n\t" \ 313 "movdqa " #t "+16(%1), %%xmm5 \n\t" 314 315#define FDCT_ROW_SSE2_H2(i,t) \ 316 "movq " #i "(%0), %%xmm2 \n\t" \ 317 "movq " #i "+8(%0), %%xmm0 \n\t" \ 318 "movdqa " #t "+32(%1), %%xmm3 \n\t" \ 319 "movdqa " #t "+48(%1), %%xmm7 \n\t" 320 321#define FDCT_ROW_SSE2(i) \ 322 "movq %%xmm2, %%xmm1 \n\t" \ 323 "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ 324 "paddsw %%xmm0, %%xmm1 \n\t" \ 325 "psubsw %%xmm0, %%xmm2 \n\t" \ 326 "punpckldq %%xmm2, %%xmm1 \n\t" \ 327 "pshufd $78, %%xmm1, %%xmm2 \n\t" \ 328 "pmaddwd %%xmm2, %%xmm3 \n\t" \ 329 "pmaddwd %%xmm1, %%xmm7 \n\t" \ 330 "pmaddwd %%xmm5, %%xmm2 \n\t" \ 331 "pmaddwd %%xmm4, %%xmm1 \n\t" \ 332 "paddd %%xmm7, %%xmm3 \n\t" \ 333 "paddd %%xmm2, %%xmm1 \n\t" \ 334 "paddd %%xmm6, %%xmm3 \n\t" \ 335 "paddd %%xmm6, %%xmm1 \n\t" \ 336 "psrad %3, %%xmm3 \n\t" \ 337 "psrad %3, %%xmm1 \n\t" \ 338 "packssdw %%xmm3, %%xmm1 \n\t" \ 339 "movdqa %%xmm1, " #i "(%4) \n\t" 340 341 "movdqa (%2), %%xmm6 \n\t" 342 FDCT_ROW_SSE2_H1(0,0) 343 FDCT_ROW_SSE2(0) 344 FDCT_ROW_SSE2_H2(64,0) 345 FDCT_ROW_SSE2(64) 346 347 FDCT_ROW_SSE2_H1(16,64) 348 FDCT_ROW_SSE2(16) 349 FDCT_ROW_SSE2_H2(112,64) 350 FDCT_ROW_SSE2(112) 351 352 FDCT_ROW_SSE2_H1(32,128) 353 FDCT_ROW_SSE2(32) 354 FDCT_ROW_SSE2_H2(96,128) 355 FDCT_ROW_SSE2(96) 356 357 FDCT_ROW_SSE2_H1(48,192) 358 FDCT_ROW_SSE2(48) 359 FDCT_ROW_SSE2_H2(80,192) 360 FDCT_ROW_SSE2(80) 361 : 362 : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), 363 "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) 364 XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", 365 "%xmm4", "%xmm5", "%xmm6", "%xmm7") 366 ); 367} 368 369void ff_fdct_sse2(int16_t *block) 370{ 371 DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; 372 int16_t * const block1= (int16_t*)align_tmp; 373 374 fdct_col_sse2(block, block1, 0); 375 fdct_row_sse2(block1, block); 376} 377 378#endif /* HAVE_SSE2_INLINE */ 379