1/*
2 * Simple IDCT
3 *
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23/**
24 * @file
25 * simpleidct in C.
26 */
27
28/* Based upon some commented-out C code from mpeg2dec (idct_mmx.c
29 * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>). */
30
31#include "simple_idct.h"
32
33#include "bit_depth_template.c"
34
35#undef W1
36#undef W2
37#undef W3
38#undef W4
39#undef W5
40#undef W6
41#undef W7
42#undef ROW_SHIFT
43#undef COL_SHIFT
44#undef DC_SHIFT
45#undef MUL
46#undef MAC
47
48#if BIT_DEPTH == 8
49
50#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
51#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
52#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
53#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
54#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
55#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
56#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
57
58#define ROW_SHIFT 11
59#define COL_SHIFT 20
60#define DC_SHIFT 3
61
62#define MUL(a, b)    MUL16(a, b)
63#define MAC(a, b, c) MAC16(a, b, c)
64
65#elif BIT_DEPTH == 10 || BIT_DEPTH == 12
66
67# if BIT_DEPTH == 10
68#define W1 22725 // 90901
69#define W2 21407 //  85627
70#define W3 19265 //  77062
71#define W4 16384 //  65535
72#define W5 12873 //  51491
73#define W6  8867 //  35468
74#define W7  4520 //  18081
75
76#   ifdef EXTRA_SHIFT
77#define ROW_SHIFT 13
78#define COL_SHIFT 18
79#define DC_SHIFT  1
80#   elif IN_IDCT_DEPTH == 32
81#define ROW_SHIFT 13
82#define COL_SHIFT 21
83#define DC_SHIFT  2
84#   else
85#define ROW_SHIFT 12
86#define COL_SHIFT 19
87#define DC_SHIFT  2
88#   endif
89
90# else
91#define W1 45451
92#define W2 42813
93#define W3 38531
94#define W4 32767
95#define W5 25746
96#define W6 17734
97#define W7 9041
98
99#define ROW_SHIFT 16
100#define COL_SHIFT 17
101#define DC_SHIFT -1
102# endif
103
104#define MUL(a, b)    ((int)((SUINT)(a) * (b)))
105#define MAC(a, b, c) ((a) += (SUINT)(b) * (c))
106
107#else
108
109#error "Unsupported bitdepth"
110
111#endif
112
113#ifdef EXTRA_SHIFT
114static inline void FUNC(idctRowCondDC_extrashift)(int16_t *row, int extra_shift)
115#else
116static inline void FUNC6(idctRowCondDC)(idctin *row, int extra_shift)
117#endif
118{
119    SUINT a0, a1, a2, a3, b0, b1, b2, b3;
120
121// TODO: Add DC-only support for int32_t input
122#if IN_IDCT_DEPTH == 16
123#if HAVE_FAST_64BIT
124#define ROW0_MASK (0xffffULL << 48 * HAVE_BIGENDIAN)
125    if (((AV_RN64A(row) & ~ROW0_MASK) | AV_RN64A(row+4)) == 0) {
126        uint64_t temp;
127        if (DC_SHIFT - extra_shift >= 0) {
128            temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff;
129        } else {
130            temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff;
131        }
132        temp += temp * (1 << 16);
133        temp += temp * ((uint64_t) 1 << 32);
134        AV_WN64A(row, temp);
135        AV_WN64A(row + 4, temp);
136        return;
137    }
138#else
139    if (!(AV_RN32A(row+2) |
140          AV_RN32A(row+4) |
141          AV_RN32A(row+6) |
142          row[1])) {
143        uint32_t temp;
144        if (DC_SHIFT - extra_shift >= 0) {
145            temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff;
146        } else {
147            temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff;
148        }
149        temp += temp * (1 << 16);
150        AV_WN32A(row, temp);
151        AV_WN32A(row+2, temp);
152        AV_WN32A(row+4, temp);
153        AV_WN32A(row+6, temp);
154        return;
155    }
156#endif
157#endif
158
159    a0 = ((SUINT)W4 * row[0]) + (1 << (ROW_SHIFT + extra_shift - 1));
160    a1 = a0;
161    a2 = a0;
162    a3 = a0;
163
164    a0 += (SUINT)W2 * row[2];
165    a1 += (SUINT)W6 * row[2];
166    a2 -= (SUINT)W6 * row[2];
167    a3 -= (SUINT)W2 * row[2];
168
169    b0 = MUL(W1, row[1]);
170    MAC(b0, W3, row[3]);
171    b1 = MUL(W3, row[1]);
172    MAC(b1, -W7, row[3]);
173    b2 = MUL(W5, row[1]);
174    MAC(b2, -W1, row[3]);
175    b3 = MUL(W7, row[1]);
176    MAC(b3, -W5, row[3]);
177
178#if IN_IDCT_DEPTH == 32
179    if (AV_RN64A(row + 4) | AV_RN64A(row + 6)) {
180#else
181    if (AV_RN64A(row + 4)) {
182#endif
183        a0 += (SUINT)  W4*row[4] + (SUINT)W6*row[6];
184        a1 += (SUINT)- W4*row[4] - (SUINT)W2*row[6];
185        a2 += (SUINT)- W4*row[4] + (SUINT)W2*row[6];
186        a3 += (SUINT)  W4*row[4] - (SUINT)W6*row[6];
187
188        MAC(b0,  W5, row[5]);
189        MAC(b0,  W7, row[7]);
190
191        MAC(b1, -W1, row[5]);
192        MAC(b1, -W5, row[7]);
193
194        MAC(b2,  W7, row[5]);
195        MAC(b2,  W3, row[7]);
196
197        MAC(b3,  W3, row[5]);
198        MAC(b3, -W1, row[7]);
199    }
200
201    row[0] = (int)(a0 + b0) >> (ROW_SHIFT + extra_shift);
202    row[7] = (int)(a0 - b0) >> (ROW_SHIFT + extra_shift);
203    row[1] = (int)(a1 + b1) >> (ROW_SHIFT + extra_shift);
204    row[6] = (int)(a1 - b1) >> (ROW_SHIFT + extra_shift);
205    row[2] = (int)(a2 + b2) >> (ROW_SHIFT + extra_shift);
206    row[5] = (int)(a2 - b2) >> (ROW_SHIFT + extra_shift);
207    row[3] = (int)(a3 + b3) >> (ROW_SHIFT + extra_shift);
208    row[4] = (int)(a3 - b3) >> (ROW_SHIFT + extra_shift);
209}
210
211#define IDCT_COLS do {                                  \
212        a0 = (SUINT)W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); \
213        a1 = a0;                                        \
214        a2 = a0;                                        \
215        a3 = a0;                                        \
216                                                        \
217        a0 += (SUINT) W2*col[8*2];                             \
218        a1 += (SUINT) W6*col[8*2];                             \
219        a2 += (SUINT)-W6*col[8*2];                             \
220        a3 += (SUINT)-W2*col[8*2];                             \
221                                                        \
222        b0 = MUL(W1, col[8*1]);                         \
223        b1 = MUL(W3, col[8*1]);                         \
224        b2 = MUL(W5, col[8*1]);                         \
225        b3 = MUL(W7, col[8*1]);                         \
226                                                        \
227        MAC(b0,  W3, col[8*3]);                         \
228        MAC(b1, -W7, col[8*3]);                         \
229        MAC(b2, -W1, col[8*3]);                         \
230        MAC(b3, -W5, col[8*3]);                         \
231                                                        \
232        if (col[8*4]) {                                 \
233            a0 += (SUINT) W4*col[8*4];                         \
234            a1 += (SUINT)-W4*col[8*4];                         \
235            a2 += (SUINT)-W4*col[8*4];                         \
236            a3 += (SUINT) W4*col[8*4];                         \
237        }                                               \
238                                                        \
239        if (col[8*5]) {                                 \
240            MAC(b0,  W5, col[8*5]);                     \
241            MAC(b1, -W1, col[8*5]);                     \
242            MAC(b2,  W7, col[8*5]);                     \
243            MAC(b3,  W3, col[8*5]);                     \
244        }                                               \
245                                                        \
246        if (col[8*6]) {                                 \
247            a0 += (SUINT) W6*col[8*6];                         \
248            a1 += (SUINT)-W2*col[8*6];                         \
249            a2 += (SUINT) W2*col[8*6];                         \
250            a3 += (SUINT)-W6*col[8*6];                         \
251        }                                               \
252                                                        \
253        if (col[8*7]) {                                 \
254            MAC(b0,  W7, col[8*7]);                     \
255            MAC(b1, -W5, col[8*7]);                     \
256            MAC(b2,  W3, col[8*7]);                     \
257            MAC(b3, -W1, col[8*7]);                     \
258        }                                               \
259    } while (0)
260
261#ifdef EXTRA_SHIFT
262static inline void FUNC(idctSparseCol_extrashift)(int16_t *col)
263#else
264static inline void FUNC6(idctSparseColPut)(pixel *dest, ptrdiff_t line_size,
265                                          idctin *col)
266{
267    SUINT a0, a1, a2, a3, b0, b1, b2, b3;
268
269    IDCT_COLS;
270
271    dest[0] = av_clip_pixel((int)(a0 + b0) >> COL_SHIFT);
272    dest += line_size;
273    dest[0] = av_clip_pixel((int)(a1 + b1) >> COL_SHIFT);
274    dest += line_size;
275    dest[0] = av_clip_pixel((int)(a2 + b2) >> COL_SHIFT);
276    dest += line_size;
277    dest[0] = av_clip_pixel((int)(a3 + b3) >> COL_SHIFT);
278    dest += line_size;
279    dest[0] = av_clip_pixel((int)(a3 - b3) >> COL_SHIFT);
280    dest += line_size;
281    dest[0] = av_clip_pixel((int)(a2 - b2) >> COL_SHIFT);
282    dest += line_size;
283    dest[0] = av_clip_pixel((int)(a1 - b1) >> COL_SHIFT);
284    dest += line_size;
285    dest[0] = av_clip_pixel((int)(a0 - b0) >> COL_SHIFT);
286}
287
288static inline void FUNC6(idctSparseColAdd)(pixel *dest, ptrdiff_t line_size,
289                                          idctin *col)
290{
291    unsigned a0, a1, a2, a3, b0, b1, b2, b3;
292
293    IDCT_COLS;
294
295    dest[0] = av_clip_pixel(dest[0] + ((int)(a0 + b0) >> COL_SHIFT));
296    dest += line_size;
297    dest[0] = av_clip_pixel(dest[0] + ((int)(a1 + b1) >> COL_SHIFT));
298    dest += line_size;
299    dest[0] = av_clip_pixel(dest[0] + ((int)(a2 + b2) >> COL_SHIFT));
300    dest += line_size;
301    dest[0] = av_clip_pixel(dest[0] + ((int)(a3 + b3) >> COL_SHIFT));
302    dest += line_size;
303    dest[0] = av_clip_pixel(dest[0] + ((int)(a3 - b3) >> COL_SHIFT));
304    dest += line_size;
305    dest[0] = av_clip_pixel(dest[0] + ((int)(a2 - b2) >> COL_SHIFT));
306    dest += line_size;
307    dest[0] = av_clip_pixel(dest[0] + ((int)(a1 - b1) >> COL_SHIFT));
308    dest += line_size;
309    dest[0] = av_clip_pixel(dest[0] + ((int)(a0 - b0) >> COL_SHIFT));
310}
311
312static inline void FUNC6(idctSparseCol)(idctin *col)
313#endif
314{
315    unsigned a0, a1, a2, a3, b0, b1, b2, b3;
316
317    IDCT_COLS;
318
319    col[0 ] = ((int)(a0 + b0) >> COL_SHIFT);
320    col[8 ] = ((int)(a1 + b1) >> COL_SHIFT);
321    col[16] = ((int)(a2 + b2) >> COL_SHIFT);
322    col[24] = ((int)(a3 + b3) >> COL_SHIFT);
323    col[32] = ((int)(a3 - b3) >> COL_SHIFT);
324    col[40] = ((int)(a2 - b2) >> COL_SHIFT);
325    col[48] = ((int)(a1 - b1) >> COL_SHIFT);
326    col[56] = ((int)(a0 - b0) >> COL_SHIFT);
327}
328
329#ifndef EXTRA_SHIFT
330void FUNC6(ff_simple_idct_put)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block_)
331{
332    idctin *block = (idctin *)block_;
333    pixel *dest = (pixel *)dest_;
334    int i;
335
336    line_size /= sizeof(pixel);
337
338    for (i = 0; i < 8; i++)
339        FUNC6(idctRowCondDC)(block + i*8, 0);
340
341    for (i = 0; i < 8; i++)
342        FUNC6(idctSparseColPut)(dest + i, line_size, block + i);
343}
344
345#if IN_IDCT_DEPTH == 16
346void FUNC6(ff_simple_idct_add)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block)
347{
348    pixel *dest = (pixel *)dest_;
349    int i;
350
351    line_size /= sizeof(pixel);
352
353    for (i = 0; i < 8; i++)
354        FUNC6(idctRowCondDC)(block + i*8, 0);
355
356    for (i = 0; i < 8; i++)
357        FUNC6(idctSparseColAdd)(dest + i, line_size, block + i);
358}
359
360void FUNC6(ff_simple_idct)(int16_t *block)
361{
362    int i;
363
364    for (i = 0; i < 8; i++)
365        FUNC6(idctRowCondDC)(block + i*8, 0);
366
367    for (i = 0; i < 8; i++)
368        FUNC6(idctSparseCol)(block + i);
369}
370#endif
371#endif
372