1 /*
2 * Copyright (C) 2004 The FFmpeg project
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 /**
22 * @file
23 * Standard C DSP-oriented functions cribbed from the original VP3
24 * source code.
25 */
26
27 #include <string.h>
28
29 #include "config.h"
30 #include "libavutil/attributes.h"
31 #include "libavutil/common.h"
32 #include "libavutil/internal.h"
33 #include "libavutil/intreadwrite.h"
34 #include "libavutil/avassert.h"
35
36 #include "rnd_avg.h"
37 #include "vp3dsp.h"
38
39 #define IdctAdjustBeforeShift 8
40 #define xC1S7 64277
41 #define xC2S6 60547
42 #define xC3S5 54491
43 #define xC4S4 46341
44 #define xC5S3 36410
45 #define xC6S2 25080
46 #define xC7S1 12785
47
48 #define M(a, b) ((int)((SUINT)(a) * (b)) >> 16)
49
idct(uint8_t *dst, ptrdiff_t stride, int16_t *input, int type)50 static av_always_inline void idct(uint8_t *dst, ptrdiff_t stride,
51 int16_t *input, int type)
52 {
53 int16_t *ip = input;
54
55 int A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;
56 int Ed, Gd, Add, Bdd, Fd, Hd;
57
58 int i;
59
60 /* Inverse DCT on the rows now */
61 for (i = 0; i < 8; i++) {
62 /* Check for non-zero values */
63 if (ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
64 ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8]) {
65 A = M(xC1S7, ip[1 * 8]) + M(xC7S1, ip[7 * 8]);
66 B = M(xC7S1, ip[1 * 8]) - M(xC1S7, ip[7 * 8]);
67 C = M(xC3S5, ip[3 * 8]) + M(xC5S3, ip[5 * 8]);
68 D = M(xC3S5, ip[5 * 8]) - M(xC5S3, ip[3 * 8]);
69
70 Ad = M(xC4S4, (A - C));
71 Bd = M(xC4S4, (B - D));
72
73 Cd = A + C;
74 Dd = B + D;
75
76 E = M(xC4S4, (ip[0 * 8] + ip[4 * 8]));
77 F = M(xC4S4, (ip[0 * 8] - ip[4 * 8]));
78
79 G = M(xC2S6, ip[2 * 8]) + M(xC6S2, ip[6 * 8]);
80 H = M(xC6S2, ip[2 * 8]) - M(xC2S6, ip[6 * 8]);
81
82 Ed = E - G;
83 Gd = E + G;
84
85 Add = F + Ad;
86 Bdd = Bd - H;
87
88 Fd = F - Ad;
89 Hd = Bd + H;
90
91 /* Final sequence of operations over-write original inputs. */
92 ip[0 * 8] = Gd + Cd;
93 ip[7 * 8] = Gd - Cd;
94
95 ip[1 * 8] = Add + Hd;
96 ip[2 * 8] = Add - Hd;
97
98 ip[3 * 8] = Ed + Dd;
99 ip[4 * 8] = Ed - Dd;
100
101 ip[5 * 8] = Fd + Bdd;
102 ip[6 * 8] = Fd - Bdd;
103 }
104
105 ip += 1; /* next row */
106 }
107
108 ip = input;
109
110 for (i = 0; i < 8; i++) {
111 /* Check for non-zero values (bitwise or faster than ||) */
112 if (ip[1] | ip[2] | ip[3] |
113 ip[4] | ip[5] | ip[6] | ip[7]) {
114 A = M(xC1S7, ip[1]) + M(xC7S1, ip[7]);
115 B = M(xC7S1, ip[1]) - M(xC1S7, ip[7]);
116 C = M(xC3S5, ip[3]) + M(xC5S3, ip[5]);
117 D = M(xC3S5, ip[5]) - M(xC5S3, ip[3]);
118
119 Ad = M(xC4S4, (A - C));
120 Bd = M(xC4S4, (B - D));
121
122 Cd = A + C;
123 Dd = B + D;
124
125 E = M(xC4S4, (ip[0] + ip[4])) + 8;
126 F = M(xC4S4, (ip[0] - ip[4])) + 8;
127
128 if (type == 1) { // HACK
129 E += 16 * 128;
130 F += 16 * 128;
131 }
132
133 G = M(xC2S6, ip[2]) + M(xC6S2, ip[6]);
134 H = M(xC6S2, ip[2]) - M(xC2S6, ip[6]);
135
136 Ed = E - G;
137 Gd = E + G;
138
139 Add = F + Ad;
140 Bdd = Bd - H;
141
142 Fd = F - Ad;
143 Hd = Bd + H;
144
145 /* Final sequence of operations over-write original inputs. */
146 if (type == 1) {
147 dst[0 * stride] = av_clip_uint8((Gd + Cd) >> 4);
148 dst[7 * stride] = av_clip_uint8((Gd - Cd) >> 4);
149
150 dst[1 * stride] = av_clip_uint8((Add + Hd) >> 4);
151 dst[2 * stride] = av_clip_uint8((Add - Hd) >> 4);
152
153 dst[3 * stride] = av_clip_uint8((Ed + Dd) >> 4);
154 dst[4 * stride] = av_clip_uint8((Ed - Dd) >> 4);
155
156 dst[5 * stride] = av_clip_uint8((Fd + Bdd) >> 4);
157 dst[6 * stride] = av_clip_uint8((Fd - Bdd) >> 4);
158 } else {
159 dst[0 * stride] = av_clip_uint8(dst[0 * stride] + ((Gd + Cd) >> 4));
160 dst[7 * stride] = av_clip_uint8(dst[7 * stride] + ((Gd - Cd) >> 4));
161
162 dst[1 * stride] = av_clip_uint8(dst[1 * stride] + ((Add + Hd) >> 4));
163 dst[2 * stride] = av_clip_uint8(dst[2 * stride] + ((Add - Hd) >> 4));
164
165 dst[3 * stride] = av_clip_uint8(dst[3 * stride] + ((Ed + Dd) >> 4));
166 dst[4 * stride] = av_clip_uint8(dst[4 * stride] + ((Ed - Dd) >> 4));
167
168 dst[5 * stride] = av_clip_uint8(dst[5 * stride] + ((Fd + Bdd) >> 4));
169 dst[6 * stride] = av_clip_uint8(dst[6 * stride] + ((Fd - Bdd) >> 4));
170 }
171 } else {
172 if (type == 1) {
173 dst[0*stride] =
174 dst[1*stride] =
175 dst[2*stride] =
176 dst[3*stride] =
177 dst[4*stride] =
178 dst[5*stride] =
179 dst[6*stride] =
180 dst[7*stride] = av_clip_uint8(128 + ((xC4S4 * ip[0] + (IdctAdjustBeforeShift << 16)) >> 20));
181 } else {
182 if (ip[0]) {
183 int v = (xC4S4 * ip[0] + (IdctAdjustBeforeShift << 16)) >> 20;
184 dst[0 * stride] = av_clip_uint8(dst[0 * stride] + v);
185 dst[1 * stride] = av_clip_uint8(dst[1 * stride] + v);
186 dst[2 * stride] = av_clip_uint8(dst[2 * stride] + v);
187 dst[3 * stride] = av_clip_uint8(dst[3 * stride] + v);
188 dst[4 * stride] = av_clip_uint8(dst[4 * stride] + v);
189 dst[5 * stride] = av_clip_uint8(dst[5 * stride] + v);
190 dst[6 * stride] = av_clip_uint8(dst[6 * stride] + v);
191 dst[7 * stride] = av_clip_uint8(dst[7 * stride] + v);
192 }
193 }
194 }
195
196 ip += 8; /* next column */
197 dst++;
198 }
199 }
200
idct10(uint8_t *dst, ptrdiff_t stride, int16_t *input, int type)201 static av_always_inline void idct10(uint8_t *dst, ptrdiff_t stride,
202 int16_t *input, int type)
203 {
204 int16_t *ip = input;
205
206 int A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;
207 int Ed, Gd, Add, Bdd, Fd, Hd;
208
209 int i;
210
211 /* Inverse DCT on the rows now */
212 for (i = 0; i < 4; i++) {
213 /* Check for non-zero values */
214 if (ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8]) {
215 A = M(xC1S7, ip[1 * 8]);
216 B = M(xC7S1, ip[1 * 8]);
217 C = M(xC3S5, ip[3 * 8]);
218 D = -M(xC5S3, ip[3 * 8]);
219
220 Ad = M(xC4S4, (A - C));
221 Bd = M(xC4S4, (B - D));
222
223 Cd = A + C;
224 Dd = B + D;
225
226 E = M(xC4S4, ip[0 * 8]);
227 F = E;
228
229 G = M(xC2S6, ip[2 * 8]);
230 H = M(xC6S2, ip[2 * 8]);
231
232 Ed = E - G;
233 Gd = E + G;
234
235 Add = F + Ad;
236 Bdd = Bd - H;
237
238 Fd = F - Ad;
239 Hd = Bd + H;
240
241 /* Final sequence of operations over-write original inputs */
242 ip[0 * 8] = Gd + Cd;
243 ip[7 * 8] = Gd - Cd;
244
245 ip[1 * 8] = Add + Hd;
246 ip[2 * 8] = Add - Hd;
247
248 ip[3 * 8] = Ed + Dd;
249 ip[4 * 8] = Ed - Dd;
250
251 ip[5 * 8] = Fd + Bdd;
252 ip[6 * 8] = Fd - Bdd;
253
254 }
255
256 ip += 1;
257 }
258
259 ip = input;
260
261 for (i = 0; i < 8; i++) {
262 /* Check for non-zero values (bitwise or faster than ||) */
263 if (ip[0] | ip[1] | ip[2] | ip[3]) {
264 A = M(xC1S7, ip[1]);
265 B = M(xC7S1, ip[1]);
266 C = M(xC3S5, ip[3]);
267 D = -M(xC5S3, ip[3]);
268
269 Ad = M(xC4S4, (A - C));
270 Bd = M(xC4S4, (B - D));
271
272 Cd = A + C;
273 Dd = B + D;
274
275 E = M(xC4S4, ip[0]);
276 if (type == 1)
277 E += 16 * 128;
278 F = E;
279
280 G = M(xC2S6, ip[2]);
281 H = M(xC6S2, ip[2]);
282
283 Ed = E - G;
284 Gd = E + G;
285
286 Add = F + Ad;
287 Bdd = Bd - H;
288
289 Fd = F - Ad;
290 Hd = Bd + H;
291
292 Gd += 8;
293 Add += 8;
294 Ed += 8;
295 Fd += 8;
296
297 /* Final sequence of operations over-write original inputs. */
298 if (type == 1) {
299 dst[0 * stride] = av_clip_uint8((Gd + Cd) >> 4);
300 dst[7 * stride] = av_clip_uint8((Gd - Cd) >> 4);
301
302 dst[1 * stride] = av_clip_uint8((Add + Hd) >> 4);
303 dst[2 * stride] = av_clip_uint8((Add - Hd) >> 4);
304
305 dst[3 * stride] = av_clip_uint8((Ed + Dd) >> 4);
306 dst[4 * stride] = av_clip_uint8((Ed - Dd) >> 4);
307
308 dst[5 * stride] = av_clip_uint8((Fd + Bdd) >> 4);
309 dst[6 * stride] = av_clip_uint8((Fd - Bdd) >> 4);
310 } else {
311 dst[0 * stride] = av_clip_uint8(dst[0 * stride] + ((Gd + Cd) >> 4));
312 dst[7 * stride] = av_clip_uint8(dst[7 * stride] + ((Gd - Cd) >> 4));
313
314 dst[1 * stride] = av_clip_uint8(dst[1 * stride] + ((Add + Hd) >> 4));
315 dst[2 * stride] = av_clip_uint8(dst[2 * stride] + ((Add - Hd) >> 4));
316
317 dst[3 * stride] = av_clip_uint8(dst[3 * stride] + ((Ed + Dd) >> 4));
318 dst[4 * stride] = av_clip_uint8(dst[4 * stride] + ((Ed - Dd) >> 4));
319
320 dst[5 * stride] = av_clip_uint8(dst[5 * stride] + ((Fd + Bdd) >> 4));
321 dst[6 * stride] = av_clip_uint8(dst[6 * stride] + ((Fd - Bdd) >> 4));
322 }
323 } else {
324 if (type == 1) {
325 dst[0*stride] =
326 dst[1*stride] =
327 dst[2*stride] =
328 dst[3*stride] =
329 dst[4*stride] =
330 dst[5*stride] =
331 dst[6*stride] =
332 dst[7*stride] = 128;
333 }
334 }
335
336 ip += 8;
337 dst++;
338 }
339 }
340
ff_vp3dsp_idct10_put(uint8_t *dest, ptrdiff_t stride, int16_t *block)341 void ff_vp3dsp_idct10_put(uint8_t *dest, ptrdiff_t stride, int16_t *block)
342 {
343 idct10(dest, stride, block, 1);
344 memset(block, 0, sizeof(*block) * 64);
345 }
346
ff_vp3dsp_idct10_add(uint8_t *dest, ptrdiff_t stride, int16_t *block)347 void ff_vp3dsp_idct10_add(uint8_t *dest, ptrdiff_t stride, int16_t *block)
348 {
349 idct10(dest, stride, block, 2);
350 memset(block, 0, sizeof(*block) * 64);
351 }
352
vp3_idct_put_c(uint8_t *dest , ptrdiff_t stride, int16_t *block )353 static void vp3_idct_put_c(uint8_t *dest /* align 8 */, ptrdiff_t stride,
354 int16_t *block /* align 16 */)
355 {
356 idct(dest, stride, block, 1);
357 memset(block, 0, sizeof(*block) * 64);
358 }
359
vp3_idct_add_c(uint8_t *dest , ptrdiff_t stride, int16_t *block )360 static void vp3_idct_add_c(uint8_t *dest /* align 8 */, ptrdiff_t stride,
361 int16_t *block /* align 16 */)
362 {
363 idct(dest, stride, block, 2);
364 memset(block, 0, sizeof(*block) * 64);
365 }
366
vp3_idct_dc_add_c(uint8_t *dest , ptrdiff_t stride, int16_t *block )367 static void vp3_idct_dc_add_c(uint8_t *dest /* align 8 */, ptrdiff_t stride,
368 int16_t *block /* align 16 */)
369 {
370 int i, dc = (block[0] + 15) >> 5;
371
372 for (i = 0; i < 8; i++) {
373 dest[0] = av_clip_uint8(dest[0] + dc);
374 dest[1] = av_clip_uint8(dest[1] + dc);
375 dest[2] = av_clip_uint8(dest[2] + dc);
376 dest[3] = av_clip_uint8(dest[3] + dc);
377 dest[4] = av_clip_uint8(dest[4] + dc);
378 dest[5] = av_clip_uint8(dest[5] + dc);
379 dest[6] = av_clip_uint8(dest[6] + dc);
380 dest[7] = av_clip_uint8(dest[7] + dc);
381 dest += stride;
382 }
383 block[0] = 0;
384 }
385
vp3_v_loop_filter_c(uint8_t *first_pixel, ptrdiff_t stride, int *bounding_values, int count)386 static av_always_inline void vp3_v_loop_filter_c(uint8_t *first_pixel, ptrdiff_t stride,
387 int *bounding_values, int count)
388 {
389 unsigned char *end;
390 int filter_value;
391 const ptrdiff_t nstride = -stride;
392
393 for (end = first_pixel + count; first_pixel < end; first_pixel++) {
394 filter_value = (first_pixel[2 * nstride] - first_pixel[stride]) +
395 (first_pixel[0] - first_pixel[nstride]) * 3;
396 filter_value = bounding_values[(filter_value + 4) >> 3];
397
398 first_pixel[nstride] = av_clip_uint8(first_pixel[nstride] + filter_value);
399 first_pixel[0] = av_clip_uint8(first_pixel[0] - filter_value);
400 }
401 }
402
vp3_h_loop_filter_c(uint8_t *first_pixel, ptrdiff_t stride, int *bounding_values, int count)403 static av_always_inline void vp3_h_loop_filter_c(uint8_t *first_pixel, ptrdiff_t stride,
404 int *bounding_values, int count)
405 {
406 unsigned char *end;
407 int filter_value;
408
409 for (end = first_pixel + count * stride; first_pixel != end; first_pixel += stride) {
410 filter_value = (first_pixel[-2] - first_pixel[1]) +
411 (first_pixel[ 0] - first_pixel[-1]) * 3;
412 filter_value = bounding_values[(filter_value + 4) >> 3];
413
414 first_pixel[-1] = av_clip_uint8(first_pixel[-1] + filter_value);
415 first_pixel[ 0] = av_clip_uint8(first_pixel[ 0] - filter_value);
416 }
417 }
418
419 #define LOOP_FILTER(prefix, suffix, dim, count) \
420 void prefix##_##dim##_loop_filter_##count##suffix(uint8_t *first_pixel, ptrdiff_t stride, \
421 int *bounding_values) \
422 { \
423 vp3_##dim##_loop_filter_c(first_pixel, stride, bounding_values, count); \
424 }
425
426 static LOOP_FILTER(vp3,_c, v, 8)
427 static LOOP_FILTER(vp3,_c, h, 8)
428 LOOP_FILTER(ff_vp3dsp, , v, 12)
429 LOOP_FILTER(ff_vp3dsp, , h, 12)
430
put_no_rnd_pixels_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ptrdiff_t stride, int h)431 static void put_no_rnd_pixels_l2(uint8_t *dst, const uint8_t *src1,
432 const uint8_t *src2, ptrdiff_t stride, int h)
433 {
434 int i;
435
436 for (i = 0; i < h; i++) {
437 uint32_t a, b;
438
439 a = AV_RN32(&src1[i * stride]);
440 b = AV_RN32(&src2[i * stride]);
441 AV_WN32A(&dst[i * stride], no_rnd_avg32(a, b));
442 a = AV_RN32(&src1[i * stride + 4]);
443 b = AV_RN32(&src2[i * stride + 4]);
444 AV_WN32A(&dst[i * stride + 4], no_rnd_avg32(a, b));
445 }
446 }
447
ff_vp3dsp_init(VP3DSPContext *c, int flags)448 av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags)
449 {
450 c->put_no_rnd_pixels_l2 = put_no_rnd_pixels_l2;
451
452 c->idct_put = vp3_idct_put_c;
453 c->idct_add = vp3_idct_add_c;
454 c->idct_dc_add = vp3_idct_dc_add_c;
455 c->v_loop_filter = c->v_loop_filter_unaligned = vp3_v_loop_filter_8_c;
456 c->h_loop_filter = c->h_loop_filter_unaligned = vp3_h_loop_filter_8_c;
457
458 #if ARCH_ARM
459 ff_vp3dsp_init_arm(c, flags);
460 #elif ARCH_PPC
461 ff_vp3dsp_init_ppc(c, flags);
462 #elif ARCH_X86
463 ff_vp3dsp_init_x86(c, flags);
464 #elif ARCH_MIPS
465 ff_vp3dsp_init_mips(c, flags);
466 #endif
467 }
468
469 /*
470 * This function initializes the loop filter boundary limits if the frame's
471 * quality index is different from the previous frame's.
472 *
473 * where sizeof(bounding_values_array) is 256 * sizeof(int)
474 *
475 * The filter_limit_values may not be larger than 127.
476 */
ff_vp3dsp_set_bounding_values(int * bounding_values_array, int filter_limit)477 void ff_vp3dsp_set_bounding_values(int * bounding_values_array, int filter_limit)
478 {
479 int *bounding_values = bounding_values_array + 127;
480 int x;
481 int value;
482
483 av_assert0(filter_limit < 128U);
484
485 /* set up the bounding values */
486 memset(bounding_values_array, 0, 256 * sizeof(int));
487 for (x = 0; x < filter_limit; x++) {
488 bounding_values[-x] = -x;
489 bounding_values[x] = x;
490 }
491 for (x = value = filter_limit; x < 128 && value; x++, value--) {
492 bounding_values[ x] = value;
493 bounding_values[-x] = -value;
494 }
495 if (value)
496 bounding_values[128] = value;
497 bounding_values[129] = bounding_values[130] = filter_limit * 0x02020202U;
498 }
499