1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 /**
24 * @file
25 * postprocessing.
26 */
27
28 /*
29 C MMX MMX2 3DNow AltiVec
30 isVertDC Ec Ec Ec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
34 isHorizDC Ec Ec Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
39 deRing E e e* Ecp
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
42 Vertical X1# a E E
43 Horizontal X1# a E E
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
49
50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51 # more or less selfinvented filters so the exactness is not too meaningful
52 E = Exact implementation
53 e = almost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73
74 //Changelog: use git log
75
76 #include "config.h"
77 #include "libavutil/avutil.h"
78 #include "libavutil/avassert.h"
79 #include "libavutil/cpu.h"
80 #include "libavutil/intreadwrite.h"
81 #include <inttypes.h>
82 #include <stdio.h>
83 #include <stdlib.h>
84 #include <string.h>
85 //#undef HAVE_MMXEXT_INLINE
86 //#define HAVE_AMD3DNOW_INLINE
87 //#undef HAVE_MMX_INLINE
88 //#undef ARCH_X86
89 //#define DEBUG_BRIGHTNESS
90 #include "postprocess.h"
91 #include "postprocess_internal.h"
92 #include "libavutil/avstring.h"
93 #include "libavutil/ppc/util_altivec.h"
94
95 #define GET_MODE_BUFFER_SIZE 500
96 #define OPTIONS_ARRAY_SIZE 10
97 #define BLOCK_SIZE 8
98 #define TEMP_STRIDE 8
99 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
100
101 #if ARCH_X86 && HAVE_INLINE_ASM
102 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
103 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
104 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
105 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
106 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
107 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
108 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
109 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
110 #endif
111
112 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
113
114
115 static const struct PPFilter filters[]=
116 {
117 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
118 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
119 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
120 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
121 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
122 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
123 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
124 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
125 {"dr", "dering", 1, 5, 6, DERING},
126 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
127 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
128 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
129 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
130 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
131 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
132 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
133 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
134 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
135 {"be", "bitexact", 1, 0, 0, BITEXACT},
136 {"vi", "visualize", 1, 0, 0, VISUALIZE},
137 {NULL, NULL,0,0,0,0} //End Marker
138 };
139
140 static const char * const replaceTable[]=
141 {
142 "default", "hb:a,vb:a,dr:a",
143 "de", "hb:a,vb:a,dr:a",
144 "fast", "h1:a,v1:a,dr:a",
145 "fa", "h1:a,v1:a,dr:a",
146 "ac", "ha:a:128:7,va:a,dr:a",
147 NULL //End Marker
148 };
149
150 /* The horizontal functions exist only in C because the MMX
151 * code is faster with vertical filters and transposing. */
152
153 /**
154 * Check if the given 8x8 Block is mostly "flat"
155 */
isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)156 static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
157 {
158 int numEq= 0;
159 int y;
160 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
161 const int dcThreshold= dcOffset*2 + 1;
162
163 for(y=0; y<BLOCK_SIZE; y++){
164 numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
165 numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
166 numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
167 numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold;
168 numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold;
169 numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold;
170 numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold;
171 src+= stride;
172 }
173 return numEq > c->ppMode.flatnessThreshold;
174 }
175
176 /**
177 * Check if the middle 8x8 Block in the given 8x16 block is flat
178 */
isVertDC_C(const uint8_t src[], int stride, const PPContext *c)179 static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
180 {
181 int numEq= 0;
182 int y;
183 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
184 const int dcThreshold= dcOffset*2 + 1;
185
186 src+= stride*4; // src points to begin of the 8x8 Block
187 for(y=0; y<BLOCK_SIZE-1; y++){
188 numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
189 numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
190 numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
191 numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold;
192 numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold;
193 numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold;
194 numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold;
195 numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold;
196 src+= stride;
197 }
198 return numEq > c->ppMode.flatnessThreshold;
199 }
200
isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)201 static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
202 {
203 int i;
204 for(i=0; i<2; i++){
205 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
206 src += stride;
207 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
208 src += stride;
209 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
210 src += stride;
211 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
212 src += stride;
213 }
214 return 1;
215 }
216
isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)217 static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
218 {
219 int x;
220 src+= stride*4;
221 for(x=0; x<BLOCK_SIZE; x+=4){
222 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
223 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
224 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
225 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
226 }
227 return 1;
228 }
229
horizClassify_C(const uint8_t src[], int stride, const PPContext *c)230 static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
231 {
232 if( isHorizDC_C(src, stride, c) ){
233 return isHorizMinMaxOk_C(src, stride, c->QP);
234 }else{
235 return 2;
236 }
237 }
238
vertClassify_C(const uint8_t src[], int stride, const PPContext *c)239 static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
240 {
241 if( isVertDC_C(src, stride, c) ){
242 return isVertMinMaxOk_C(src, stride, c->QP);
243 }else{
244 return 2;
245 }
246 }
247
doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)248 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
249 {
250 int y;
251 for(y=0; y<BLOCK_SIZE; y++){
252 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
253
254 if(FFABS(middleEnergy) < 8*c->QP){
255 const int q=(dst[3] - dst[4])/2;
256 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
257 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
258
259 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
260 d= FFMAX(d, 0);
261
262 d= (5*d + 32) >> 6;
263 d*= FFSIGN(-middleEnergy);
264
265 if(q>0)
266 {
267 d = FFMAX(d, 0);
268 d = FFMIN(d, q);
269 }
270 else
271 {
272 d = FFMIN(d, 0);
273 d = FFMAX(d, q);
274 }
275
276 dst[3]-= d;
277 dst[4]+= d;
278 }
279 dst+= stride;
280 }
281 }
282
283 /**
284 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
285 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
286 */
doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)287 static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
288 {
289 int y;
290 for(y=0; y<BLOCK_SIZE; y++){
291 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
292 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
293
294 int sums[10];
295 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
296 sums[1] = sums[0] - first + dst[3];
297 sums[2] = sums[1] - first + dst[4];
298 sums[3] = sums[2] - first + dst[5];
299 sums[4] = sums[3] - first + dst[6];
300 sums[5] = sums[4] - dst[0] + dst[7];
301 sums[6] = sums[5] - dst[1] + last;
302 sums[7] = sums[6] - dst[2] + last;
303 sums[8] = sums[7] - dst[3] + last;
304 sums[9] = sums[8] - dst[4] + last;
305
306 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
307 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
308 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
309 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
310 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
311 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
312 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
313 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
314
315 dst+= stride;
316 }
317 }
318
319 /**
320 * Experimental Filter 1 (Horizontal)
321 * will not damage linear gradients
322 * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
323 * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
324 * MMX2 version does correct clipping C version does not
325 * not identical with the vertical one
326 */
horizX1Filter(uint8_t *src, int stride, int QP)327 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
328 {
329 int y;
330 static uint64_t lut[256];
331 if(!lut[255])
332 {
333 int i;
334 for(i=0; i<256; i++)
335 {
336 int v= i < 128 ? 2*i : 2*(i-256);
337 /*
338 //Simulate 112242211 9-Tap filter
339 uint64_t a= (v/16) & 0xFF;
340 uint64_t b= (v/8) & 0xFF;
341 uint64_t c= (v/4) & 0xFF;
342 uint64_t d= (3*v/8) & 0xFF;
343 */
344 //Simulate piecewise linear interpolation
345 uint64_t a= (v/16) & 0xFF;
346 uint64_t b= (v*3/16) & 0xFF;
347 uint64_t c= (v*5/16) & 0xFF;
348 uint64_t d= (7*v/16) & 0xFF;
349 uint64_t A= (0x100 - a)&0xFF;
350 uint64_t B= (0x100 - b)&0xFF;
351 uint64_t C= (0x100 - c)&0xFF;
352 uint64_t D= (0x100 - c)&0xFF;
353
354 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
355 (D<<24) | (C<<16) | (B<<8) | (A);
356 //lut[i] = (v<<32) | (v<<24);
357 }
358 }
359
360 for(y=0; y<BLOCK_SIZE; y++){
361 int a= src[1] - src[2];
362 int b= src[3] - src[4];
363 int c= src[5] - src[6];
364
365 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
366
367 if(d < QP){
368 int v = d * FFSIGN(-b);
369
370 src[1] +=v/8;
371 src[2] +=v/4;
372 src[3] +=3*v/8;
373 src[4] -=3*v/8;
374 src[5] -=v/4;
375 src[6] -=v/8;
376 }
377 src+=stride;
378 }
379 }
380
381 /**
382 * accurate deblock filter
383 */
do_a_deblock_C(uint8_t *src, int step, int stride, const PPContext *c, int mode)384 static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
385 int stride, const PPContext *c, int mode)
386 {
387 int y;
388 const int QP= c->QP;
389 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
390 const int dcThreshold= dcOffset*2 + 1;
391
392 src+= step*4; // src points to begin of the 8x8 Block
393 for(y=0; y<8; y++){
394 int numEq= 0;
395
396 numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold;
397 numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold;
398 numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold;
399 numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold;
400 numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold;
401 numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold;
402 numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold;
403 numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold;
404 numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold;
405 if(numEq > c->ppMode.flatnessThreshold){
406 int min, max, x;
407
408 if(src[0] > src[step]){
409 max= src[0];
410 min= src[step];
411 }else{
412 max= src[step];
413 min= src[0];
414 }
415 for(x=2; x<8; x+=2){
416 if(src[x*step] > src[(x+1)*step]){
417 if(src[x *step] > max) max= src[ x *step];
418 if(src[(x+1)*step] < min) min= src[(x+1)*step];
419 }else{
420 if(src[(x+1)*step] > max) max= src[(x+1)*step];
421 if(src[ x *step] < min) min= src[ x *step];
422 }
423 }
424 if(max-min < 2*QP){
425 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
426 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
427
428 int sums[10];
429 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
430 sums[1] = sums[0] - first + src[3*step];
431 sums[2] = sums[1] - first + src[4*step];
432 sums[3] = sums[2] - first + src[5*step];
433 sums[4] = sums[3] - first + src[6*step];
434 sums[5] = sums[4] - src[0*step] + src[7*step];
435 sums[6] = sums[5] - src[1*step] + last;
436 sums[7] = sums[6] - src[2*step] + last;
437 sums[8] = sums[7] - src[3*step] + last;
438 sums[9] = sums[8] - src[4*step] + last;
439
440 if (mode & VISUALIZE) {
441 src[0*step] =
442 src[1*step] =
443 src[2*step] =
444 src[3*step] =
445 src[4*step] =
446 src[5*step] =
447 src[6*step] =
448 src[7*step] = 128;
449 }
450 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
451 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
452 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
453 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
454 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
455 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
456 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
457 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
458 }
459 }else{
460 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
461
462 if(FFABS(middleEnergy) < 8*QP){
463 const int q=(src[3*step] - src[4*step])/2;
464 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
465 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
466
467 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
468 d= FFMAX(d, 0);
469
470 d= (5*d + 32) >> 6;
471 d*= FFSIGN(-middleEnergy);
472
473 if(q>0){
474 d = FFMAX(d, 0);
475 d = FFMIN(d, q);
476 }else{
477 d = FFMIN(d, 0);
478 d = FFMAX(d, q);
479 }
480
481 if ((mode & VISUALIZE) && d) {
482 d= (d < 0) ? 32 : -32;
483 src[3*step]= av_clip_uint8(src[3*step] - d);
484 src[4*step]= av_clip_uint8(src[4*step] + d);
485 d = 0;
486 }
487
488 src[3*step]-= d;
489 src[4*step]+= d;
490 }
491 }
492
493 src += stride;
494 }
495 }
496
497 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
498 //Plain C versions
499 //we always compile C for testing which needs bitexactness
500 #define TEMPLATE_PP_C 1
501 #include "postprocess_template.c"
502
503 #if HAVE_ALTIVEC
504 # define TEMPLATE_PP_ALTIVEC 1
505 # include "postprocess_altivec_template.c"
506 # include "postprocess_template.c"
507 #endif
508
509 #if ARCH_X86 && HAVE_INLINE_ASM
510 # if CONFIG_RUNTIME_CPUDETECT
511 # define TEMPLATE_PP_MMX 1
512 # include "postprocess_template.c"
513 # define TEMPLATE_PP_MMXEXT 1
514 # include "postprocess_template.c"
515 # define TEMPLATE_PP_3DNOW 1
516 # include "postprocess_template.c"
517 # define TEMPLATE_PP_SSE2 1
518 # include "postprocess_template.c"
519 # else
520 # if HAVE_SSE2_INLINE
521 # define TEMPLATE_PP_SSE2 1
522 # include "postprocess_template.c"
523 # elif HAVE_MMXEXT_INLINE
524 # define TEMPLATE_PP_MMXEXT 1
525 # include "postprocess_template.c"
526 # elif HAVE_AMD3DNOW_INLINE
527 # define TEMPLATE_PP_3DNOW 1
528 # include "postprocess_template.c"
529 # elif HAVE_MMX_INLINE
530 # define TEMPLATE_PP_MMX 1
531 # include "postprocess_template.c"
532 # endif
533 # endif
534 #endif
535
536 typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
537 const int8_t QPs[], int QPStride, int isColor, PPContext *c2);
538
postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)539 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
540 const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
541 {
542 pp_fn pp = postProcess_C;
543 PPContext *c= (PPContext *)vc;
544 PPMode *ppMode= (PPMode *)vm;
545 c->ppMode= *ppMode; //FIXME
546
547 if (!(ppMode->lumMode & BITEXACT)) {
548 #if CONFIG_RUNTIME_CPUDETECT
549 #if ARCH_X86 && HAVE_INLINE_ASM
550 // ordered per speed fastest first
551 if (c->cpuCaps & AV_CPU_FLAG_SSE2) pp = postProcess_SSE2;
552 else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT) pp = postProcess_MMX2;
553 else if (c->cpuCaps & AV_CPU_FLAG_3DNOW) pp = postProcess_3DNow;
554 else if (c->cpuCaps & AV_CPU_FLAG_MMX) pp = postProcess_MMX;
555 #elif HAVE_ALTIVEC
556 if (c->cpuCaps & AV_CPU_FLAG_ALTIVEC) pp = postProcess_altivec;
557 #endif
558 #else /* CONFIG_RUNTIME_CPUDETECT */
559 #if HAVE_SSE2_INLINE
560 pp = postProcess_SSE2;
561 #elif HAVE_MMXEXT_INLINE
562 pp = postProcess_MMX2;
563 #elif HAVE_AMD3DNOW_INLINE
564 pp = postProcess_3DNow;
565 #elif HAVE_MMX_INLINE
566 pp = postProcess_MMX;
567 #elif HAVE_ALTIVEC
568 pp = postProcess_altivec;
569 #endif
570 #endif /* !CONFIG_RUNTIME_CPUDETECT */
571 }
572
573 pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
574 }
575
576 /* -pp Command line Help
577 */
578 const char pp_help[] =
579 "Available postprocessing filters:\n"
580 "Filters Options\n"
581 "short long name short long option Description\n"
582 "* * a autoq CPU power dependent enabler\n"
583 " c chrom chrominance filtering enabled\n"
584 " y nochrom chrominance filtering disabled\n"
585 " n noluma luma filtering disabled\n"
586 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
587 " 1. difference factor: default=32, higher -> more deblocking\n"
588 " 2. flatness threshold: default=39, lower -> more deblocking\n"
589 " the h & v deblocking filters share these\n"
590 " so you can't set different thresholds for h / v\n"
591 "vb vdeblock (2 threshold) vertical deblocking filter\n"
592 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
593 "va vadeblock (2 threshold) vertical deblocking filter\n"
594 "h1 x1hdeblock experimental h deblock filter 1\n"
595 "v1 x1vdeblock experimental v deblock filter 1\n"
596 "dr dering deringing filter\n"
597 "al autolevels automatic brightness / contrast\n"
598 " f fullyrange stretch luminance to (0..255)\n"
599 "lb linblenddeint linear blend deinterlacer\n"
600 "li linipoldeint linear interpolating deinterlace\n"
601 "ci cubicipoldeint cubic interpolating deinterlacer\n"
602 "md mediandeint median deinterlacer\n"
603 "fd ffmpegdeint ffmpeg deinterlacer\n"
604 "l5 lowpass5 FIR lowpass deinterlacer\n"
605 "de default hb:a,vb:a,dr:a\n"
606 "fa fast h1:a,v1:a,dr:a\n"
607 "ac ha:a:128:7,va:a,dr:a\n"
608 "tn tmpnoise (3 threshold) temporal noise reducer\n"
609 " 1. <= 2. <= 3. larger -> stronger filtering\n"
610 "fq forceQuant <quantizer> force quantizer\n"
611 "Usage:\n"
612 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
613 "long form example:\n"
614 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
615 "short form example:\n"
616 "vb:a/hb:a/lb de,-vb\n"
617 "more examples:\n"
618 "tn:64:128:256\n"
619 "\n"
620 ;
621
pp_get_mode_by_name_and_quality(const char *name, int quality)622 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
623 {
624 char temp[GET_MODE_BUFFER_SIZE];
625 char *p= temp;
626 static const char filterDelimiters[] = ",/";
627 static const char optionDelimiters[] = ":|";
628 struct PPMode *ppMode;
629 char *filterToken;
630
631 if (!name) {
632 av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
633 return NULL;
634 }
635
636 if (!strcmp(name, "help")) {
637 const char *p;
638 for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
639 av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
640 av_log(NULL, AV_LOG_INFO, "%s", temp);
641 }
642 return NULL;
643 }
644
645 ppMode= av_malloc(sizeof(PPMode));
646 if (!ppMode)
647 return NULL;
648
649 ppMode->lumMode= 0;
650 ppMode->chromMode= 0;
651 ppMode->maxTmpNoise[0]= 700;
652 ppMode->maxTmpNoise[1]= 1500;
653 ppMode->maxTmpNoise[2]= 3000;
654 ppMode->maxAllowedY= 234;
655 ppMode->minAllowedY= 16;
656 ppMode->baseDcDiff= 256/8;
657 ppMode->flatnessThreshold= 56-16-1;
658 ppMode->maxClippedThreshold= (AVRational){1,100};
659 ppMode->error=0;
660
661 memset(temp, 0, GET_MODE_BUFFER_SIZE);
662 av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
663
664 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
665
666 for(;;){
667 const char *filterName;
668 int q= 1000000; //PP_QUALITY_MAX;
669 int chrom=-1;
670 int luma=-1;
671 const char *option;
672 const char *options[OPTIONS_ARRAY_SIZE];
673 int i;
674 int filterNameOk=0;
675 int numOfUnknownOptions=0;
676 int enable=1; //does the user want us to enabled or disabled the filter
677 char *tokstate;
678
679 filterToken= av_strtok(p, filterDelimiters, &tokstate);
680 if(!filterToken) break;
681 p+= strlen(filterToken) + 1; // p points to next filterToken
682 filterName= av_strtok(filterToken, optionDelimiters, &tokstate);
683 if (!filterName) {
684 ppMode->error++;
685 break;
686 }
687 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
688
689 if(*filterName == '-'){
690 enable=0;
691 filterName++;
692 }
693
694 for(;;){ //for all options
695 option= av_strtok(NULL, optionDelimiters, &tokstate);
696 if(!option) break;
697
698 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
699 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
700 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
701 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
702 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
703 else{
704 options[numOfUnknownOptions] = option;
705 numOfUnknownOptions++;
706 }
707 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
708 }
709 options[numOfUnknownOptions] = NULL;
710
711 /* replace stuff from the replace Table */
712 for(i=0; replaceTable[2*i]; i++){
713 if(!strcmp(replaceTable[2*i], filterName)){
714 size_t newlen = strlen(replaceTable[2*i + 1]);
715 int plen;
716 int spaceLeft;
717
718 p--, *p=',';
719
720 plen= strlen(p);
721 spaceLeft= p - temp + plen;
722 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE - 1){
723 ppMode->error++;
724 break;
725 }
726 memmove(p + newlen, p, plen+1);
727 memcpy(p, replaceTable[2*i + 1], newlen);
728 filterNameOk=1;
729 }
730 }
731
732 for(i=0; filters[i].shortName; i++){
733 if( !strcmp(filters[i].longName, filterName)
734 || !strcmp(filters[i].shortName, filterName)){
735 ppMode->lumMode &= ~filters[i].mask;
736 ppMode->chromMode &= ~filters[i].mask;
737
738 filterNameOk=1;
739 if(!enable) break; // user wants to disable it
740
741 if(q >= filters[i].minLumQuality && luma)
742 ppMode->lumMode|= filters[i].mask;
743 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
744 if(q >= filters[i].minChromQuality)
745 ppMode->chromMode|= filters[i].mask;
746
747 if(filters[i].mask == LEVEL_FIX){
748 int o;
749 ppMode->minAllowedY= 16;
750 ppMode->maxAllowedY= 234;
751 for(o=0; options[o]; o++){
752 if( !strcmp(options[o],"fullyrange")
753 ||!strcmp(options[o],"f")){
754 ppMode->minAllowedY= 0;
755 ppMode->maxAllowedY= 255;
756 numOfUnknownOptions--;
757 }
758 }
759 }
760 else if(filters[i].mask == TEMP_NOISE_FILTER)
761 {
762 int o;
763 int numOfNoises=0;
764
765 for(o=0; options[o]; o++){
766 char *tail;
767 ppMode->maxTmpNoise[numOfNoises]=
768 strtol(options[o], &tail, 0);
769 if(tail!=options[o]){
770 numOfNoises++;
771 numOfUnknownOptions--;
772 if(numOfNoises >= 3) break;
773 }
774 }
775 }
776 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
777 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
778 int o;
779
780 for(o=0; options[o] && o<2; o++){
781 char *tail;
782 int val= strtol(options[o], &tail, 0);
783 if(tail==options[o]) break;
784
785 numOfUnknownOptions--;
786 if(o==0) ppMode->baseDcDiff= val;
787 else ppMode->flatnessThreshold= val;
788 }
789 }
790 else if(filters[i].mask == FORCE_QUANT){
791 int o;
792 ppMode->forcedQuant= 15;
793
794 for(o=0; options[o] && o<1; o++){
795 char *tail;
796 int val= strtol(options[o], &tail, 0);
797 if(tail==options[o]) break;
798
799 numOfUnknownOptions--;
800 ppMode->forcedQuant= val;
801 }
802 }
803 }
804 }
805 if(!filterNameOk) ppMode->error++;
806 ppMode->error += numOfUnknownOptions;
807 }
808
809 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
810 if(ppMode->error){
811 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
812 av_free(ppMode);
813 return NULL;
814 }
815 return ppMode;
816 }
817
pp_free_mode(pp_mode *mode)818 void pp_free_mode(pp_mode *mode){
819 av_free(mode);
820 }
821
reallocAlign(void **p, int size)822 static void reallocAlign(void **p, int size){
823 av_free(*p);
824 *p= av_mallocz(size);
825 }
826
reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride)827 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
828 int mbWidth = (width+15)>>4;
829 int mbHeight= (height+15)>>4;
830 int i;
831
832 c->stride= stride;
833 c->qpStride= qpStride;
834
835 reallocAlign((void **)&c->tempDst, stride*24+32);
836 reallocAlign((void **)&c->tempSrc, stride*24);
837 reallocAlign((void **)&c->tempBlocks, 2*16*8);
838 reallocAlign((void **)&c->yHistogram, 256*sizeof(uint64_t));
839 for(i=0; i<256; i++)
840 c->yHistogram[i]= width*height/64*15/256;
841
842 for(i=0; i<3; i++){
843 //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
844 reallocAlign((void **)&c->tempBlurred[i], stride*mbHeight*16 + 17*1024);
845 reallocAlign((void **)&c->tempBlurredPast[i], 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
846 }
847
848 reallocAlign((void **)&c->deintTemp, 2*width+32);
849 reallocAlign((void **)&c->nonBQPTable, qpStride*mbHeight*sizeof(int8_t));
850 reallocAlign((void **)&c->stdQPTable, qpStride*mbHeight*sizeof(int8_t));
851 reallocAlign((void **)&c->forcedQPTable, mbWidth*sizeof(int8_t));
852 }
853
context_to_name(void * ptr)854 static const char * context_to_name(void * ptr) {
855 return "postproc";
856 }
857
858 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
859
pp_get_context(int width, int height, int cpuCaps)860 av_cold pp_context *pp_get_context(int width, int height, int cpuCaps){
861 PPContext *c= av_mallocz(sizeof(PPContext));
862 int stride= FFALIGN(width, 16); //assumed / will realloc if needed
863 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
864
865 if (!c)
866 return NULL;
867
868 c->av_class = &av_codec_context_class;
869 if(cpuCaps&PP_FORMAT){
870 c->hChromaSubSample= cpuCaps&0x3;
871 c->vChromaSubSample= (cpuCaps>>4)&0x3;
872 }else{
873 c->hChromaSubSample= 1;
874 c->vChromaSubSample= 1;
875 }
876 if (cpuCaps & PP_CPU_CAPS_AUTO) {
877 c->cpuCaps = av_get_cpu_flags();
878 } else {
879 c->cpuCaps = 0;
880 if (cpuCaps & PP_CPU_CAPS_MMX) c->cpuCaps |= AV_CPU_FLAG_MMX;
881 if (cpuCaps & PP_CPU_CAPS_MMX2) c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
882 if (cpuCaps & PP_CPU_CAPS_3DNOW) c->cpuCaps |= AV_CPU_FLAG_3DNOW;
883 if (cpuCaps & PP_CPU_CAPS_ALTIVEC) c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
884 }
885
886 reallocBuffers(c, width, height, stride, qpStride);
887
888 c->frameNum=-1;
889
890 return c;
891 }
892
pp_free_context(void *vc)893 av_cold void pp_free_context(void *vc){
894 PPContext *c = (PPContext*)vc;
895 int i;
896
897 for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurred); i++)
898 av_free(c->tempBlurred[i]);
899 for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurredPast); i++)
900 av_free(c->tempBlurredPast[i]);
901
902 av_free(c->tempBlocks);
903 av_free(c->yHistogram);
904 av_free(c->tempDst);
905 av_free(c->tempSrc);
906 av_free(c->deintTemp);
907 av_free(c->stdQPTable);
908 av_free(c->nonBQPTable);
909 av_free(c->forcedQPTable);
910
911 memset(c, 0, sizeof(PPContext));
912
913 av_free(c);
914 }
915
pp_postprocess(const uint8_t * src[3], const int srcStride[3], uint8_t * dst[3], const int dstStride[3], int width, int height, const int8_t *QP_store, int QPStride, pp_mode *vm, void *vc, int pict_type)916 void pp_postprocess(const uint8_t * src[3], const int srcStride[3],
917 uint8_t * dst[3], const int dstStride[3],
918 int width, int height,
919 const int8_t *QP_store, int QPStride,
920 pp_mode *vm, void *vc, int pict_type)
921 {
922 int mbWidth = (width+15)>>4;
923 int mbHeight= (height+15)>>4;
924 PPMode *mode = vm;
925 PPContext *c = vc;
926 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
927 int absQPStride = FFABS(QPStride);
928
929 // c->stride and c->QPStride are always positive
930 if(c->stride < minStride || c->qpStride < absQPStride)
931 reallocBuffers(c, width, height,
932 FFMAX(minStride, c->stride),
933 FFMAX(c->qpStride, absQPStride));
934
935 if(!QP_store || (mode->lumMode & FORCE_QUANT)){
936 int i;
937 QP_store= c->forcedQPTable;
938 absQPStride = QPStride = 0;
939 if(mode->lumMode & FORCE_QUANT)
940 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
941 else
942 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
943 }
944
945 if(pict_type & PP_PICT_TYPE_QP2){
946 int i;
947 const int count= FFMAX(mbHeight * absQPStride, mbWidth);
948 for(i=0; i<(count>>2); i++){
949 AV_WN32(c->stdQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) >> 1 & 0x7F7F7F7F);
950 }
951 for(i<<=2; i<count; i++){
952 c->stdQPTable[i] = QP_store[i]>>1;
953 }
954 QP_store= c->stdQPTable;
955 QPStride= absQPStride;
956 }
957
958 if(0){
959 int x,y;
960 for(y=0; y<mbHeight; y++){
961 for(x=0; x<mbWidth; x++){
962 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
963 }
964 av_log(c, AV_LOG_INFO, "\n");
965 }
966 av_log(c, AV_LOG_INFO, "\n");
967 }
968
969 if((pict_type&7)!=3){
970 if (QPStride >= 0){
971 int i;
972 const int count= FFMAX(mbHeight * QPStride, mbWidth);
973 for(i=0; i<(count>>2); i++){
974 AV_WN32(c->nonBQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) & 0x3F3F3F3F);
975 }
976 for(i<<=2; i<count; i++){
977 c->nonBQPTable[i] = QP_store[i] & 0x3F;
978 }
979 } else {
980 int i,j;
981 for(i=0; i<mbHeight; i++) {
982 for(j=0; j<absQPStride; j++) {
983 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
984 }
985 }
986 }
987 }
988
989 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
990 mode->lumMode, mode->chromMode);
991
992 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
993 width, height, QP_store, QPStride, 0, mode, c);
994
995 if (!(src[1] && src[2] && dst[1] && dst[2]))
996 return;
997
998 width = (width )>>c->hChromaSubSample;
999 height = (height)>>c->vChromaSubSample;
1000
1001 if(mode->chromMode){
1002 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1003 width, height, QP_store, QPStride, 1, mode, c);
1004 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1005 width, height, QP_store, QPStride, 2, mode, c);
1006 }
1007 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1008 linecpy(dst[1], src[1], height, srcStride[1]);
1009 linecpy(dst[2], src[2], height, srcStride[2]);
1010 }else{
1011 int y;
1012 for(y=0; y<height; y++){
1013 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1014 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1015 }
1016 }
1017 }
1018