1 /*
2  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 /**
24  * @file
25  * postprocessing.
26  */
27 
28 /*
29                         C       MMX     MMX2    3DNow   AltiVec
30 isVertDC                Ec      Ec                      Ec
31 isVertMinMaxOk          Ec      Ec                      Ec
32 doVertLowPass           E               e       e       Ec
33 doVertDefFilter         Ec      Ec      e       e       Ec
34 isHorizDC               Ec      Ec                      Ec
35 isHorizMinMaxOk         a       E                       Ec
36 doHorizLowPass          E               e       e       Ec
37 doHorizDefFilter        Ec      Ec      e       e       Ec
38 do_a_deblock            Ec      E       Ec      E
39 deRing                  E               e       e*      Ecp
40 Vertical RKAlgo1        E               a       a
41 Horizontal RKAlgo1                      a       a
42 Vertical X1#            a               E       E
43 Horizontal X1#          a               E       E
44 LinIpolDeinterlace      e               E       E*
45 CubicIpolDeinterlace    a               e       e*
46 LinBlendDeinterlace     e               E       E*
47 MedianDeinterlace#      E       Ec      Ec
48 TempDeNoiser#           E               e       e       Ec
49 
50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51 # more or less selfinvented filters so the exactness is not too meaningful
52 E = Exact implementation
53 e = almost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58 
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66         (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73 
74 //Changelog: use git log
75 
76 #include "config.h"
77 #include "libavutil/avutil.h"
78 #include "libavutil/avassert.h"
79 #include "libavutil/cpu.h"
80 #include "libavutil/intreadwrite.h"
81 #include <inttypes.h>
82 #include <stdio.h>
83 #include <stdlib.h>
84 #include <string.h>
85 //#undef HAVE_MMXEXT_INLINE
86 //#define HAVE_AMD3DNOW_INLINE
87 //#undef HAVE_MMX_INLINE
88 //#undef ARCH_X86
89 //#define DEBUG_BRIGHTNESS
90 #include "postprocess.h"
91 #include "postprocess_internal.h"
92 #include "libavutil/avstring.h"
93 #include "libavutil/ppc/util_altivec.h"
94 
95 #define GET_MODE_BUFFER_SIZE 500
96 #define OPTIONS_ARRAY_SIZE 10
97 #define BLOCK_SIZE 8
98 #define TEMP_STRIDE 8
99 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
100 
101 #if ARCH_X86 && HAVE_INLINE_ASM
102 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
103 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
104 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
105 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
106 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
107 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
108 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
109 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
110 #endif
111 
112 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
113 
114 
115 static const struct PPFilter filters[]=
116 {
117     {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
118     {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
119 /*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
120     {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
121     {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
122     {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
123     {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
124     {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
125     {"dr", "dering",                1, 5, 6, DERING},
126     {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
127     {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
128     {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
129     {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
130     {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
131     {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
132     {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
133     {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
134     {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
135     {"be", "bitexact",              1, 0, 0, BITEXACT},
136     {"vi", "visualize",             1, 0, 0, VISUALIZE},
137     {NULL, NULL,0,0,0,0} //End Marker
138 };
139 
140 static const char * const replaceTable[]=
141 {
142     "default",      "hb:a,vb:a,dr:a",
143     "de",           "hb:a,vb:a,dr:a",
144     "fast",         "h1:a,v1:a,dr:a",
145     "fa",           "h1:a,v1:a,dr:a",
146     "ac",           "ha:a:128:7,va:a,dr:a",
147     NULL //End Marker
148 };
149 
150 /* The horizontal functions exist only in C because the MMX
151  * code is faster with vertical filters and transposing. */
152 
153 /**
154  * Check if the given 8x8 Block is mostly "flat"
155  */
isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)156 static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
157 {
158     int numEq= 0;
159     int y;
160     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
161     const int dcThreshold= dcOffset*2 + 1;
162 
163     for(y=0; y<BLOCK_SIZE; y++){
164         numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
165         numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
166         numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
167         numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold;
168         numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold;
169         numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold;
170         numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold;
171         src+= stride;
172     }
173     return numEq > c->ppMode.flatnessThreshold;
174 }
175 
176 /**
177  * Check if the middle 8x8 Block in the given 8x16 block is flat
178  */
isVertDC_C(const uint8_t src[], int stride, const PPContext *c)179 static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
180 {
181     int numEq= 0;
182     int y;
183     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
184     const int dcThreshold= dcOffset*2 + 1;
185 
186     src+= stride*4; // src points to begin of the 8x8 Block
187     for(y=0; y<BLOCK_SIZE-1; y++){
188         numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
189         numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
190         numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
191         numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold;
192         numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold;
193         numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold;
194         numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold;
195         numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold;
196         src+= stride;
197     }
198     return numEq > c->ppMode.flatnessThreshold;
199 }
200 
isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)201 static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
202 {
203     int i;
204     for(i=0; i<2; i++){
205         if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
206         src += stride;
207         if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
208         src += stride;
209         if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
210         src += stride;
211         if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
212         src += stride;
213     }
214     return 1;
215 }
216 
isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)217 static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
218 {
219     int x;
220     src+= stride*4;
221     for(x=0; x<BLOCK_SIZE; x+=4){
222         if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
223         if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
224         if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
225         if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
226     }
227     return 1;
228 }
229 
horizClassify_C(const uint8_t src[], int stride, const PPContext *c)230 static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
231 {
232     if( isHorizDC_C(src, stride, c) ){
233         return isHorizMinMaxOk_C(src, stride, c->QP);
234     }else{
235         return 2;
236     }
237 }
238 
vertClassify_C(const uint8_t src[], int stride, const PPContext *c)239 static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
240 {
241     if( isVertDC_C(src, stride, c) ){
242         return isVertMinMaxOk_C(src, stride, c->QP);
243     }else{
244         return 2;
245     }
246 }
247 
doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)248 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
249 {
250     int y;
251     for(y=0; y<BLOCK_SIZE; y++){
252         const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
253 
254         if(FFABS(middleEnergy) < 8*c->QP){
255             const int q=(dst[3] - dst[4])/2;
256             const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
257             const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
258 
259             int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
260             d= FFMAX(d, 0);
261 
262             d= (5*d + 32) >> 6;
263             d*= FFSIGN(-middleEnergy);
264 
265             if(q>0)
266             {
267                 d = FFMAX(d, 0);
268                 d = FFMIN(d, q);
269             }
270             else
271             {
272                 d = FFMIN(d, 0);
273                 d = FFMAX(d, q);
274             }
275 
276             dst[3]-= d;
277             dst[4]+= d;
278         }
279         dst+= stride;
280     }
281 }
282 
283 /**
284  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
285  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
286  */
doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)287 static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
288 {
289     int y;
290     for(y=0; y<BLOCK_SIZE; y++){
291         const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
292         const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
293 
294         int sums[10];
295         sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
296         sums[1] = sums[0] - first  + dst[3];
297         sums[2] = sums[1] - first  + dst[4];
298         sums[3] = sums[2] - first  + dst[5];
299         sums[4] = sums[3] - first  + dst[6];
300         sums[5] = sums[4] - dst[0] + dst[7];
301         sums[6] = sums[5] - dst[1] + last;
302         sums[7] = sums[6] - dst[2] + last;
303         sums[8] = sums[7] - dst[3] + last;
304         sums[9] = sums[8] - dst[4] + last;
305 
306         dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
307         dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
308         dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
309         dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
310         dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
311         dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
312         dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
313         dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
314 
315         dst+= stride;
316     }
317 }
318 
319 /**
320  * Experimental Filter 1 (Horizontal)
321  * will not damage linear gradients
322  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
323  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
324  * MMX2 version does correct clipping C version does not
325  * not identical with the vertical one
326  */
horizX1Filter(uint8_t *src, int stride, int QP)327 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
328 {
329     int y;
330     static uint64_t lut[256];
331     if(!lut[255])
332     {
333         int i;
334         for(i=0; i<256; i++)
335         {
336             int v= i < 128 ? 2*i : 2*(i-256);
337 /*
338 //Simulate 112242211 9-Tap filter
339             uint64_t a= (v/16)  & 0xFF;
340             uint64_t b= (v/8)   & 0xFF;
341             uint64_t c= (v/4)   & 0xFF;
342             uint64_t d= (3*v/8) & 0xFF;
343 */
344 //Simulate piecewise linear interpolation
345             uint64_t a= (v/16)   & 0xFF;
346             uint64_t b= (v*3/16) & 0xFF;
347             uint64_t c= (v*5/16) & 0xFF;
348             uint64_t d= (7*v/16) & 0xFF;
349             uint64_t A= (0x100 - a)&0xFF;
350             uint64_t B= (0x100 - b)&0xFF;
351             uint64_t C= (0x100 - c)&0xFF;
352             uint64_t D= (0x100 - c)&0xFF;
353 
354             lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
355                        (D<<24) | (C<<16) | (B<<8)  | (A);
356             //lut[i] = (v<<32) | (v<<24);
357         }
358     }
359 
360     for(y=0; y<BLOCK_SIZE; y++){
361         int a= src[1] - src[2];
362         int b= src[3] - src[4];
363         int c= src[5] - src[6];
364 
365         int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
366 
367         if(d < QP){
368             int v = d * FFSIGN(-b);
369 
370             src[1] +=v/8;
371             src[2] +=v/4;
372             src[3] +=3*v/8;
373             src[4] -=3*v/8;
374             src[5] -=v/4;
375             src[6] -=v/8;
376         }
377         src+=stride;
378     }
379 }
380 
381 /**
382  * accurate deblock filter
383  */
do_a_deblock_C(uint8_t *src, int step, int stride, const PPContext *c, int mode)384 static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
385                                             int stride, const PPContext *c, int mode)
386 {
387     int y;
388     const int QP= c->QP;
389     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
390     const int dcThreshold= dcOffset*2 + 1;
391 
392     src+= step*4; // src points to begin of the 8x8 Block
393     for(y=0; y<8; y++){
394         int numEq= 0;
395 
396         numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold;
397         numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold;
398         numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold;
399         numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold;
400         numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold;
401         numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold;
402         numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold;
403         numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold;
404         numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold;
405         if(numEq > c->ppMode.flatnessThreshold){
406             int min, max, x;
407 
408             if(src[0] > src[step]){
409                 max= src[0];
410                 min= src[step];
411             }else{
412                 max= src[step];
413                 min= src[0];
414             }
415             for(x=2; x<8; x+=2){
416                 if(src[x*step] > src[(x+1)*step]){
417                         if(src[x    *step] > max) max= src[ x   *step];
418                         if(src[(x+1)*step] < min) min= src[(x+1)*step];
419                 }else{
420                         if(src[(x+1)*step] > max) max= src[(x+1)*step];
421                         if(src[ x   *step] < min) min= src[ x   *step];
422                 }
423             }
424             if(max-min < 2*QP){
425                 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
426                 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
427 
428                 int sums[10];
429                 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
430                 sums[1] = sums[0] - first       + src[3*step];
431                 sums[2] = sums[1] - first       + src[4*step];
432                 sums[3] = sums[2] - first       + src[5*step];
433                 sums[4] = sums[3] - first       + src[6*step];
434                 sums[5] = sums[4] - src[0*step] + src[7*step];
435                 sums[6] = sums[5] - src[1*step] + last;
436                 sums[7] = sums[6] - src[2*step] + last;
437                 sums[8] = sums[7] - src[3*step] + last;
438                 sums[9] = sums[8] - src[4*step] + last;
439 
440                 if (mode & VISUALIZE) {
441                     src[0*step] =
442                     src[1*step] =
443                     src[2*step] =
444                     src[3*step] =
445                     src[4*step] =
446                     src[5*step] =
447                     src[6*step] =
448                     src[7*step] = 128;
449                 }
450                 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
451                 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
452                 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
453                 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
454                 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
455                 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
456                 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
457                 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
458             }
459         }else{
460             const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
461 
462             if(FFABS(middleEnergy) < 8*QP){
463                 const int q=(src[3*step] - src[4*step])/2;
464                 const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
465                 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
466 
467                 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
468                 d= FFMAX(d, 0);
469 
470                 d= (5*d + 32) >> 6;
471                 d*= FFSIGN(-middleEnergy);
472 
473                 if(q>0){
474                     d = FFMAX(d, 0);
475                     d = FFMIN(d, q);
476                 }else{
477                     d = FFMIN(d, 0);
478                     d = FFMAX(d, q);
479                 }
480 
481                 if ((mode & VISUALIZE) && d) {
482                     d= (d < 0) ? 32 : -32;
483                     src[3*step]= av_clip_uint8(src[3*step] - d);
484                     src[4*step]= av_clip_uint8(src[4*step] + d);
485                     d = 0;
486                 }
487 
488                 src[3*step]-= d;
489                 src[4*step]+= d;
490             }
491         }
492 
493         src += stride;
494     }
495 }
496 
497 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
498 //Plain C versions
499 //we always compile C for testing which needs bitexactness
500 #define TEMPLATE_PP_C 1
501 #include "postprocess_template.c"
502 
503 #if HAVE_ALTIVEC
504 #   define TEMPLATE_PP_ALTIVEC 1
505 #   include "postprocess_altivec_template.c"
506 #   include "postprocess_template.c"
507 #endif
508 
509 #if ARCH_X86 && HAVE_INLINE_ASM
510 #    if CONFIG_RUNTIME_CPUDETECT
511 #        define TEMPLATE_PP_MMX 1
512 #        include "postprocess_template.c"
513 #        define TEMPLATE_PP_MMXEXT 1
514 #        include "postprocess_template.c"
515 #        define TEMPLATE_PP_3DNOW 1
516 #        include "postprocess_template.c"
517 #        define TEMPLATE_PP_SSE2 1
518 #        include "postprocess_template.c"
519 #    else
520 #        if HAVE_SSE2_INLINE
521 #            define TEMPLATE_PP_SSE2 1
522 #            include "postprocess_template.c"
523 #        elif HAVE_MMXEXT_INLINE
524 #            define TEMPLATE_PP_MMXEXT 1
525 #            include "postprocess_template.c"
526 #        elif HAVE_AMD3DNOW_INLINE
527 #            define TEMPLATE_PP_3DNOW 1
528 #            include "postprocess_template.c"
529 #        elif HAVE_MMX_INLINE
530 #            define TEMPLATE_PP_MMX 1
531 #            include "postprocess_template.c"
532 #        endif
533 #    endif
534 #endif
535 
536 typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
537                       const int8_t QPs[], int QPStride, int isColor, PPContext *c2);
538 
postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)539 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
540         const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
541 {
542     pp_fn pp = postProcess_C;
543     PPContext *c= (PPContext *)vc;
544     PPMode *ppMode= (PPMode *)vm;
545     c->ppMode= *ppMode; //FIXME
546 
547     if (!(ppMode->lumMode & BITEXACT)) {
548 #if CONFIG_RUNTIME_CPUDETECT
549 #if ARCH_X86 && HAVE_INLINE_ASM
550         // ordered per speed fastest first
551         if      (c->cpuCaps & AV_CPU_FLAG_SSE2)     pp = postProcess_SSE2;
552         else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT)   pp = postProcess_MMX2;
553         else if (c->cpuCaps & AV_CPU_FLAG_3DNOW)    pp = postProcess_3DNow;
554         else if (c->cpuCaps & AV_CPU_FLAG_MMX)      pp = postProcess_MMX;
555 #elif HAVE_ALTIVEC
556         if      (c->cpuCaps & AV_CPU_FLAG_ALTIVEC)  pp = postProcess_altivec;
557 #endif
558 #else /* CONFIG_RUNTIME_CPUDETECT */
559 #if     HAVE_SSE2_INLINE
560         pp = postProcess_SSE2;
561 #elif   HAVE_MMXEXT_INLINE
562         pp = postProcess_MMX2;
563 #elif HAVE_AMD3DNOW_INLINE
564         pp = postProcess_3DNow;
565 #elif HAVE_MMX_INLINE
566         pp = postProcess_MMX;
567 #elif HAVE_ALTIVEC
568         pp = postProcess_altivec;
569 #endif
570 #endif /* !CONFIG_RUNTIME_CPUDETECT */
571     }
572 
573     pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
574 }
575 
576 /* -pp Command line Help
577 */
578 const char pp_help[] =
579 "Available postprocessing filters:\n"
580 "Filters                        Options\n"
581 "short  long name       short   long option     Description\n"
582 "*      *               a       autoq           CPU power dependent enabler\n"
583 "                       c       chrom           chrominance filtering enabled\n"
584 "                       y       nochrom         chrominance filtering disabled\n"
585 "                       n       noluma          luma filtering disabled\n"
586 "hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
587 "       1. difference factor: default=32, higher -> more deblocking\n"
588 "       2. flatness threshold: default=39, lower -> more deblocking\n"
589 "                       the h & v deblocking filters share these\n"
590 "                       so you can't set different thresholds for h / v\n"
591 "vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
592 "ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
593 "va     vadeblock       (2 threshold)           vertical deblocking filter\n"
594 "h1     x1hdeblock                              experimental h deblock filter 1\n"
595 "v1     x1vdeblock                              experimental v deblock filter 1\n"
596 "dr     dering                                  deringing filter\n"
597 "al     autolevels                              automatic brightness / contrast\n"
598 "                       f        fullyrange     stretch luminance to (0..255)\n"
599 "lb     linblenddeint                           linear blend deinterlacer\n"
600 "li     linipoldeint                            linear interpolating deinterlace\n"
601 "ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
602 "md     mediandeint                             median deinterlacer\n"
603 "fd     ffmpegdeint                             ffmpeg deinterlacer\n"
604 "l5     lowpass5                                FIR lowpass deinterlacer\n"
605 "de     default                                 hb:a,vb:a,dr:a\n"
606 "fa     fast                                    h1:a,v1:a,dr:a\n"
607 "ac                                             ha:a:128:7,va:a,dr:a\n"
608 "tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
609 "                     1. <= 2. <= 3.            larger -> stronger filtering\n"
610 "fq     forceQuant      <quantizer>             force quantizer\n"
611 "Usage:\n"
612 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
613 "long form example:\n"
614 "vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
615 "short form example:\n"
616 "vb:a/hb:a/lb                                   de,-vb\n"
617 "more examples:\n"
618 "tn:64:128:256\n"
619 "\n"
620 ;
621 
pp_get_mode_by_name_and_quality(const char *name, int quality)622 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
623 {
624     char temp[GET_MODE_BUFFER_SIZE];
625     char *p= temp;
626     static const char filterDelimiters[] = ",/";
627     static const char optionDelimiters[] = ":|";
628     struct PPMode *ppMode;
629     char *filterToken;
630 
631     if (!name)  {
632         av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
633         return NULL;
634     }
635 
636     if (!strcmp(name, "help")) {
637         const char *p;
638         for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
639             av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
640             av_log(NULL, AV_LOG_INFO, "%s", temp);
641         }
642         return NULL;
643     }
644 
645     ppMode= av_malloc(sizeof(PPMode));
646     if (!ppMode)
647         return NULL;
648 
649     ppMode->lumMode= 0;
650     ppMode->chromMode= 0;
651     ppMode->maxTmpNoise[0]= 700;
652     ppMode->maxTmpNoise[1]= 1500;
653     ppMode->maxTmpNoise[2]= 3000;
654     ppMode->maxAllowedY= 234;
655     ppMode->minAllowedY= 16;
656     ppMode->baseDcDiff= 256/8;
657     ppMode->flatnessThreshold= 56-16-1;
658     ppMode->maxClippedThreshold= (AVRational){1,100};
659     ppMode->error=0;
660 
661     memset(temp, 0, GET_MODE_BUFFER_SIZE);
662     av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
663 
664     av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
665 
666     for(;;){
667         const char *filterName;
668         int q= 1000000; //PP_QUALITY_MAX;
669         int chrom=-1;
670         int luma=-1;
671         const char *option;
672         const char *options[OPTIONS_ARRAY_SIZE];
673         int i;
674         int filterNameOk=0;
675         int numOfUnknownOptions=0;
676         int enable=1; //does the user want us to enabled or disabled the filter
677         char *tokstate;
678 
679         filterToken= av_strtok(p, filterDelimiters, &tokstate);
680         if(!filterToken) break;
681         p+= strlen(filterToken) + 1; // p points to next filterToken
682         filterName= av_strtok(filterToken, optionDelimiters, &tokstate);
683         if (!filterName) {
684             ppMode->error++;
685             break;
686         }
687         av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
688 
689         if(*filterName == '-'){
690             enable=0;
691             filterName++;
692         }
693 
694         for(;;){ //for all options
695             option= av_strtok(NULL, optionDelimiters, &tokstate);
696             if(!option) break;
697 
698             av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
699             if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
700             else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
701             else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
702             else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
703             else{
704                 options[numOfUnknownOptions] = option;
705                 numOfUnknownOptions++;
706             }
707             if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
708         }
709         options[numOfUnknownOptions] = NULL;
710 
711         /* replace stuff from the replace Table */
712         for(i=0; replaceTable[2*i]; i++){
713             if(!strcmp(replaceTable[2*i], filterName)){
714                 size_t newlen = strlen(replaceTable[2*i + 1]);
715                 int plen;
716                 int spaceLeft;
717 
718                 p--, *p=',';
719 
720                 plen= strlen(p);
721                 spaceLeft= p - temp + plen;
722                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE - 1){
723                     ppMode->error++;
724                     break;
725                 }
726                 memmove(p + newlen, p, plen+1);
727                 memcpy(p, replaceTable[2*i + 1], newlen);
728                 filterNameOk=1;
729             }
730         }
731 
732         for(i=0; filters[i].shortName; i++){
733             if(   !strcmp(filters[i].longName, filterName)
734                || !strcmp(filters[i].shortName, filterName)){
735                 ppMode->lumMode &= ~filters[i].mask;
736                 ppMode->chromMode &= ~filters[i].mask;
737 
738                 filterNameOk=1;
739                 if(!enable) break; // user wants to disable it
740 
741                 if(q >= filters[i].minLumQuality && luma)
742                     ppMode->lumMode|= filters[i].mask;
743                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
744                     if(q >= filters[i].minChromQuality)
745                             ppMode->chromMode|= filters[i].mask;
746 
747                 if(filters[i].mask == LEVEL_FIX){
748                     int o;
749                     ppMode->minAllowedY= 16;
750                     ppMode->maxAllowedY= 234;
751                     for(o=0; options[o]; o++){
752                         if(  !strcmp(options[o],"fullyrange")
753                            ||!strcmp(options[o],"f")){
754                             ppMode->minAllowedY= 0;
755                             ppMode->maxAllowedY= 255;
756                             numOfUnknownOptions--;
757                         }
758                     }
759                 }
760                 else if(filters[i].mask == TEMP_NOISE_FILTER)
761                 {
762                     int o;
763                     int numOfNoises=0;
764 
765                     for(o=0; options[o]; o++){
766                         char *tail;
767                         ppMode->maxTmpNoise[numOfNoises]=
768                             strtol(options[o], &tail, 0);
769                         if(tail!=options[o]){
770                             numOfNoises++;
771                             numOfUnknownOptions--;
772                             if(numOfNoises >= 3) break;
773                         }
774                     }
775                 }
776                 else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
777                      || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
778                     int o;
779 
780                     for(o=0; options[o] && o<2; o++){
781                         char *tail;
782                         int val= strtol(options[o], &tail, 0);
783                         if(tail==options[o]) break;
784 
785                         numOfUnknownOptions--;
786                         if(o==0) ppMode->baseDcDiff= val;
787                         else ppMode->flatnessThreshold= val;
788                     }
789                 }
790                 else if(filters[i].mask == FORCE_QUANT){
791                     int o;
792                     ppMode->forcedQuant= 15;
793 
794                     for(o=0; options[o] && o<1; o++){
795                         char *tail;
796                         int val= strtol(options[o], &tail, 0);
797                         if(tail==options[o]) break;
798 
799                         numOfUnknownOptions--;
800                         ppMode->forcedQuant= val;
801                     }
802                 }
803             }
804         }
805         if(!filterNameOk) ppMode->error++;
806         ppMode->error += numOfUnknownOptions;
807     }
808 
809     av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
810     if(ppMode->error){
811         av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
812         av_free(ppMode);
813         return NULL;
814     }
815     return ppMode;
816 }
817 
pp_free_mode(pp_mode *mode)818 void pp_free_mode(pp_mode *mode){
819     av_free(mode);
820 }
821 
reallocAlign(void **p, int size)822 static void reallocAlign(void **p, int size){
823     av_free(*p);
824     *p= av_mallocz(size);
825 }
826 
reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride)827 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
828     int mbWidth = (width+15)>>4;
829     int mbHeight= (height+15)>>4;
830     int i;
831 
832     c->stride= stride;
833     c->qpStride= qpStride;
834 
835     reallocAlign((void **)&c->tempDst, stride*24+32);
836     reallocAlign((void **)&c->tempSrc, stride*24);
837     reallocAlign((void **)&c->tempBlocks, 2*16*8);
838     reallocAlign((void **)&c->yHistogram, 256*sizeof(uint64_t));
839     for(i=0; i<256; i++)
840             c->yHistogram[i]= width*height/64*15/256;
841 
842     for(i=0; i<3; i++){
843         //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
844         reallocAlign((void **)&c->tempBlurred[i], stride*mbHeight*16 + 17*1024);
845         reallocAlign((void **)&c->tempBlurredPast[i], 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
846     }
847 
848     reallocAlign((void **)&c->deintTemp, 2*width+32);
849     reallocAlign((void **)&c->nonBQPTable, qpStride*mbHeight*sizeof(int8_t));
850     reallocAlign((void **)&c->stdQPTable, qpStride*mbHeight*sizeof(int8_t));
851     reallocAlign((void **)&c->forcedQPTable, mbWidth*sizeof(int8_t));
852 }
853 
context_to_name(void * ptr)854 static const char * context_to_name(void * ptr) {
855     return "postproc";
856 }
857 
858 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
859 
pp_get_context(int width, int height, int cpuCaps)860 av_cold pp_context *pp_get_context(int width, int height, int cpuCaps){
861     PPContext *c= av_mallocz(sizeof(PPContext));
862     int stride= FFALIGN(width, 16);  //assumed / will realloc if needed
863     int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
864 
865     if (!c)
866         return NULL;
867 
868     c->av_class = &av_codec_context_class;
869     if(cpuCaps&PP_FORMAT){
870         c->hChromaSubSample= cpuCaps&0x3;
871         c->vChromaSubSample= (cpuCaps>>4)&0x3;
872     }else{
873         c->hChromaSubSample= 1;
874         c->vChromaSubSample= 1;
875     }
876     if (cpuCaps & PP_CPU_CAPS_AUTO) {
877         c->cpuCaps = av_get_cpu_flags();
878     } else {
879         c->cpuCaps = 0;
880         if (cpuCaps & PP_CPU_CAPS_MMX)      c->cpuCaps |= AV_CPU_FLAG_MMX;
881         if (cpuCaps & PP_CPU_CAPS_MMX2)     c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
882         if (cpuCaps & PP_CPU_CAPS_3DNOW)    c->cpuCaps |= AV_CPU_FLAG_3DNOW;
883         if (cpuCaps & PP_CPU_CAPS_ALTIVEC)  c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
884     }
885 
886     reallocBuffers(c, width, height, stride, qpStride);
887 
888     c->frameNum=-1;
889 
890     return c;
891 }
892 
pp_free_context(void *vc)893 av_cold void pp_free_context(void *vc){
894     PPContext *c = (PPContext*)vc;
895     int i;
896 
897     for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurred); i++)
898         av_free(c->tempBlurred[i]);
899     for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurredPast); i++)
900         av_free(c->tempBlurredPast[i]);
901 
902     av_free(c->tempBlocks);
903     av_free(c->yHistogram);
904     av_free(c->tempDst);
905     av_free(c->tempSrc);
906     av_free(c->deintTemp);
907     av_free(c->stdQPTable);
908     av_free(c->nonBQPTable);
909     av_free(c->forcedQPTable);
910 
911     memset(c, 0, sizeof(PPContext));
912 
913     av_free(c);
914 }
915 
pp_postprocess(const uint8_t * src[3], const int srcStride[3], uint8_t * dst[3], const int dstStride[3], int width, int height, const int8_t *QP_store, int QPStride, pp_mode *vm, void *vc, int pict_type)916 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
917                      uint8_t * dst[3], const int dstStride[3],
918                      int width, int height,
919                      const int8_t *QP_store,  int QPStride,
920                      pp_mode *vm,  void *vc, int pict_type)
921 {
922     int mbWidth = (width+15)>>4;
923     int mbHeight= (height+15)>>4;
924     PPMode *mode = vm;
925     PPContext *c = vc;
926     int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
927     int absQPStride = FFABS(QPStride);
928 
929     // c->stride and c->QPStride are always positive
930     if(c->stride < minStride || c->qpStride < absQPStride)
931         reallocBuffers(c, width, height,
932                        FFMAX(minStride, c->stride),
933                        FFMAX(c->qpStride, absQPStride));
934 
935     if(!QP_store || (mode->lumMode & FORCE_QUANT)){
936         int i;
937         QP_store= c->forcedQPTable;
938         absQPStride = QPStride = 0;
939         if(mode->lumMode & FORCE_QUANT)
940             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
941         else
942             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
943     }
944 
945     if(pict_type & PP_PICT_TYPE_QP2){
946         int i;
947         const int count= FFMAX(mbHeight * absQPStride, mbWidth);
948         for(i=0; i<(count>>2); i++){
949             AV_WN32(c->stdQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) >> 1 & 0x7F7F7F7F);
950         }
951         for(i<<=2; i<count; i++){
952             c->stdQPTable[i] = QP_store[i]>>1;
953         }
954         QP_store= c->stdQPTable;
955         QPStride= absQPStride;
956     }
957 
958     if(0){
959         int x,y;
960         for(y=0; y<mbHeight; y++){
961             for(x=0; x<mbWidth; x++){
962                 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
963             }
964             av_log(c, AV_LOG_INFO, "\n");
965         }
966         av_log(c, AV_LOG_INFO, "\n");
967     }
968 
969     if((pict_type&7)!=3){
970         if (QPStride >= 0){
971             int i;
972             const int count= FFMAX(mbHeight * QPStride, mbWidth);
973             for(i=0; i<(count>>2); i++){
974                 AV_WN32(c->nonBQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) & 0x3F3F3F3F);
975             }
976             for(i<<=2; i<count; i++){
977                 c->nonBQPTable[i] = QP_store[i] & 0x3F;
978             }
979         } else {
980             int i,j;
981             for(i=0; i<mbHeight; i++) {
982                 for(j=0; j<absQPStride; j++) {
983                     c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
984                 }
985             }
986         }
987     }
988 
989     av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
990            mode->lumMode, mode->chromMode);
991 
992     postProcess(src[0], srcStride[0], dst[0], dstStride[0],
993                 width, height, QP_store, QPStride, 0, mode, c);
994 
995     if (!(src[1] && src[2] && dst[1] && dst[2]))
996         return;
997 
998     width  = (width )>>c->hChromaSubSample;
999     height = (height)>>c->vChromaSubSample;
1000 
1001     if(mode->chromMode){
1002         postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1003                     width, height, QP_store, QPStride, 1, mode, c);
1004         postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1005                     width, height, QP_store, QPStride, 2, mode, c);
1006     }
1007     else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1008         linecpy(dst[1], src[1], height, srcStride[1]);
1009         linecpy(dst[2], src[2], height, srcStride[2]);
1010     }else{
1011         int y;
1012         for(y=0; y<height; y++){
1013             memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1014             memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1015         }
1016     }
1017 }
1018