1/* 2 * This file is part of the Independent JPEG Group's software. 3 * 4 * The authors make NO WARRANTY or representation, either express or implied, 5 * with respect to this software, its quality, accuracy, merchantability, or 6 * fitness for a particular purpose. This software is provided "AS IS", and 7 * you, its user, assume the entire risk as to its quality and accuracy. 8 * 9 * This software is copyright (C) 1991, 1992, Thomas G. Lane. 10 * All Rights Reserved except as specified below. 11 * 12 * Permission is hereby granted to use, copy, modify, and distribute this 13 * software (or portions thereof) for any purpose, without fee, subject to 14 * these conditions: 15 * (1) If any part of the source code for this software is distributed, then 16 * this README file must be included, with this copyright and no-warranty 17 * notice unaltered; and any additions, deletions, or changes to the original 18 * files must be clearly indicated in accompanying documentation. 19 * (2) If only executable code is distributed, then the accompanying 20 * documentation must state that "this software is based in part on the work 21 * of the Independent JPEG Group". 22 * (3) Permission for use of this software is granted only if the user accepts 23 * full responsibility for any undesirable consequences; the authors accept 24 * NO LIABILITY for damages of any kind. 25 * 26 * These conditions apply to any software derived from or based on the IJG 27 * code, not just to the unmodified library. If you use our work, you ought 28 * to acknowledge us. 29 * 30 * Permission is NOT granted for the use of any IJG author's name or company 31 * name in advertising or publicity relating to this software or products 32 * derived from it. This software may be referred to only as "the Independent 33 * JPEG Group's software". 34 * 35 * We specifically permit and encourage the use of this software as the basis 36 * of commercial products, provided that all warranty or liability claims are 37 * assumed by the product vendor. 38 * 39 * This file contains the basic inverse-DCT transformation subroutine. 40 * 41 * This implementation is based on an algorithm described in 42 * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT 43 * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, 44 * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. 45 * The primary algorithm described there uses 11 multiplies and 29 adds. 46 * We use their alternate method with 12 multiplies and 32 adds. 47 * The advantage of this method is that no data path contains more than one 48 * multiplication; this allows a very simple and accurate implementation in 49 * scaled fixed-point arithmetic, with a minimal number of shifts. 50 * 51 * I've made lots of modifications to attempt to take advantage of the 52 * sparse nature of the DCT matrices we're getting. Although the logic 53 * is cumbersome, it's straightforward and the resulting code is much 54 * faster. 55 * 56 * A better way to do this would be to pass in the DCT block as a sparse 57 * matrix, perhaps with the difference cases encoded. 58 */ 59 60/** 61 * @file 62 * Independent JPEG Group's LLM idct. 63 */ 64 65#include <stddef.h> 66#include <stdint.h> 67 68#include "libavutil/intreadwrite.h" 69 70#include "dct.h" 71#include "idctdsp.h" 72 73#define EIGHT_BIT_SAMPLES 74 75#define DCTSIZE 8 76#define DCTSIZE2 64 77 78#define GLOBAL 79 80#define RIGHT_SHIFT(x, n) ((x) >> (n)) 81 82typedef int16_t DCTBLOCK[DCTSIZE2]; 83 84#define CONST_BITS 13 85 86/* 87 * This routine is specialized to the case DCTSIZE = 8. 88 */ 89 90#if DCTSIZE != 8 91 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ 92#endif 93 94 95/* 96 * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT 97 * on each column. Direct algorithms are also available, but they are 98 * much more complex and seem not to be any faster when reduced to code. 99 * 100 * The poop on this scaling stuff is as follows: 101 * 102 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N) 103 * larger than the true IDCT outputs. The final outputs are therefore 104 * a factor of N larger than desired; since N=8 this can be cured by 105 * a simple right shift at the end of the algorithm. The advantage of 106 * this arrangement is that we save two multiplications per 1-D IDCT, 107 * because the y0 and y4 inputs need not be divided by sqrt(N). 108 * 109 * We have to do addition and subtraction of the integer inputs, which 110 * is no problem, and multiplication by fractional constants, which is 111 * a problem to do in integer arithmetic. We multiply all the constants 112 * by CONST_SCALE and convert them to integer constants (thus retaining 113 * CONST_BITS bits of precision in the constants). After doing a 114 * multiplication we have to divide the product by CONST_SCALE, with proper 115 * rounding, to produce the correct output. This division can be done 116 * cheaply as a right shift of CONST_BITS bits. We postpone shifting 117 * as long as possible so that partial sums can be added together with 118 * full fractional precision. 119 * 120 * The outputs of the first pass are scaled up by PASS1_BITS bits so that 121 * they are represented to better-than-integral precision. These outputs 122 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word 123 * with the recommended scaling. (To scale up 12-bit sample data further, an 124 * intermediate int32 array would be needed.) 125 * 126 * To avoid overflow of the 32-bit intermediate results in pass 2, we must 127 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis 128 * shows that the values given below are the most effective. 129 */ 130 131#ifdef EIGHT_BIT_SAMPLES 132#define PASS1_BITS 2 133#else 134#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ 135#endif 136 137#define ONE ((int32_t) 1) 138 139#define CONST_SCALE (ONE << CONST_BITS) 140 141/* Convert a positive real constant to an integer scaled by CONST_SCALE. 142 * IMPORTANT: if your compiler doesn't do this arithmetic at compile time, 143 * you will pay a significant penalty in run time. In that case, figure 144 * the correct integer constant values and insert them by hand. 145 */ 146 147/* Actually FIX is no longer used, we precomputed them all */ 148#define FIX(x) ((int32_t) ((x) * CONST_SCALE + 0.5)) 149 150/* Descale and correctly round an int32_t value that's scaled by N bits. 151 * We assume RIGHT_SHIFT rounds towards minus infinity, so adding 152 * the fudge factor is correct for either sign of X. 153 */ 154 155#define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n) 156 157/* Multiply an int32_t variable by an int32_t constant to yield an int32_t result. 158 * For 8-bit samples with the recommended scaling, all the variable 159 * and constant values involved are no more than 16 bits wide, so a 160 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply; 161 * this provides a useful speedup on many machines. 162 * There is no way to specify a 16x16->32 multiply in portable C, but 163 * some C compilers will do the right thing if you provide the correct 164 * combination of casts. 165 * NB: for 12-bit samples, a full 32-bit multiplication will be needed. 166 */ 167 168#ifdef EIGHT_BIT_SAMPLES 169#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ 170#define MULTIPLY(var,const) (((int16_t) (var)) * ((int16_t) (const))) 171#endif 172#ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */ 173#define MULTIPLY(var,const) (((int16_t) (var)) * ((int32_t) (const))) 174#endif 175#endif 176 177#ifndef MULTIPLY /* default definition */ 178#define MULTIPLY(var,const) ((var) * (const)) 179#endif 180 181 182/* 183 Unlike our decoder where we approximate the FIXes, we need to use exact 184ones here or successive P-frames will drift too much with Reference frame coding 185*/ 186#define FIX_0_211164243 1730 187#define FIX_0_275899380 2260 188#define FIX_0_298631336 2446 189#define FIX_0_390180644 3196 190#define FIX_0_509795579 4176 191#define FIX_0_541196100 4433 192#define FIX_0_601344887 4926 193#define FIX_0_765366865 6270 194#define FIX_0_785694958 6436 195#define FIX_0_899976223 7373 196#define FIX_1_061594337 8697 197#define FIX_1_111140466 9102 198#define FIX_1_175875602 9633 199#define FIX_1_306562965 10703 200#define FIX_1_387039845 11363 201#define FIX_1_451774981 11893 202#define FIX_1_501321110 12299 203#define FIX_1_662939225 13623 204#define FIX_1_847759065 15137 205#define FIX_1_961570560 16069 206#define FIX_2_053119869 16819 207#define FIX_2_172734803 17799 208#define FIX_2_562915447 20995 209#define FIX_3_072711026 25172 210 211/* 212 * Perform the inverse DCT on one block of coefficients. 213 */ 214 215void ff_j_rev_dct(DCTBLOCK data) 216{ 217 int32_t tmp0, tmp1, tmp2, tmp3; 218 int32_t tmp10, tmp11, tmp12, tmp13; 219 int32_t z1, z2, z3, z4, z5; 220 int32_t d0, d1, d2, d3, d4, d5, d6, d7; 221 register int16_t *dataptr; 222 int rowctr; 223 224 /* Pass 1: process rows. */ 225 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ 226 /* furthermore, we scale the results by 2**PASS1_BITS. */ 227 228 dataptr = data; 229 230 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { 231 /* Due to quantization, we will usually find that many of the input 232 * coefficients are zero, especially the AC terms. We can exploit this 233 * by short-circuiting the IDCT calculation for any row in which all 234 * the AC terms are zero. In that case each output is equal to the 235 * DC coefficient (with scale factor as needed). 236 * With typical images and quantization tables, half or more of the 237 * row DCT calculations can be simplified this way. 238 */ 239 240 register uint8_t *idataptr = (uint8_t*)dataptr; 241 242 /* WARNING: we do the same permutation as MMX idct to simplify the 243 video core */ 244 d0 = dataptr[0]; 245 d2 = dataptr[1]; 246 d4 = dataptr[2]; 247 d6 = dataptr[3]; 248 d1 = dataptr[4]; 249 d3 = dataptr[5]; 250 d5 = dataptr[6]; 251 d7 = dataptr[7]; 252 253 if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) { 254 /* AC terms all zero */ 255 if (d0) { 256 /* Compute a 32 bit value to assign. */ 257 int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS)); 258 register int v = (dcval & 0xffff) | ((dcval * (1 << 16)) & 0xffff0000); 259 260 AV_WN32A(&idataptr[ 0], v); 261 AV_WN32A(&idataptr[ 4], v); 262 AV_WN32A(&idataptr[ 8], v); 263 AV_WN32A(&idataptr[12], v); 264 } 265 266 dataptr += DCTSIZE; /* advance pointer to next row */ 267 continue; 268 } 269 270 /* Even part: reverse the even part of the forward DCT. */ 271 /* The rotator is sqrt(2)*c(-6). */ 272{ 273 if (d6) { 274 if (d2) { 275 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ 276 z1 = MULTIPLY(d2 + d6, FIX_0_541196100); 277 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); 278 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); 279 280 tmp0 = (d0 + d4) * CONST_SCALE; 281 tmp1 = (d0 - d4) * CONST_SCALE; 282 283 tmp10 = tmp0 + tmp3; 284 tmp13 = tmp0 - tmp3; 285 tmp11 = tmp1 + tmp2; 286 tmp12 = tmp1 - tmp2; 287 } else { 288 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ 289 tmp2 = MULTIPLY(-d6, FIX_1_306562965); 290 tmp3 = MULTIPLY(d6, FIX_0_541196100); 291 292 tmp0 = (d0 + d4) * CONST_SCALE; 293 tmp1 = (d0 - d4) * CONST_SCALE; 294 295 tmp10 = tmp0 + tmp3; 296 tmp13 = tmp0 - tmp3; 297 tmp11 = tmp1 + tmp2; 298 tmp12 = tmp1 - tmp2; 299 } 300 } else { 301 if (d2) { 302 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ 303 tmp2 = MULTIPLY(d2, FIX_0_541196100); 304 tmp3 = MULTIPLY(d2, FIX_1_306562965); 305 306 tmp0 = (d0 + d4) * CONST_SCALE; 307 tmp1 = (d0 - d4) * CONST_SCALE; 308 309 tmp10 = tmp0 + tmp3; 310 tmp13 = tmp0 - tmp3; 311 tmp11 = tmp1 + tmp2; 312 tmp12 = tmp1 - tmp2; 313 } else { 314 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ 315 tmp10 = tmp13 = (d0 + d4) * CONST_SCALE; 316 tmp11 = tmp12 = (d0 - d4) * CONST_SCALE; 317 } 318 } 319 320 /* Odd part per figure 8; the matrix is unitary and hence its 321 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 322 */ 323 324 if (d7) { 325 if (d5) { 326 if (d3) { 327 if (d1) { 328 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */ 329 z1 = d7 + d1; 330 z2 = d5 + d3; 331 z3 = d7 + d3; 332 z4 = d5 + d1; 333 z5 = MULTIPLY(z3 + z4, FIX_1_175875602); 334 335 tmp0 = MULTIPLY(d7, FIX_0_298631336); 336 tmp1 = MULTIPLY(d5, FIX_2_053119869); 337 tmp2 = MULTIPLY(d3, FIX_3_072711026); 338 tmp3 = MULTIPLY(d1, FIX_1_501321110); 339 z1 = MULTIPLY(-z1, FIX_0_899976223); 340 z2 = MULTIPLY(-z2, FIX_2_562915447); 341 z3 = MULTIPLY(-z3, FIX_1_961570560); 342 z4 = MULTIPLY(-z4, FIX_0_390180644); 343 344 z3 += z5; 345 z4 += z5; 346 347 tmp0 += z1 + z3; 348 tmp1 += z2 + z4; 349 tmp2 += z2 + z3; 350 tmp3 += z1 + z4; 351 } else { 352 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */ 353 z2 = d5 + d3; 354 z3 = d7 + d3; 355 z5 = MULTIPLY(z3 + d5, FIX_1_175875602); 356 357 tmp0 = MULTIPLY(d7, FIX_0_298631336); 358 tmp1 = MULTIPLY(d5, FIX_2_053119869); 359 tmp2 = MULTIPLY(d3, FIX_3_072711026); 360 z1 = MULTIPLY(-d7, FIX_0_899976223); 361 z2 = MULTIPLY(-z2, FIX_2_562915447); 362 z3 = MULTIPLY(-z3, FIX_1_961570560); 363 z4 = MULTIPLY(-d5, FIX_0_390180644); 364 365 z3 += z5; 366 z4 += z5; 367 368 tmp0 += z1 + z3; 369 tmp1 += z2 + z4; 370 tmp2 += z2 + z3; 371 tmp3 = z1 + z4; 372 } 373 } else { 374 if (d1) { 375 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */ 376 z1 = d7 + d1; 377 z4 = d5 + d1; 378 z5 = MULTIPLY(d7 + z4, FIX_1_175875602); 379 380 tmp0 = MULTIPLY(d7, FIX_0_298631336); 381 tmp1 = MULTIPLY(d5, FIX_2_053119869); 382 tmp3 = MULTIPLY(d1, FIX_1_501321110); 383 z1 = MULTIPLY(-z1, FIX_0_899976223); 384 z2 = MULTIPLY(-d5, FIX_2_562915447); 385 z3 = MULTIPLY(-d7, FIX_1_961570560); 386 z4 = MULTIPLY(-z4, FIX_0_390180644); 387 388 z3 += z5; 389 z4 += z5; 390 391 tmp0 += z1 + z3; 392 tmp1 += z2 + z4; 393 tmp2 = z2 + z3; 394 tmp3 += z1 + z4; 395 } else { 396 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */ 397 tmp0 = MULTIPLY(-d7, FIX_0_601344887); 398 z1 = MULTIPLY(-d7, FIX_0_899976223); 399 z3 = MULTIPLY(-d7, FIX_1_961570560); 400 tmp1 = MULTIPLY(-d5, FIX_0_509795579); 401 z2 = MULTIPLY(-d5, FIX_2_562915447); 402 z4 = MULTIPLY(-d5, FIX_0_390180644); 403 z5 = MULTIPLY(d5 + d7, FIX_1_175875602); 404 405 z3 += z5; 406 z4 += z5; 407 408 tmp0 += z3; 409 tmp1 += z4; 410 tmp2 = z2 + z3; 411 tmp3 = z1 + z4; 412 } 413 } 414 } else { 415 if (d3) { 416 if (d1) { 417 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */ 418 z1 = d7 + d1; 419 z3 = d7 + d3; 420 z5 = MULTIPLY(z3 + d1, FIX_1_175875602); 421 422 tmp0 = MULTIPLY(d7, FIX_0_298631336); 423 tmp2 = MULTIPLY(d3, FIX_3_072711026); 424 tmp3 = MULTIPLY(d1, FIX_1_501321110); 425 z1 = MULTIPLY(-z1, FIX_0_899976223); 426 z2 = MULTIPLY(-d3, FIX_2_562915447); 427 z3 = MULTIPLY(-z3, FIX_1_961570560); 428 z4 = MULTIPLY(-d1, FIX_0_390180644); 429 430 z3 += z5; 431 z4 += z5; 432 433 tmp0 += z1 + z3; 434 tmp1 = z2 + z4; 435 tmp2 += z2 + z3; 436 tmp3 += z1 + z4; 437 } else { 438 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */ 439 z3 = d7 + d3; 440 441 tmp0 = MULTIPLY(-d7, FIX_0_601344887); 442 z1 = MULTIPLY(-d7, FIX_0_899976223); 443 tmp2 = MULTIPLY(d3, FIX_0_509795579); 444 z2 = MULTIPLY(-d3, FIX_2_562915447); 445 z5 = MULTIPLY(z3, FIX_1_175875602); 446 z3 = MULTIPLY(-z3, FIX_0_785694958); 447 448 tmp0 += z3; 449 tmp1 = z2 + z5; 450 tmp2 += z3; 451 tmp3 = z1 + z5; 452 } 453 } else { 454 if (d1) { 455 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */ 456 z1 = d7 + d1; 457 z5 = MULTIPLY(z1, FIX_1_175875602); 458 459 z1 = MULTIPLY(z1, FIX_0_275899380); 460 z3 = MULTIPLY(-d7, FIX_1_961570560); 461 tmp0 = MULTIPLY(-d7, FIX_1_662939225); 462 z4 = MULTIPLY(-d1, FIX_0_390180644); 463 tmp3 = MULTIPLY(d1, FIX_1_111140466); 464 465 tmp0 += z1; 466 tmp1 = z4 + z5; 467 tmp2 = z3 + z5; 468 tmp3 += z1; 469 } else { 470 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */ 471 tmp0 = MULTIPLY(-d7, FIX_1_387039845); 472 tmp1 = MULTIPLY(d7, FIX_1_175875602); 473 tmp2 = MULTIPLY(-d7, FIX_0_785694958); 474 tmp3 = MULTIPLY(d7, FIX_0_275899380); 475 } 476 } 477 } 478 } else { 479 if (d5) { 480 if (d3) { 481 if (d1) { 482 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */ 483 z2 = d5 + d3; 484 z4 = d5 + d1; 485 z5 = MULTIPLY(d3 + z4, FIX_1_175875602); 486 487 tmp1 = MULTIPLY(d5, FIX_2_053119869); 488 tmp2 = MULTIPLY(d3, FIX_3_072711026); 489 tmp3 = MULTIPLY(d1, FIX_1_501321110); 490 z1 = MULTIPLY(-d1, FIX_0_899976223); 491 z2 = MULTIPLY(-z2, FIX_2_562915447); 492 z3 = MULTIPLY(-d3, FIX_1_961570560); 493 z4 = MULTIPLY(-z4, FIX_0_390180644); 494 495 z3 += z5; 496 z4 += z5; 497 498 tmp0 = z1 + z3; 499 tmp1 += z2 + z4; 500 tmp2 += z2 + z3; 501 tmp3 += z1 + z4; 502 } else { 503 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */ 504 z2 = d5 + d3; 505 506 z5 = MULTIPLY(z2, FIX_1_175875602); 507 tmp1 = MULTIPLY(d5, FIX_1_662939225); 508 z4 = MULTIPLY(-d5, FIX_0_390180644); 509 z2 = MULTIPLY(-z2, FIX_1_387039845); 510 tmp2 = MULTIPLY(d3, FIX_1_111140466); 511 z3 = MULTIPLY(-d3, FIX_1_961570560); 512 513 tmp0 = z3 + z5; 514 tmp1 += z2; 515 tmp2 += z2; 516 tmp3 = z4 + z5; 517 } 518 } else { 519 if (d1) { 520 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */ 521 z4 = d5 + d1; 522 523 z5 = MULTIPLY(z4, FIX_1_175875602); 524 z1 = MULTIPLY(-d1, FIX_0_899976223); 525 tmp3 = MULTIPLY(d1, FIX_0_601344887); 526 tmp1 = MULTIPLY(-d5, FIX_0_509795579); 527 z2 = MULTIPLY(-d5, FIX_2_562915447); 528 z4 = MULTIPLY(z4, FIX_0_785694958); 529 530 tmp0 = z1 + z5; 531 tmp1 += z4; 532 tmp2 = z2 + z5; 533 tmp3 += z4; 534 } else { 535 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */ 536 tmp0 = MULTIPLY(d5, FIX_1_175875602); 537 tmp1 = MULTIPLY(d5, FIX_0_275899380); 538 tmp2 = MULTIPLY(-d5, FIX_1_387039845); 539 tmp3 = MULTIPLY(d5, FIX_0_785694958); 540 } 541 } 542 } else { 543 if (d3) { 544 if (d1) { 545 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */ 546 z5 = d1 + d3; 547 tmp3 = MULTIPLY(d1, FIX_0_211164243); 548 tmp2 = MULTIPLY(-d3, FIX_1_451774981); 549 z1 = MULTIPLY(d1, FIX_1_061594337); 550 z2 = MULTIPLY(-d3, FIX_2_172734803); 551 z4 = MULTIPLY(z5, FIX_0_785694958); 552 z5 = MULTIPLY(z5, FIX_1_175875602); 553 554 tmp0 = z1 - z4; 555 tmp1 = z2 + z4; 556 tmp2 += z5; 557 tmp3 += z5; 558 } else { 559 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */ 560 tmp0 = MULTIPLY(-d3, FIX_0_785694958); 561 tmp1 = MULTIPLY(-d3, FIX_1_387039845); 562 tmp2 = MULTIPLY(-d3, FIX_0_275899380); 563 tmp3 = MULTIPLY(d3, FIX_1_175875602); 564 } 565 } else { 566 if (d1) { 567 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */ 568 tmp0 = MULTIPLY(d1, FIX_0_275899380); 569 tmp1 = MULTIPLY(d1, FIX_0_785694958); 570 tmp2 = MULTIPLY(d1, FIX_1_175875602); 571 tmp3 = MULTIPLY(d1, FIX_1_387039845); 572 } else { 573 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */ 574 tmp0 = tmp1 = tmp2 = tmp3 = 0; 575 } 576 } 577 } 578 } 579} 580 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 581 582 dataptr[0] = (int16_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); 583 dataptr[7] = (int16_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); 584 dataptr[1] = (int16_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); 585 dataptr[6] = (int16_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS); 586 dataptr[2] = (int16_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS); 587 dataptr[5] = (int16_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); 588 dataptr[3] = (int16_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); 589 dataptr[4] = (int16_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); 590 591 dataptr += DCTSIZE; /* advance pointer to next row */ 592 } 593 594 /* Pass 2: process columns. */ 595 /* Note that we must descale the results by a factor of 8 == 2**3, */ 596 /* and also undo the PASS1_BITS scaling. */ 597 598 dataptr = data; 599 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { 600 /* Columns of zeroes can be exploited in the same way as we did with rows. 601 * However, the row calculation has created many nonzero AC terms, so the 602 * simplification applies less often (typically 5% to 10% of the time). 603 * On machines with very fast multiplication, it's possible that the 604 * test takes more time than it's worth. In that case this section 605 * may be commented out. 606 */ 607 608 d0 = dataptr[DCTSIZE*0]; 609 d1 = dataptr[DCTSIZE*1]; 610 d2 = dataptr[DCTSIZE*2]; 611 d3 = dataptr[DCTSIZE*3]; 612 d4 = dataptr[DCTSIZE*4]; 613 d5 = dataptr[DCTSIZE*5]; 614 d6 = dataptr[DCTSIZE*6]; 615 d7 = dataptr[DCTSIZE*7]; 616 617 /* Even part: reverse the even part of the forward DCT. */ 618 /* The rotator is sqrt(2)*c(-6). */ 619 if (d6) { 620 if (d2) { 621 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ 622 z1 = MULTIPLY(d2 + d6, FIX_0_541196100); 623 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); 624 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); 625 626 tmp0 = (d0 + d4) * CONST_SCALE; 627 tmp1 = (d0 - d4) * CONST_SCALE; 628 629 tmp10 = tmp0 + tmp3; 630 tmp13 = tmp0 - tmp3; 631 tmp11 = tmp1 + tmp2; 632 tmp12 = tmp1 - tmp2; 633 } else { 634 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ 635 tmp2 = MULTIPLY(-d6, FIX_1_306562965); 636 tmp3 = MULTIPLY(d6, FIX_0_541196100); 637 638 tmp0 = (d0 + d4) * CONST_SCALE; 639 tmp1 = (d0 - d4) * CONST_SCALE; 640 641 tmp10 = tmp0 + tmp3; 642 tmp13 = tmp0 - tmp3; 643 tmp11 = tmp1 + tmp2; 644 tmp12 = tmp1 - tmp2; 645 } 646 } else { 647 if (d2) { 648 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ 649 tmp2 = MULTIPLY(d2, FIX_0_541196100); 650 tmp3 = MULTIPLY(d2, FIX_1_306562965); 651 652 tmp0 = (d0 + d4) * CONST_SCALE; 653 tmp1 = (d0 - d4) * CONST_SCALE; 654 655 tmp10 = tmp0 + tmp3; 656 tmp13 = tmp0 - tmp3; 657 tmp11 = tmp1 + tmp2; 658 tmp12 = tmp1 - tmp2; 659 } else { 660 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ 661 tmp10 = tmp13 = (d0 + d4) * CONST_SCALE; 662 tmp11 = tmp12 = (d0 - d4) * CONST_SCALE; 663 } 664 } 665 666 /* Odd part per figure 8; the matrix is unitary and hence its 667 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 668 */ 669 if (d7) { 670 if (d5) { 671 if (d3) { 672 if (d1) { 673 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */ 674 z1 = d7 + d1; 675 z2 = d5 + d3; 676 z3 = d7 + d3; 677 z4 = d5 + d1; 678 z5 = MULTIPLY(z3 + z4, FIX_1_175875602); 679 680 tmp0 = MULTIPLY(d7, FIX_0_298631336); 681 tmp1 = MULTIPLY(d5, FIX_2_053119869); 682 tmp2 = MULTIPLY(d3, FIX_3_072711026); 683 tmp3 = MULTIPLY(d1, FIX_1_501321110); 684 z1 = MULTIPLY(-z1, FIX_0_899976223); 685 z2 = MULTIPLY(-z2, FIX_2_562915447); 686 z3 = MULTIPLY(-z3, FIX_1_961570560); 687 z4 = MULTIPLY(-z4, FIX_0_390180644); 688 689 z3 += z5; 690 z4 += z5; 691 692 tmp0 += z1 + z3; 693 tmp1 += z2 + z4; 694 tmp2 += z2 + z3; 695 tmp3 += z1 + z4; 696 } else { 697 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */ 698 z2 = d5 + d3; 699 z3 = d7 + d3; 700 z5 = MULTIPLY(z3 + d5, FIX_1_175875602); 701 702 tmp0 = MULTIPLY(d7, FIX_0_298631336); 703 tmp1 = MULTIPLY(d5, FIX_2_053119869); 704 tmp2 = MULTIPLY(d3, FIX_3_072711026); 705 z1 = MULTIPLY(-d7, FIX_0_899976223); 706 z2 = MULTIPLY(-z2, FIX_2_562915447); 707 z3 = MULTIPLY(-z3, FIX_1_961570560); 708 z4 = MULTIPLY(-d5, FIX_0_390180644); 709 710 z3 += z5; 711 z4 += z5; 712 713 tmp0 += z1 + z3; 714 tmp1 += z2 + z4; 715 tmp2 += z2 + z3; 716 tmp3 = z1 + z4; 717 } 718 } else { 719 if (d1) { 720 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */ 721 z1 = d7 + d1; 722 z3 = d7; 723 z4 = d5 + d1; 724 z5 = MULTIPLY(z3 + z4, FIX_1_175875602); 725 726 tmp0 = MULTIPLY(d7, FIX_0_298631336); 727 tmp1 = MULTIPLY(d5, FIX_2_053119869); 728 tmp3 = MULTIPLY(d1, FIX_1_501321110); 729 z1 = MULTIPLY(-z1, FIX_0_899976223); 730 z2 = MULTIPLY(-d5, FIX_2_562915447); 731 z3 = MULTIPLY(-d7, FIX_1_961570560); 732 z4 = MULTIPLY(-z4, FIX_0_390180644); 733 734 z3 += z5; 735 z4 += z5; 736 737 tmp0 += z1 + z3; 738 tmp1 += z2 + z4; 739 tmp2 = z2 + z3; 740 tmp3 += z1 + z4; 741 } else { 742 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */ 743 tmp0 = MULTIPLY(-d7, FIX_0_601344887); 744 z1 = MULTIPLY(-d7, FIX_0_899976223); 745 z3 = MULTIPLY(-d7, FIX_1_961570560); 746 tmp1 = MULTIPLY(-d5, FIX_0_509795579); 747 z2 = MULTIPLY(-d5, FIX_2_562915447); 748 z4 = MULTIPLY(-d5, FIX_0_390180644); 749 z5 = MULTIPLY(d5 + d7, FIX_1_175875602); 750 751 z3 += z5; 752 z4 += z5; 753 754 tmp0 += z3; 755 tmp1 += z4; 756 tmp2 = z2 + z3; 757 tmp3 = z1 + z4; 758 } 759 } 760 } else { 761 if (d3) { 762 if (d1) { 763 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */ 764 z1 = d7 + d1; 765 z3 = d7 + d3; 766 z5 = MULTIPLY(z3 + d1, FIX_1_175875602); 767 768 tmp0 = MULTIPLY(d7, FIX_0_298631336); 769 tmp2 = MULTIPLY(d3, FIX_3_072711026); 770 tmp3 = MULTIPLY(d1, FIX_1_501321110); 771 z1 = MULTIPLY(-z1, FIX_0_899976223); 772 z2 = MULTIPLY(-d3, FIX_2_562915447); 773 z3 = MULTIPLY(-z3, FIX_1_961570560); 774 z4 = MULTIPLY(-d1, FIX_0_390180644); 775 776 z3 += z5; 777 z4 += z5; 778 779 tmp0 += z1 + z3; 780 tmp1 = z2 + z4; 781 tmp2 += z2 + z3; 782 tmp3 += z1 + z4; 783 } else { 784 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */ 785 z3 = d7 + d3; 786 787 tmp0 = MULTIPLY(-d7, FIX_0_601344887); 788 z1 = MULTIPLY(-d7, FIX_0_899976223); 789 tmp2 = MULTIPLY(d3, FIX_0_509795579); 790 z2 = MULTIPLY(-d3, FIX_2_562915447); 791 z5 = MULTIPLY(z3, FIX_1_175875602); 792 z3 = MULTIPLY(-z3, FIX_0_785694958); 793 794 tmp0 += z3; 795 tmp1 = z2 + z5; 796 tmp2 += z3; 797 tmp3 = z1 + z5; 798 } 799 } else { 800 if (d1) { 801 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */ 802 z1 = d7 + d1; 803 z5 = MULTIPLY(z1, FIX_1_175875602); 804 805 z1 = MULTIPLY(z1, FIX_0_275899380); 806 z3 = MULTIPLY(-d7, FIX_1_961570560); 807 tmp0 = MULTIPLY(-d7, FIX_1_662939225); 808 z4 = MULTIPLY(-d1, FIX_0_390180644); 809 tmp3 = MULTIPLY(d1, FIX_1_111140466); 810 811 tmp0 += z1; 812 tmp1 = z4 + z5; 813 tmp2 = z3 + z5; 814 tmp3 += z1; 815 } else { 816 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */ 817 tmp0 = MULTIPLY(-d7, FIX_1_387039845); 818 tmp1 = MULTIPLY(d7, FIX_1_175875602); 819 tmp2 = MULTIPLY(-d7, FIX_0_785694958); 820 tmp3 = MULTIPLY(d7, FIX_0_275899380); 821 } 822 } 823 } 824 } else { 825 if (d5) { 826 if (d3) { 827 if (d1) { 828 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */ 829 z2 = d5 + d3; 830 z4 = d5 + d1; 831 z5 = MULTIPLY(d3 + z4, FIX_1_175875602); 832 833 tmp1 = MULTIPLY(d5, FIX_2_053119869); 834 tmp2 = MULTIPLY(d3, FIX_3_072711026); 835 tmp3 = MULTIPLY(d1, FIX_1_501321110); 836 z1 = MULTIPLY(-d1, FIX_0_899976223); 837 z2 = MULTIPLY(-z2, FIX_2_562915447); 838 z3 = MULTIPLY(-d3, FIX_1_961570560); 839 z4 = MULTIPLY(-z4, FIX_0_390180644); 840 841 z3 += z5; 842 z4 += z5; 843 844 tmp0 = z1 + z3; 845 tmp1 += z2 + z4; 846 tmp2 += z2 + z3; 847 tmp3 += z1 + z4; 848 } else { 849 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */ 850 z2 = d5 + d3; 851 852 z5 = MULTIPLY(z2, FIX_1_175875602); 853 tmp1 = MULTIPLY(d5, FIX_1_662939225); 854 z4 = MULTIPLY(-d5, FIX_0_390180644); 855 z2 = MULTIPLY(-z2, FIX_1_387039845); 856 tmp2 = MULTIPLY(d3, FIX_1_111140466); 857 z3 = MULTIPLY(-d3, FIX_1_961570560); 858 859 tmp0 = z3 + z5; 860 tmp1 += z2; 861 tmp2 += z2; 862 tmp3 = z4 + z5; 863 } 864 } else { 865 if (d1) { 866 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */ 867 z4 = d5 + d1; 868 869 z5 = MULTIPLY(z4, FIX_1_175875602); 870 z1 = MULTIPLY(-d1, FIX_0_899976223); 871 tmp3 = MULTIPLY(d1, FIX_0_601344887); 872 tmp1 = MULTIPLY(-d5, FIX_0_509795579); 873 z2 = MULTIPLY(-d5, FIX_2_562915447); 874 z4 = MULTIPLY(z4, FIX_0_785694958); 875 876 tmp0 = z1 + z5; 877 tmp1 += z4; 878 tmp2 = z2 + z5; 879 tmp3 += z4; 880 } else { 881 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */ 882 tmp0 = MULTIPLY(d5, FIX_1_175875602); 883 tmp1 = MULTIPLY(d5, FIX_0_275899380); 884 tmp2 = MULTIPLY(-d5, FIX_1_387039845); 885 tmp3 = MULTIPLY(d5, FIX_0_785694958); 886 } 887 } 888 } else { 889 if (d3) { 890 if (d1) { 891 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */ 892 z5 = d1 + d3; 893 tmp3 = MULTIPLY(d1, FIX_0_211164243); 894 tmp2 = MULTIPLY(-d3, FIX_1_451774981); 895 z1 = MULTIPLY(d1, FIX_1_061594337); 896 z2 = MULTIPLY(-d3, FIX_2_172734803); 897 z4 = MULTIPLY(z5, FIX_0_785694958); 898 z5 = MULTIPLY(z5, FIX_1_175875602); 899 900 tmp0 = z1 - z4; 901 tmp1 = z2 + z4; 902 tmp2 += z5; 903 tmp3 += z5; 904 } else { 905 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */ 906 tmp0 = MULTIPLY(-d3, FIX_0_785694958); 907 tmp1 = MULTIPLY(-d3, FIX_1_387039845); 908 tmp2 = MULTIPLY(-d3, FIX_0_275899380); 909 tmp3 = MULTIPLY(d3, FIX_1_175875602); 910 } 911 } else { 912 if (d1) { 913 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */ 914 tmp0 = MULTIPLY(d1, FIX_0_275899380); 915 tmp1 = MULTIPLY(d1, FIX_0_785694958); 916 tmp2 = MULTIPLY(d1, FIX_1_175875602); 917 tmp3 = MULTIPLY(d1, FIX_1_387039845); 918 } else { 919 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */ 920 tmp0 = tmp1 = tmp2 = tmp3 = 0; 921 } 922 } 923 } 924 } 925 926 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 927 928 dataptr[DCTSIZE*0] = (int16_t) DESCALE(tmp10 + tmp3, 929 CONST_BITS+PASS1_BITS+3); 930 dataptr[DCTSIZE*7] = (int16_t) DESCALE(tmp10 - tmp3, 931 CONST_BITS+PASS1_BITS+3); 932 dataptr[DCTSIZE*1] = (int16_t) DESCALE(tmp11 + tmp2, 933 CONST_BITS+PASS1_BITS+3); 934 dataptr[DCTSIZE*6] = (int16_t) DESCALE(tmp11 - tmp2, 935 CONST_BITS+PASS1_BITS+3); 936 dataptr[DCTSIZE*2] = (int16_t) DESCALE(tmp12 + tmp1, 937 CONST_BITS+PASS1_BITS+3); 938 dataptr[DCTSIZE*5] = (int16_t) DESCALE(tmp12 - tmp1, 939 CONST_BITS+PASS1_BITS+3); 940 dataptr[DCTSIZE*3] = (int16_t) DESCALE(tmp13 + tmp0, 941 CONST_BITS+PASS1_BITS+3); 942 dataptr[DCTSIZE*4] = (int16_t) DESCALE(tmp13 - tmp0, 943 CONST_BITS+PASS1_BITS+3); 944 945 dataptr++; /* advance pointer to next column */ 946 } 947} 948 949#undef DCTSIZE 950#define DCTSIZE 4 951#define DCTSTRIDE 8 952 953void ff_j_rev_dct4(DCTBLOCK data) 954{ 955 int32_t tmp0, tmp1, tmp2, tmp3; 956 int32_t tmp10, tmp11, tmp12, tmp13; 957 int32_t z1; 958 int32_t d0, d2, d4, d6; 959 register int16_t *dataptr; 960 int rowctr; 961 962 /* Pass 1: process rows. */ 963 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ 964 /* furthermore, we scale the results by 2**PASS1_BITS. */ 965 966 data[0] += 4; 967 968 dataptr = data; 969 970 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { 971 /* Due to quantization, we will usually find that many of the input 972 * coefficients are zero, especially the AC terms. We can exploit this 973 * by short-circuiting the IDCT calculation for any row in which all 974 * the AC terms are zero. In that case each output is equal to the 975 * DC coefficient (with scale factor as needed). 976 * With typical images and quantization tables, half or more of the 977 * row DCT calculations can be simplified this way. 978 */ 979 980 register uint8_t *idataptr = (uint8_t*)dataptr; 981 982 d0 = dataptr[0]; 983 d2 = dataptr[1]; 984 d4 = dataptr[2]; 985 d6 = dataptr[3]; 986 987 if ((d2 | d4 | d6) == 0) { 988 /* AC terms all zero */ 989 if (d0) { 990 /* Compute a 32 bit value to assign. */ 991 int16_t dcval = (int16_t) (d0 << PASS1_BITS); 992 register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000); 993 994 AV_WN32A(&idataptr[0], v); 995 AV_WN32A(&idataptr[4], v); 996 } 997 998 dataptr += DCTSTRIDE; /* advance pointer to next row */ 999 continue; 1000 } 1001 1002 /* Even part: reverse the even part of the forward DCT. */ 1003 /* The rotator is sqrt(2)*c(-6). */ 1004 if (d6) { 1005 if (d2) { 1006 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ 1007 z1 = MULTIPLY(d2 + d6, FIX_0_541196100); 1008 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); 1009 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); 1010 1011 tmp0 = (d0 + d4) << CONST_BITS; 1012 tmp1 = (d0 - d4) << CONST_BITS; 1013 1014 tmp10 = tmp0 + tmp3; 1015 tmp13 = tmp0 - tmp3; 1016 tmp11 = tmp1 + tmp2; 1017 tmp12 = tmp1 - tmp2; 1018 } else { 1019 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ 1020 tmp2 = MULTIPLY(-d6, FIX_1_306562965); 1021 tmp3 = MULTIPLY(d6, FIX_0_541196100); 1022 1023 tmp0 = (d0 + d4) << CONST_BITS; 1024 tmp1 = (d0 - d4) << CONST_BITS; 1025 1026 tmp10 = tmp0 + tmp3; 1027 tmp13 = tmp0 - tmp3; 1028 tmp11 = tmp1 + tmp2; 1029 tmp12 = tmp1 - tmp2; 1030 } 1031 } else { 1032 if (d2) { 1033 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ 1034 tmp2 = MULTIPLY(d2, FIX_0_541196100); 1035 tmp3 = MULTIPLY(d2, FIX_1_306562965); 1036 1037 tmp0 = (d0 + d4) << CONST_BITS; 1038 tmp1 = (d0 - d4) << CONST_BITS; 1039 1040 tmp10 = tmp0 + tmp3; 1041 tmp13 = tmp0 - tmp3; 1042 tmp11 = tmp1 + tmp2; 1043 tmp12 = tmp1 - tmp2; 1044 } else { 1045 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ 1046 tmp10 = tmp13 = (d0 + d4) << CONST_BITS; 1047 tmp11 = tmp12 = (d0 - d4) << CONST_BITS; 1048 } 1049 } 1050 1051 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 1052 1053 dataptr[0] = (int16_t) DESCALE(tmp10, CONST_BITS-PASS1_BITS); 1054 dataptr[1] = (int16_t) DESCALE(tmp11, CONST_BITS-PASS1_BITS); 1055 dataptr[2] = (int16_t) DESCALE(tmp12, CONST_BITS-PASS1_BITS); 1056 dataptr[3] = (int16_t) DESCALE(tmp13, CONST_BITS-PASS1_BITS); 1057 1058 dataptr += DCTSTRIDE; /* advance pointer to next row */ 1059 } 1060 1061 /* Pass 2: process columns. */ 1062 /* Note that we must descale the results by a factor of 8 == 2**3, */ 1063 /* and also undo the PASS1_BITS scaling. */ 1064 1065 dataptr = data; 1066 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { 1067 /* Columns of zeroes can be exploited in the same way as we did with rows. 1068 * However, the row calculation has created many nonzero AC terms, so the 1069 * simplification applies less often (typically 5% to 10% of the time). 1070 * On machines with very fast multiplication, it's possible that the 1071 * test takes more time than it's worth. In that case this section 1072 * may be commented out. 1073 */ 1074 1075 d0 = dataptr[DCTSTRIDE*0]; 1076 d2 = dataptr[DCTSTRIDE*1]; 1077 d4 = dataptr[DCTSTRIDE*2]; 1078 d6 = dataptr[DCTSTRIDE*3]; 1079 1080 /* Even part: reverse the even part of the forward DCT. */ 1081 /* The rotator is sqrt(2)*c(-6). */ 1082 if (d6) { 1083 if (d2) { 1084 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ 1085 z1 = MULTIPLY(d2 + d6, FIX_0_541196100); 1086 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); 1087 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); 1088 1089 tmp0 = (d0 + d4) << CONST_BITS; 1090 tmp1 = (d0 - d4) << CONST_BITS; 1091 1092 tmp10 = tmp0 + tmp3; 1093 tmp13 = tmp0 - tmp3; 1094 tmp11 = tmp1 + tmp2; 1095 tmp12 = tmp1 - tmp2; 1096 } else { 1097 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ 1098 tmp2 = MULTIPLY(-d6, FIX_1_306562965); 1099 tmp3 = MULTIPLY(d6, FIX_0_541196100); 1100 1101 tmp0 = (d0 + d4) << CONST_BITS; 1102 tmp1 = (d0 - d4) << CONST_BITS; 1103 1104 tmp10 = tmp0 + tmp3; 1105 tmp13 = tmp0 - tmp3; 1106 tmp11 = tmp1 + tmp2; 1107 tmp12 = tmp1 - tmp2; 1108 } 1109 } else { 1110 if (d2) { 1111 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ 1112 tmp2 = MULTIPLY(d2, FIX_0_541196100); 1113 tmp3 = MULTIPLY(d2, FIX_1_306562965); 1114 1115 tmp0 = (d0 + d4) << CONST_BITS; 1116 tmp1 = (d0 - d4) << CONST_BITS; 1117 1118 tmp10 = tmp0 + tmp3; 1119 tmp13 = tmp0 - tmp3; 1120 tmp11 = tmp1 + tmp2; 1121 tmp12 = tmp1 - tmp2; 1122 } else { 1123 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ 1124 tmp10 = tmp13 = (d0 + d4) << CONST_BITS; 1125 tmp11 = tmp12 = (d0 - d4) << CONST_BITS; 1126 } 1127 } 1128 1129 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 1130 1131 dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3); 1132 dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3); 1133 dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3); 1134 dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3); 1135 1136 dataptr++; /* advance pointer to next column */ 1137 } 1138} 1139 1140void ff_j_rev_dct2(DCTBLOCK data){ 1141 int d00, d01, d10, d11; 1142 1143 data[0] += 4; 1144 d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE]; 1145 d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE]; 1146 d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE]; 1147 d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE]; 1148 1149 data[0+0*DCTSTRIDE]= (d00 + d10)>>3; 1150 data[1+0*DCTSTRIDE]= (d01 + d11)>>3; 1151 data[0+1*DCTSTRIDE]= (d00 - d10)>>3; 1152 data[1+1*DCTSTRIDE]= (d01 - d11)>>3; 1153} 1154 1155void ff_j_rev_dct1(DCTBLOCK data){ 1156 data[0] = (data[0] + 4)>>3; 1157} 1158 1159#undef FIX 1160#undef CONST_BITS 1161 1162void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block) 1163{ 1164 ff_j_rev_dct(block); 1165 ff_put_pixels_clamped_c(block, dest, line_size); 1166} 1167 1168void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block) 1169{ 1170 ff_j_rev_dct(block); 1171 ff_add_pixels_clamped_c(block, dest, line_size); 1172} 1173