1/* 2 * VC-1 and WMV3 decoder - DSP functions 3 * Copyright (c) 2006 Konstantin Shishkov 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22/** 23 * @file 24 * VC-1 and WMV3 decoder 25 */ 26 27#include "config_components.h" 28 29#include "libavutil/avassert.h" 30#include "libavutil/common.h" 31#include "libavutil/intreadwrite.h" 32#include "h264chroma.h" 33#include "qpeldsp.h" 34#include "rnd_avg.h" 35#include "vc1dsp.h" 36#include "startcode.h" 37#include "vc1_common.h" 38 39/* Apply overlap transform to horizontal edge */ 40static void vc1_v_overlap_c(uint8_t *src, ptrdiff_t stride) 41{ 42 int i; 43 int a, b, c, d; 44 int d1, d2; 45 int rnd = 1; 46 for (i = 0; i < 8; i++) { 47 a = src[-2 * stride]; 48 b = src[-stride]; 49 c = src[0]; 50 d = src[stride]; 51 d1 = (a - d + 3 + rnd) >> 3; 52 d2 = (a - d + b - c + 4 - rnd) >> 3; 53 54 src[-2 * stride] = a - d1; 55 src[-stride] = av_clip_uint8(b - d2); 56 src[0] = av_clip_uint8(c + d2); 57 src[stride] = d + d1; 58 src++; 59 rnd = !rnd; 60 } 61} 62 63/* Apply overlap transform to vertical edge */ 64static void vc1_h_overlap_c(uint8_t *src, ptrdiff_t stride) 65{ 66 int i; 67 int a, b, c, d; 68 int d1, d2; 69 int rnd = 1; 70 for (i = 0; i < 8; i++) { 71 a = src[-2]; 72 b = src[-1]; 73 c = src[0]; 74 d = src[1]; 75 d1 = (a - d + 3 + rnd) >> 3; 76 d2 = (a - d + b - c + 4 - rnd) >> 3; 77 78 src[-2] = a - d1; 79 src[-1] = av_clip_uint8(b - d2); 80 src[0] = av_clip_uint8(c + d2); 81 src[1] = d + d1; 82 src += stride; 83 rnd = !rnd; 84 } 85} 86 87static void vc1_v_s_overlap_c(int16_t *top, int16_t *bottom) 88{ 89 int i; 90 int a, b, c, d; 91 int d1, d2; 92 int rnd1 = 4, rnd2 = 3; 93 for (i = 0; i < 8; i++) { 94 a = top[48]; 95 b = top[56]; 96 c = bottom[0]; 97 d = bottom[8]; 98 d1 = a - d; 99 d2 = a - d + b - c; 100 101 top[48] = ((a * 8) - d1 + rnd1) >> 3; 102 top[56] = ((b * 8) - d2 + rnd2) >> 3; 103 bottom[0] = ((c * 8) + d2 + rnd1) >> 3; 104 bottom[8] = ((d * 8) + d1 + rnd2) >> 3; 105 106 bottom++; 107 top++; 108 rnd2 = 7 - rnd2; 109 rnd1 = 7 - rnd1; 110 } 111} 112 113static void vc1_h_s_overlap_c(int16_t *left, int16_t *right, ptrdiff_t left_stride, ptrdiff_t right_stride, int flags) 114{ 115 int i; 116 int a, b, c, d; 117 int d1, d2; 118 int rnd1 = flags & 2 ? 3 : 4; 119 int rnd2 = 7 - rnd1; 120 for (i = 0; i < 8; i++) { 121 a = left[6]; 122 b = left[7]; 123 c = right[0]; 124 d = right[1]; 125 d1 = a - d; 126 d2 = a - d + b - c; 127 128 left[6] = ((a * 8) - d1 + rnd1) >> 3; 129 left[7] = ((b * 8) - d2 + rnd2) >> 3; 130 right[0] = ((c * 8) + d2 + rnd1) >> 3; 131 right[1] = ((d * 8) + d1 + rnd2) >> 3; 132 133 right += right_stride; 134 left += left_stride; 135 if (flags & 1) { 136 rnd2 = 7 - rnd2; 137 rnd1 = 7 - rnd1; 138 } 139 } 140} 141 142/** 143 * VC-1 in-loop deblocking filter for one line 144 * @param src source block type 145 * @param stride block stride 146 * @param pq block quantizer 147 * @return whether other 3 pairs should be filtered or not 148 * @see 8.6 149 */ 150static av_always_inline int vc1_filter_line(uint8_t *src, ptrdiff_t stride, int pq) 151{ 152 int a0 = (2 * (src[-2 * stride] - src[1 * stride]) - 153 5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3; 154 int a0_sign = a0 >> 31; /* Store sign */ 155 156 a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */ 157 if (a0 < pq) { 158 int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) - 159 5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3); 160 int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) - 161 5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3); 162 if (a1 < a0 || a2 < a0) { 163 int clip = src[-1 * stride] - src[0 * stride]; 164 int clip_sign = clip >> 31; 165 166 clip = ((clip ^ clip_sign) - clip_sign) >> 1; 167 if (clip) { 168 int a3 = FFMIN(a1, a2); 169 int d = 5 * (a3 - a0); 170 int d_sign = (d >> 31); 171 172 d = ((d ^ d_sign) - d_sign) >> 3; 173 d_sign ^= a0_sign; 174 175 if (d_sign ^ clip_sign) 176 d = 0; 177 else { 178 d = FFMIN(d, clip); 179 d = (d ^ d_sign) - d_sign; /* Restore sign */ 180 src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d); 181 src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d); 182 } 183 return 1; 184 } 185 } 186 } 187 return 0; 188} 189 190/** 191 * VC-1 in-loop deblocking filter 192 * @param src source block type 193 * @param step distance between horizontally adjacent elements 194 * @param stride distance between vertically adjacent elements 195 * @param len edge length to filter (4 or 8 pixels) 196 * @param pq block quantizer 197 * @see 8.6 198 */ 199static inline void vc1_loop_filter(uint8_t *src, int step, ptrdiff_t stride, 200 int len, int pq) 201{ 202 int i; 203 int filt3; 204 205 for (i = 0; i < len; i += 4) { 206 filt3 = vc1_filter_line(src + 2 * step, stride, pq); 207 if (filt3) { 208 vc1_filter_line(src + 0 * step, stride, pq); 209 vc1_filter_line(src + 1 * step, stride, pq); 210 vc1_filter_line(src + 3 * step, stride, pq); 211 } 212 src += step * 4; 213 } 214} 215 216static void vc1_v_loop_filter4_c(uint8_t *src, ptrdiff_t stride, int pq) 217{ 218 vc1_loop_filter(src, 1, stride, 4, pq); 219} 220 221static void vc1_h_loop_filter4_c(uint8_t *src, ptrdiff_t stride, int pq) 222{ 223 vc1_loop_filter(src, stride, 1, 4, pq); 224} 225 226static void vc1_v_loop_filter8_c(uint8_t *src, ptrdiff_t stride, int pq) 227{ 228 vc1_loop_filter(src, 1, stride, 8, pq); 229} 230 231static void vc1_h_loop_filter8_c(uint8_t *src, ptrdiff_t stride, int pq) 232{ 233 vc1_loop_filter(src, stride, 1, 8, pq); 234} 235 236static void vc1_v_loop_filter16_c(uint8_t *src, ptrdiff_t stride, int pq) 237{ 238 vc1_loop_filter(src, 1, stride, 16, pq); 239} 240 241static void vc1_h_loop_filter16_c(uint8_t *src, ptrdiff_t stride, int pq) 242{ 243 vc1_loop_filter(src, stride, 1, 16, pq); 244} 245 246/* Do inverse transform on 8x8 block */ 247static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, ptrdiff_t stride, int16_t *block) 248{ 249 int i; 250 int dc = block[0]; 251 252 dc = (3 * dc + 1) >> 1; 253 dc = (3 * dc + 16) >> 5; 254 255 for (i = 0; i < 8; i++) { 256 dest[0] = av_clip_uint8(dest[0] + dc); 257 dest[1] = av_clip_uint8(dest[1] + dc); 258 dest[2] = av_clip_uint8(dest[2] + dc); 259 dest[3] = av_clip_uint8(dest[3] + dc); 260 dest[4] = av_clip_uint8(dest[4] + dc); 261 dest[5] = av_clip_uint8(dest[5] + dc); 262 dest[6] = av_clip_uint8(dest[6] + dc); 263 dest[7] = av_clip_uint8(dest[7] + dc); 264 dest += stride; 265 } 266} 267 268static void vc1_inv_trans_8x8_c(int16_t block[64]) 269{ 270 int i; 271 register int t1, t2, t3, t4, t5, t6, t7, t8; 272 int16_t *src, *dst, temp[64]; 273 274 src = block; 275 dst = temp; 276 for (i = 0; i < 8; i++) { 277 t1 = 12 * (src[ 0] + src[32]) + 4; 278 t2 = 12 * (src[ 0] - src[32]) + 4; 279 t3 = 16 * src[16] + 6 * src[48]; 280 t4 = 6 * src[16] - 16 * src[48]; 281 282 t5 = t1 + t3; 283 t6 = t2 + t4; 284 t7 = t2 - t4; 285 t8 = t1 - t3; 286 287 t1 = 16 * src[ 8] + 15 * src[24] + 9 * src[40] + 4 * src[56]; 288 t2 = 15 * src[ 8] - 4 * src[24] - 16 * src[40] - 9 * src[56]; 289 t3 = 9 * src[ 8] - 16 * src[24] + 4 * src[40] + 15 * src[56]; 290 t4 = 4 * src[ 8] - 9 * src[24] + 15 * src[40] - 16 * src[56]; 291 292 dst[0] = (t5 + t1) >> 3; 293 dst[1] = (t6 + t2) >> 3; 294 dst[2] = (t7 + t3) >> 3; 295 dst[3] = (t8 + t4) >> 3; 296 dst[4] = (t8 - t4) >> 3; 297 dst[5] = (t7 - t3) >> 3; 298 dst[6] = (t6 - t2) >> 3; 299 dst[7] = (t5 - t1) >> 3; 300 301 src += 1; 302 dst += 8; 303 } 304 305 src = temp; 306 dst = block; 307 for (i = 0; i < 8; i++) { 308 t1 = 12 * (src[ 0] + src[32]) + 64; 309 t2 = 12 * (src[ 0] - src[32]) + 64; 310 t3 = 16 * src[16] + 6 * src[48]; 311 t4 = 6 * src[16] - 16 * src[48]; 312 313 t5 = t1 + t3; 314 t6 = t2 + t4; 315 t7 = t2 - t4; 316 t8 = t1 - t3; 317 318 t1 = 16 * src[ 8] + 15 * src[24] + 9 * src[40] + 4 * src[56]; 319 t2 = 15 * src[ 8] - 4 * src[24] - 16 * src[40] - 9 * src[56]; 320 t3 = 9 * src[ 8] - 16 * src[24] + 4 * src[40] + 15 * src[56]; 321 t4 = 4 * src[ 8] - 9 * src[24] + 15 * src[40] - 16 * src[56]; 322 323 dst[ 0] = (t5 + t1) >> 7; 324 dst[ 8] = (t6 + t2) >> 7; 325 dst[16] = (t7 + t3) >> 7; 326 dst[24] = (t8 + t4) >> 7; 327 dst[32] = (t8 - t4 + 1) >> 7; 328 dst[40] = (t7 - t3 + 1) >> 7; 329 dst[48] = (t6 - t2 + 1) >> 7; 330 dst[56] = (t5 - t1 + 1) >> 7; 331 332 src++; 333 dst++; 334 } 335} 336 337/* Do inverse transform on 8x4 part of block */ 338static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, ptrdiff_t stride, int16_t *block) 339{ 340 int i; 341 int dc = block[0]; 342 343 dc = (3 * dc + 1) >> 1; 344 dc = (17 * dc + 64) >> 7; 345 346 for (i = 0; i < 4; i++) { 347 dest[0] = av_clip_uint8(dest[0] + dc); 348 dest[1] = av_clip_uint8(dest[1] + dc); 349 dest[2] = av_clip_uint8(dest[2] + dc); 350 dest[3] = av_clip_uint8(dest[3] + dc); 351 dest[4] = av_clip_uint8(dest[4] + dc); 352 dest[5] = av_clip_uint8(dest[5] + dc); 353 dest[6] = av_clip_uint8(dest[6] + dc); 354 dest[7] = av_clip_uint8(dest[7] + dc); 355 dest += stride; 356 } 357} 358 359static void vc1_inv_trans_8x4_c(uint8_t *dest, ptrdiff_t stride, int16_t *block) 360{ 361 int i; 362 register int t1, t2, t3, t4, t5, t6, t7, t8; 363 int16_t *src, *dst; 364 365 src = block; 366 dst = block; 367 368 for (i = 0; i < 4; i++) { 369 t1 = 12 * (src[0] + src[4]) + 4; 370 t2 = 12 * (src[0] - src[4]) + 4; 371 t3 = 16 * src[2] + 6 * src[6]; 372 t4 = 6 * src[2] - 16 * src[6]; 373 374 t5 = t1 + t3; 375 t6 = t2 + t4; 376 t7 = t2 - t4; 377 t8 = t1 - t3; 378 379 t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7]; 380 t2 = 15 * src[1] - 4 * src[3] - 16 * src[5] - 9 * src[7]; 381 t3 = 9 * src[1] - 16 * src[3] + 4 * src[5] + 15 * src[7]; 382 t4 = 4 * src[1] - 9 * src[3] + 15 * src[5] - 16 * src[7]; 383 384 dst[0] = (t5 + t1) >> 3; 385 dst[1] = (t6 + t2) >> 3; 386 dst[2] = (t7 + t3) >> 3; 387 dst[3] = (t8 + t4) >> 3; 388 dst[4] = (t8 - t4) >> 3; 389 dst[5] = (t7 - t3) >> 3; 390 dst[6] = (t6 - t2) >> 3; 391 dst[7] = (t5 - t1) >> 3; 392 393 src += 8; 394 dst += 8; 395 } 396 397 src = block; 398 for (i = 0; i < 8; i++) { 399 t1 = 17 * (src[ 0] + src[16]) + 64; 400 t2 = 17 * (src[ 0] - src[16]) + 64; 401 t3 = 22 * src[ 8] + 10 * src[24]; 402 t4 = 22 * src[24] - 10 * src[ 8]; 403 404 dest[0 * stride] = av_clip_uint8(dest[0 * stride] + ((t1 + t3) >> 7)); 405 dest[1 * stride] = av_clip_uint8(dest[1 * stride] + ((t2 - t4) >> 7)); 406 dest[2 * stride] = av_clip_uint8(dest[2 * stride] + ((t2 + t4) >> 7)); 407 dest[3 * stride] = av_clip_uint8(dest[3 * stride] + ((t1 - t3) >> 7)); 408 409 src++; 410 dest++; 411 } 412} 413 414/* Do inverse transform on 4x8 parts of block */ 415static void vc1_inv_trans_4x8_dc_c(uint8_t *dest, ptrdiff_t stride, int16_t *block) 416{ 417 int i; 418 int dc = block[0]; 419 420 dc = (17 * dc + 4) >> 3; 421 dc = (12 * dc + 64) >> 7; 422 423 for (i = 0; i < 8; i++) { 424 dest[0] = av_clip_uint8(dest[0] + dc); 425 dest[1] = av_clip_uint8(dest[1] + dc); 426 dest[2] = av_clip_uint8(dest[2] + dc); 427 dest[3] = av_clip_uint8(dest[3] + dc); 428 dest += stride; 429 } 430} 431 432static void vc1_inv_trans_4x8_c(uint8_t *dest, ptrdiff_t stride, int16_t *block) 433{ 434 int i; 435 register int t1, t2, t3, t4, t5, t6, t7, t8; 436 int16_t *src, *dst; 437 438 src = block; 439 dst = block; 440 441 for (i = 0; i < 8; i++) { 442 t1 = 17 * (src[0] + src[2]) + 4; 443 t2 = 17 * (src[0] - src[2]) + 4; 444 t3 = 22 * src[1] + 10 * src[3]; 445 t4 = 22 * src[3] - 10 * src[1]; 446 447 dst[0] = (t1 + t3) >> 3; 448 dst[1] = (t2 - t4) >> 3; 449 dst[2] = (t2 + t4) >> 3; 450 dst[3] = (t1 - t3) >> 3; 451 452 src += 8; 453 dst += 8; 454 } 455 456 src = block; 457 for (i = 0; i < 4; i++) { 458 t1 = 12 * (src[ 0] + src[32]) + 64; 459 t2 = 12 * (src[ 0] - src[32]) + 64; 460 t3 = 16 * src[16] + 6 * src[48]; 461 t4 = 6 * src[16] - 16 * src[48]; 462 463 t5 = t1 + t3; 464 t6 = t2 + t4; 465 t7 = t2 - t4; 466 t8 = t1 - t3; 467 468 t1 = 16 * src[ 8] + 15 * src[24] + 9 * src[40] + 4 * src[56]; 469 t2 = 15 * src[ 8] - 4 * src[24] - 16 * src[40] - 9 * src[56]; 470 t3 = 9 * src[ 8] - 16 * src[24] + 4 * src[40] + 15 * src[56]; 471 t4 = 4 * src[ 8] - 9 * src[24] + 15 * src[40] - 16 * src[56]; 472 473 dest[0 * stride] = av_clip_uint8(dest[0 * stride] + ((t5 + t1) >> 7)); 474 dest[1 * stride] = av_clip_uint8(dest[1 * stride] + ((t6 + t2) >> 7)); 475 dest[2 * stride] = av_clip_uint8(dest[2 * stride] + ((t7 + t3) >> 7)); 476 dest[3 * stride] = av_clip_uint8(dest[3 * stride] + ((t8 + t4) >> 7)); 477 dest[4 * stride] = av_clip_uint8(dest[4 * stride] + ((t8 - t4 + 1) >> 7)); 478 dest[5 * stride] = av_clip_uint8(dest[5 * stride] + ((t7 - t3 + 1) >> 7)); 479 dest[6 * stride] = av_clip_uint8(dest[6 * stride] + ((t6 - t2 + 1) >> 7)); 480 dest[7 * stride] = av_clip_uint8(dest[7 * stride] + ((t5 - t1 + 1) >> 7)); 481 482 src++; 483 dest++; 484 } 485} 486 487/* Do inverse transform on 4x4 part of block */ 488static void vc1_inv_trans_4x4_dc_c(uint8_t *dest, ptrdiff_t stride, int16_t *block) 489{ 490 int i; 491 int dc = block[0]; 492 493 dc = (17 * dc + 4) >> 3; 494 dc = (17 * dc + 64) >> 7; 495 496 for (i = 0; i < 4; i++) { 497 dest[0] = av_clip_uint8(dest[0] + dc); 498 dest[1] = av_clip_uint8(dest[1] + dc); 499 dest[2] = av_clip_uint8(dest[2] + dc); 500 dest[3] = av_clip_uint8(dest[3] + dc); 501 dest += stride; 502 } 503} 504 505static void vc1_inv_trans_4x4_c(uint8_t *dest, ptrdiff_t stride, int16_t *block) 506{ 507 int i; 508 register int t1, t2, t3, t4; 509 int16_t *src, *dst; 510 511 src = block; 512 dst = block; 513 for (i = 0; i < 4; i++) { 514 t1 = 17 * (src[0] + src[2]) + 4; 515 t2 = 17 * (src[0] - src[2]) + 4; 516 t3 = 22 * src[1] + 10 * src[3]; 517 t4 = 22 * src[3] - 10 * src[1]; 518 519 dst[0] = (t1 + t3) >> 3; 520 dst[1] = (t2 - t4) >> 3; 521 dst[2] = (t2 + t4) >> 3; 522 dst[3] = (t1 - t3) >> 3; 523 524 src += 8; 525 dst += 8; 526 } 527 528 src = block; 529 for (i = 0; i < 4; i++) { 530 t1 = 17 * (src[0] + src[16]) + 64; 531 t2 = 17 * (src[0] - src[16]) + 64; 532 t3 = 22 * src[8] + 10 * src[24]; 533 t4 = 22 * src[24] - 10 * src[8]; 534 535 dest[0 * stride] = av_clip_uint8(dest[0 * stride] + ((t1 + t3) >> 7)); 536 dest[1 * stride] = av_clip_uint8(dest[1 * stride] + ((t2 - t4) >> 7)); 537 dest[2 * stride] = av_clip_uint8(dest[2 * stride] + ((t2 + t4) >> 7)); 538 dest[3 * stride] = av_clip_uint8(dest[3 * stride] + ((t1 - t3) >> 7)); 539 540 src++; 541 dest++; 542 } 543} 544 545/* motion compensation functions */ 546 547/* Filter in case of 2 filters */ 548#define VC1_MSPEL_FILTER_16B(DIR, TYPE) \ 549static av_always_inline int vc1_mspel_ ## DIR ## _filter_16bits(const TYPE *src, \ 550 int stride, \ 551 int mode) \ 552{ \ 553 switch(mode) { \ 554 case 0: /* no shift - should not occur */ \ 555 return 0; \ 556 case 1: /* 1/4 shift */ \ 557 return -4 * src[-stride] + 53 * src[0] + \ 558 18 * src[stride] - 3 * src[stride * 2]; \ 559 case 2: /* 1/2 shift */ \ 560 return -1 * src[-stride] + 9 * src[0] + \ 561 9 * src[stride] - 1 * src[stride * 2]; \ 562 case 3: /* 3/4 shift */ \ 563 return -3 * src[-stride] + 18 * src[0] + \ 564 53 * src[stride] - 4 * src[stride * 2]; \ 565 } \ 566 return 0; /* should not occur */ \ 567} 568 569VC1_MSPEL_FILTER_16B(ver, uint8_t) 570VC1_MSPEL_FILTER_16B(hor, int16_t) 571 572/* Filter used to interpolate fractional pel values */ 573static av_always_inline int vc1_mspel_filter(const uint8_t *src, int stride, 574 int mode, int r) 575{ 576 switch (mode) { 577 case 0: // no shift 578 return src[0]; 579 case 1: // 1/4 shift 580 return (-4 * src[-stride] + 53 * src[0] + 581 18 * src[stride] - 3 * src[stride * 2] + 32 - r) >> 6; 582 case 2: // 1/2 shift 583 return (-1 * src[-stride] + 9 * src[0] + 584 9 * src[stride] - 1 * src[stride * 2] + 8 - r) >> 4; 585 case 3: // 3/4 shift 586 return (-3 * src[-stride] + 18 * src[0] + 587 53 * src[stride] - 4 * src[stride * 2] + 32 - r) >> 6; 588 } 589 return 0; // should not occur 590} 591 592/* Function used to do motion compensation with bicubic interpolation */ 593#define VC1_MSPEL_MC(OP, OP4, OPNAME) \ 594static av_always_inline void OPNAME ## vc1_mspel_mc(uint8_t *dst, \ 595 const uint8_t *src, \ 596 ptrdiff_t stride, \ 597 int hmode, \ 598 int vmode, \ 599 int rnd) \ 600{ \ 601 int i, j; \ 602 \ 603 if (vmode) { /* Horizontal filter to apply */ \ 604 int r; \ 605 \ 606 if (hmode) { /* Vertical filter to apply, output to tmp */ \ 607 static const int shift_value[] = { 0, 5, 1, 5 }; \ 608 int shift = (shift_value[hmode] + shift_value[vmode]) >> 1; \ 609 int16_t tmp[11 * 8], *tptr = tmp; \ 610 \ 611 r = (1 << (shift - 1)) + rnd - 1; \ 612 \ 613 src -= 1; \ 614 for (j = 0; j < 8; j++) { \ 615 for (i = 0; i < 11; i++) \ 616 tptr[i] = (vc1_mspel_ver_filter_16bits(src + i, stride, vmode) + r) >> shift; \ 617 src += stride; \ 618 tptr += 11; \ 619 } \ 620 \ 621 r = 64 - rnd; \ 622 tptr = tmp + 1; \ 623 for (j = 0; j < 8; j++) { \ 624 for (i = 0; i < 8; i++) \ 625 OP(dst[i], (vc1_mspel_hor_filter_16bits(tptr + i, 1, hmode) + r) >> 7); \ 626 dst += stride; \ 627 tptr += 11; \ 628 } \ 629 \ 630 return; \ 631 } else { /* No horizontal filter, output 8 lines to dst */ \ 632 r = 1 - rnd; \ 633 \ 634 for (j = 0; j < 8; j++) { \ 635 for (i = 0; i < 8; i++) \ 636 OP(dst[i], vc1_mspel_filter(src + i, stride, vmode, r)); \ 637 src += stride; \ 638 dst += stride; \ 639 } \ 640 return; \ 641 } \ 642 } \ 643 \ 644 /* Horizontal mode with no vertical mode */ \ 645 for (j = 0; j < 8; j++) { \ 646 for (i = 0; i < 8; i++) \ 647 OP(dst[i], vc1_mspel_filter(src + i, 1, hmode, rnd)); \ 648 dst += stride; \ 649 src += stride; \ 650 } \ 651}\ 652static av_always_inline void OPNAME ## vc1_mspel_mc_16(uint8_t *dst, \ 653 const uint8_t *src, \ 654 ptrdiff_t stride, \ 655 int hmode, \ 656 int vmode, \ 657 int rnd) \ 658{ \ 659 int i, j; \ 660 \ 661 if (vmode) { /* Horizontal filter to apply */ \ 662 int r; \ 663 \ 664 if (hmode) { /* Vertical filter to apply, output to tmp */ \ 665 static const int shift_value[] = { 0, 5, 1, 5 }; \ 666 int shift = (shift_value[hmode] + shift_value[vmode]) >> 1; \ 667 int16_t tmp[19 * 16], *tptr = tmp; \ 668 \ 669 r = (1 << (shift - 1)) + rnd - 1; \ 670 \ 671 src -= 1; \ 672 for (j = 0; j < 16; j++) { \ 673 for (i = 0; i < 19; i++) \ 674 tptr[i] = (vc1_mspel_ver_filter_16bits(src + i, stride, vmode) + r) >> shift; \ 675 src += stride; \ 676 tptr += 19; \ 677 } \ 678 \ 679 r = 64 - rnd; \ 680 tptr = tmp + 1; \ 681 for (j = 0; j < 16; j++) { \ 682 for (i = 0; i < 16; i++) \ 683 OP(dst[i], (vc1_mspel_hor_filter_16bits(tptr + i, 1, hmode) + r) >> 7); \ 684 dst += stride; \ 685 tptr += 19; \ 686 } \ 687 \ 688 return; \ 689 } else { /* No horizontal filter, output 8 lines to dst */ \ 690 r = 1 - rnd; \ 691 \ 692 for (j = 0; j < 16; j++) { \ 693 for (i = 0; i < 16; i++) \ 694 OP(dst[i], vc1_mspel_filter(src + i, stride, vmode, r)); \ 695 src += stride; \ 696 dst += stride; \ 697 } \ 698 return; \ 699 } \ 700 } \ 701 \ 702 /* Horizontal mode with no vertical mode */ \ 703 for (j = 0; j < 16; j++) { \ 704 for (i = 0; i < 16; i++) \ 705 OP(dst[i], vc1_mspel_filter(src + i, 1, hmode, rnd)); \ 706 dst += stride; \ 707 src += stride; \ 708 } \ 709}\ 710static void OPNAME ## pixels8x8_c(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int rnd){\ 711 int i;\ 712 for(i=0; i<8; i++){\ 713 OP4(*(uint32_t*)(block ), AV_RN32(pixels ));\ 714 OP4(*(uint32_t*)(block+4), AV_RN32(pixels+4));\ 715 pixels+=line_size;\ 716 block +=line_size;\ 717 }\ 718}\ 719static void OPNAME ## pixels16x16_c(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int rnd){\ 720 int i;\ 721 for(i=0; i<16; i++){\ 722 OP4(*(uint32_t*)(block ), AV_RN32(pixels ));\ 723 OP4(*(uint32_t*)(block+ 4), AV_RN32(pixels+ 4));\ 724 OP4(*(uint32_t*)(block+ 8), AV_RN32(pixels+ 8));\ 725 OP4(*(uint32_t*)(block+12), AV_RN32(pixels+12));\ 726 pixels+=line_size;\ 727 block +=line_size;\ 728 }\ 729} 730 731#define op_put(a, b) (a) = av_clip_uint8(b) 732#define op_avg(a, b) (a) = ((a) + av_clip_uint8(b) + 1) >> 1 733#define op4_avg(a, b) (a) = rnd_avg32(a, b) 734#define op4_put(a, b) (a) = (b) 735 736VC1_MSPEL_MC(op_put, op4_put, put_) 737VC1_MSPEL_MC(op_avg, op4_avg, avg_) 738 739/* pixel functions - really are entry points to vc1_mspel_mc */ 740 741#define PUT_VC1_MSPEL(a, b) \ 742static void put_vc1_mspel_mc ## a ## b ## _c(uint8_t *dst, \ 743 const uint8_t *src, \ 744 ptrdiff_t stride, int rnd) \ 745{ \ 746 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ 747} \ 748static void avg_vc1_mspel_mc ## a ## b ## _c(uint8_t *dst, \ 749 const uint8_t *src, \ 750 ptrdiff_t stride, int rnd) \ 751{ \ 752 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ 753} \ 754static void put_vc1_mspel_mc ## a ## b ## _16_c(uint8_t *dst, \ 755 const uint8_t *src, \ 756 ptrdiff_t stride, int rnd) \ 757{ \ 758 put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ 759} \ 760static void avg_vc1_mspel_mc ## a ## b ## _16_c(uint8_t *dst, \ 761 const uint8_t *src, \ 762 ptrdiff_t stride, int rnd) \ 763{ \ 764 avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ 765} 766 767PUT_VC1_MSPEL(1, 0) 768PUT_VC1_MSPEL(2, 0) 769PUT_VC1_MSPEL(3, 0) 770 771PUT_VC1_MSPEL(0, 1) 772PUT_VC1_MSPEL(1, 1) 773PUT_VC1_MSPEL(2, 1) 774PUT_VC1_MSPEL(3, 1) 775 776PUT_VC1_MSPEL(0, 2) 777PUT_VC1_MSPEL(1, 2) 778PUT_VC1_MSPEL(2, 2) 779PUT_VC1_MSPEL(3, 2) 780 781PUT_VC1_MSPEL(0, 3) 782PUT_VC1_MSPEL(1, 3) 783PUT_VC1_MSPEL(2, 3) 784PUT_VC1_MSPEL(3, 3) 785 786#define chroma_mc(a) \ 787 ((A * src[a] + B * src[a + 1] + \ 788 C * src[stride + a] + D * src[stride + a + 1] + 32 - 4) >> 6) 789static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */, 790 uint8_t *src /* align 1 */, 791 ptrdiff_t stride, int h, int x, int y) 792{ 793 const int A = (8 - x) * (8 - y); 794 const int B = (x) * (8 - y); 795 const int C = (8 - x) * (y); 796 const int D = (x) * (y); 797 int i; 798 799 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 800 801 for (i = 0; i < h; i++) { 802 dst[0] = chroma_mc(0); 803 dst[1] = chroma_mc(1); 804 dst[2] = chroma_mc(2); 805 dst[3] = chroma_mc(3); 806 dst[4] = chroma_mc(4); 807 dst[5] = chroma_mc(5); 808 dst[6] = chroma_mc(6); 809 dst[7] = chroma_mc(7); 810 dst += stride; 811 src += stride; 812 } 813} 814 815static void put_no_rnd_vc1_chroma_mc4_c(uint8_t *dst, uint8_t *src, 816 ptrdiff_t stride, int h, int x, int y) 817{ 818 const int A = (8 - x) * (8 - y); 819 const int B = (x) * (8 - y); 820 const int C = (8 - x) * (y); 821 const int D = (x) * (y); 822 int i; 823 824 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 825 826 for (i = 0; i < h; i++) { 827 dst[0] = chroma_mc(0); 828 dst[1] = chroma_mc(1); 829 dst[2] = chroma_mc(2); 830 dst[3] = chroma_mc(3); 831 dst += stride; 832 src += stride; 833 } 834} 835 836#define avg2(a, b) (((a) + (b) + 1) >> 1) 837static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */, 838 uint8_t *src /* align 1 */, 839 ptrdiff_t stride, int h, int x, int y) 840{ 841 const int A = (8 - x) * (8 - y); 842 const int B = (x) * (8 - y); 843 const int C = (8 - x) * (y); 844 const int D = (x) * (y); 845 int i; 846 847 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 848 849 for (i = 0; i < h; i++) { 850 dst[0] = avg2(dst[0], chroma_mc(0)); 851 dst[1] = avg2(dst[1], chroma_mc(1)); 852 dst[2] = avg2(dst[2], chroma_mc(2)); 853 dst[3] = avg2(dst[3], chroma_mc(3)); 854 dst[4] = avg2(dst[4], chroma_mc(4)); 855 dst[5] = avg2(dst[5], chroma_mc(5)); 856 dst[6] = avg2(dst[6], chroma_mc(6)); 857 dst[7] = avg2(dst[7], chroma_mc(7)); 858 dst += stride; 859 src += stride; 860 } 861} 862 863static void avg_no_rnd_vc1_chroma_mc4_c(uint8_t *dst /* align 8 */, 864 uint8_t *src /* align 1 */, 865 ptrdiff_t stride, int h, int x, int y) 866{ 867 const int A = (8 - x) * (8 - y); 868 const int B = ( x) * (8 - y); 869 const int C = (8 - x) * ( y); 870 const int D = ( x) * ( y); 871 int i; 872 873 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 874 875 for (i = 0; i < h; i++) { 876 dst[0] = avg2(dst[0], chroma_mc(0)); 877 dst[1] = avg2(dst[1], chroma_mc(1)); 878 dst[2] = avg2(dst[2], chroma_mc(2)); 879 dst[3] = avg2(dst[3], chroma_mc(3)); 880 dst += stride; 881 src += stride; 882 } 883} 884 885#if CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER 886 887static void sprite_h_c(uint8_t *dst, const uint8_t *src, int offset, 888 int advance, int count) 889{ 890 while (count--) { 891 int a = src[(offset >> 16)]; 892 int b = src[(offset >> 16) + 1]; 893 *dst++ = a + ((b - a) * (offset & 0xFFFF) >> 16); 894 offset += advance; 895 } 896} 897 898static av_always_inline void sprite_v_template(uint8_t *dst, 899 const uint8_t *src1a, 900 const uint8_t *src1b, 901 int offset1, 902 int two_sprites, 903 const uint8_t *src2a, 904 const uint8_t *src2b, 905 int offset2, 906 int alpha, int scaled, 907 int width) 908{ 909 int a1, b1, a2, b2; 910 while (width--) { 911 a1 = *src1a++; 912 if (scaled) { 913 b1 = *src1b++; 914 a1 = a1 + ((b1 - a1) * offset1 >> 16); 915 } 916 if (two_sprites) { 917 a2 = *src2a++; 918 if (scaled > 1) { 919 b2 = *src2b++; 920 a2 = a2 + ((b2 - a2) * offset2 >> 16); 921 } 922 a1 = a1 + ((a2 - a1) * alpha >> 16); 923 } 924 *dst++ = a1; 925 } 926} 927 928static void sprite_v_single_c(uint8_t *dst, const uint8_t *src1a, 929 const uint8_t *src1b, 930 int offset, int width) 931{ 932 sprite_v_template(dst, src1a, src1b, offset, 0, NULL, NULL, 0, 0, 1, width); 933} 934 935static void sprite_v_double_noscale_c(uint8_t *dst, const uint8_t *src1a, 936 const uint8_t *src2a, 937 int alpha, int width) 938{ 939 sprite_v_template(dst, src1a, NULL, 0, 1, src2a, NULL, 0, alpha, 0, width); 940} 941 942static void sprite_v_double_onescale_c(uint8_t *dst, 943 const uint8_t *src1a, 944 const uint8_t *src1b, 945 int offset1, 946 const uint8_t *src2a, 947 int alpha, int width) 948{ 949 sprite_v_template(dst, src1a, src1b, offset1, 1, src2a, NULL, 0, alpha, 1, 950 width); 951} 952 953static void sprite_v_double_twoscale_c(uint8_t *dst, 954 const uint8_t *src1a, 955 const uint8_t *src1b, 956 int offset1, 957 const uint8_t *src2a, 958 const uint8_t *src2b, 959 int offset2, 960 int alpha, 961 int width) 962{ 963 sprite_v_template(dst, src1a, src1b, offset1, 1, src2a, src2b, offset2, 964 alpha, 2, width); 965} 966 967#endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */ 968#define FN_ASSIGN(X, Y) \ 969 dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = put_vc1_mspel_mc##X##Y##_c; \ 970 dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = put_vc1_mspel_mc##X##Y##_16_c; \ 971 dsp->avg_vc1_mspel_pixels_tab[1][X+4*Y] = avg_vc1_mspel_mc##X##Y##_c; \ 972 dsp->avg_vc1_mspel_pixels_tab[0][X+4*Y] = avg_vc1_mspel_mc##X##Y##_16_c 973 974av_cold void ff_vc1dsp_init(VC1DSPContext *dsp) 975{ 976 dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_c; 977 dsp->vc1_inv_trans_4x8 = vc1_inv_trans_4x8_c; 978 dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_c; 979 dsp->vc1_inv_trans_4x4 = vc1_inv_trans_4x4_c; 980 dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_c; 981 dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_c; 982 dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_c; 983 dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_c; 984 985 dsp->vc1_h_overlap = vc1_h_overlap_c; 986 dsp->vc1_v_overlap = vc1_v_overlap_c; 987 dsp->vc1_h_s_overlap = vc1_h_s_overlap_c; 988 dsp->vc1_v_s_overlap = vc1_v_s_overlap_c; 989 990 dsp->vc1_v_loop_filter4 = vc1_v_loop_filter4_c; 991 dsp->vc1_h_loop_filter4 = vc1_h_loop_filter4_c; 992 dsp->vc1_v_loop_filter8 = vc1_v_loop_filter8_c; 993 dsp->vc1_h_loop_filter8 = vc1_h_loop_filter8_c; 994 dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_c; 995 dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_c; 996 997 dsp->put_vc1_mspel_pixels_tab[0][0] = put_pixels16x16_c; 998 dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_pixels16x16_c; 999 dsp->put_vc1_mspel_pixels_tab[1][0] = put_pixels8x8_c; 1000 dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_pixels8x8_c; 1001 FN_ASSIGN(0, 1); 1002 FN_ASSIGN(0, 2); 1003 FN_ASSIGN(0, 3); 1004 1005 FN_ASSIGN(1, 0); 1006 FN_ASSIGN(1, 1); 1007 FN_ASSIGN(1, 2); 1008 FN_ASSIGN(1, 3); 1009 1010 FN_ASSIGN(2, 0); 1011 FN_ASSIGN(2, 1); 1012 FN_ASSIGN(2, 2); 1013 FN_ASSIGN(2, 3); 1014 1015 FN_ASSIGN(3, 0); 1016 FN_ASSIGN(3, 1); 1017 FN_ASSIGN(3, 2); 1018 FN_ASSIGN(3, 3); 1019 1020 dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_c; 1021 dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_c; 1022 dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = put_no_rnd_vc1_chroma_mc4_c; 1023 dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = avg_no_rnd_vc1_chroma_mc4_c; 1024 1025#if CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER 1026 dsp->sprite_h = sprite_h_c; 1027 dsp->sprite_v_single = sprite_v_single_c; 1028 dsp->sprite_v_double_noscale = sprite_v_double_noscale_c; 1029 dsp->sprite_v_double_onescale = sprite_v_double_onescale_c; 1030 dsp->sprite_v_double_twoscale = sprite_v_double_twoscale_c; 1031#endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */ 1032 1033 dsp->startcode_find_candidate = ff_startcode_find_candidate_c; 1034 dsp->vc1_unescape_buffer = vc1_unescape_buffer; 1035 1036#if ARCH_AARCH64 1037 ff_vc1dsp_init_aarch64(dsp); 1038#elif ARCH_ARM 1039 ff_vc1dsp_init_arm(dsp); 1040#elif ARCH_PPC 1041 ff_vc1dsp_init_ppc(dsp); 1042#elif ARCH_X86 1043 ff_vc1dsp_init_x86(dsp); 1044#elif ARCH_MIPS 1045 ff_vc1dsp_init_mips(dsp); 1046#elif ARCH_LOONGARCH 1047 ff_vc1dsp_init_loongarch(dsp); 1048#endif 1049} 1050