1/* 2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder 3 * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22/** 23 * @file 24 * H.264 / AVC / MPEG-4 part10 prediction functions. 25 * @author Michael Niedermayer <michaelni@gmx.at> 26 */ 27 28#include "libavutil/intreadwrite.h" 29 30#include "mathops.h" 31 32#include "bit_depth_template.c" 33 34static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright, 35 ptrdiff_t _stride) 36{ 37 pixel *src = (pixel*)_src; 38 int stride = _stride>>(sizeof(pixel)-1); 39 const pixel4 a= AV_RN4PA(src-stride); 40 41 AV_WN4PA(src+0*stride, a); 42 AV_WN4PA(src+1*stride, a); 43 AV_WN4PA(src+2*stride, a); 44 AV_WN4PA(src+3*stride, a); 45} 46 47static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright, 48 ptrdiff_t _stride) 49{ 50 pixel *src = (pixel*)_src; 51 int stride = _stride>>(sizeof(pixel)-1); 52 AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride])); 53 AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride])); 54 AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride])); 55 AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4(src[-1+3*stride])); 56} 57 58static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, 59 ptrdiff_t _stride) 60{ 61 pixel *src = (pixel*)_src; 62 int stride = _stride>>(sizeof(pixel)-1); 63 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] 64 + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3; 65 const pixel4 a = PIXEL_SPLAT_X4(dc); 66 67 AV_WN4PA(src+0*stride, a); 68 AV_WN4PA(src+1*stride, a); 69 AV_WN4PA(src+2*stride, a); 70 AV_WN4PA(src+3*stride, a); 71} 72 73static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright, 74 ptrdiff_t _stride) 75{ 76 pixel *src = (pixel*)_src; 77 int stride = _stride>>(sizeof(pixel)-1); 78 const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2; 79 const pixel4 a = PIXEL_SPLAT_X4(dc); 80 81 AV_WN4PA(src+0*stride, a); 82 AV_WN4PA(src+1*stride, a); 83 AV_WN4PA(src+2*stride, a); 84 AV_WN4PA(src+3*stride, a); 85} 86 87static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright, 88 ptrdiff_t _stride) 89{ 90 pixel *src = (pixel*)_src; 91 int stride = _stride>>(sizeof(pixel)-1); 92 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2; 93 const pixel4 a = PIXEL_SPLAT_X4(dc); 94 95 AV_WN4PA(src+0*stride, a); 96 AV_WN4PA(src+1*stride, a); 97 AV_WN4PA(src+2*stride, a); 98 AV_WN4PA(src+3*stride, a); 99} 100 101static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright, 102 ptrdiff_t _stride) 103{ 104 pixel *src = (pixel*)_src; 105 int stride = _stride>>(sizeof(pixel)-1); 106 const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)); 107 108 AV_WN4PA(src+0*stride, a); 109 AV_WN4PA(src+1*stride, a); 110 AV_WN4PA(src+2*stride, a); 111 AV_WN4PA(src+3*stride, a); 112} 113 114 115#define LOAD_TOP_RIGHT_EDGE\ 116 const unsigned av_unused t4 = topright[0];\ 117 const unsigned av_unused t5 = topright[1];\ 118 const unsigned av_unused t6 = topright[2];\ 119 const unsigned av_unused t7 = topright[3];\ 120 121#define LOAD_DOWN_LEFT_EDGE\ 122 const unsigned av_unused l4 = src[-1+4*stride];\ 123 const unsigned av_unused l5 = src[-1+5*stride];\ 124 const unsigned av_unused l6 = src[-1+6*stride];\ 125 const unsigned av_unused l7 = src[-1+7*stride];\ 126 127#define LOAD_LEFT_EDGE\ 128 const unsigned av_unused l0 = src[-1+0*stride];\ 129 const unsigned av_unused l1 = src[-1+1*stride];\ 130 const unsigned av_unused l2 = src[-1+2*stride];\ 131 const unsigned av_unused l3 = src[-1+3*stride];\ 132 133#define LOAD_TOP_EDGE\ 134 const unsigned av_unused t0 = src[ 0-1*stride];\ 135 const unsigned av_unused t1 = src[ 1-1*stride];\ 136 const unsigned av_unused t2 = src[ 2-1*stride];\ 137 const unsigned av_unused t3 = src[ 3-1*stride];\ 138 139static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright, 140 ptrdiff_t _stride) 141{ 142 pixel *src = (pixel*)_src; 143 int stride = _stride>>(sizeof(pixel)-1); 144 const int lt= src[-1-1*stride]; 145 LOAD_TOP_EDGE 146 LOAD_LEFT_EDGE 147 148 src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2; 149 src[0+2*stride]= 150 src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2; 151 src[0+1*stride]= 152 src[1+2*stride]= 153 src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2; 154 src[0+0*stride]= 155 src[1+1*stride]= 156 src[2+2*stride]= 157 src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2; 158 src[1+0*stride]= 159 src[2+1*stride]= 160 src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2; 161 src[2+0*stride]= 162 src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; 163 src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2; 164} 165 166static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright, 167 ptrdiff_t _stride) 168{ 169 pixel *src = (pixel*)_src; 170 const pixel *topright = (const pixel*)_topright; 171 int stride = _stride>>(sizeof(pixel)-1); 172 LOAD_TOP_EDGE 173 LOAD_TOP_RIGHT_EDGE 174// LOAD_LEFT_EDGE 175 176 src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2; 177 src[1+0*stride]= 178 src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2; 179 src[2+0*stride]= 180 src[1+1*stride]= 181 src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2; 182 src[3+0*stride]= 183 src[2+1*stride]= 184 src[1+2*stride]= 185 src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2; 186 src[3+1*stride]= 187 src[2+2*stride]= 188 src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2; 189 src[3+2*stride]= 190 src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2; 191 src[3+3*stride]=(t6 + 3*t7 + 2)>>2; 192} 193 194static void FUNCC(pred4x4_vertical_right)(uint8_t *_src, 195 const uint8_t *topright, 196 ptrdiff_t _stride) 197{ 198 pixel *src = (pixel*)_src; 199 int stride = _stride>>(sizeof(pixel)-1); 200 const int lt= src[-1-1*stride]; 201 LOAD_TOP_EDGE 202 LOAD_LEFT_EDGE 203 204 src[0+0*stride]= 205 src[1+2*stride]=(lt + t0 + 1)>>1; 206 src[1+0*stride]= 207 src[2+2*stride]=(t0 + t1 + 1)>>1; 208 src[2+0*stride]= 209 src[3+2*stride]=(t1 + t2 + 1)>>1; 210 src[3+0*stride]=(t2 + t3 + 1)>>1; 211 src[0+1*stride]= 212 src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2; 213 src[1+1*stride]= 214 src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2; 215 src[2+1*stride]= 216 src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2; 217 src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2; 218 src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2; 219 src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; 220} 221 222static void FUNCC(pred4x4_vertical_left)(uint8_t *_src, 223 const uint8_t *_topright, 224 ptrdiff_t _stride) 225{ 226 pixel *src = (pixel*)_src; 227 const pixel *topright = (const pixel*)_topright; 228 int stride = _stride>>(sizeof(pixel)-1); 229 LOAD_TOP_EDGE 230 LOAD_TOP_RIGHT_EDGE 231 232 src[0+0*stride]=(t0 + t1 + 1)>>1; 233 src[1+0*stride]= 234 src[0+2*stride]=(t1 + t2 + 1)>>1; 235 src[2+0*stride]= 236 src[1+2*stride]=(t2 + t3 + 1)>>1; 237 src[3+0*stride]= 238 src[2+2*stride]=(t3 + t4+ 1)>>1; 239 src[3+2*stride]=(t4 + t5+ 1)>>1; 240 src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; 241 src[1+1*stride]= 242 src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2; 243 src[2+1*stride]= 244 src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2; 245 src[3+1*stride]= 246 src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2; 247 src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2; 248} 249 250static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright, 251 ptrdiff_t _stride) 252{ 253 pixel *src = (pixel*)_src; 254 int stride = _stride>>(sizeof(pixel)-1); 255 LOAD_LEFT_EDGE 256 257 src[0+0*stride]=(l0 + l1 + 1)>>1; 258 src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2; 259 src[2+0*stride]= 260 src[0+1*stride]=(l1 + l2 + 1)>>1; 261 src[3+0*stride]= 262 src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2; 263 src[2+1*stride]= 264 src[0+2*stride]=(l2 + l3 + 1)>>1; 265 src[3+1*stride]= 266 src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2; 267 src[3+2*stride]= 268 src[1+3*stride]= 269 src[0+3*stride]= 270 src[2+2*stride]= 271 src[2+3*stride]= 272 src[3+3*stride]=l3; 273} 274 275static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src, 276 const uint8_t *topright, 277 ptrdiff_t _stride) 278{ 279 pixel *src = (pixel*)_src; 280 int stride = _stride>>(sizeof(pixel)-1); 281 const int lt= src[-1-1*stride]; 282 LOAD_TOP_EDGE 283 LOAD_LEFT_EDGE 284 285 src[0+0*stride]= 286 src[2+1*stride]=(lt + l0 + 1)>>1; 287 src[1+0*stride]= 288 src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2; 289 src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2; 290 src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2; 291 src[0+1*stride]= 292 src[2+2*stride]=(l0 + l1 + 1)>>1; 293 src[1+1*stride]= 294 src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2; 295 src[0+2*stride]= 296 src[2+3*stride]=(l1 + l2+ 1)>>1; 297 src[1+2*stride]= 298 src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; 299 src[0+3*stride]=(l2 + l3 + 1)>>1; 300 src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2; 301} 302 303static void FUNCC(pred16x16_vertical)(uint8_t *_src, ptrdiff_t _stride) 304{ 305 int i; 306 pixel *src = (pixel*)_src; 307 int stride = _stride>>(sizeof(pixel)-1); 308 const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0); 309 const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1); 310 const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2); 311 const pixel4 d = AV_RN4PA(((pixel4*)(src-stride))+3); 312 313 for(i=0; i<16; i++){ 314 AV_WN4PA(((pixel4*)(src+i*stride))+0, a); 315 AV_WN4PA(((pixel4*)(src+i*stride))+1, b); 316 AV_WN4PA(((pixel4*)(src+i*stride))+2, c); 317 AV_WN4PA(((pixel4*)(src+i*stride))+3, d); 318 } 319} 320 321static void FUNCC(pred16x16_horizontal)(uint8_t *_src, ptrdiff_t stride) 322{ 323 int i; 324 pixel *src = (pixel*)_src; 325 stride >>= sizeof(pixel)-1; 326 327 for(i=0; i<16; i++){ 328 const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]); 329 330 AV_WN4PA(((pixel4*)(src+i*stride))+0, a); 331 AV_WN4PA(((pixel4*)(src+i*stride))+1, a); 332 AV_WN4PA(((pixel4*)(src+i*stride))+2, a); 333 AV_WN4PA(((pixel4*)(src+i*stride))+3, a); 334 } 335} 336 337#define PREDICT_16x16_DC(v)\ 338 for(i=0; i<16; i++){\ 339 AV_WN4PA(src+ 0, v);\ 340 AV_WN4PA(src+ 4, v);\ 341 AV_WN4PA(src+ 8, v);\ 342 AV_WN4PA(src+12, v);\ 343 src += stride;\ 344 } 345 346static void FUNCC(pred16x16_dc)(uint8_t *_src, ptrdiff_t stride) 347{ 348 int i, dc=0; 349 pixel *src = (pixel*)_src; 350 pixel4 dcsplat; 351 stride >>= sizeof(pixel)-1; 352 353 for(i=0;i<16; i++){ 354 dc+= src[-1+i*stride]; 355 } 356 357 for(i=0;i<16; i++){ 358 dc+= src[i-stride]; 359 } 360 361 dcsplat = PIXEL_SPLAT_X4((dc+16)>>5); 362 PREDICT_16x16_DC(dcsplat); 363} 364 365static void FUNCC(pred16x16_left_dc)(uint8_t *_src, ptrdiff_t stride) 366{ 367 int i, dc=0; 368 pixel *src = (pixel*)_src; 369 pixel4 dcsplat; 370 stride >>= sizeof(pixel)-1; 371 372 for(i=0;i<16; i++){ 373 dc+= src[-1+i*stride]; 374 } 375 376 dcsplat = PIXEL_SPLAT_X4((dc+8)>>4); 377 PREDICT_16x16_DC(dcsplat); 378} 379 380static void FUNCC(pred16x16_top_dc)(uint8_t *_src, ptrdiff_t stride) 381{ 382 int i, dc=0; 383 pixel *src = (pixel*)_src; 384 pixel4 dcsplat; 385 stride >>= sizeof(pixel)-1; 386 387 for(i=0;i<16; i++){ 388 dc+= src[i-stride]; 389 } 390 391 dcsplat = PIXEL_SPLAT_X4((dc+8)>>4); 392 PREDICT_16x16_DC(dcsplat); 393} 394 395#define PRED16x16_X(n, v) \ 396static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\ 397{\ 398 int i;\ 399 pixel *src = (pixel*)_src;\ 400 stride >>= sizeof(pixel)-1;\ 401 PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\ 402} 403 404PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0) 405#if BIT_DEPTH == 8 406PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1) 407PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1) 408#endif 409 410static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src, 411 ptrdiff_t _stride, 412 const int svq3, 413 const int rv40) 414{ 415 int i, j, k; 416 int a; 417 INIT_CLIP 418 pixel *src = (pixel*)_src; 419 int stride = _stride>>(sizeof(pixel)-1); 420 const pixel * const src0 = src +7-stride; 421 const pixel * src1 = src +8*stride-1; 422 const pixel * src2 = src1-2*stride; // == src+6*stride-1; 423 int H = src0[1] - src0[-1]; 424 int V = src1[0] - src2[ 0]; 425 for(k=2; k<=8; ++k) { 426 src1 += stride; src2 -= stride; 427 H += k*(src0[k] - src0[-k]); 428 V += k*(src1[0] - src2[ 0]); 429 } 430 if(svq3){ 431 H = ( 5*(H/4) ) / 16; 432 V = ( 5*(V/4) ) / 16; 433 434 /* required for 100% accuracy */ 435 i = H; H = V; V = i; 436 }else if(rv40){ 437 H = ( H + (H>>2) ) >> 4; 438 V = ( V + (V>>2) ) >> 4; 439 }else{ 440 H = ( 5*H+32 ) >> 6; 441 V = ( 5*V+32 ) >> 6; 442 } 443 444 a = 16*(src1[0] + src2[16] + 1) - 7*(V+H); 445 for(j=16; j>0; --j) { 446 int b = a; 447 a += V; 448 for(i=-16; i<0; i+=4) { 449 src[16+i] = CLIP((b ) >> 5); 450 src[17+i] = CLIP((b+ H) >> 5); 451 src[18+i] = CLIP((b+2*H) >> 5); 452 src[19+i] = CLIP((b+3*H) >> 5); 453 b += 4*H; 454 } 455 src += stride; 456 } 457} 458 459static void FUNCC(pred16x16_plane)(uint8_t *src, ptrdiff_t stride) 460{ 461 FUNCC(pred16x16_plane_compat)(src, stride, 0, 0); 462} 463 464static void FUNCC(pred8x8_vertical)(uint8_t *_src, ptrdiff_t _stride) 465{ 466 int i; 467 pixel *src = (pixel*)_src; 468 int stride = _stride>>(sizeof(pixel)-1); 469 const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0); 470 const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1); 471 472 for(i=0; i<8; i++){ 473 AV_WN4PA(((pixel4*)(src+i*stride))+0, a); 474 AV_WN4PA(((pixel4*)(src+i*stride))+1, b); 475 } 476} 477 478static void FUNCC(pred8x16_vertical)(uint8_t *_src, ptrdiff_t _stride) 479{ 480 int i; 481 pixel *src = (pixel*)_src; 482 int stride = _stride>>(sizeof(pixel)-1); 483 const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0); 484 const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1); 485 486 for(i=0; i<16; i++){ 487 AV_WN4PA(((pixel4*)(src+i*stride))+0, a); 488 AV_WN4PA(((pixel4*)(src+i*stride))+1, b); 489 } 490} 491 492static void FUNCC(pred8x8_horizontal)(uint8_t *_src, ptrdiff_t stride) 493{ 494 int i; 495 pixel *src = (pixel*)_src; 496 stride >>= sizeof(pixel)-1; 497 498 for(i=0; i<8; i++){ 499 const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]); 500 AV_WN4PA(((pixel4*)(src+i*stride))+0, a); 501 AV_WN4PA(((pixel4*)(src+i*stride))+1, a); 502 } 503} 504 505static void FUNCC(pred8x16_horizontal)(uint8_t *_src, ptrdiff_t stride) 506{ 507 int i; 508 pixel *src = (pixel*)_src; 509 stride >>= sizeof(pixel)-1; 510 for(i=0; i<16; i++){ 511 const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]); 512 AV_WN4PA(((pixel4*)(src+i*stride))+0, a); 513 AV_WN4PA(((pixel4*)(src+i*stride))+1, a); 514 } 515} 516 517#define PRED8x8_X(n, v)\ 518static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\ 519{\ 520 int i;\ 521 const pixel4 a = PIXEL_SPLAT_X4(v);\ 522 pixel *src = (pixel*)_src;\ 523 stride >>= sizeof(pixel)-1;\ 524 for(i=0; i<8; i++){\ 525 AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\ 526 AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\ 527 }\ 528} 529 530PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0) 531#if BIT_DEPTH == 8 532PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1) 533PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1) 534#endif 535 536static void FUNCC(pred8x16_128_dc)(uint8_t *_src, ptrdiff_t stride) 537{ 538 FUNCC(pred8x8_128_dc)(_src, stride); 539 FUNCC(pred8x8_128_dc)(_src+8*stride, stride); 540} 541 542static void FUNCC(pred8x8_left_dc)(uint8_t *_src, ptrdiff_t stride) 543{ 544 int i; 545 int dc0, dc2; 546 pixel4 dc0splat, dc2splat; 547 pixel *src = (pixel*)_src; 548 stride >>= sizeof(pixel)-1; 549 550 dc0=dc2=0; 551 for(i=0;i<4; i++){ 552 dc0+= src[-1+i*stride]; 553 dc2+= src[-1+(i+4)*stride]; 554 } 555 dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2); 556 dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2); 557 558 for(i=0; i<4; i++){ 559 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); 560 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc0splat); 561 } 562 for(i=4; i<8; i++){ 563 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat); 564 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc2splat); 565 } 566} 567 568static void FUNCC(pred8x16_left_dc)(uint8_t *_src, ptrdiff_t stride) 569{ 570 FUNCC(pred8x8_left_dc)(_src, stride); 571 FUNCC(pred8x8_left_dc)(_src+8*stride, stride); 572} 573 574static void FUNCC(pred8x8_top_dc)(uint8_t *_src, ptrdiff_t stride) 575{ 576 int i; 577 int dc0, dc1; 578 pixel4 dc0splat, dc1splat; 579 pixel *src = (pixel*)_src; 580 stride >>= sizeof(pixel)-1; 581 582 dc0=dc1=0; 583 for(i=0;i<4; i++){ 584 dc0+= src[i-stride]; 585 dc1+= src[4+i-stride]; 586 } 587 dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2); 588 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); 589 590 for(i=0; i<4; i++){ 591 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); 592 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); 593 } 594 for(i=4; i<8; i++){ 595 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); 596 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); 597 } 598} 599 600static void FUNCC(pred8x16_top_dc)(uint8_t *_src, ptrdiff_t stride) 601{ 602 int i; 603 int dc0, dc1; 604 pixel4 dc0splat, dc1splat; 605 pixel *src = (pixel*)_src; 606 stride >>= sizeof(pixel)-1; 607 608 dc0=dc1=0; 609 for(i=0;i<4; i++){ 610 dc0+= src[i-stride]; 611 dc1+= src[4+i-stride]; 612 } 613 dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2); 614 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); 615 616 for(i=0; i<16; i++){ 617 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); 618 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); 619 } 620} 621 622static void FUNCC(pred8x8_dc)(uint8_t *_src, ptrdiff_t stride) 623{ 624 int i; 625 int dc0, dc1, dc2; 626 pixel4 dc0splat, dc1splat, dc2splat, dc3splat; 627 pixel *src = (pixel*)_src; 628 stride >>= sizeof(pixel)-1; 629 630 dc0=dc1=dc2=0; 631 for(i=0;i<4; i++){ 632 dc0+= src[-1+i*stride] + src[i-stride]; 633 dc1+= src[4+i-stride]; 634 dc2+= src[-1+(i+4)*stride]; 635 } 636 dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3); 637 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); 638 dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2); 639 dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3); 640 641 for(i=0; i<4; i++){ 642 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); 643 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); 644 } 645 for(i=4; i<8; i++){ 646 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat); 647 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat); 648 } 649} 650 651static void FUNCC(pred8x16_dc)(uint8_t *_src, ptrdiff_t stride) 652{ 653 int i; 654 int dc0, dc1, dc2, dc3, dc4; 655 pixel4 dc0splat, dc1splat, dc2splat, dc3splat, dc4splat, dc5splat, dc6splat, dc7splat; 656 pixel *src = (pixel*)_src; 657 stride >>= sizeof(pixel)-1; 658 659 dc0=dc1=dc2=dc3=dc4=0; 660 for(i=0;i<4; i++){ 661 dc0+= src[-1+i*stride] + src[i-stride]; 662 dc1+= src[4+i-stride]; 663 dc2+= src[-1+(i+4)*stride]; 664 dc3+= src[-1+(i+8)*stride]; 665 dc4+= src[-1+(i+12)*stride]; 666 } 667 dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3); 668 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); 669 dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2); 670 dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3); 671 dc4splat = PIXEL_SPLAT_X4((dc3 + 2)>>2); 672 dc5splat = PIXEL_SPLAT_X4((dc1 + dc3 + 4)>>3); 673 dc6splat = PIXEL_SPLAT_X4((dc4 + 2)>>2); 674 dc7splat = PIXEL_SPLAT_X4((dc1 + dc4 + 4)>>3); 675 676 for(i=0; i<4; i++){ 677 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); 678 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); 679 } 680 for(i=4; i<8; i++){ 681 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat); 682 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat); 683 } 684 for(i=8; i<12; i++){ 685 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc4splat); 686 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc5splat); 687 } 688 for(i=12; i<16; i++){ 689 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc6splat); 690 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc7splat); 691 } 692} 693 694//the following 4 function should not be optimized! 695static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride) 696{ 697 FUNCC(pred8x8_top_dc)(src, stride); 698 FUNCC(pred4x4_dc)(src, NULL, stride); 699} 700 701static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride) 702{ 703 FUNCC(pred8x16_top_dc)(src, stride); 704 FUNCC(pred4x4_dc)(src, NULL, stride); 705} 706 707static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride) 708{ 709 FUNCC(pred8x8_dc)(src, stride); 710 FUNCC(pred4x4_top_dc)(src, NULL, stride); 711} 712 713static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride) 714{ 715 FUNCC(pred8x16_dc)(src, stride); 716 FUNCC(pred4x4_top_dc)(src, NULL, stride); 717} 718 719static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride) 720{ 721 FUNCC(pred8x8_left_dc)(src, stride); 722 FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride); 723 FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride); 724} 725 726static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride) 727{ 728 FUNCC(pred8x16_left_dc)(src, stride); 729 FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride); 730 FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride); 731} 732 733static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride) 734{ 735 FUNCC(pred8x8_left_dc)(src, stride); 736 FUNCC(pred4x4_128_dc)(src , NULL, stride); 737 FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride); 738} 739 740static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride) 741{ 742 FUNCC(pred8x16_left_dc)(src, stride); 743 FUNCC(pred4x4_128_dc)(src , NULL, stride); 744 FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride); 745} 746 747static void FUNCC(pred8x8_plane)(uint8_t *_src, ptrdiff_t _stride) 748{ 749 int j, k; 750 int a; 751 INIT_CLIP 752 pixel *src = (pixel*)_src; 753 int stride = _stride>>(sizeof(pixel)-1); 754 const pixel * const src0 = src +3-stride; 755 const pixel * src1 = src +4*stride-1; 756 const pixel * src2 = src1-2*stride; // == src+2*stride-1; 757 int H = src0[1] - src0[-1]; 758 int V = src1[0] - src2[ 0]; 759 for(k=2; k<=4; ++k) { 760 src1 += stride; src2 -= stride; 761 H += k*(src0[k] - src0[-k]); 762 V += k*(src1[0] - src2[ 0]); 763 } 764 H = ( 17*H+16 ) >> 5; 765 V = ( 17*V+16 ) >> 5; 766 767 a = 16*(src1[0] + src2[8]+1) - 3*(V+H); 768 for(j=8; j>0; --j) { 769 int b = a; 770 a += V; 771 src[0] = CLIP((b ) >> 5); 772 src[1] = CLIP((b+ H) >> 5); 773 src[2] = CLIP((b+2*H) >> 5); 774 src[3] = CLIP((b+3*H) >> 5); 775 src[4] = CLIP((b+4*H) >> 5); 776 src[5] = CLIP((b+5*H) >> 5); 777 src[6] = CLIP((b+6*H) >> 5); 778 src[7] = CLIP((b+7*H) >> 5); 779 src += stride; 780 } 781} 782 783static void FUNCC(pred8x16_plane)(uint8_t *_src, ptrdiff_t _stride) 784{ 785 int j, k; 786 int a; 787 INIT_CLIP 788 pixel *src = (pixel*)_src; 789 int stride = _stride>>(sizeof(pixel)-1); 790 const pixel * const src0 = src +3-stride; 791 const pixel * src1 = src +8*stride-1; 792 const pixel * src2 = src1-2*stride; // == src+6*stride-1; 793 int H = src0[1] - src0[-1]; 794 int V = src1[0] - src2[ 0]; 795 796 for (k = 2; k <= 4; ++k) { 797 src1 += stride; src2 -= stride; 798 H += k*(src0[k] - src0[-k]); 799 V += k*(src1[0] - src2[ 0]); 800 } 801 for (; k <= 8; ++k) { 802 src1 += stride; src2 -= stride; 803 V += k*(src1[0] - src2[0]); 804 } 805 806 H = (17*H+16) >> 5; 807 V = (5*V+32) >> 6; 808 809 a = 16*(src1[0] + src2[8] + 1) - 7*V - 3*H; 810 for(j=16; j>0; --j) { 811 int b = a; 812 a += V; 813 src[0] = CLIP((b ) >> 5); 814 src[1] = CLIP((b+ H) >> 5); 815 src[2] = CLIP((b+2*H) >> 5); 816 src[3] = CLIP((b+3*H) >> 5); 817 src[4] = CLIP((b+4*H) >> 5); 818 src[5] = CLIP((b+5*H) >> 5); 819 src[6] = CLIP((b+6*H) >> 5); 820 src[7] = CLIP((b+7*H) >> 5); 821 src += stride; 822 } 823} 824 825#define SRC(x,y) src[(x)+(y)*stride] 826#define PL(y) \ 827 const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; 828#define PREDICT_8x8_LOAD_LEFT \ 829 const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \ 830 + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \ 831 PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \ 832 const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2 833 834#define PT(x) \ 835 const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; 836#define PREDICT_8x8_LOAD_TOP \ 837 const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \ 838 + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \ 839 PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \ 840 const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \ 841 + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2 842 843#define PTR(x) \ 844 t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; 845#define PREDICT_8x8_LOAD_TOPRIGHT \ 846 int t8, t9, t10, t11, t12, t13, t14, t15; \ 847 if(has_topright) { \ 848 PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \ 849 t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \ 850 } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1); 851 852#define PREDICT_8x8_LOAD_TOPLEFT \ 853 const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2 854 855#define PREDICT_8x8_DC(v) \ 856 int y; \ 857 for( y = 0; y < 8; y++ ) { \ 858 AV_WN4PA(((pixel4*)src)+0, v); \ 859 AV_WN4PA(((pixel4*)src)+1, v); \ 860 src += stride; \ 861 } 862 863static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft, 864 int has_topright, ptrdiff_t _stride) 865{ 866 pixel *src = (pixel*)_src; 867 int stride = _stride>>(sizeof(pixel)-1); 868 869 PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1))); 870} 871static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft, 872 int has_topright, ptrdiff_t _stride) 873{ 874 pixel *src = (pixel*)_src; 875 int stride = _stride>>(sizeof(pixel)-1); 876 877 PREDICT_8x8_LOAD_LEFT; 878 const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3); 879 PREDICT_8x8_DC(dc); 880} 881static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft, 882 int has_topright, ptrdiff_t _stride) 883{ 884 pixel *src = (pixel*)_src; 885 int stride = _stride>>(sizeof(pixel)-1); 886 887 PREDICT_8x8_LOAD_TOP; 888 const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3); 889 PREDICT_8x8_DC(dc); 890} 891static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft, 892 int has_topright, ptrdiff_t _stride) 893{ 894 pixel *src = (pixel*)_src; 895 int stride = _stride>>(sizeof(pixel)-1); 896 897 PREDICT_8x8_LOAD_LEFT; 898 PREDICT_8x8_LOAD_TOP; 899 const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7 900 +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4); 901 PREDICT_8x8_DC(dc); 902} 903static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft, 904 int has_topright, ptrdiff_t _stride) 905{ 906 pixel *src = (pixel*)_src; 907 int stride = _stride>>(sizeof(pixel)-1); 908 pixel4 a; 909 910 PREDICT_8x8_LOAD_LEFT; 911#define ROW(y) a = PIXEL_SPLAT_X4(l##y); \ 912 AV_WN4PA(src+y*stride, a); \ 913 AV_WN4PA(src+y*stride+4, a); 914 ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); 915#undef ROW 916} 917static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, 918 int has_topright, ptrdiff_t _stride) 919{ 920 int y; 921 pixel *src = (pixel*)_src; 922 int stride = _stride>>(sizeof(pixel)-1); 923 pixel4 a, b; 924 925 PREDICT_8x8_LOAD_TOP; 926 src[0] = t0; 927 src[1] = t1; 928 src[2] = t2; 929 src[3] = t3; 930 src[4] = t4; 931 src[5] = t5; 932 src[6] = t6; 933 src[7] = t7; 934 a = AV_RN4PA(((pixel4*)src)+0); 935 b = AV_RN4PA(((pixel4*)src)+1); 936 for( y = 1; y < 8; y++ ) { 937 AV_WN4PA(((pixel4*)(src+y*stride))+0, a); 938 AV_WN4PA(((pixel4*)(src+y*stride))+1, b); 939 } 940} 941static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft, 942 int has_topright, ptrdiff_t _stride) 943{ 944 pixel *src = (pixel*)_src; 945 int stride = _stride>>(sizeof(pixel)-1); 946 PREDICT_8x8_LOAD_TOP; 947 PREDICT_8x8_LOAD_TOPRIGHT; 948 SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2; 949 SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2; 950 SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2; 951 SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2; 952 SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2; 953 SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2; 954 SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2; 955 SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2; 956 SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2; 957 SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2; 958 SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2; 959 SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2; 960 SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2; 961 SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2; 962 SRC(7,7)= (t14 + 3*t15 + 2) >> 2; 963} 964static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft, 965 int has_topright, ptrdiff_t _stride) 966{ 967 pixel *src = (pixel*)_src; 968 int stride = _stride>>(sizeof(pixel)-1); 969 PREDICT_8x8_LOAD_TOP; 970 PREDICT_8x8_LOAD_LEFT; 971 PREDICT_8x8_LOAD_TOPLEFT; 972 SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2; 973 SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2; 974 SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2; 975 SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2; 976 SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2; 977 SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2; 978 SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2; 979 SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2; 980 SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2; 981 SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2; 982 SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2; 983 SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2; 984 SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2; 985 SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2; 986 SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2; 987} 988static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft, 989 int has_topright, ptrdiff_t _stride) 990{ 991 pixel *src = (pixel*)_src; 992 int stride = _stride>>(sizeof(pixel)-1); 993 PREDICT_8x8_LOAD_TOP; 994 PREDICT_8x8_LOAD_LEFT; 995 PREDICT_8x8_LOAD_TOPLEFT; 996 SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2; 997 SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2; 998 SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2; 999 SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2; 1000 SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2; 1001 SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2; 1002 SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2; 1003 SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1; 1004 SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2; 1005 SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1; 1006 SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2; 1007 SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1; 1008 SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2; 1009 SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1; 1010 SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2; 1011 SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1; 1012 SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2; 1013 SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1; 1014 SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2; 1015 SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1; 1016 SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2; 1017 SRC(7,0)= (t6 + t7 + 1) >> 1; 1018} 1019static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft, 1020 int has_topright, ptrdiff_t _stride) 1021{ 1022 pixel *src = (pixel*)_src; 1023 int stride = _stride>>(sizeof(pixel)-1); 1024 PREDICT_8x8_LOAD_TOP; 1025 PREDICT_8x8_LOAD_LEFT; 1026 PREDICT_8x8_LOAD_TOPLEFT; 1027 SRC(0,7)= (l6 + l7 + 1) >> 1; 1028 SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2; 1029 SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1; 1030 SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2; 1031 SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1; 1032 SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2; 1033 SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1; 1034 SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2; 1035 SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1; 1036 SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2; 1037 SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1; 1038 SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2; 1039 SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1; 1040 SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2; 1041 SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1; 1042 SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2; 1043 SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2; 1044 SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2; 1045 SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2; 1046 SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2; 1047 SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; 1048 SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; 1049} 1050static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft, 1051 int has_topright, ptrdiff_t _stride) 1052{ 1053 pixel *src = (pixel*)_src; 1054 int stride = _stride>>(sizeof(pixel)-1); 1055 PREDICT_8x8_LOAD_TOP; 1056 PREDICT_8x8_LOAD_TOPRIGHT; 1057 SRC(0,0)= (t0 + t1 + 1) >> 1; 1058 SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2; 1059 SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1; 1060 SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2; 1061 SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1; 1062 SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2; 1063 SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1; 1064 SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2; 1065 SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1; 1066 SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2; 1067 SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1; 1068 SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2; 1069 SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1; 1070 SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2; 1071 SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1; 1072 SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2; 1073 SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1; 1074 SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2; 1075 SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1; 1076 SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2; 1077 SRC(7,6)= (t10 + t11 + 1) >> 1; 1078 SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2; 1079} 1080static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft, 1081 int has_topright, ptrdiff_t _stride) 1082{ 1083 pixel *src = (pixel*)_src; 1084 int stride = _stride>>(sizeof(pixel)-1); 1085 PREDICT_8x8_LOAD_LEFT; 1086 SRC(0,0)= (l0 + l1 + 1) >> 1; 1087 SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; 1088 SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1; 1089 SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2; 1090 SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1; 1091 SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2; 1092 SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1; 1093 SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2; 1094 SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1; 1095 SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2; 1096 SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1; 1097 SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2; 1098 SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1; 1099 SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2; 1100 SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)= 1101 SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)= 1102 SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)= 1103 SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7; 1104} 1105 1106static void FUNCC(pred8x8l_vertical_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft, 1107 int has_topright, ptrdiff_t _stride) 1108{ 1109 int i; 1110 pixel *src = (pixel*)_src; 1111 const dctcoef *block = (const dctcoef*)_block; 1112 pixel pix[8]; 1113 int stride = _stride>>(sizeof(pixel)-1); 1114 PREDICT_8x8_LOAD_TOP; 1115 1116 pix[0] = t0; 1117 pix[1] = t1; 1118 pix[2] = t2; 1119 pix[3] = t3; 1120 pix[4] = t4; 1121 pix[5] = t5; 1122 pix[6] = t6; 1123 pix[7] = t7; 1124 1125 for(i=0; i<8; i++){ 1126 pixel v = pix[i]; 1127 src[0*stride]= v += block[0]; 1128 src[1*stride]= v += block[8]; 1129 src[2*stride]= v += block[16]; 1130 src[3*stride]= v += block[24]; 1131 src[4*stride]= v += block[32]; 1132 src[5*stride]= v += block[40]; 1133 src[6*stride]= v += block[48]; 1134 src[7*stride]= v + block[56]; 1135 src++; 1136 block++; 1137 } 1138 1139 memset(_block, 0, sizeof(dctcoef) * 64); 1140} 1141 1142static void FUNCC(pred8x8l_horizontal_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft, 1143 int has_topright, ptrdiff_t _stride) 1144{ 1145 int i; 1146 pixel *src = (pixel*)_src; 1147 const dctcoef *block = (const dctcoef*)_block; 1148 pixel pix[8]; 1149 int stride = _stride>>(sizeof(pixel)-1); 1150 PREDICT_8x8_LOAD_LEFT; 1151 1152 pix[0] = l0; 1153 pix[1] = l1; 1154 pix[2] = l2; 1155 pix[3] = l3; 1156 pix[4] = l4; 1157 pix[5] = l5; 1158 pix[6] = l6; 1159 pix[7] = l7; 1160 1161 for(i=0; i<8; i++){ 1162 pixel v = pix[i]; 1163 src[0]= v += block[0]; 1164 src[1]= v += block[1]; 1165 src[2]= v += block[2]; 1166 src[3]= v += block[3]; 1167 src[4]= v += block[4]; 1168 src[5]= v += block[5]; 1169 src[6]= v += block[6]; 1170 src[7]= v + block[7]; 1171 src+= stride; 1172 block+= 8; 1173 } 1174 1175 memset(_block, 0, sizeof(dctcoef) * 64); 1176} 1177 1178#undef PREDICT_8x8_LOAD_LEFT 1179#undef PREDICT_8x8_LOAD_TOP 1180#undef PREDICT_8x8_LOAD_TOPLEFT 1181#undef PREDICT_8x8_LOAD_TOPRIGHT 1182#undef PREDICT_8x8_DC 1183#undef PTR 1184#undef PT 1185#undef PL 1186#undef SRC 1187 1188static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block, 1189 ptrdiff_t stride) 1190{ 1191 int i; 1192 pixel *pix = (pixel*)_pix; 1193 const dctcoef *block = (const dctcoef*)_block; 1194 stride >>= sizeof(pixel)-1; 1195 pix -= stride; 1196 for(i=0; i<4; i++){ 1197 pixel v = pix[0]; 1198 pix[1*stride]= v += block[0]; 1199 pix[2*stride]= v += block[4]; 1200 pix[3*stride]= v += block[8]; 1201 pix[4*stride]= v + block[12]; 1202 pix++; 1203 block++; 1204 } 1205 1206 memset(_block, 0, sizeof(dctcoef) * 16); 1207} 1208 1209static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block, 1210 ptrdiff_t stride) 1211{ 1212 int i; 1213 pixel *pix = (pixel*)_pix; 1214 const dctcoef *block = (const dctcoef*)_block; 1215 stride >>= sizeof(pixel)-1; 1216 for(i=0; i<4; i++){ 1217 pixel v = pix[-1]; 1218 pix[0]= v += block[0]; 1219 pix[1]= v += block[1]; 1220 pix[2]= v += block[2]; 1221 pix[3]= v + block[3]; 1222 pix+= stride; 1223 block+= 4; 1224 } 1225 1226 memset(_block, 0, sizeof(dctcoef) * 16); 1227} 1228 1229static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block, 1230 ptrdiff_t stride) 1231{ 1232 int i; 1233 pixel *pix = (pixel*)_pix; 1234 const dctcoef *block = (const dctcoef*)_block; 1235 stride >>= sizeof(pixel)-1; 1236 pix -= stride; 1237 for(i=0; i<8; i++){ 1238 pixel v = pix[0]; 1239 pix[1*stride]= v += block[0]; 1240 pix[2*stride]= v += block[8]; 1241 pix[3*stride]= v += block[16]; 1242 pix[4*stride]= v += block[24]; 1243 pix[5*stride]= v += block[32]; 1244 pix[6*stride]= v += block[40]; 1245 pix[7*stride]= v += block[48]; 1246 pix[8*stride]= v + block[56]; 1247 pix++; 1248 block++; 1249 } 1250 1251 memset(_block, 0, sizeof(dctcoef) * 64); 1252} 1253 1254static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block, 1255 ptrdiff_t stride) 1256{ 1257 int i; 1258 pixel *pix = (pixel*)_pix; 1259 const dctcoef *block = (const dctcoef*)_block; 1260 stride >>= sizeof(pixel)-1; 1261 for(i=0; i<8; i++){ 1262 pixel v = pix[-1]; 1263 pix[0]= v += block[0]; 1264 pix[1]= v += block[1]; 1265 pix[2]= v += block[2]; 1266 pix[3]= v += block[3]; 1267 pix[4]= v += block[4]; 1268 pix[5]= v += block[5]; 1269 pix[6]= v += block[6]; 1270 pix[7]= v + block[7]; 1271 pix+= stride; 1272 block+= 8; 1273 } 1274 1275 memset(_block, 0, sizeof(dctcoef) * 64); 1276} 1277 1278static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, 1279 int16_t *block, 1280 ptrdiff_t stride) 1281{ 1282 int i; 1283 for(i=0; i<16; i++) 1284 FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); 1285} 1286 1287static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, 1288 const int *block_offset, 1289 int16_t *block, 1290 ptrdiff_t stride) 1291{ 1292 int i; 1293 for(i=0; i<16; i++) 1294 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); 1295} 1296 1297static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, 1298 int16_t *block, ptrdiff_t stride) 1299{ 1300 int i; 1301 for(i=0; i<4; i++) 1302 FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); 1303} 1304 1305static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, 1306 int16_t *block, ptrdiff_t stride) 1307{ 1308 int i; 1309 for(i=0; i<4; i++) 1310 FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); 1311 for(i=4; i<8; i++) 1312 FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride); 1313} 1314 1315static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, 1316 int16_t *block, 1317 ptrdiff_t stride) 1318{ 1319 int i; 1320 for(i=0; i<4; i++) 1321 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); 1322} 1323 1324static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, 1325 const int *block_offset, 1326 int16_t *block, ptrdiff_t stride) 1327{ 1328 int i; 1329 for(i=0; i<4; i++) 1330 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); 1331 for(i=4; i<8; i++) 1332 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride); 1333} 1334