1/* 2 * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavcodec/hevcdec.h" 22#include "libavutil/mips/generic_macros_msa.h" 23#include "hevcpred_mips.h" 24 25static const int8_t intra_pred_angle_up[17] = { 26 -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 27}; 28 29static const int8_t intra_pred_angle_low[16] = { 30 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26 31}; 32 33#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, \ 34 mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, \ 35 res0, res1, mul_val_b0, mul_val_b1, round) \ 36{ \ 37 v8i16 res0_m, res1_m, res2_m, res3_m; \ 38 \ 39 MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1, \ 40 mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m); \ 41 \ 42 res0_m += mul_val_h1 * tmp0; \ 43 res1_m += mul_val_h3 * tmp0; \ 44 res2_m += mul_val_h1 * tmp0; \ 45 res3_m += mul_val_h3 * tmp0; \ 46 \ 47 res0_m += mul_val_b0 * src0_r; \ 48 res1_m += mul_val_b0 * src0_l; \ 49 res2_m += (mul_val_b0 - 1) * src0_r; \ 50 res3_m += (mul_val_b0 - 1) * src0_l; \ 51 \ 52 res0_m += mul_val_b1 * tmp1; \ 53 res1_m += mul_val_b1 * tmp1; \ 54 res2_m += (mul_val_b1 + 1) * tmp1; \ 55 res3_m += (mul_val_b1 + 1) * tmp1; \ 56 \ 57 SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round); \ 58 PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1); \ 59} 60 61static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top, 62 const uint8_t *src_left, 63 uint8_t *dst, int32_t stride, 64 int32_t flag) 65{ 66 uint32_t col; 67 uint32_t src_data; 68 v8i16 vec0, vec1, vec2; 69 v16i8 zero = { 0 }; 70 71 src_data = LW(src_top); 72 SW4(src_data, src_data, src_data, src_data, dst, stride); 73 74 if (0 == flag) { 75 src_data = LW(src_left); 76 77 vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data); 78 79 vec0 = __msa_fill_h(src_left[-1]); 80 vec1 = __msa_fill_h(src_top[0]); 81 82 vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2); 83 vec2 -= vec0; 84 vec2 >>= 1; 85 vec2 += vec1; 86 CLIP_SH_0_255(vec2); 87 88 for (col = 0; col < 4; col++) { 89 dst[stride * col] = (uint8_t) vec2[col]; 90 } 91 } 92} 93 94static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top, 95 const uint8_t *src_left, 96 uint8_t *dst, int32_t stride, 97 int32_t flag) 98{ 99 uint8_t *tmp_dst = dst; 100 uint32_t row; 101 uint16_t val0, val1, val2, val3; 102 uint64_t src_data1; 103 v8i16 vec0, vec1, vec2; 104 v16i8 zero = { 0 }; 105 106 src_data1 = LD(src_top); 107 108 for (row = 8; row--;) { 109 SD(src_data1, tmp_dst); 110 tmp_dst += stride; 111 } 112 113 if (0 == flag) { 114 src_data1 = LD(src_left); 115 116 vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1); 117 118 vec0 = __msa_fill_h(src_left[-1]); 119 vec1 = __msa_fill_h(src_top[0]); 120 121 vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2); 122 vec2 -= vec0; 123 vec2 >>= 1; 124 vec2 += vec1; 125 CLIP_SH_0_255(vec2); 126 127 val0 = vec2[0]; 128 val1 = vec2[1]; 129 val2 = vec2[2]; 130 val3 = vec2[3]; 131 132 dst[0] = val0; 133 dst[stride] = val1; 134 dst[2 * stride] = val2; 135 dst[3 * stride] = val3; 136 137 val0 = vec2[4]; 138 val1 = vec2[5]; 139 val2 = vec2[6]; 140 val3 = vec2[7]; 141 142 dst[4 * stride] = val0; 143 dst[5 * stride] = val1; 144 dst[6 * stride] = val2; 145 dst[7 * stride] = val3; 146 } 147} 148 149static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top, 150 const uint8_t *src_left, 151 uint8_t *dst, int32_t stride, 152 int32_t flag) 153{ 154 int32_t col; 155 uint8_t *tmp_dst = dst; 156 uint32_t row; 157 v16u8 src; 158 v8i16 vec0, vec1, vec2, vec3; 159 160 src = LD_UB(src_top); 161 162 for (row = 16; row--;) { 163 ST_UB(src, tmp_dst); 164 tmp_dst += stride; 165 } 166 167 if (0 == flag) { 168 src = LD_UB(src_left); 169 170 vec0 = __msa_fill_h(src_left[-1]); 171 vec1 = __msa_fill_h(src_top[0]); 172 173 UNPCK_UB_SH(src, vec2, vec3); 174 SUB2(vec2, vec0, vec3, vec0, vec2, vec3); 175 176 vec2 >>= 1; 177 vec3 >>= 1; 178 179 ADD2(vec2, vec1, vec3, vec1, vec2, vec3); 180 CLIP_SH2_0_255(vec2, vec3); 181 182 src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2); 183 184 for (col = 0; col < 16; col++) { 185 dst[stride * col] = src[col]; 186 } 187 } 188} 189 190static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top, 191 const uint8_t *src_left, 192 uint8_t *dst, int32_t stride, 193 int32_t flag) 194{ 195 uint32_t val0, val1, val2, val3; 196 v16i8 src0; 197 v8i16 src0_r, src_top_val, src_left_val; 198 v16i8 zero = { 0 }; 199 200 val0 = src_left[0] * 0x01010101; 201 val1 = src_left[1] * 0x01010101; 202 val2 = src_left[2] * 0x01010101; 203 val3 = src_left[3] * 0x01010101; 204 SW4(val0, val1, val2, val3, dst, stride); 205 206 if (0 == flag) { 207 val0 = LW(src_top); 208 src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0); 209 src_top_val = __msa_fill_h(src_top[-1]); 210 src_left_val = __msa_fill_h(src_left[0]); 211 212 src0_r = (v8i16) __msa_ilvr_b(zero, src0); 213 214 src0_r -= src_top_val; 215 src0_r >>= 1; 216 src0_r += src_left_val; 217 CLIP_SH_0_255(src0_r); 218 src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r); 219 val0 = __msa_copy_s_w((v4i32) src0, 0); 220 SW(val0, dst); 221 } 222} 223 224static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top, 225 const uint8_t *src_left, 226 uint8_t *dst, int32_t stride, 227 int32_t flag) 228{ 229 uint64_t val0, val1, val2, val3; 230 v16i8 src0; 231 v8i16 src0_r, src_top_val, src_left_val; 232 v16i8 zero = { 0 }; 233 234 val0 = src_left[0] * 0x0101010101010101; 235 val1 = src_left[1] * 0x0101010101010101; 236 val2 = src_left[2] * 0x0101010101010101; 237 val3 = src_left[3] * 0x0101010101010101; 238 SD4(val0, val1, val2, val3, dst, stride); 239 240 val0 = src_left[4] * 0x0101010101010101; 241 val1 = src_left[5] * 0x0101010101010101; 242 val2 = src_left[6] * 0x0101010101010101; 243 val3 = src_left[7] * 0x0101010101010101; 244 SD4(val0, val1, val2, val3, dst + 4 * stride, stride); 245 246 if (0 == flag) { 247 val0 = LD(src_top); 248 src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0); 249 src_top_val = __msa_fill_h(src_top[-1]); 250 src_left_val = __msa_fill_h(src_left[0]); 251 252 src0_r = (v8i16) __msa_ilvr_b(zero, src0); 253 254 src0_r -= src_top_val; 255 src0_r >>= 1; 256 src0_r += src_left_val; 257 CLIP_SH_0_255(src0_r); 258 src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r); 259 val0 = __msa_copy_s_d((v2i64) src0, 0); 260 SD(val0, dst); 261 } 262} 263 264static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top, 265 const uint8_t *src_left, 266 uint8_t *dst, int32_t stride, 267 int32_t flag) 268{ 269 uint8_t *tmp_dst = dst; 270 uint32_t row; 271 uint8_t inp0, inp1, inp2, inp3; 272 v16i8 src0, src1, src2, src3; 273 v8i16 src0_r, src0_l, src_left_val, src_top_val; 274 275 src_left_val = __msa_fill_h(src_left[0]); 276 277 for (row = 4; row--;) { 278 inp0 = src_left[0]; 279 inp1 = src_left[1]; 280 inp2 = src_left[2]; 281 inp3 = src_left[3]; 282 src_left += 4; 283 284 src0 = __msa_fill_b(inp0); 285 src1 = __msa_fill_b(inp1); 286 src2 = __msa_fill_b(inp2); 287 src3 = __msa_fill_b(inp3); 288 289 ST_SB4(src0, src1, src2, src3, tmp_dst, stride); 290 tmp_dst += (4 * stride); 291 } 292 293 if (0 == flag) { 294 src0 = LD_SB(src_top); 295 src_top_val = __msa_fill_h(src_top[-1]); 296 297 UNPCK_UB_SH(src0, src0_r, src0_l); 298 SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l); 299 300 src0_r >>= 1; 301 src0_l >>= 1; 302 303 ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l); 304 CLIP_SH2_0_255(src0_r, src0_l); 305 src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r); 306 ST_SB(src0, dst); 307 } 308} 309 310static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top, 311 const uint8_t *src_left, 312 uint8_t *dst, int32_t stride) 313{ 314 uint32_t row; 315 uint8_t inp0, inp1, inp2, inp3; 316 v16i8 src0, src1, src2, src3; 317 318 for (row = 0; row < 8; row++) { 319 inp0 = src_left[row * 4]; 320 inp1 = src_left[row * 4 + 1]; 321 inp2 = src_left[row * 4 + 2]; 322 inp3 = src_left[row * 4 + 3]; 323 324 src0 = __msa_fill_b(inp0); 325 src1 = __msa_fill_b(inp1); 326 src2 = __msa_fill_b(inp2); 327 src3 = __msa_fill_b(inp3); 328 329 ST_SB2(src0, src0, dst, 16); 330 dst += stride; 331 ST_SB2(src1, src1, dst, 16); 332 dst += stride; 333 ST_SB2(src2, src2, dst, 16); 334 dst += stride; 335 ST_SB2(src3, src3, dst, 16); 336 dst += stride; 337 } 338} 339 340static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top, 341 const uint8_t *src_left, 342 uint8_t *dst, int32_t stride, 343 int32_t flag) 344{ 345 uint8_t *tmp_dst = dst; 346 uint32_t addition = 0; 347 uint32_t val0, val1, val2; 348 v16i8 src = { 0 }; 349 v16u8 store; 350 v16i8 zero = { 0 }; 351 v8u16 sum, vec0, vec1; 352 353 val0 = LW(src_top); 354 val1 = LW(src_left); 355 INSERT_W2_SB(val0, val1, src); 356 sum = __msa_hadd_u_h((v16u8) src, (v16u8) src); 357 sum = (v8u16) __msa_hadd_u_w(sum, sum); 358 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); 359 sum = (v8u16) __msa_srari_w((v4i32) sum, 3); 360 addition = __msa_copy_u_w((v4i32) sum, 0); 361 store = (v16u8) __msa_fill_b(addition); 362 val0 = __msa_copy_u_w((v4i32) store, 0); 363 SW4(val0, val0, val0, val0, dst, stride) 364 365 if (0 == flag) { 366 ILVR_B2_UH(zero, store, zero, src, vec0, vec1); 367 368 vec1 += vec0; 369 vec0 += vec0; 370 vec1 += vec0; 371 372 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2); 373 store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1); 374 val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2; 375 store = (v16u8) __msa_insert_b((v16i8) store, 0, val1); 376 val0 = __msa_copy_u_w((v4i32) store, 0); 377 SW(val0, tmp_dst); 378 379 val0 = src_left[1]; 380 val1 = src_left[2]; 381 val2 = src_left[3]; 382 383 addition *= 3; 384 385 ADD2(val0, addition, val1, addition, val0, val1); 386 val2 += addition; 387 388 val0 += 2; 389 val1 += 2; 390 val2 += 2; 391 val0 >>= 2; 392 val1 >>= 2; 393 val2 >>= 2; 394 395 tmp_dst[stride * 1] = val0; 396 tmp_dst[stride * 2] = val1; 397 tmp_dst[stride * 3] = val2; 398 } 399} 400 401static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top, 402 const uint8_t *src_left, 403 uint8_t *dst, int32_t stride, 404 int32_t flag) 405{ 406 uint8_t *tmp_dst = dst; 407 uint32_t row, col, val; 408 uint32_t addition = 0; 409 uint64_t val0, val1; 410 v16u8 src = { 0 }; 411 v16u8 store; 412 v8u16 sum, vec0, vec1; 413 v16i8 zero = { 0 }; 414 415 val0 = LD(src_top); 416 val1 = LD(src_left); 417 INSERT_D2_UB(val0, val1, src); 418 sum = __msa_hadd_u_h((v16u8) src, (v16u8) src); 419 sum = (v8u16) __msa_hadd_u_w(sum, sum); 420 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); 421 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum); 422 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); 423 sum = (v8u16) __msa_srari_w((v4i32) sum, 4); 424 addition = __msa_copy_u_w((v4i32) sum, 0); 425 store = (v16u8) __msa_fill_b(addition); 426 val0 = __msa_copy_u_d((v2i64) store, 0); 427 428 for (row = 8; row--;) { 429 SD(val0, dst); 430 dst += stride; 431 } 432 433 if (0 == flag) { 434 ILVR_B2_UH(zero, store, zero, src, vec0, vec1); 435 436 vec1 += vec0; 437 vec0 += vec0; 438 vec1 += vec0; 439 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2); 440 store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1); 441 val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2; 442 store = (v16u8) __msa_insert_b((v16i8) store, 0, val); 443 val0 = __msa_copy_u_d((v2i64) store, 0); 444 SD(val0, tmp_dst); 445 446 val0 = LD(src_left); 447 src = (v16u8) __msa_insert_d((v2i64) src, 0, val0); 448 vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src); 449 vec0 = (v8u16) __msa_fill_h(addition); 450 vec0 *= 3; 451 vec1 += vec0; 452 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2); 453 454 for (col = 1; col < 8; col++) { 455 tmp_dst[stride * col] = vec1[col]; 456 } 457 } 458} 459 460static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top, 461 const uint8_t *src_left, 462 uint8_t *dst, int32_t stride, 463 int32_t flag) 464{ 465 uint8_t *tmp_dst = dst; 466 uint32_t row, col, val; 467 uint32_t addition = 0; 468 v16u8 src_above1, store, src_left1; 469 v8u16 sum, sum_above, sum_left; 470 v8u16 vec0, vec1, vec2; 471 v16i8 zero = { 0 }; 472 473 src_above1 = LD_UB(src_top); 474 src_left1 = LD_UB(src_left); 475 476 HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left); 477 sum = sum_above + sum_left; 478 sum = (v8u16) __msa_hadd_u_w(sum, sum); 479 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); 480 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum); 481 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); 482 sum = (v8u16) __msa_srari_w((v4i32) sum, 5); 483 addition = __msa_copy_u_w((v4i32) sum, 0); 484 store = (v16u8) __msa_fill_b(addition); 485 486 for (row = 16; row--;) { 487 ST_UB(store, dst); 488 dst += stride; 489 } 490 491 if (0 == flag) { 492 vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store); 493 ILVRL_B2_UH(zero, src_above1, vec1, vec2); 494 ADD2(vec1, vec0, vec2, vec0, vec1, vec2); 495 vec0 += vec0; 496 ADD2(vec1, vec0, vec2, vec0, vec1, vec2); 497 SRARI_H2_UH(vec1, vec2, 2); 498 store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1); 499 val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2; 500 store = (v16u8) __msa_insert_b((v16i8) store, 0, val); 501 ST_UB(store, tmp_dst); 502 503 ILVRL_B2_UH(zero, src_left1, vec1, vec2); 504 vec0 = (v8u16) __msa_fill_h(addition); 505 vec0 *= 3; 506 ADD2(vec1, vec0, vec2, vec0, vec1, vec2); 507 SRARI_H2_UH(vec1, vec2, 2); 508 store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1); 509 510 for (col = 1; col < 16; col++) { 511 tmp_dst[stride * col] = store[col]; 512 } 513 } 514} 515 516static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top, 517 const uint8_t *src_left, 518 uint8_t *dst, int32_t stride) 519{ 520 uint32_t row; 521 v16u8 src_above1, src_above2, store, src_left1, src_left2; 522 v8u16 sum_above1, sum_above2; 523 v8u16 sum_left1, sum_left2; 524 v8u16 sum, sum_above, sum_left; 525 526 LD_UB2(src_top, 16, src_above1, src_above2); 527 LD_UB2(src_left, 16, src_left1, src_left2); 528 HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2); 529 HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2); 530 sum_above = sum_above1 + sum_above2; 531 sum_left = sum_left1 + sum_left2; 532 sum = sum_above + sum_left; 533 sum = (v8u16) __msa_hadd_u_w(sum, sum); 534 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); 535 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum); 536 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum); 537 sum = (v8u16) __msa_srari_w((v4i32) sum, 6); 538 store = (v16u8) __msa_splati_b((v16i8) sum, 0); 539 540 for (row = 16; row--;) { 541 ST_UB2(store, store, dst, 16); 542 dst += stride; 543 ST_UB2(store, store, dst, 16); 544 dst += stride; 545 } 546} 547 548static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top, 549 const uint8_t *src_left, 550 uint8_t *dst, int32_t stride) 551{ 552 uint32_t src0, src1; 553 v16i8 src_vec0, src_vec1; 554 v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1; 555 v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3; 556 v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 }; 557 v16i8 zero = { 0 }; 558 559 src0 = LW(src_top); 560 src1 = LW(src_left); 561 562 mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0); 563 564 src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0); 565 src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1); 566 567 ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r); 568 SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3); 569 570 tmp0 = __msa_fill_h(src_top[4]); 571 tmp1 = __msa_fill_h(src_left[4]); 572 573 MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3, 574 res0, res1, res2, res3); 575 576 res0 += mul_val1 * tmp0; 577 res1 += mul_val1 * tmp0; 578 res2 += mul_val1 * tmp0; 579 res3 += mul_val1 * tmp0; 580 581 res0 += 3 * src_vec0_r; 582 res1 += 2 * src_vec0_r; 583 res2 += src_vec0_r; 584 res0 += tmp1; 585 res1 += 2 * tmp1; 586 res2 += 3 * tmp1; 587 res3 += 4 * tmp1; 588 589 PCKEV_D2_SH(res1, res0, res3, res2, res0, res1); 590 SRARI_H2_SH(res0, res1, 3); 591 src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0); 592 ST_W4(src_vec0, 0, 1, 2, 3, dst, stride); 593} 594 595static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top, 596 const uint8_t *src_left, 597 uint8_t *dst, int32_t stride) 598{ 599 uint64_t src0, src1; 600 v16i8 src_vec0, src_vec1, src_vec2, src_vec3; 601 v8i16 src_vec0_r, src_vec1_r; 602 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 603 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 604 v8i16 tmp0, tmp1, tmp2; 605 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 }; 606 v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 }; 607 v16i8 zero = { 0 }; 608 609 src0 = LD(src_top); 610 src1 = LD(src_left); 611 612 src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0); 613 src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1); 614 615 ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r); 616 SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3); 617 SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7); 618 619 tmp0 = __msa_fill_h(src_top[8]); 620 tmp1 = __msa_fill_h(src_left[8]); 621 622 MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3, 623 res0, res1, res2, res3); 624 MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7, 625 res4, res5, res6, res7); 626 627 tmp2 = mul_val1 * tmp0; 628 res0 += tmp2; 629 res1 += tmp2; 630 res2 += tmp2; 631 res3 += tmp2; 632 res4 += tmp2; 633 res5 += tmp2; 634 res6 += tmp2; 635 res7 += tmp2; 636 637 res0 += 7 * src_vec0_r; 638 res1 += 6 * src_vec0_r; 639 res2 += 5 * src_vec0_r; 640 res3 += 4 * src_vec0_r; 641 res4 += 3 * src_vec0_r; 642 res5 += 2 * src_vec0_r; 643 res6 += src_vec0_r; 644 645 res0 += tmp1; 646 res1 += 2 * tmp1; 647 res2 += 3 * tmp1; 648 res3 += 4 * tmp1; 649 res4 += 5 * tmp1; 650 res5 += 6 * tmp1; 651 res6 += 7 * tmp1; 652 res7 += 8 * tmp1; 653 654 SRARI_H4_SH(res0, res1, res2, res3, 4); 655 SRARI_H4_SH(res4, res5, res6, res7, 4); 656 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, 657 src_vec0, src_vec1, src_vec2, src_vec3); 658 659 ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1, 660 0, 1, 0, 1, dst, stride); 661} 662 663static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top, 664 const uint8_t *src_left, 665 uint8_t *dst, int32_t stride) 666{ 667 v16u8 src0, src1; 668 v8i16 src0_r, src1_r, src0_l, src1_l; 669 v8i16 vec0, vec1; 670 v8i16 res0, res1, tmp0, tmp1; 671 v8i16 mul_val2, mul_val3; 672 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 }; 673 v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 }; 674 675 src0 = LD_UB(src_top); 676 src1 = LD_UB(src_left); 677 678 UNPCK_UB_SH(src0, src0_r, src0_l); 679 UNPCK_UB_SH(src1, src1_r, src1_l); 680 681 mul_val2 = mul_val0 - 8; 682 mul_val3 = mul_val1 + 8; 683 684 tmp0 = __msa_fill_h(src_top[16]); 685 tmp1 = __msa_fill_h(src_left[16]); 686 687 SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1); 688 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 689 mul_val0, mul_val1, mul_val2, mul_val3, 690 res0, res1, 15, 1, 5); 691 ST_SH2(res0, res1, dst, stride); 692 dst += (2 * stride); 693 694 SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1); 695 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 696 mul_val0, mul_val1, mul_val2, mul_val3, 697 res0, res1, 13, 3, 5); 698 ST_SH2(res0, res1, dst, stride); 699 dst += (2 * stride); 700 701 SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1); 702 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 703 mul_val0, mul_val1, mul_val2, mul_val3, 704 res0, res1, 11, 5, 5); 705 ST_SH2(res0, res1, dst, stride); 706 dst += (2 * stride); 707 708 SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1); 709 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 710 mul_val0, mul_val1, mul_val2, mul_val3, 711 res0, res1, 9, 7, 5); 712 ST_SH2(res0, res1, dst, stride); 713 dst += (2 * stride); 714 715 SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1); 716 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 717 mul_val0, mul_val1, mul_val2, mul_val3, 718 res0, res1, 7, 9, 5); 719 ST_SH2(res0, res1, dst, stride); 720 dst += (2 * stride); 721 722 SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1); 723 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 724 mul_val0, mul_val1, mul_val2, mul_val3, 725 res0, res1, 5, 11, 5); 726 ST_SH2(res0, res1, dst, stride); 727 dst += (2 * stride); 728 729 SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1); 730 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 731 mul_val0, mul_val1, mul_val2, mul_val3, 732 res0, res1, 3, 13, 5); 733 ST_SH2(res0, res1, dst, stride); 734 dst += (2 * stride); 735 736 SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1); 737 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 738 mul_val0, mul_val1, mul_val2, mul_val3, 739 res0, res1, 1, 15, 5); 740 ST_SH2(res0, res1, dst, stride); 741} 742 743static void process_intra_upper_16x16_msa(const uint8_t *src_top, 744 const uint8_t *src_left, 745 uint8_t *dst, int32_t stride, 746 uint8_t offset) 747{ 748 v16i8 src0, src1; 749 v8i16 src0_r, src1_r, src0_l, src1_l; 750 v8i16 vec0, vec1, res0, res1; 751 v8i16 tmp0, tmp1; 752 v8i16 mul_val2, mul_val3; 753 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 }; 754 v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 }; 755 756 tmp0 = __msa_fill_h(src_top[32 - offset]); 757 tmp1 = __msa_fill_h(src_left[32]); 758 759 src0 = LD_SB(src_top); 760 src1 = LD_SB(src_left); 761 762 UNPCK_UB_SH(src0, src0_r, src0_l); 763 UNPCK_UB_SH(src1, src1_r, src1_l); 764 765 mul_val1 += offset; 766 mul_val0 -= offset; 767 mul_val2 = mul_val0 - 8; 768 mul_val3 = mul_val1 + 8; 769 770 SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1); 771 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 772 mul_val0, mul_val1, mul_val2, mul_val3, 773 res0, res1, 31, 1, 6); 774 ST_SH2(res0, res1, dst, stride); 775 dst += (2 * stride); 776 777 SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1); 778 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 779 mul_val0, mul_val1, mul_val2, mul_val3, 780 res0, res1, 29, 3, 6); 781 ST_SH2(res0, res1, dst, stride); 782 dst += (2 * stride); 783 784 SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1); 785 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 786 mul_val0, mul_val1, mul_val2, mul_val3, 787 res0, res1, 27, 5, 6); 788 ST_SH2(res0, res1, dst, stride); 789 dst += (2 * stride); 790 791 SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1); 792 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 793 mul_val0, mul_val1, mul_val2, mul_val3, 794 res0, res1, 25, 7, 6); 795 ST_SH2(res0, res1, dst, stride); 796 dst += (2 * stride); 797 798 SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1); 799 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 800 mul_val0, mul_val1, mul_val2, mul_val3, 801 res0, res1, 23, 9, 6); 802 ST_SH2(res0, res1, dst, stride); 803 dst += (2 * stride); 804 805 SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1); 806 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 807 mul_val0, mul_val1, mul_val2, mul_val3, 808 res0, res1, 21, 11, 6); 809 ST_SH2(res0, res1, dst, stride); 810 dst += (2 * stride); 811 812 SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1); 813 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 814 mul_val0, mul_val1, mul_val2, mul_val3, 815 res0, res1, 19, 13, 6); 816 ST_SH2(res0, res1, dst, stride); 817 dst += (2 * stride); 818 819 SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1); 820 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 821 mul_val0, mul_val1, mul_val2, mul_val3, 822 res0, res1, 17, 15, 6); 823 ST_SH2(res0, res1, dst, stride); 824} 825 826static void process_intra_lower_16x16_msa(const uint8_t *src_top, 827 const uint8_t *src_left, 828 uint8_t *dst, int32_t stride, 829 uint8_t offset) 830{ 831 v16i8 src0, src1; 832 v8i16 src0_r, src1_r, src0_l, src1_l; 833 v8i16 vec0, vec1, res0, res1, tmp0, tmp1; 834 v8i16 mul_val2, mul_val3; 835 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 }; 836 v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 }; 837 838 tmp0 = __msa_fill_h(src_top[32 - offset]); 839 tmp1 = __msa_fill_h(src_left[16]); 840 841 src0 = LD_SB(src_top); 842 src1 = LD_SB(src_left); 843 844 UNPCK_UB_SH(src0, src0_r, src0_l); 845 UNPCK_UB_SH(src1, src1_r, src1_l); 846 847 mul_val1 += offset; 848 mul_val0 -= offset; 849 mul_val2 = mul_val0 - 8; 850 mul_val3 = mul_val1 + 8; 851 852 SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1); 853 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 854 mul_val0, mul_val1, mul_val2, mul_val3, 855 res0, res1, 15, 17, 6); 856 ST_SH2(res0, res1, dst, stride); 857 dst += (2 * stride); 858 859 SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1); 860 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 861 mul_val0, mul_val1, mul_val2, mul_val3, 862 res0, res1, 13, 19, 6); 863 ST_SH2(res0, res1, dst, stride); 864 dst += (2 * stride); 865 866 SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1); 867 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 868 mul_val0, mul_val1, mul_val2, mul_val3, 869 res0, res1, 11, 21, 6); 870 ST_SH2(res0, res1, dst, stride); 871 dst += (2 * stride); 872 873 SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1); 874 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 875 mul_val0, mul_val1, mul_val2, mul_val3, 876 res0, res1, 9, 23, 6); 877 ST_SH2(res0, res1, dst, stride); 878 dst += (2 * stride); 879 880 SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1); 881 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 882 mul_val0, mul_val1, mul_val2, mul_val3, 883 res0, res1, 7, 25, 6); 884 ST_SH2(res0, res1, dst, stride); 885 dst += (2 * stride); 886 887 SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1); 888 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 889 mul_val0, mul_val1, mul_val2, mul_val3, 890 res0, res1, 5, 27, 6); 891 ST_SH2(res0, res1, dst, stride); 892 dst += (2 * stride); 893 894 SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1); 895 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 896 mul_val0, mul_val1, mul_val2, mul_val3, 897 res0, res1, 3, 29, 6); 898 ST_SH2(res0, res1, dst, stride); 899 dst += (2 * stride); 900 901 SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1); 902 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, 903 mul_val0, mul_val1, mul_val2, mul_val3, 904 res0, res1, 1, 31, 6); 905 ST_SH2(res0, res1, dst, stride); 906} 907 908static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top, 909 const uint8_t *src_left, 910 uint8_t *dst, int32_t stride) 911{ 912 process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0); 913 process_intra_upper_16x16_msa((src_top + 16), src_left, 914 (dst + 16), stride, 16); 915 dst += (16 * stride); 916 src_left += 16; 917 918 process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0); 919 process_intra_lower_16x16_msa((src_top + 16), src_left, 920 (dst + 16), stride, 16); 921} 922 923static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top, 924 const uint8_t *src_left, 925 uint8_t *dst, 926 int32_t stride, 927 int32_t mode) 928{ 929 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 }; 930 uint8_t ref_array[3 * 32 + 4]; 931 uint8_t *ref_tmp = ref_array + 4; 932 const uint8_t *ref; 933 int32_t last; 934 int32_t h_cnt, idx0, fact_val0, idx1, fact_val1; 935 int32_t idx2, fact_val2, idx3, fact_val3; 936 int32_t angle, angle_loop; 937 int32_t inv_angle_val, offset; 938 uint64_t tmp0; 939 v16i8 top0, top1, top2, top3; 940 v16i8 dst_val0; 941 v16i8 zero = { 0 }; 942 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; 943 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7; 944 945 angle = intra_pred_angle_up[mode - 18]; 946 inv_angle_val = inv_angle[mode - 18]; 947 last = (angle) >> 3; 948 angle_loop = angle; 949 950 ref = src_top - 1; 951 if (angle < 0 && last < -1) { 952 inv_angle_val = inv_angle[mode - 18]; 953 954 tmp0 = LD(ref); 955 SD(tmp0, ref_tmp); 956 957 for (h_cnt = last; h_cnt <= -1; h_cnt++) { 958 offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8); 959 ref_tmp[h_cnt] = src_left[offset]; 960 } 961 962 ref = ref_tmp; 963 } 964 965 idx0 = angle_loop >> 5; 966 fact_val0 = angle_loop & 31; 967 angle_loop += angle; 968 969 idx1 = angle_loop >> 5; 970 fact_val1 = angle_loop & 31; 971 angle_loop += angle; 972 973 idx2 = angle_loop >> 5; 974 fact_val2 = angle_loop & 31; 975 angle_loop += angle; 976 977 idx3 = angle_loop >> 5; 978 fact_val3 = angle_loop & 31; 979 980 top0 = LD_SB(ref + idx0 + 1); 981 top1 = LD_SB(ref + idx1 + 1); 982 top2 = LD_SB(ref + idx2 + 1); 983 top3 = LD_SB(ref + idx3 + 1); 984 985 fact0 = __msa_fill_h(fact_val0); 986 fact1 = __msa_fill_h(32 - fact_val0); 987 988 fact2 = __msa_fill_h(fact_val1); 989 fact3 = __msa_fill_h(32 - fact_val1); 990 991 fact4 = __msa_fill_h(fact_val2); 992 fact5 = __msa_fill_h(32 - fact_val2); 993 994 fact6 = __msa_fill_h(fact_val3); 995 fact7 = __msa_fill_h(32 - fact_val3); 996 997 ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2); 998 ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3); 999 ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3, 1000 diff0, diff2, diff4, diff6); 1001 SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2, 1002 diff1, diff3, diff5, diff7); 1003 ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2); 1004 ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3); 1005 MUL2(diff1, fact0, diff3, fact2, diff1, diff3); 1006 1007 diff1 += diff0 * fact1; 1008 diff3 += diff2 * fact3; 1009 1010 SRARI_H2_SH(diff1, diff3, 5); 1011 dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1); 1012 ST_W4(dst_val0, 0, 1, 2, 3, dst, stride); 1013} 1014 1015static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top, 1016 const uint8_t *src_left, 1017 uint8_t *dst, 1018 int32_t stride, 1019 int32_t mode) 1020{ 1021 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 }; 1022 uint8_t ref_array[3 * 32 + 4]; 1023 uint8_t *ref_tmp = ref_array + 8; 1024 const uint8_t *ref; 1025 const uint8_t *src_left_tmp = src_left - 1; 1026 int32_t last, offset; 1027 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1; 1028 int32_t idx2, fact_val2, idx3, fact_val3; 1029 int32_t angle, angle_loop; 1030 int32_t inv_angle_val, inv_angle_val_loop; 1031 int32_t tmp0, tmp1, tmp2; 1032 v16i8 top0, top1, top2, top3; 1033 v16u8 dst_val0, dst_val1; 1034 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7; 1035 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; 1036 1037 angle = intra_pred_angle_up[mode - 18]; 1038 inv_angle_val = inv_angle[mode - 18]; 1039 last = (angle) >> 2; 1040 angle_loop = angle; 1041 1042 ref = src_top - 1; 1043 if (last < -1) { 1044 inv_angle_val_loop = inv_angle_val * last; 1045 1046 tmp0 = LW(ref); 1047 tmp1 = LW(ref + 4); 1048 tmp2 = LW(ref + 8); 1049 SW(tmp0, ref_tmp); 1050 SW(tmp1, ref_tmp + 4); 1051 SW(tmp2, ref_tmp + 8); 1052 1053 for (h_cnt = last; h_cnt <= -1; h_cnt++) { 1054 offset = (inv_angle_val_loop + 128) >> 8; 1055 ref_tmp[h_cnt] = src_left_tmp[offset]; 1056 inv_angle_val_loop += inv_angle_val; 1057 } 1058 ref = ref_tmp; 1059 } 1060 1061 for (v_cnt = 0; v_cnt < 2; v_cnt++) { 1062 idx0 = (angle_loop) >> 5; 1063 fact_val0 = (angle_loop) & 31; 1064 angle_loop += angle; 1065 1066 idx1 = (angle_loop) >> 5; 1067 fact_val1 = (angle_loop) & 31; 1068 angle_loop += angle; 1069 1070 idx2 = (angle_loop) >> 5; 1071 fact_val2 = (angle_loop) & 31; 1072 angle_loop += angle; 1073 1074 idx3 = (angle_loop) >> 5; 1075 fact_val3 = (angle_loop) & 31; 1076 angle_loop += angle; 1077 1078 top0 = LD_SB(ref + idx0 + 1); 1079 top1 = LD_SB(ref + idx1 + 1); 1080 top2 = LD_SB(ref + idx2 + 1); 1081 top3 = LD_SB(ref + idx3 + 1); 1082 1083 fact0 = __msa_fill_h(fact_val0); 1084 fact1 = __msa_fill_h(32 - fact_val0); 1085 fact2 = __msa_fill_h(fact_val1); 1086 fact3 = __msa_fill_h(32 - fact_val1); 1087 fact4 = __msa_fill_h(fact_val2); 1088 fact5 = __msa_fill_h(32 - fact_val2); 1089 fact6 = __msa_fill_h(fact_val3); 1090 fact7 = __msa_fill_h(32 - fact_val3); 1091 1092 UNPCK_UB_SH(top0, diff0, diff1); 1093 UNPCK_UB_SH(top1, diff2, diff3); 1094 UNPCK_UB_SH(top2, diff4, diff5); 1095 UNPCK_UB_SH(top3, diff6, diff7); 1096 1097 SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2, 1098 diff1, diff3, diff5, diff7); 1099 MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6, 1100 diff1, diff3, diff5, diff7); 1101 1102 diff1 += diff0 * fact1; 1103 diff3 += diff2 * fact3; 1104 diff5 += diff4 * fact5; 1105 diff7 += diff6 * fact7; 1106 1107 SRARI_H4_SH(diff1, diff3, diff5, diff7, 5); 1108 PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1); 1109 ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride); 1110 dst += (4 * stride); 1111 } 1112} 1113 1114static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top, 1115 const uint8_t *src_left, 1116 uint8_t *dst, 1117 int32_t stride, 1118 int32_t mode) 1119{ 1120 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 }; 1121 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1; 1122 int32_t idx2, fact_val2, idx3, fact_val3; 1123 int32_t tmp0; 1124 int32_t angle, angle_loop, offset; 1125 int32_t inv_angle_val, inv_angle_val_loop; 1126 uint8_t ref_array[3 * 32 + 4]; 1127 uint8_t *ref_tmp = ref_array + 16; 1128 const uint8_t *ref; 1129 const uint8_t *src_left_tmp = src_left - 1; 1130 int32_t last; 1131 v16u8 top0, top1, top2, top3, top4, top5, top6, top7; 1132 v16i8 dst0, dst1, dst2, dst3; 1133 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7; 1134 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; 1135 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15; 1136 1137 angle = intra_pred_angle_up[mode - 18]; 1138 inv_angle_val = inv_angle[mode - 18]; 1139 last = angle >> 1; 1140 angle_loop = angle; 1141 1142 ref = src_top - 1; 1143 if (last < -1) { 1144 inv_angle_val_loop = inv_angle_val * last; 1145 1146 top0 = LD_UB(ref); 1147 tmp0 = LW(ref + 16); 1148 ST_UB(top0, ref_tmp); 1149 SW(tmp0, ref_tmp + 16); 1150 1151 for (h_cnt = last; h_cnt <= -1; h_cnt++) { 1152 offset = (inv_angle_val_loop + 128) >> 8; 1153 ref_tmp[h_cnt] = src_left_tmp[offset]; 1154 inv_angle_val_loop += inv_angle_val; 1155 } 1156 ref = ref_tmp; 1157 } 1158 1159 for (v_cnt = 4; v_cnt--;) { 1160 idx0 = (angle_loop) >> 5; 1161 fact_val0 = (angle_loop) & 31; 1162 angle_loop += angle; 1163 1164 idx1 = (angle_loop) >> 5; 1165 fact_val1 = (angle_loop) & 31; 1166 angle_loop += angle; 1167 1168 idx2 = (angle_loop) >> 5; 1169 fact_val2 = (angle_loop) & 31; 1170 angle_loop += angle; 1171 1172 idx3 = (angle_loop) >> 5; 1173 fact_val3 = (angle_loop) & 31; 1174 angle_loop += angle; 1175 1176 LD_UB2(ref + idx0 + 1, 16, top0, top1); 1177 LD_UB2(ref + idx1 + 1, 16, top2, top3); 1178 LD_UB2(ref + idx2 + 1, 16, top4, top5); 1179 LD_UB2(ref + idx3 + 1, 16, top6, top7); 1180 1181 fact0 = __msa_fill_h(fact_val0); 1182 fact1 = __msa_fill_h(32 - fact_val0); 1183 fact2 = __msa_fill_h(fact_val1); 1184 fact3 = __msa_fill_h(32 - fact_val1); 1185 fact4 = __msa_fill_h(fact_val2); 1186 fact5 = __msa_fill_h(32 - fact_val2); 1187 fact6 = __msa_fill_h(fact_val3); 1188 fact7 = __msa_fill_h(32 - fact_val3); 1189 1190 SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1, 1191 top1, top3, top5, top7); 1192 UNPCK_UB_SH(top0, diff0, diff1); 1193 UNPCK_UB_SH(top1, diff2, diff3); 1194 UNPCK_UB_SH(top2, diff4, diff5); 1195 UNPCK_UB_SH(top3, diff6, diff7); 1196 UNPCK_UB_SH(top4, diff8, diff9); 1197 UNPCK_UB_SH(top5, diff10, diff11); 1198 UNPCK_UB_SH(top6, diff12, diff13); 1199 UNPCK_UB_SH(top7, diff14, diff15); 1200 1201 MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2, 1202 diff2, diff3, diff6, diff7); 1203 MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6, 1204 diff10, diff11, diff14, diff15); 1205 1206 diff2 += diff0 * fact1; 1207 diff3 += diff1 * fact1; 1208 diff6 += diff4 * fact3; 1209 diff7 += diff5 * fact3; 1210 diff10 += diff8 * fact5; 1211 diff11 += diff9 * fact5; 1212 diff14 += diff12 * fact7; 1213 diff15 += diff13 * fact7; 1214 1215 SRARI_H4_SH(diff2, diff3, diff6, diff7, 5); 1216 SRARI_H4_SH(diff10, diff11, diff14, diff15, 5); 1217 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14, 1218 dst0, dst1, dst2, dst3); 1219 ST_SB4(dst0, dst1, dst2, dst3, dst, stride); 1220 dst += (4 * stride); 1221 } 1222} 1223 1224static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top, 1225 const uint8_t *src_left, 1226 uint8_t *dst, 1227 int32_t stride, 1228 int32_t mode) 1229{ 1230 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 }; 1231 uint8_t ref_array[3 * 32 + 4]; 1232 uint8_t *ref_tmp; 1233 const uint8_t *ref; 1234 const uint8_t *src_left_tmp = src_left - 1; 1235 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1; 1236 int32_t tmp0, tmp1, tmp2, tmp3; 1237 int32_t angle, angle_loop; 1238 int32_t inv_angle_val, inv_angle_val_loop; 1239 int32_t last, offset; 1240 v16u8 top0, top1, top2, top3, top4, top5, top6, top7; 1241 v16i8 dst0, dst1, dst2, dst3; 1242 v8i16 fact0, fact1, fact2, fact3; 1243 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; 1244 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15; 1245 1246 ref_tmp = ref_array + 32; 1247 1248 angle = intra_pred_angle_up[mode - 18]; 1249 inv_angle_val = inv_angle[mode - 18]; 1250 last = angle; 1251 angle_loop = angle; 1252 1253 ref = src_top - 1; 1254 if (last < -1) { 1255 inv_angle_val_loop = inv_angle_val * last; 1256 LD_UB2(ref, 16, top0, top1); 1257 tmp0 = ref[32]; 1258 tmp1 = ref[33]; 1259 tmp2 = ref[34]; 1260 tmp3 = ref[35]; 1261 1262 ST_UB2(top0, top1, ref_tmp, 16); 1263 ref_tmp[32] = tmp0; 1264 ref_tmp[33] = tmp1; 1265 ref_tmp[34] = tmp2; 1266 ref_tmp[35] = tmp3; 1267 1268 for (h_cnt = last; h_cnt <= -1; h_cnt++) { 1269 offset = (inv_angle_val_loop + 128) >> 8; 1270 ref_tmp[h_cnt] = src_left_tmp[offset]; 1271 inv_angle_val_loop += inv_angle_val; 1272 } 1273 1274 ref = ref_tmp; 1275 } 1276 1277 for (v_cnt = 16; v_cnt--;) { 1278 idx0 = (angle_loop) >> 5; 1279 fact_val0 = (angle_loop) & 31; 1280 angle_loop += angle; 1281 1282 idx1 = (angle_loop) >> 5; 1283 fact_val1 = (angle_loop) & 31; 1284 angle_loop += angle; 1285 1286 top0 = LD_UB(ref + idx0 + 1); 1287 top4 = LD_UB(ref + idx1 + 1); 1288 top1 = LD_UB(ref + idx0 + 17); 1289 top5 = LD_UB(ref + idx1 + 17); 1290 top3 = LD_UB(ref + idx0 + 33); 1291 top7 = LD_UB(ref + idx1 + 33); 1292 1293 fact0 = __msa_fill_h(fact_val0); 1294 fact1 = __msa_fill_h(32 - fact_val0); 1295 fact2 = __msa_fill_h(fact_val1); 1296 fact3 = __msa_fill_h(32 - fact_val1); 1297 1298 top2 = top1; 1299 top6 = top5; 1300 1301 SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1, 1302 top1, top3, top5, top7); 1303 UNPCK_UB_SH(top0, diff0, diff1); 1304 UNPCK_UB_SH(top1, diff2, diff3); 1305 UNPCK_UB_SH(top2, diff4, diff5); 1306 UNPCK_UB_SH(top3, diff6, diff7); 1307 UNPCK_UB_SH(top4, diff8, diff9); 1308 UNPCK_UB_SH(top5, diff10, diff11); 1309 UNPCK_UB_SH(top6, diff12, diff13); 1310 UNPCK_UB_SH(top7, diff14, diff15); 1311 1312 MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0, 1313 diff2, diff3, diff6, diff7); 1314 MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2, 1315 diff10, diff11, diff14, diff15); 1316 1317 diff2 += diff0 * fact1; 1318 diff3 += diff1 * fact1; 1319 diff6 += diff4 * fact1; 1320 diff7 += diff5 * fact1; 1321 diff10 += diff8 * fact3; 1322 diff11 += diff9 * fact3; 1323 diff14 += diff12 * fact3; 1324 diff15 += diff13 * fact3; 1325 1326 SRARI_H4_SH(diff2, diff3, diff6, diff7, 5); 1327 SRARI_H4_SH(diff10, diff11, diff14, diff15, 5); 1328 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14, 1329 dst0, dst1, dst2, dst3); 1330 1331 ST_SB2(dst0, dst1, dst, 16); 1332 dst += stride; 1333 ST_SB2(dst2, dst3, dst, 16); 1334 dst += stride; 1335 } 1336} 1337 1338static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top, 1339 const uint8_t *src_left, 1340 uint8_t *dst, 1341 int32_t stride, 1342 int32_t mode) 1343{ 1344 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 }; 1345 uint8_t ref_array[3 * 32 + 4]; 1346 uint8_t *ref_tmp = ref_array + 4; 1347 const uint8_t *ref; 1348 int32_t last, offset; 1349 int32_t h_cnt, idx0, fact_val0, idx1, fact_val1; 1350 int32_t idx2, fact_val2, idx3, fact_val3; 1351 int32_t angle, angle_loop, inv_angle_val; 1352 uint64_t tmp0; 1353 v16i8 dst_val0, dst_val1; 1354 v16u8 top0, top1, top2, top3; 1355 v16u8 zero = { 0 }; 1356 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; 1357 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7; 1358 1359 angle = intra_pred_angle_low[mode - 2]; 1360 last = angle >> 3; 1361 angle_loop = angle; 1362 1363 ref = src_left - 1; 1364 if (last < -1) { 1365 inv_angle_val = inv_angle[mode - 11]; 1366 1367 tmp0 = LD(ref); 1368 SD(tmp0, ref_tmp); 1369 1370 for (h_cnt = last; h_cnt <= -1; h_cnt++) { 1371 offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8); 1372 ref_tmp[h_cnt] = src_top[offset]; 1373 } 1374 1375 ref = ref_tmp; 1376 } 1377 1378 idx0 = angle_loop >> 5; 1379 fact_val0 = angle_loop & 31; 1380 angle_loop += angle; 1381 1382 idx1 = angle_loop >> 5; 1383 fact_val1 = angle_loop & 31; 1384 angle_loop += angle; 1385 1386 idx2 = angle_loop >> 5; 1387 fact_val2 = angle_loop & 31; 1388 angle_loop += angle; 1389 1390 idx3 = angle_loop >> 5; 1391 fact_val3 = angle_loop & 31; 1392 1393 top0 = LD_UB(ref + idx0 + 1); 1394 top1 = LD_UB(ref + idx1 + 1); 1395 top2 = LD_UB(ref + idx2 + 1); 1396 top3 = LD_UB(ref + idx3 + 1); 1397 1398 fact0 = __msa_fill_h(fact_val0); 1399 fact1 = __msa_fill_h(32 - fact_val0); 1400 fact2 = __msa_fill_h(fact_val1); 1401 fact3 = __msa_fill_h(32 - fact_val1); 1402 fact4 = __msa_fill_h(fact_val2); 1403 fact5 = __msa_fill_h(32 - fact_val2); 1404 fact6 = __msa_fill_h(fact_val3); 1405 fact7 = __msa_fill_h(32 - fact_val3); 1406 1407 ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2); 1408 ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3); 1409 ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3, 1410 diff0, diff2, diff4, diff6); 1411 SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2, 1412 diff1, diff3, diff5, diff7); 1413 ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2); 1414 ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3); 1415 MUL2(diff1, fact0, diff3, fact2, diff1, diff3); 1416 1417 diff1 += diff0 * fact1; 1418 diff3 += diff2 * fact3; 1419 1420 SRARI_H2_SH(diff1, diff3, 5); 1421 PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1); 1422 1423 diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0); 1424 diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0); 1425 1426 diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0); 1427 1428 dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2); 1429 dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2); 1430 1431 ST_W2(dst_val0, 0, 1, dst, stride); 1432 ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride); 1433} 1434 1435static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top, 1436 const uint8_t *src_left, 1437 uint8_t *dst, 1438 int32_t stride, 1439 int32_t mode) 1440{ 1441 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 }; 1442 uint8_t ref_array[3 * 32 + 4]; 1443 uint8_t *ref_tmp = ref_array + 8; 1444 const uint8_t *ref; 1445 const uint8_t *src_top_tmp = src_top - 1; 1446 uint8_t *dst_org; 1447 int32_t last, offset, tmp0, tmp1, tmp2; 1448 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1; 1449 int32_t idx2, fact_val2, idx3, fact_val3; 1450 int32_t angle, angle_loop, inv_angle_val; 1451 v16i8 top0, top1, top2, top3; 1452 v16i8 dst_val0, dst_val1, dst_val2, dst_val3; 1453 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; 1454 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7; 1455 1456 angle = intra_pred_angle_low[mode - 2]; 1457 last = (angle) >> 2; 1458 angle_loop = angle; 1459 1460 ref = src_left - 1; 1461 if (last < -1) { 1462 inv_angle_val = inv_angle[mode - 11]; 1463 1464 tmp0 = LW(ref); 1465 tmp1 = LW(ref + 4); 1466 tmp2 = LW(ref + 8); 1467 SW(tmp0, ref_tmp); 1468 SW(tmp1, ref_tmp + 4); 1469 SW(tmp2, ref_tmp + 8); 1470 1471 for (h_cnt = last; h_cnt <= -1; h_cnt++) { 1472 offset = (h_cnt * inv_angle_val + 128) >> 8; 1473 ref_tmp[h_cnt] = src_top_tmp[offset]; 1474 } 1475 1476 ref = ref_tmp; 1477 } 1478 1479 for (v_cnt = 0; v_cnt < 2; v_cnt++) { 1480 dst_org = dst; 1481 1482 idx0 = angle_loop >> 5; 1483 fact_val0 = angle_loop & 31; 1484 angle_loop += angle; 1485 1486 idx1 = angle_loop >> 5; 1487 fact_val1 = angle_loop & 31; 1488 angle_loop += angle; 1489 1490 idx2 = angle_loop >> 5; 1491 fact_val2 = angle_loop & 31; 1492 angle_loop += angle; 1493 1494 idx3 = angle_loop >> 5; 1495 fact_val3 = angle_loop & 31; 1496 angle_loop += angle; 1497 1498 top0 = LD_SB(ref + idx0 + 1); 1499 top1 = LD_SB(ref + idx1 + 1); 1500 top2 = LD_SB(ref + idx2 + 1); 1501 top3 = LD_SB(ref + idx3 + 1); 1502 1503 fact0 = __msa_fill_h(fact_val0); 1504 fact1 = __msa_fill_h(32 - fact_val0); 1505 fact2 = __msa_fill_h(fact_val1); 1506 fact3 = __msa_fill_h(32 - fact_val1); 1507 fact4 = __msa_fill_h(fact_val2); 1508 fact5 = __msa_fill_h(32 - fact_val2); 1509 fact6 = __msa_fill_h(fact_val3); 1510 fact7 = __msa_fill_h(32 - fact_val3); 1511 1512 UNPCK_UB_SH(top0, diff0, diff1); 1513 UNPCK_UB_SH(top1, diff2, diff3); 1514 UNPCK_UB_SH(top2, diff4, diff5); 1515 UNPCK_UB_SH(top3, diff6, diff7); 1516 SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2, 1517 diff1, diff3, diff5, diff7); 1518 MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6, 1519 diff1, diff3, diff5, diff7); 1520 1521 diff1 += diff0 * fact1; 1522 diff3 += diff2 * fact3; 1523 diff5 += diff4 * fact5; 1524 diff7 += diff6 * fact7; 1525 1526 SRARI_H4_SH(diff1, diff3, diff5, diff7, 5); 1527 PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7, 1528 dst_val0, dst_val1, dst_val2, dst_val3); 1529 ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1); 1530 ILVRL_H2_SH(diff1, diff0, diff3, diff4); 1531 ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride); 1532 dst += 4; 1533 } 1534} 1535 1536static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top, 1537 const uint8_t *src_left, 1538 uint8_t *dst, 1539 int32_t stride, 1540 int32_t mode) 1541{ 1542 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 }; 1543 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1; 1544 int32_t idx2, fact_val2, idx3, fact_val3, tmp0; 1545 v16i8 top0, top1, dst_val0, top2, top3, dst_val1; 1546 v16i8 top4, top5, dst_val2, top6, top7, dst_val3; 1547 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7; 1548 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; 1549 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15; 1550 int32_t angle, angle_loop, inv_angle_val, offset; 1551 uint8_t ref_array[3 * 32 + 4]; 1552 uint8_t *ref_tmp = ref_array + 16; 1553 const uint8_t *ref, *src_top_tmp = src_top - 1; 1554 uint8_t *dst_org; 1555 int32_t last; 1556 1557 angle = intra_pred_angle_low[mode - 2]; 1558 last = (angle) >> 1; 1559 angle_loop = angle; 1560 1561 ref = src_left - 1; 1562 if (last < -1) { 1563 inv_angle_val = inv_angle[mode - 11]; 1564 1565 top0 = LD_SB(ref); 1566 tmp0 = LW(ref + 16); 1567 ST_SB(top0, ref_tmp); 1568 SW(tmp0, ref_tmp + 16); 1569 1570 for (h_cnt = last; h_cnt <= -1; h_cnt++) { 1571 offset = (h_cnt * inv_angle_val + 128) >> 8; 1572 ref_tmp[h_cnt] = src_top_tmp[offset]; 1573 } 1574 1575 ref = ref_tmp; 1576 } 1577 1578 for (v_cnt = 0; v_cnt < 4; v_cnt++) { 1579 dst_org = dst; 1580 1581 idx0 = angle_loop >> 5; 1582 fact_val0 = angle_loop & 31; 1583 angle_loop += angle; 1584 1585 idx1 = angle_loop >> 5; 1586 fact_val1 = angle_loop & 31; 1587 angle_loop += angle; 1588 1589 idx2 = angle_loop >> 5; 1590 fact_val2 = angle_loop & 31; 1591 angle_loop += angle; 1592 1593 idx3 = angle_loop >> 5; 1594 fact_val3 = angle_loop & 31; 1595 angle_loop += angle; 1596 1597 LD_SB2(ref + idx0 + 1, 16, top0, top1); 1598 LD_SB2(ref + idx1 + 1, 16, top2, top3); 1599 LD_SB2(ref + idx2 + 1, 16, top4, top5); 1600 LD_SB2(ref + idx3 + 1, 16, top6, top7); 1601 1602 fact0 = __msa_fill_h(fact_val0); 1603 fact1 = __msa_fill_h(32 - fact_val0); 1604 fact2 = __msa_fill_h(fact_val1); 1605 fact3 = __msa_fill_h(32 - fact_val1); 1606 fact4 = __msa_fill_h(fact_val2); 1607 fact5 = __msa_fill_h(32 - fact_val2); 1608 fact6 = __msa_fill_h(fact_val3); 1609 fact7 = __msa_fill_h(32 - fact_val3); 1610 1611 SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1, 1612 top1, top3, top5, top7); 1613 1614 UNPCK_UB_SH(top0, diff0, diff1); 1615 UNPCK_UB_SH(top1, diff2, diff3); 1616 UNPCK_UB_SH(top2, diff4, diff5); 1617 UNPCK_UB_SH(top3, diff6, diff7); 1618 UNPCK_UB_SH(top4, diff8, diff9); 1619 UNPCK_UB_SH(top5, diff10, diff11); 1620 UNPCK_UB_SH(top6, diff12, diff13); 1621 UNPCK_UB_SH(top7, diff14, diff15); 1622 1623 MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2, 1624 diff2, diff3, diff6, diff7); 1625 MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6, 1626 diff10, diff11, diff14, diff15); 1627 1628 diff2 += diff0 * fact1; 1629 diff3 += diff1 * fact1; 1630 diff6 += diff4 * fact3; 1631 diff7 += diff5 * fact3; 1632 diff10 += diff8 * fact5; 1633 diff11 += diff9 * fact5; 1634 diff14 += diff12 * fact7; 1635 diff15 += diff13 * fact7; 1636 1637 SRARI_H4_SH(diff2, diff3, diff6, diff7, 5); 1638 SRARI_H4_SH(diff10, diff11, diff14, diff15, 5); 1639 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14, 1640 dst_val0, dst_val1, dst_val2, dst_val3); 1641 ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1); 1642 ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3); 1643 ILVRL_H2_SH(diff1, diff0, diff4, diff5); 1644 ILVRL_H2_SH(diff3, diff2, diff6, diff7); 1645 ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride); 1646 dst_org += (8 * stride); 1647 ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride); 1648 dst += 4; 1649 } 1650} 1651 1652static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top, 1653 const uint8_t *src_left, 1654 uint8_t *dst, 1655 int32_t stride, 1656 int32_t mode) 1657{ 1658 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 }; 1659 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0; 1660 v16i8 top0, top1, dst_val0, top2, top3, dst_val1; 1661 v16i8 top4, top5, dst_val2, top6, top7, dst_val3; 1662 v8i16 fact0, fact1, fact2, fact3; 1663 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; 1664 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15; 1665 int32_t angle, angle_loop, inv_angle_val, offset; 1666 uint8_t ref_array[3 * 32 + 4]; 1667 uint8_t *ref_tmp = ref_array + 32; 1668 const uint8_t *ref, *src_top_tmp = src_top - 1; 1669 uint8_t *dst_org; 1670 int32_t last; 1671 1672 angle = intra_pred_angle_low[mode - 2]; 1673 last = angle; 1674 angle_loop = angle; 1675 1676 ref = src_left - 1; 1677 if (last < -1) { 1678 inv_angle_val = inv_angle[mode - 11]; 1679 1680 LD_SB2(ref, 16, top0, top1); 1681 tmp0 = LW(ref + 32); 1682 ST_SB2(top0, top1, ref_tmp, 16); 1683 SW(tmp0, ref_tmp + 32); 1684 1685 for (h_cnt = last; h_cnt <= -1; h_cnt++) { 1686 offset = (h_cnt * inv_angle_val + 128) >> 8; 1687 ref_tmp[h_cnt] = src_top_tmp[offset]; 1688 } 1689 1690 ref = ref_tmp; 1691 } 1692 1693 for (v_cnt = 0; v_cnt < 16; v_cnt++) { 1694 dst_org = dst; 1695 idx0 = angle_loop >> 5; 1696 fact_val0 = angle_loop & 31; 1697 angle_loop += angle; 1698 1699 idx1 = angle_loop >> 5; 1700 fact_val1 = angle_loop & 31; 1701 angle_loop += angle; 1702 1703 top0 = LD_SB(ref + idx0 + 1); 1704 top4 = LD_SB(ref + idx1 + 1); 1705 top1 = LD_SB(ref + idx0 + 17); 1706 top5 = LD_SB(ref + idx1 + 17); 1707 top3 = LD_SB(ref + idx0 + 33); 1708 top7 = LD_SB(ref + idx1 + 33); 1709 1710 fact0 = __msa_fill_h(fact_val0); 1711 fact1 = __msa_fill_h(32 - fact_val0); 1712 fact2 = __msa_fill_h(fact_val1); 1713 fact3 = __msa_fill_h(32 - fact_val1); 1714 1715 top2 = top1; 1716 top6 = top5; 1717 1718 SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1, 1719 top1, top3, top5, top7); 1720 1721 UNPCK_UB_SH(top0, diff0, diff1); 1722 UNPCK_UB_SH(top1, diff2, diff3); 1723 UNPCK_UB_SH(top2, diff4, diff5); 1724 UNPCK_UB_SH(top3, diff6, diff7); 1725 UNPCK_UB_SH(top4, diff8, diff9); 1726 UNPCK_UB_SH(top5, diff10, diff11); 1727 UNPCK_UB_SH(top6, diff12, diff13); 1728 UNPCK_UB_SH(top7, diff14, diff15); 1729 1730 MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0, 1731 diff2, diff3, diff6, diff7); 1732 MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2, 1733 diff10, diff11, diff14, diff15); 1734 1735 diff2 += diff0 * fact1; 1736 diff3 += diff1 * fact1; 1737 diff6 += diff4 * fact1; 1738 diff7 += diff5 * fact1; 1739 diff10 += diff8 * fact3; 1740 diff11 += diff9 * fact3; 1741 diff14 += diff12 * fact3; 1742 diff15 += diff13 * fact3; 1743 1744 SRARI_H4_SH(diff2, diff3, diff6, diff7, 5); 1745 SRARI_H4_SH(diff10, diff11, diff14, diff15, 5); 1746 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14, 1747 dst_val0, dst_val1, dst_val2, dst_val3); 1748 ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1); 1749 ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3); 1750 1751 ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride) 1752 dst_org += (8 * stride); 1753 ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride) 1754 dst_org += (8 * stride); 1755 ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride) 1756 dst_org += (8 * stride); 1757 ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride) 1758 dst_org += (8 * stride); 1759 1760 dst += 2; 1761 } 1762} 1763 1764static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, 1765 int32_t dst_stride) 1766{ 1767 uint32_t row; 1768 v16u8 src1, src2; 1769 1770 src1 = LD_UB(src); 1771 src2 = LD_UB(src + 16); 1772 1773 for (row = 32; row--;) { 1774 ST_UB2(src1, src2, dst, 16); 1775 dst += dst_stride; 1776 } 1777} 1778 1779void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst, 1780 const uint8_t *src_top, 1781 const uint8_t *src_left, 1782 ptrdiff_t stride) 1783{ 1784 hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride); 1785} 1786 1787void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst, 1788 const uint8_t *src_top, 1789 const uint8_t *src_left, 1790 ptrdiff_t stride) 1791{ 1792 hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride); 1793} 1794 1795void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst, 1796 const uint8_t *src_top, 1797 const uint8_t *src_left, 1798 ptrdiff_t stride) 1799{ 1800 hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride); 1801} 1802 1803void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst, 1804 const uint8_t *src_top, 1805 const uint8_t *src_left, 1806 ptrdiff_t stride) 1807{ 1808 hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride); 1809} 1810 1811void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top, 1812 const uint8_t *src_left, 1813 ptrdiff_t stride, int log2, int c_idx) 1814{ 1815 switch (log2) { 1816 case 2: 1817 hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx); 1818 break; 1819 1820 case 3: 1821 hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx); 1822 break; 1823 1824 case 4: 1825 hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx); 1826 break; 1827 1828 case 5: 1829 hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride); 1830 break; 1831 } 1832} 1833 1834void ff_pred_intra_pred_angular_0_msa(uint8_t *dst, 1835 const uint8_t *src_top, 1836 const uint8_t *src_left, 1837 ptrdiff_t stride, int c_idx, int mode) 1838{ 1839 if (mode == 10) { 1840 hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx); 1841 } else if (mode == 26) { 1842 hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx); 1843 } else if (mode >= 18) { 1844 hevc_intra_pred_angular_upper_4width_msa(src_top, src_left, 1845 dst, stride, mode); 1846 } else { 1847 hevc_intra_pred_angular_lower_4width_msa(src_top, src_left, 1848 dst, stride, mode); 1849 } 1850} 1851 1852void ff_pred_intra_pred_angular_1_msa(uint8_t *dst, 1853 const uint8_t *src_top, 1854 const uint8_t *src_left, 1855 ptrdiff_t stride, int c_idx, int mode) 1856{ 1857 if (mode == 10) { 1858 hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx); 1859 } else if (mode == 26) { 1860 hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx); 1861 } else if (mode >= 18) { 1862 hevc_intra_pred_angular_upper_8width_msa(src_top, src_left, 1863 dst, stride, mode); 1864 } else { 1865 hevc_intra_pred_angular_lower_8width_msa(src_top, src_left, 1866 dst, stride, mode); 1867 } 1868} 1869 1870void ff_pred_intra_pred_angular_2_msa(uint8_t *dst, 1871 const uint8_t *src_top, 1872 const uint8_t *src_left, 1873 ptrdiff_t stride, int c_idx, int mode) 1874{ 1875 if (mode == 10) { 1876 hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx); 1877 } else if (mode == 26) { 1878 hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx); 1879 } else if (mode >= 18) { 1880 hevc_intra_pred_angular_upper_16width_msa(src_top, src_left, 1881 dst, stride, mode); 1882 } else { 1883 hevc_intra_pred_angular_lower_16width_msa(src_top, src_left, 1884 dst, stride, mode); 1885 } 1886} 1887 1888void ff_pred_intra_pred_angular_3_msa(uint8_t *dst, 1889 const uint8_t *src_top, 1890 const uint8_t *src_left, 1891 ptrdiff_t stride, int c_idx, int mode) 1892{ 1893 if (mode == 10) { 1894 hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride); 1895 } else if (mode == 26) { 1896 intra_predict_vert_32x32_msa(src_top, dst, stride); 1897 } else if (mode >= 18) { 1898 hevc_intra_pred_angular_upper_32width_msa(src_top, src_left, 1899 dst, stride, mode); 1900 } else { 1901 hevc_intra_pred_angular_lower_32width_msa(src_top, src_left, 1902 dst, stride, mode); 1903 } 1904} 1905 1906void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx) 1907{ 1908 v16u8 vec0; 1909 HEVCLocalContext *lc = s->HEVClc; 1910 int i; 1911 int hshift = s->ps.sps->hshift[c_idx]; 1912 int vshift = s->ps.sps->vshift[c_idx]; 1913 int size_in_luma_h = 16 << hshift; 1914 int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size; 1915 int size_in_luma_v = 16 << vshift; 1916 int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size; 1917 int x = x0 >> hshift; 1918 int y = y0 >> vshift; 1919 int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; 1920 int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; 1921 1922 int cur_tb_addr = 1923 s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)]; 1924 1925 ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t); 1926 uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride; 1927 1928 int min_pu_width = s->ps.sps->min_pu_width; 1929 1930 enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c : 1931 lc->tu.intra_pred_mode; 1932 uint32_t a; 1933 uint8_t left_array[2 * 32 + 1]; 1934 uint8_t filtered_left_array[2 * 32 + 1]; 1935 uint8_t top_array[2 * 32 + 1]; 1936 uint8_t filtered_top_array[2 * 32 + 1]; 1937 1938 uint8_t *left = left_array + 1; 1939 uint8_t *top = top_array + 1; 1940 uint8_t *filtered_left = filtered_left_array + 1; 1941 uint8_t *filtered_top = filtered_top_array + 1; 1942 int cand_bottom_left = lc->na.cand_bottom_left 1943 && cur_tb_addr > 1944 s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) * 1945 (s->ps.sps->tb_mask + 2) + (x_tb - 1)]; 1946 int cand_left = lc->na.cand_left; 1947 int cand_up_left = lc->na.cand_up_left; 1948 int cand_up = lc->na.cand_up; 1949 int cand_up_right = lc->na.cand_up_right 1950 && cur_tb_addr > 1951 s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) + 1952 ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)]; 1953 1954 int bottom_left_size = 1955 (((y0 + 2 * size_in_luma_v) > 1956 (s->ps.sps->height) ? (s->ps.sps->height) : (y0 + 1957 2 * size_in_luma_v)) - 1958 (y0 + size_in_luma_v)) >> vshift; 1959 int top_right_size = 1960 (((x0 + 2 * size_in_luma_h) > 1961 (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) - 1962 (x0 + size_in_luma_h)) >> hshift; 1963 1964 if (s->ps.pps->constrained_intra_pred_flag == 1) { 1965 int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size); 1966 int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size); 1967 int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1)); 1968 int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1)); 1969 if (!size_in_luma_pu_h) 1970 size_in_luma_pu_h++; 1971 if (cand_bottom_left == 1 && on_pu_edge_x) { 1972 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size); 1973 int y_bottom_pu = 1974 ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size); 1975 int max = 1976 ((size_in_luma_pu_v) > 1977 (s->ps.sps->min_pu_height - 1978 y_bottom_pu) ? (s->ps.sps->min_pu_height - 1979 y_bottom_pu) : (size_in_luma_pu_v)); 1980 cand_bottom_left = 0; 1981 for (i = 0; i < max; i += 2) 1982 cand_bottom_left |= 1983 ((s->ref->tab_mvf[(x_left_pu) + 1984 (y_bottom_pu + 1985 i) * min_pu_width]).pred_flag == 1986 PF_INTRA); 1987 } 1988 if (cand_left == 1 && on_pu_edge_x) { 1989 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size); 1990 int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size); 1991 int max = 1992 ((size_in_luma_pu_v) > 1993 (s->ps.sps->min_pu_height - 1994 y_left_pu) ? (s->ps.sps->min_pu_height - 1995 y_left_pu) : (size_in_luma_pu_v)); 1996 cand_left = 0; 1997 for (i = 0; i < max; i += 2) 1998 cand_left |= 1999 ((s->ref->tab_mvf[(x_left_pu) + 2000 (y_left_pu + 2001 i) * min_pu_width]).pred_flag == 2002 PF_INTRA); 2003 } 2004 if (cand_up_left == 1) { 2005 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size); 2006 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size); 2007 cand_up_left = 2008 (s->ref->tab_mvf[(x_left_pu) + 2009 (y_top_pu) * min_pu_width]).pred_flag == 2010 PF_INTRA; 2011 } 2012 if (cand_up == 1 && on_pu_edge_y) { 2013 int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size); 2014 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size); 2015 int max = 2016 ((size_in_luma_pu_h) > 2017 (s->ps.sps->min_pu_width - 2018 x_top_pu) ? (s->ps.sps->min_pu_width - 2019 x_top_pu) : (size_in_luma_pu_h)); 2020 cand_up = 0; 2021 for (i = 0; i < max; i += 2) 2022 cand_up |= 2023 ((s->ref->tab_mvf[(x_top_pu + i) + 2024 (y_top_pu) * 2025 min_pu_width]).pred_flag == PF_INTRA); 2026 } 2027 if (cand_up_right == 1 && on_pu_edge_y) { 2028 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size); 2029 int x_right_pu = 2030 ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size); 2031 int max = 2032 ((size_in_luma_pu_h) > 2033 (s->ps.sps->min_pu_width - 2034 x_right_pu) ? (s->ps.sps->min_pu_width - 2035 x_right_pu) : (size_in_luma_pu_h)); 2036 cand_up_right = 0; 2037 for (i = 0; i < max; i += 2) 2038 cand_up_right |= 2039 ((s->ref->tab_mvf[(x_right_pu + i) + 2040 (y_top_pu) * 2041 min_pu_width]).pred_flag == PF_INTRA); 2042 } 2043 2044 vec0 = (v16u8) __msa_ldi_b(128); 2045 2046 ST_UB4(vec0, vec0, vec0, vec0, left, 16); 2047 2048 ST_UB4(vec0, vec0, vec0, vec0, top, 16); 2049 2050 top[-1] = 128; 2051 } 2052 if (cand_up_left) { 2053 left[-1] = src[(-1) + stride * (-1)]; 2054 top[-1] = left[-1]; 2055 } 2056 if (cand_up) { 2057 vec0 = LD_UB(src - stride); 2058 ST_UB(vec0, top); 2059 } 2060 if (cand_up_right) { 2061 vec0 = LD_UB(src - stride + 16); 2062 ST_UB(vec0, (top + 16)); 2063 2064 do { 2065 uint32_t pix = 2066 ((src[(16 + top_right_size - 1) + stride * (-1)]) * 2067 0x01010101U); 2068 for (i = 0; i < (16 - top_right_size); i += 4) 2069 ((((union unaligned_32 *) (top + 16 + top_right_size + 2070 i))->l) = (pix)); 2071 } while (0); 2072 } 2073 if (cand_left) 2074 for (i = 0; i < 16; i++) 2075 left[i] = src[(-1) + stride * (i)]; 2076 if (cand_bottom_left) { 2077 for (i = 16; i < 16 + bottom_left_size; i++) 2078 left[i] = src[(-1) + stride * (i)]; 2079 do { 2080 uint32_t pix = 2081 ((src[(-1) + stride * (16 + bottom_left_size - 1)]) * 2082 0x01010101U); 2083 for (i = 0; i < (16 - bottom_left_size); i += 4) 2084 ((((union unaligned_32 *) (left + 16 + bottom_left_size + 2085 i))->l) = (pix)); 2086 } while (0); 2087 } 2088 2089 if (s->ps.pps->constrained_intra_pred_flag == 1) { 2090 if (cand_bottom_left || cand_left || cand_up_left || cand_up 2091 || cand_up_right) { 2092 int size_max_x = 2093 x0 + ((2 * 16) << hshift) < 2094 s->ps.sps->width ? 2 * 16 : (s->ps.sps->width - x0) >> hshift; 2095 int size_max_y = 2096 y0 + ((2 * 16) << vshift) < 2097 s->ps.sps->height ? 2 * 16 : (s->ps.sps->height - y0) >> vshift; 2098 int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1; 2099 if (!cand_up_right) { 2100 size_max_x = x0 + ((16) << hshift) < s->ps.sps->width ? 2101 16 : (s->ps.sps->width - x0) >> hshift; 2102 } 2103 if (!cand_bottom_left) { 2104 size_max_y = y0 + ((16) << vshift) < s->ps.sps->height ? 2105 16 : (s->ps.sps->height - y0) >> vshift; 2106 } 2107 if (cand_bottom_left || cand_left || cand_up_left) { 2108 while (j > -1 2109 && 2110 !((s->ref->tab_mvf[(((x0 + 2111 ((-1) << hshift)) >> s->ps.sps-> 2112 log2_min_pu_size)) + (((y0 + 2113 ((j) << 2114 vshift)) 2115 >> s->ps.sps-> 2116 log2_min_pu_size)) 2117 * min_pu_width]).pred_flag == 2118 PF_INTRA)) 2119 j--; 2120 if (! 2121 ((s->ref->tab_mvf[(((x0 + 2122 ((-1) << hshift)) >> s->ps.sps-> 2123 log2_min_pu_size)) + (((y0 + ((j) 2124 << 2125 vshift)) 2126 >> s->ps.sps-> 2127 log2_min_pu_size)) 2128 * min_pu_width]).pred_flag == PF_INTRA)) { 2129 j = 0; 2130 while (j < size_max_x 2131 && 2132 !((s->ref->tab_mvf[(((x0 + 2133 ((j) << hshift)) >> s->ps.sps-> 2134 log2_min_pu_size)) + (((y0 + 2135 ((-1) << 2136 vshift)) 2137 >> s-> 2138 ps.sps-> 2139 log2_min_pu_size)) 2140 * min_pu_width]).pred_flag == 2141 PF_INTRA)) 2142 j++; 2143 for (i = j; i > (j) - (j + 1); i--) 2144 if (! 2145 ((s->ref->tab_mvf[(((x0 + 2146 ((i - 2147 1) << hshift)) >> s->ps.sps-> 2148 log2_min_pu_size)) + (((y0 + 2149 ((-1) << 2150 vshift)) 2151 >> s-> 2152 ps.sps-> 2153 log2_min_pu_size)) 2154 * min_pu_width]).pred_flag == 2155 PF_INTRA)) 2156 top[i - 1] = top[i]; 2157 left[-1] = top[-1]; 2158 } 2159 } else { 2160 j = 0; 2161 while (j < size_max_x 2162 && 2163 !((s->ref->tab_mvf[(((x0 + 2164 ((j) << hshift)) >> s->ps.sps-> 2165 log2_min_pu_size)) + (((y0 + ((-1) 2166 << 2167 vshift)) 2168 >> s->ps.sps-> 2169 log2_min_pu_size)) 2170 * min_pu_width]).pred_flag == 2171 PF_INTRA)) 2172 j++; 2173 if (j > 0) 2174 if (x0 > 0) { 2175 for (i = j; i > (j) - (j + 1); i--) 2176 if (! 2177 ((s->ref->tab_mvf[(((x0 + 2178 ((i - 2179 1) << hshift)) >> 2180 s->ps.sps->log2_min_pu_size)) 2181 + (((y0 + ((-1) 2182 << vshift)) 2183 >> 2184 s->ps.sps->log2_min_pu_size)) 2185 * 2186 min_pu_width]).pred_flag == 2187 PF_INTRA)) 2188 top[i - 1] = top[i]; 2189 } else { 2190 for (i = j; i > (j) - (j); i--) 2191 if (! 2192 ((s->ref->tab_mvf[(((x0 + 2193 ((i - 2194 1) << hshift)) >> 2195 s->ps.sps->log2_min_pu_size)) 2196 + (((y0 + ((-1) 2197 << vshift)) 2198 >> 2199 s->ps.sps->log2_min_pu_size)) 2200 * 2201 min_pu_width]).pred_flag == 2202 PF_INTRA)) 2203 top[i - 1] = top[i]; 2204 top[-1] = top[0]; 2205 } 2206 left[-1] = top[-1]; 2207 } 2208 left[-1] = top[-1]; 2209 if (cand_bottom_left || cand_left) { 2210 a = ((left[-1]) * 0x01010101U); 2211 for (i = 0; i < (0) + (size_max_y); i += 4) 2212 if (! 2213 ((s->ref->tab_mvf[(((x0 + 2214 ((-1) << hshift)) >> s->ps.sps-> 2215 log2_min_pu_size)) + (((y0 + 2216 ((i) << 2217 vshift)) 2218 >> s->ps.sps-> 2219 log2_min_pu_size)) 2220 * min_pu_width]).pred_flag == 2221 PF_INTRA)) 2222 ((((union unaligned_32 *) (&left[i]))->l) = (a)); 2223 else 2224 a = ((left[i + 3]) * 0x01010101U); 2225 } 2226 if (!cand_left) { 2227 vec0 = (v16u8) __msa_fill_b(left[-1]); 2228 2229 ST_UB(vec0, left); 2230 } 2231 if (!cand_bottom_left) { 2232 2233 vec0 = (v16u8) __msa_fill_b(left[15]); 2234 2235 ST_UB(vec0, (left + 16)); 2236 } 2237 if (x0 != 0 && y0 != 0) { 2238 a = ((left[size_max_y - 1]) * 0x01010101U); 2239 for (i = (size_max_y - 1); 2240 i > (size_max_y - 1) - (size_max_y); i -= 4) 2241 if (! 2242 ((s->ref->tab_mvf[(((x0 + 2243 ((-1) << hshift)) >> s->ps.sps-> 2244 log2_min_pu_size)) + (((y0 + 2245 ((i - 2246 3) << 2247 vshift)) 2248 >> s->ps.sps-> 2249 log2_min_pu_size)) 2250 * min_pu_width]).pred_flag == 2251 PF_INTRA)) 2252 ((((union unaligned_32 *) (&left[i - 3]))->l) = (a)); 2253 else 2254 a = ((left[i - 3]) * 0x01010101U); 2255 if (! 2256 ((s->ref->tab_mvf[(((x0 + 2257 ((-1) << hshift)) >> s->ps.sps-> 2258 log2_min_pu_size)) + (((y0 + ((-1) 2259 << 2260 vshift)) 2261 >> s->ps.sps-> 2262 log2_min_pu_size)) 2263 * min_pu_width]).pred_flag == PF_INTRA)) 2264 left[-1] = left[0]; 2265 } else if (x0 == 0) { 2266 do { 2267 uint32_t pix = ((0) * 0x01010101U); 2268 for (i = 0; i < (size_max_y); i += 4) 2269 ((((union unaligned_32 *) (left + i))->l) = (pix)); 2270 } while (0); 2271 } else { 2272 a = ((left[size_max_y - 1]) * 0x01010101U); 2273 for (i = (size_max_y - 1); 2274 i > (size_max_y - 1) - (size_max_y); i -= 4) 2275 if (! 2276 ((s->ref->tab_mvf[(((x0 + 2277 ((-1) << hshift)) >> s->ps.sps-> 2278 log2_min_pu_size)) + (((y0 + 2279 ((i - 2280 3) << 2281 vshift)) 2282 >> s->ps.sps-> 2283 log2_min_pu_size)) 2284 * min_pu_width]).pred_flag == 2285 PF_INTRA)) 2286 ((((union unaligned_32 *) (&left[i - 3]))->l) = (a)); 2287 else 2288 a = ((left[i - 3]) * 0x01010101U); 2289 } 2290 top[-1] = left[-1]; 2291 if (y0 != 0) { 2292 a = ((left[-1]) * 0x01010101U); 2293 for (i = 0; i < (0) + (size_max_x); i += 4) 2294 if (! 2295 ((s->ref->tab_mvf[(((x0 + 2296 ((i) << hshift)) >> s->ps.sps-> 2297 log2_min_pu_size)) + (((y0 + ((-1) 2298 << 2299 vshift)) 2300 >> s->ps.sps-> 2301 log2_min_pu_size)) 2302 * min_pu_width]).pred_flag == 2303 PF_INTRA)) 2304 ((((union unaligned_32 *) (&top[i]))->l) = (a)); 2305 else 2306 a = ((top[i + 3]) * 0x01010101U); 2307 } 2308 } 2309 } 2310 2311 if (!cand_bottom_left) { 2312 if (cand_left) { 2313 vec0 = (v16u8) __msa_fill_b(left[15]); 2314 2315 ST_UB(vec0, (left + 16)); 2316 2317 } else if (cand_up_left) { 2318 vec0 = (v16u8) __msa_fill_b(left[-1]); 2319 2320 ST_UB2(vec0, vec0, left, 16); 2321 2322 cand_left = 1; 2323 } else if (cand_up) { 2324 left[-1] = top[0]; 2325 2326 vec0 = (v16u8) __msa_fill_b(left[-1]); 2327 2328 ST_UB2(vec0, vec0, left, 16); 2329 2330 cand_up_left = 1; 2331 cand_left = 1; 2332 } else if (cand_up_right) { 2333 vec0 = (v16u8) __msa_fill_b(top[16]); 2334 2335 ST_UB(vec0, top); 2336 2337 left[-1] = top[16]; 2338 2339 ST_UB2(vec0, vec0, left, 16); 2340 2341 cand_up = 1; 2342 cand_up_left = 1; 2343 cand_left = 1; 2344 } else { 2345 left[-1] = 128; 2346 vec0 = (v16u8) __msa_ldi_b(128); 2347 2348 ST_UB2(vec0, vec0, top, 16); 2349 ST_UB2(vec0, vec0, left, 16); 2350 } 2351 } 2352 2353 if (!cand_left) { 2354 vec0 = (v16u8) __msa_fill_b(left[16]); 2355 ST_UB(vec0, left); 2356 } 2357 if (!cand_up_left) { 2358 left[-1] = left[0]; 2359 } 2360 if (!cand_up) { 2361 vec0 = (v16u8) __msa_fill_b(left[-1]); 2362 ST_UB(vec0, top); 2363 } 2364 if (!cand_up_right) { 2365 vec0 = (v16u8) __msa_fill_b(top[15]); 2366 ST_UB(vec0, (top + 16)); 2367 } 2368 2369 top[-1] = left[-1]; 2370 2371 2372 if (!s->ps.sps->intra_smoothing_disabled_flag 2373 && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) { 2374 if (mode != INTRA_DC && 16 != 4) { 2375 int intra_hor_ver_dist_thresh[] = { 7, 1, 0 }; 2376 int min_dist_vert_hor = 2377 (((((int) (mode - 26U)) >= 2378 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) > 2379 ((((int) (mode - 10U)) >= 2380 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U))))) 2381 ? ((((int) (mode - 10U)) >= 2382 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U))))) 2383 : ((((int) (mode - 26U)) >= 2384 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U)))))); 2385 if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) { 2386 filtered_left[2 * 16 - 1] = left[2 * 16 - 1]; 2387 filtered_top[2 * 16 - 1] = top[2 * 16 - 1]; 2388 for (i = 2 * 16 - 2; i >= 0; i--) 2389 filtered_left[i] = (left[i + 1] + 2 * left[i] + 2390 left[i - 1] + 2) >> 2; 2391 filtered_top[-1] = 2392 filtered_left[-1] = 2393 (left[0] + 2 * left[-1] + top[0] + 2) >> 2; 2394 for (i = 2 * 16 - 2; i >= 0; i--) 2395 filtered_top[i] = (top[i + 1] + 2 * top[i] + 2396 top[i - 1] + 2) >> 2; 2397 left = filtered_left; 2398 top = filtered_top; 2399 } 2400 } 2401 } 2402 2403 switch (mode) { 2404 case INTRA_PLANAR: 2405 s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top, 2406 (uint8_t *) left, stride); 2407 break; 2408 case INTRA_DC: 2409 s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top, 2410 (uint8_t *) left, stride, 4, c_idx); 2411 break; 2412 default: 2413 s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top, 2414 (uint8_t *) left, stride, c_idx, mode); 2415 break; 2416 } 2417} 2418 2419void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx) 2420{ 2421 v16u8 vec0, vec1; 2422 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2423 v8i16 res0, res1, res2, res3; 2424 v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 }; 2425 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 }; 2426 HEVCLocalContext *lc = s->HEVClc; 2427 int i; 2428 int hshift = s->ps.sps->hshift[c_idx]; 2429 int vshift = s->ps.sps->vshift[c_idx]; 2430 int size_in_luma_h = 32 << hshift; 2431 int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size; 2432 int size_in_luma_v = 32 << vshift; 2433 int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size; 2434 int x = x0 >> hshift; 2435 int y = y0 >> vshift; 2436 int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; 2437 int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; 2438 2439 int cur_tb_addr = 2440 s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)]; 2441 2442 ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t); 2443 uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride; 2444 2445 int min_pu_width = s->ps.sps->min_pu_width; 2446 2447 enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c : 2448 lc->tu.intra_pred_mode; 2449 uint32_t a; 2450 uint8_t left_array[2 * 32 + 1]; 2451 uint8_t filtered_left_array[2 * 32 + 1]; 2452 uint8_t top_array[2 * 32 + 1]; 2453 uint8_t filtered_top_array[2 * 32 + 1]; 2454 2455 uint8_t *left = left_array + 1; 2456 uint8_t *top = top_array + 1; 2457 uint8_t *filtered_left = filtered_left_array + 1; 2458 uint8_t *filtered_top = filtered_top_array + 1; 2459 int cand_bottom_left = lc->na.cand_bottom_left 2460 && cur_tb_addr > 2461 s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) * 2462 (s->ps.sps->tb_mask + 2) + (x_tb - 1)]; 2463 int cand_left = lc->na.cand_left; 2464 int cand_up_left = lc->na.cand_up_left; 2465 int cand_up = lc->na.cand_up; 2466 int cand_up_right = lc->na.cand_up_right 2467 && cur_tb_addr > 2468 s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) + 2469 ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)]; 2470 2471 int bottom_left_size = 2472 (((y0 + 2 * size_in_luma_v) > 2473 (s->ps.sps->height) ? (s->ps.sps->height) : (y0 + 2474 2 * size_in_luma_v)) - 2475 (y0 + size_in_luma_v)) >> vshift; 2476 int top_right_size = 2477 (((x0 + 2 * size_in_luma_h) > 2478 (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) - 2479 (x0 + size_in_luma_h)) >> hshift; 2480 2481 if (s->ps.pps->constrained_intra_pred_flag == 1) { 2482 int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size); 2483 int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size); 2484 int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1)); 2485 int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1)); 2486 if (!size_in_luma_pu_h) 2487 size_in_luma_pu_h++; 2488 if (cand_bottom_left == 1 && on_pu_edge_x) { 2489 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size); 2490 int y_bottom_pu = 2491 ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size); 2492 int max = 2493 ((size_in_luma_pu_v) > 2494 (s->ps.sps->min_pu_height - 2495 y_bottom_pu) ? (s->ps.sps->min_pu_height - 2496 y_bottom_pu) : (size_in_luma_pu_v)); 2497 cand_bottom_left = 0; 2498 for (i = 0; i < max; i += 2) 2499 cand_bottom_left |= 2500 ((s->ref->tab_mvf[(x_left_pu) + 2501 (y_bottom_pu + 2502 i) * min_pu_width]).pred_flag == 2503 PF_INTRA); 2504 } 2505 if (cand_left == 1 && on_pu_edge_x) { 2506 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size); 2507 int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size); 2508 int max = 2509 ((size_in_luma_pu_v) > 2510 (s->ps.sps->min_pu_height - 2511 y_left_pu) ? (s->ps.sps->min_pu_height - 2512 y_left_pu) : (size_in_luma_pu_v)); 2513 cand_left = 0; 2514 for (i = 0; i < max; i += 2) 2515 cand_left |= 2516 ((s->ref->tab_mvf[(x_left_pu) + 2517 (y_left_pu + 2518 i) * min_pu_width]).pred_flag == 2519 PF_INTRA); 2520 } 2521 if (cand_up_left == 1) { 2522 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size); 2523 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size); 2524 cand_up_left = 2525 (s->ref->tab_mvf[(x_left_pu) + 2526 (y_top_pu) * min_pu_width]).pred_flag == 2527 PF_INTRA; 2528 } 2529 if (cand_up == 1 && on_pu_edge_y) { 2530 int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size); 2531 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size); 2532 int max = 2533 ((size_in_luma_pu_h) > 2534 (s->ps.sps->min_pu_width - 2535 x_top_pu) ? (s->ps.sps->min_pu_width - 2536 x_top_pu) : (size_in_luma_pu_h)); 2537 cand_up = 0; 2538 for (i = 0; i < max; i += 2) 2539 cand_up |= 2540 ((s->ref->tab_mvf[(x_top_pu + i) + 2541 (y_top_pu) * 2542 min_pu_width]).pred_flag == PF_INTRA); 2543 } 2544 if (cand_up_right == 1 && on_pu_edge_y) { 2545 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size); 2546 int x_right_pu = 2547 ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size); 2548 int max = 2549 ((size_in_luma_pu_h) > 2550 (s->ps.sps->min_pu_width - 2551 x_right_pu) ? (s->ps.sps->min_pu_width - 2552 x_right_pu) : (size_in_luma_pu_h)); 2553 cand_up_right = 0; 2554 for (i = 0; i < max; i += 2) 2555 cand_up_right |= 2556 ((s->ref->tab_mvf[(x_right_pu + i) + 2557 (y_top_pu) * 2558 min_pu_width]).pred_flag == PF_INTRA); 2559 } 2560 vec0 = (v16u8) __msa_ldi_b(128); 2561 2562 ST_UB4(vec0, vec0, vec0, vec0, left, 16); 2563 ST_UB4(vec0, vec0, vec0, vec0, top, 16); 2564 2565 top[-1] = 128; 2566 } 2567 if (cand_up_left) { 2568 left[-1] = src[(-1) + stride * (-1)]; 2569 top[-1] = left[-1]; 2570 } 2571 if (cand_up) { 2572 LD_UB2(src - stride, 16, vec0, vec1); 2573 ST_UB2(vec0, vec1, top, 16); 2574 } 2575 2576 if (cand_up_right) { 2577 LD_UB2(src - stride + 32, 16, vec0, vec1); 2578 ST_UB2(vec0, vec1, (top + 32), 16); 2579 do { 2580 uint32_t pix = 2581 ((src[(32 + top_right_size - 1) + stride * (-1)]) * 2582 0x01010101U); 2583 for (i = 0; i < (32 - top_right_size); i += 4) 2584 ((((union unaligned_32 *) (top + 32 + top_right_size + 2585 i))->l) = (pix)); 2586 } while (0); 2587 } 2588 if (cand_left) 2589 for (i = 0; i < 32; i++) 2590 left[i] = src[(-1) + stride * (i)]; 2591 if (cand_bottom_left) { 2592 for (i = 32; i < 32 + bottom_left_size; i++) 2593 left[i] = src[(-1) + stride * (i)]; 2594 do { 2595 uint32_t pix = 2596 ((src[(-1) + stride * (32 + bottom_left_size - 1)]) * 2597 0x01010101U); 2598 for (i = 0; i < (32 - bottom_left_size); i += 4) 2599 ((((union unaligned_32 *) (left + 32 + bottom_left_size + 2600 i))->l) = (pix)); 2601 } while (0); 2602 } 2603 2604 if (s->ps.pps->constrained_intra_pred_flag == 1) { 2605 if (cand_bottom_left || cand_left || cand_up_left || cand_up 2606 || cand_up_right) { 2607 int size_max_x = 2608 x0 + ((2 * 32) << hshift) < 2609 s->ps.sps->width ? 2 * 32 : (s->ps.sps->width - x0) >> hshift; 2610 int size_max_y = 2611 y0 + ((2 * 32) << vshift) < 2612 s->ps.sps->height ? 2 * 32 : (s->ps.sps->height - y0) >> vshift; 2613 int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1; 2614 if (!cand_up_right) { 2615 size_max_x = x0 + ((32) << hshift) < s->ps.sps->width ? 2616 32 : (s->ps.sps->width - x0) >> hshift; 2617 } 2618 if (!cand_bottom_left) { 2619 size_max_y = y0 + ((32) << vshift) < s->ps.sps->height ? 2620 32 : (s->ps.sps->height - y0) >> vshift; 2621 } 2622 if (cand_bottom_left || cand_left || cand_up_left) { 2623 while (j > -1 2624 && 2625 !((s->ref->tab_mvf[(((x0 + 2626 ((-1) << hshift)) >> s->ps.sps-> 2627 log2_min_pu_size)) + (((y0 + 2628 ((j) << 2629 vshift)) 2630 >> s->ps.sps-> 2631 log2_min_pu_size)) 2632 * min_pu_width]).pred_flag == 2633 PF_INTRA)) 2634 j--; 2635 if (! 2636 ((s->ref->tab_mvf[(((x0 + 2637 ((-1) << hshift)) >> s->ps.sps-> 2638 log2_min_pu_size)) + (((y0 + ((j) 2639 << 2640 vshift)) 2641 >> s->ps.sps-> 2642 log2_min_pu_size)) 2643 * min_pu_width]).pred_flag == PF_INTRA)) { 2644 j = 0; 2645 while (j < size_max_x 2646 && 2647 !((s->ref->tab_mvf[(((x0 + 2648 ((j) << hshift)) >> s->ps.sps-> 2649 log2_min_pu_size)) + (((y0 + 2650 ((-1) << 2651 vshift)) 2652 >> s-> 2653 ps.sps-> 2654 log2_min_pu_size)) 2655 * min_pu_width]).pred_flag == 2656 PF_INTRA)) 2657 j++; 2658 for (i = j; i > (j) - (j + 1); i--) 2659 if (! 2660 ((s->ref->tab_mvf[(((x0 + 2661 ((i - 2662 1) << hshift)) >> s->ps.sps-> 2663 log2_min_pu_size)) + (((y0 + 2664 ((-1) << 2665 vshift)) 2666 >> s-> 2667 ps.sps-> 2668 log2_min_pu_size)) 2669 * min_pu_width]).pred_flag == 2670 PF_INTRA)) 2671 top[i - 1] = top[i]; 2672 left[-1] = top[-1]; 2673 } 2674 } else { 2675 j = 0; 2676 while (j < size_max_x 2677 && 2678 !((s->ref->tab_mvf[(((x0 + 2679 ((j) << hshift)) >> s->ps.sps-> 2680 log2_min_pu_size)) + (((y0 + ((-1) 2681 << 2682 vshift)) 2683 >> s->ps.sps-> 2684 log2_min_pu_size)) 2685 * min_pu_width]).pred_flag == 2686 PF_INTRA)) 2687 j++; 2688 if (j > 0) 2689 if (x0 > 0) { 2690 for (i = j; i > (j) - (j + 1); i--) 2691 if (! 2692 ((s->ref->tab_mvf[(((x0 + 2693 ((i - 2694 1) << hshift)) >> 2695 s->ps.sps->log2_min_pu_size)) 2696 + (((y0 + ((-1) 2697 << vshift)) 2698 >> 2699 s->ps.sps->log2_min_pu_size)) 2700 * 2701 min_pu_width]).pred_flag == 2702 PF_INTRA)) 2703 top[i - 1] = top[i]; 2704 } else { 2705 for (i = j; i > (j) - (j); i--) 2706 if (! 2707 ((s->ref->tab_mvf[(((x0 + 2708 ((i - 2709 1) << hshift)) >> 2710 s->ps.sps->log2_min_pu_size)) 2711 + (((y0 + ((-1) 2712 << vshift)) 2713 >> 2714 s->ps.sps->log2_min_pu_size)) 2715 * 2716 min_pu_width]).pred_flag == 2717 PF_INTRA)) 2718 top[i - 1] = top[i]; 2719 top[-1] = top[0]; 2720 } 2721 left[-1] = top[-1]; 2722 } 2723 left[-1] = top[-1]; 2724 if (cand_bottom_left || cand_left) { 2725 a = ((left[-1]) * 0x01010101U); 2726 for (i = 0; i < (0) + (size_max_y); i += 4) 2727 if (! 2728 ((s->ref->tab_mvf[(((x0 + 2729 ((-1) << hshift)) >> s->ps.sps-> 2730 log2_min_pu_size)) + (((y0 + 2731 ((i) << 2732 vshift)) 2733 >> s->ps.sps-> 2734 log2_min_pu_size)) 2735 * min_pu_width]).pred_flag == 2736 PF_INTRA)) 2737 ((((union unaligned_32 *) (&left[i]))->l) = (a)); 2738 else 2739 a = ((left[i + 3]) * 0x01010101U); 2740 } 2741 if (!cand_left) { 2742 vec0 = (v16u8) __msa_fill_b(left[-1]); 2743 2744 ST_UB2(vec0, vec0, left, 16); 2745 } 2746 if (!cand_bottom_left) { 2747 vec0 = (v16u8) __msa_fill_b(left[31]); 2748 2749 ST_UB2(vec0, vec0, (left + 32), 16); 2750 } 2751 if (x0 != 0 && y0 != 0) { 2752 a = ((left[size_max_y - 1]) * 0x01010101U); 2753 for (i = (size_max_y - 1); 2754 i > (size_max_y - 1) - (size_max_y); i -= 4) 2755 if (! 2756 ((s->ref->tab_mvf[(((x0 + 2757 ((-1) << hshift)) >> s->ps.sps-> 2758 log2_min_pu_size)) + (((y0 + 2759 ((i - 2760 3) << 2761 vshift)) 2762 >> s->ps.sps-> 2763 log2_min_pu_size)) 2764 * min_pu_width]).pred_flag == 2765 PF_INTRA)) 2766 ((((union unaligned_32 *) (&left[i - 3]))->l) = (a)); 2767 else 2768 a = ((left[i - 3]) * 0x01010101U); 2769 if (! 2770 ((s->ref->tab_mvf[(((x0 + 2771 ((-1) << hshift)) >> s->ps.sps-> 2772 log2_min_pu_size)) + (((y0 + ((-1) 2773 << 2774 vshift)) 2775 >> s->ps.sps-> 2776 log2_min_pu_size)) 2777 * min_pu_width]).pred_flag == PF_INTRA)) 2778 left[-1] = left[0]; 2779 } else if (x0 == 0) { 2780 do { 2781 uint32_t pix = ((0) * 0x01010101U); 2782 for (i = 0; i < (size_max_y); i += 4) 2783 ((((union unaligned_32 *) (left + i))->l) = (pix)); 2784 } while (0); 2785 } else { 2786 a = ((left[size_max_y - 1]) * 0x01010101U); 2787 for (i = (size_max_y - 1); 2788 i > (size_max_y - 1) - (size_max_y); i -= 4) 2789 if (! 2790 ((s->ref->tab_mvf[(((x0 + 2791 ((-1) << hshift)) >> s->ps.sps-> 2792 log2_min_pu_size)) + (((y0 + 2793 ((i - 2794 3) << 2795 vshift)) 2796 >> s->ps.sps-> 2797 log2_min_pu_size)) 2798 * min_pu_width]).pred_flag == 2799 PF_INTRA)) 2800 ((((union unaligned_32 *) (&left[i - 3]))->l) = (a)); 2801 else 2802 a = ((left[i - 3]) * 0x01010101U); 2803 } 2804 top[-1] = left[-1]; 2805 if (y0 != 0) { 2806 a = ((left[-1]) * 0x01010101U); 2807 for (i = 0; i < (0) + (size_max_x); i += 4) 2808 if (! 2809 ((s->ref->tab_mvf[(((x0 + 2810 ((i) << hshift)) >> s->ps.sps-> 2811 log2_min_pu_size)) + (((y0 + ((-1) 2812 << 2813 vshift)) 2814 >> s->ps.sps-> 2815 log2_min_pu_size)) 2816 * min_pu_width]).pred_flag == 2817 PF_INTRA)) 2818 ((((union unaligned_32 *) (&top[i]))->l) = (a)); 2819 else 2820 a = ((top[i + 3]) * 0x01010101U); 2821 } 2822 } 2823 } 2824 2825 if (!cand_bottom_left) { 2826 if (cand_left) { 2827 vec0 = (v16u8) __msa_fill_b(left[31]); 2828 2829 ST_UB2(vec0, vec0, (left + 32), 16); 2830 } else if (cand_up_left) { 2831 vec0 = (v16u8) __msa_fill_b(left[-1]); 2832 2833 ST_UB4(vec0, vec0, vec0, vec0, left, 16); 2834 2835 cand_left = 1; 2836 } else if (cand_up) { 2837 left[-1] = top[0]; 2838 2839 vec0 = (v16u8) __msa_fill_b(left[-1]); 2840 2841 ST_UB4(vec0, vec0, vec0, vec0, left, 16); 2842 2843 cand_up_left = 1; 2844 cand_left = 1; 2845 } else if (cand_up_right) { 2846 vec0 = (v16u8) __msa_fill_b(top[32]); 2847 2848 ST_UB2(vec0, vec0, top, 16); 2849 2850 left[-1] = top[32]; 2851 2852 ST_UB4(vec0, vec0, vec0, vec0, left, 16); 2853 2854 cand_up = 1; 2855 cand_up_left = 1; 2856 cand_left = 1; 2857 } else { 2858 left[-1] = 128; 2859 2860 vec0 = (v16u8) __msa_ldi_b(128); 2861 2862 ST_UB4(vec0, vec0, vec0, vec0, top, 16); 2863 ST_UB4(vec0, vec0, vec0, vec0, left, 16); 2864 } 2865 } 2866 2867 if (!cand_left) { 2868 vec0 = (v16u8) __msa_fill_b(left[32]); 2869 2870 ST_UB2(vec0, vec0, left, 16); 2871 } 2872 if (!cand_up_left) { 2873 left[-1] = left[0]; 2874 } 2875 if (!cand_up) { 2876 vec0 = (v16u8) __msa_fill_b(left[-1]); 2877 2878 ST_UB2(vec0, vec0, top, 16); 2879 } 2880 if (!cand_up_right) { 2881 vec0 = (v16u8) __msa_fill_b(top[31]); 2882 2883 ST_UB2(vec0, vec0, (top + 32), 16); 2884 } 2885 2886 top[-1] = left[-1]; 2887 2888 2889 if (!s->ps.sps->intra_smoothing_disabled_flag 2890 && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) { 2891 if (mode != INTRA_DC && 32 != 4) { 2892 int intra_hor_ver_dist_thresh[] = { 7, 1, 0 }; 2893 int min_dist_vert_hor = 2894 (((((int) (mode - 26U)) >= 2895 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) > 2896 ((((int) (mode - 10U)) >= 2897 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U))))) 2898 ? ((((int) (mode - 10U)) >= 2899 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U))))) 2900 : ((((int) (mode - 26U)) >= 2901 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U)))))); 2902 if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) { 2903 int threshold = 1 << (8 - 5); 2904 if (s->ps.sps->sps_strong_intra_smoothing_enable_flag 2905 && c_idx == 0 2906 && ((top[-1] + top[63] - 2 * top[31]) >= 2907 0 ? (top[-1] + top[63] - 2908 2 * top[31]) : (-(top[-1] + top[63] - 2909 2 * top[31]))) < threshold 2910 && ((left[-1] + left[63] - 2 * left[31]) >= 2911 0 ? (left[-1] + left[63] - 2912 2 * left[31]) : (-(left[-1] + left[63] - 2913 2 * left[31]))) < threshold) { 2914 2915 2916 filtered_top[-1] = top[-1]; 2917 filtered_top[63] = top[63]; 2918 2919 2920 for (i = 0; i < 63; i++) { 2921 filtered_top[i] = 2922 ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6; 2923 } 2924 2925 tmp0 = __msa_fill_h(top[-1]); 2926 tmp1 = __msa_fill_h(top[63]); 2927 2928 tmp2 = mul_val0 - 8; 2929 tmp3 = mul_val0 - 16; 2930 tmp4 = mul_val0 - 24; 2931 tmp5 = mul_val1 + 8; 2932 tmp6 = mul_val1 + 16; 2933 tmp7 = mul_val1 + 24; 2934 2935 res0 = mul_val0 * tmp0; 2936 res1 = tmp2 * tmp0; 2937 res2 = tmp3 * tmp0; 2938 res3 = tmp4 * tmp0; 2939 res0 += mul_val1 * tmp1; 2940 res1 += tmp5 * tmp1; 2941 res2 += tmp6 * tmp1; 2942 res3 += tmp7 * tmp1; 2943 2944 res0 = __msa_srari_h(res0, 6); 2945 res1 = __msa_srari_h(res1, 6); 2946 res2 = __msa_srari_h(res2, 6); 2947 res3 = __msa_srari_h(res3, 6); 2948 2949 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0); 2950 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2); 2951 2952 ST_UB2(vec0, vec1, filtered_top, 16); 2953 2954 res0 = mul_val0 - 32; 2955 tmp2 = mul_val0 - 40; 2956 tmp3 = mul_val0 - 48; 2957 tmp4 = mul_val0 - 56; 2958 res3 = mul_val1 + 32; 2959 tmp5 = mul_val1 + 40; 2960 tmp6 = mul_val1 + 48; 2961 tmp7 = mul_val1 + 56; 2962 2963 res0 = res0 * tmp0; 2964 res1 = tmp2 * tmp0; 2965 res2 = tmp3 * tmp0; 2966 res0 += res3 * tmp1; 2967 res3 = tmp4 * tmp0; 2968 res1 += tmp5 * tmp1; 2969 res2 += tmp6 * tmp1; 2970 res3 += tmp7 * tmp1; 2971 2972 res0 = __msa_srari_h(res0, 6); 2973 res1 = __msa_srari_h(res1, 6); 2974 res2 = __msa_srari_h(res2, 6); 2975 res3 = __msa_srari_h(res3, 6); 2976 2977 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0); 2978 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2); 2979 2980 ST_UB2(vec0, vec1, (filtered_top + 32), 16); 2981 2982 filtered_top[63] = top[63]; 2983 2984 tmp0 = __msa_fill_h(left[-1]); 2985 tmp1 = __msa_fill_h(left[63]); 2986 2987 tmp2 = mul_val0 - 8; 2988 tmp3 = mul_val0 - 16; 2989 tmp4 = mul_val0 - 24; 2990 tmp5 = mul_val1 + 8; 2991 tmp6 = mul_val1 + 16; 2992 tmp7 = mul_val1 + 24; 2993 2994 res0 = mul_val0 * tmp0; 2995 res1 = tmp2 * tmp0; 2996 res2 = tmp3 * tmp0; 2997 res3 = tmp4 * tmp0; 2998 res0 += mul_val1 * tmp1; 2999 res1 += tmp5 * tmp1; 3000 res2 += tmp6 * tmp1; 3001 res3 += tmp7 * tmp1; 3002 3003 res0 = __msa_srari_h(res0, 6); 3004 res1 = __msa_srari_h(res1, 6); 3005 res2 = __msa_srari_h(res2, 6); 3006 res3 = __msa_srari_h(res3, 6); 3007 3008 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0); 3009 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2); 3010 3011 ST_UB2(vec0, vec1, left, 16); 3012 3013 res0 = mul_val0 - 32; 3014 tmp2 = mul_val0 - 40; 3015 tmp3 = mul_val0 - 48; 3016 tmp4 = mul_val0 - 56; 3017 res3 = mul_val1 + 32; 3018 tmp5 = mul_val1 + 40; 3019 tmp6 = mul_val1 + 48; 3020 tmp7 = mul_val1 + 56; 3021 3022 res0 = res0 * tmp0; 3023 res1 = tmp2 * tmp0; 3024 res2 = tmp3 * tmp0; 3025 res0 += res3 * tmp1; 3026 res3 = tmp4 * tmp0; 3027 res1 += tmp5 * tmp1; 3028 res2 += tmp6 * tmp1; 3029 res3 += tmp7 * tmp1; 3030 3031 res0 = __msa_srari_h(res0, 6); 3032 res1 = __msa_srari_h(res1, 6); 3033 res2 = __msa_srari_h(res2, 6); 3034 res3 = __msa_srari_h(res3, 6); 3035 3036 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0); 3037 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2); 3038 3039 ST_UB2(vec0, vec1, (left + 32), 16); 3040 3041 left[63] = tmp1[0]; 3042 3043 top = filtered_top; 3044 } else { 3045 filtered_left[2 * 32 - 1] = left[2 * 32 - 1]; 3046 filtered_top[2 * 32 - 1] = top[2 * 32 - 1]; 3047 for (i = 2 * 32 - 2; i >= 0; i--) 3048 filtered_left[i] = (left[i + 1] + 2 * left[i] + 3049 left[i - 1] + 2) >> 2; 3050 filtered_top[-1] = 3051 filtered_left[-1] = 3052 (left[0] + 2 * left[-1] + top[0] + 2) >> 2; 3053 for (i = 2 * 32 - 2; i >= 0; i--) 3054 filtered_top[i] = (top[i + 1] + 2 * top[i] + 3055 top[i - 1] + 2) >> 2; 3056 left = filtered_left; 3057 top = filtered_top; 3058 } 3059 } 3060 } 3061 } 3062 3063 switch (mode) { 3064 case INTRA_PLANAR: 3065 s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top, 3066 (uint8_t *) left, stride); 3067 break; 3068 case INTRA_DC: 3069 s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top, 3070 (uint8_t *) left, stride, 5, c_idx); 3071 break; 3072 default: 3073 s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top, 3074 (uint8_t *) left, stride, c_idx, mode); 3075 break; 3076 } 3077} 3078