1/* 2 * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "h264dsp_mips.h" 23 24static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst, 25 int32_t dst_stride) 26{ 27 uint64_t out = LD(src); 28 29 SD4(out, out, out, out, dst, dst_stride); 30 dst += (4 * dst_stride); 31 SD4(out, out, out, out, dst, dst_stride); 32} 33 34static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst, 35 int32_t dst_stride) 36{ 37 v16u8 out = LD_UB(src); 38 39 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); 40 dst += (8 * dst_stride); 41 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); 42} 43 44static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride, 45 uint8_t *dst, int32_t dst_stride) 46{ 47 uint64_t out0, out1, out2, out3, out4, out5, out6, out7; 48 49 out0 = src[0 * src_stride] * 0x0101010101010101; 50 out1 = src[1 * src_stride] * 0x0101010101010101; 51 out2 = src[2 * src_stride] * 0x0101010101010101; 52 out3 = src[3 * src_stride] * 0x0101010101010101; 53 out4 = src[4 * src_stride] * 0x0101010101010101; 54 out5 = src[5 * src_stride] * 0x0101010101010101; 55 out6 = src[6 * src_stride] * 0x0101010101010101; 56 out7 = src[7 * src_stride] * 0x0101010101010101; 57 58 SD4(out0, out1, out2, out3, dst, dst_stride); 59 dst += (4 * dst_stride); 60 SD4(out4, out5, out6, out7, dst, dst_stride); 61} 62 63static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride, 64 uint8_t *dst, int32_t dst_stride) 65{ 66 uint8_t inp0, inp1, inp2, inp3; 67 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 68 v16u8 src8, src9, src10, src11, src12, src13, src14, src15; 69 70 inp0 = src[0 * src_stride]; 71 inp1 = src[1 * src_stride]; 72 inp2 = src[2 * src_stride]; 73 inp3 = src[3 * src_stride]; 74 src0 = (v16u8) __msa_fill_b(inp0); 75 src1 = (v16u8) __msa_fill_b(inp1); 76 src2 = (v16u8) __msa_fill_b(inp2); 77 src3 = (v16u8) __msa_fill_b(inp3); 78 inp0 = src[4 * src_stride]; 79 inp1 = src[5 * src_stride]; 80 inp2 = src[6 * src_stride]; 81 inp3 = src[7 * src_stride]; 82 src4 = (v16u8) __msa_fill_b(inp0); 83 src5 = (v16u8) __msa_fill_b(inp1); 84 src6 = (v16u8) __msa_fill_b(inp2); 85 src7 = (v16u8) __msa_fill_b(inp3); 86 inp0 = src[ 8 * src_stride]; 87 inp1 = src[ 9 * src_stride]; 88 inp2 = src[10 * src_stride]; 89 inp3 = src[11 * src_stride]; 90 src8 = (v16u8) __msa_fill_b(inp0); 91 src9 = (v16u8) __msa_fill_b(inp1); 92 src10 = (v16u8) __msa_fill_b(inp2); 93 src11 = (v16u8) __msa_fill_b(inp3); 94 inp0 = src[12 * src_stride]; 95 inp1 = src[13 * src_stride]; 96 inp2 = src[14 * src_stride]; 97 inp3 = src[15 * src_stride]; 98 src12 = (v16u8) __msa_fill_b(inp0); 99 src13 = (v16u8) __msa_fill_b(inp1); 100 src14 = (v16u8) __msa_fill_b(inp2); 101 src15 = (v16u8) __msa_fill_b(inp3); 102 103 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 104 dst += (8 * dst_stride); 105 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, 106 dst, dst_stride); 107} 108 109#define INTRA_PREDICT_VALDC_8X8_MSA(val) \ 110static void intra_predict_##val##dc_8x8_msa(uint8_t *dst, int32_t dst_stride) \ 111{ \ 112 v16i8 store = __msa_fill_b(val); \ 113 uint64_t out = __msa_copy_u_d((v2i64) store, 0); \ 114 \ 115 SD4(out, out, out, out, dst, dst_stride); \ 116 dst += (4 * dst_stride); \ 117 SD4(out, out, out, out, dst, dst_stride); \ 118} 119 120INTRA_PREDICT_VALDC_8X8_MSA(127); 121INTRA_PREDICT_VALDC_8X8_MSA(129); 122 123#define INTRA_PREDICT_VALDC_16X16_MSA(val) \ 124static void intra_predict_##val##dc_16x16_msa(uint8_t *dst, \ 125 int32_t dst_stride) \ 126{ \ 127 v16u8 out = (v16u8) __msa_fill_b(val); \ 128 \ 129 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 130 dst += (8 * dst_stride); \ 131 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \ 132} 133 134INTRA_PREDICT_VALDC_16X16_MSA(127); 135INTRA_PREDICT_VALDC_16X16_MSA(129); 136 137static void intra_predict_plane_8x8_msa(uint8_t *src, int32_t stride) 138{ 139 uint8_t lpcnt; 140 int32_t res, res0, res1, res2, res3; 141 uint64_t out0, out1; 142 v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 }; 143 v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 }; 144 v4i32 int_multiplier = { 0, 1, 2, 3 }; 145 v16u8 src_top; 146 v8i16 vec9, vec10, vec11; 147 v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8; 148 v2i64 sum; 149 150 src_top = LD_UB(src - (stride + 1)); 151 src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top); 152 153 vec9 = __msa_hsub_u_h(src_top, src_top); 154 vec9 *= short_multiplier; 155 vec8 = __msa_hadd_s_w(vec9, vec9); 156 sum = __msa_hadd_s_d(vec8, vec8); 157 158 res0 = __msa_copy_s_w((v4i32) sum, 0); 159 160 res1 = (src[4 * stride - 1] - src[2 * stride - 1]) + 161 2 * (src[5 * stride - 1] - src[stride - 1]) + 162 3 * (src[6 * stride - 1] - src[-1]) + 163 4 * (src[7 * stride - 1] - src[-stride - 1]); 164 165 res0 *= 17; 166 res1 *= 17; 167 res0 = (res0 + 16) >> 5; 168 res1 = (res1 + 16) >> 5; 169 170 res3 = 3 * (res0 + res1); 171 res2 = 16 * (src[7 * stride - 1] + src[-stride + 7] + 1); 172 res = res2 - res3; 173 174 vec8 = __msa_fill_w(res0); 175 vec4 = __msa_fill_w(res); 176 vec2 = __msa_fill_w(res1); 177 vec5 = vec8 * int_multiplier; 178 vec3 = vec8 * 4; 179 180 for (lpcnt = 4; lpcnt--;) { 181 vec0 = vec5; 182 vec0 += vec4; 183 vec1 = vec0 + vec3; 184 vec6 = vec5; 185 vec4 += vec2; 186 vec6 += vec4; 187 vec7 = vec6 + vec3; 188 189 SRA_4V(vec0, vec1, vec6, vec7, 5); 190 PCKEV_H2_SH(vec1, vec0, vec7, vec6, vec10, vec11); 191 CLIP_SH2_0_255(vec10, vec11); 192 PCKEV_B2_SH(vec10, vec10, vec11, vec11, vec10, vec11); 193 194 out0 = __msa_copy_s_d((v2i64) vec10, 0); 195 out1 = __msa_copy_s_d((v2i64) vec11, 0); 196 SD(out0, src); 197 src += stride; 198 SD(out1, src); 199 src += stride; 200 201 vec4 += vec2; 202 } 203} 204 205static void intra_predict_plane_16x16_msa(uint8_t *src, int32_t stride) 206{ 207 uint8_t lpcnt; 208 int32_t res0, res1, res2, res3; 209 uint64_t load0, load1; 210 v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 }; 211 v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 }; 212 v4i32 int_multiplier = { 0, 1, 2, 3 }; 213 v16u8 src_top = { 0 }; 214 v16u8 store0, store1; 215 v8i16 vec9, vec10, vec11, vec12; 216 v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add; 217 v4i32 reg0, reg1, reg2, reg3; 218 219 load0 = LD(src - (stride + 1)); 220 load1 = LD(src - (stride + 1) + 9); 221 222 INSERT_D2_UB(load0, load1, src_top); 223 224 src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top); 225 226 vec9 = __msa_hsub_u_h(src_top, src_top); 227 vec9 *= short_multiplier; 228 vec8 = __msa_hadd_s_w(vec9, vec9); 229 res_add = (v4i32) __msa_hadd_s_d(vec8, vec8); 230 231 res0 = __msa_copy_s_w(res_add, 0) + __msa_copy_s_w(res_add, 2); 232 233 res1 = (src[8 * stride - 1] - src[6 * stride - 1]) + 234 2 * (src[9 * stride - 1] - src[5 * stride - 1]) + 235 3 * (src[10 * stride - 1] - src[4 * stride - 1]) + 236 4 * (src[11 * stride - 1] - src[3 * stride - 1]) + 237 5 * (src[12 * stride - 1] - src[2 * stride - 1]) + 238 6 * (src[13 * stride - 1] - src[stride - 1]) + 239 7 * (src[14 * stride - 1] - src[-1]) + 240 8 * (src[15 * stride - 1] - src[-1 * stride - 1]); 241 242 res0 *= 5; 243 res1 *= 5; 244 res0 = (res0 + 32) >> 6; 245 res1 = (res1 + 32) >> 6; 246 247 res3 = 7 * (res0 + res1); 248 res2 = 16 * (src[15 * stride - 1] + src[-stride + 15] + 1); 249 res2 -= res3; 250 251 vec8 = __msa_fill_w(res0); 252 vec4 = __msa_fill_w(res2); 253 vec5 = __msa_fill_w(res1); 254 vec6 = vec8 * 4; 255 vec7 = vec8 * int_multiplier; 256 257 for (lpcnt = 8; lpcnt--;) { 258 vec0 = vec7; 259 reg0 = vec7; 260 vec0 += vec4; 261 vec4 += vec5; 262 reg0 += vec4; 263 vec1 = vec0 + vec6; 264 reg1 = reg0 + vec6; 265 vec2 = vec1 + vec6; 266 reg2 = reg1 + vec6; 267 vec3 = vec2 + vec6; 268 reg3 = reg2 + vec6; 269 270 SRA_4V(vec0, vec1, vec2, vec3, 5); 271 SRA_4V(reg0, reg1, reg2, reg3, 5); 272 PCKEV_H2_SH(vec1, vec0, vec3, vec2, vec9, vec10); 273 PCKEV_H2_SH(reg1, reg0, reg3, reg2, vec11, vec12); 274 CLIP_SH2_0_255(vec9, vec10); 275 CLIP_SH2_0_255(vec11, vec12); 276 PCKEV_B2_UB(vec10, vec9, vec12, vec11, store0, store1); 277 ST_UB2(store0, store1, src, stride); 278 src += 2 * stride; 279 280 vec4 += vec5; 281 } 282} 283 284static void intra_predict_dc_4blk_8x8_msa(uint8_t *src, int32_t stride) 285{ 286 uint32_t src0, src1, src3, src2; 287 uint32_t out0, out1, out2, out3; 288 uint64_t store0, store1; 289 v16u8 src_top; 290 v8u16 add; 291 v4u32 sum; 292 293 src_top = LD_UB(src - stride); 294 add = __msa_hadd_u_h((v16u8) src_top, (v16u8) src_top); 295 sum = __msa_hadd_u_w(add, add); 296 src0 = __msa_copy_u_w((v4i32) sum, 0); 297 src1 = __msa_copy_u_w((v4i32) sum, 1); 298 src0 += src[0 * stride - 1]; 299 src0 += src[1 * stride - 1]; 300 src0 += src[2 * stride - 1]; 301 src0 += src[3 * stride - 1]; 302 src2 = src[4 * stride - 1]; 303 src2 += src[5 * stride - 1]; 304 src2 += src[6 * stride - 1]; 305 src2 += src[7 * stride - 1]; 306 src0 = (src0 + 4) >> 3; 307 src3 = (src1 + src2 + 4) >> 3; 308 src1 = (src1 + 2) >> 2; 309 src2 = (src2 + 2) >> 2; 310 out0 = src0 * 0x01010101; 311 out1 = src1 * 0x01010101; 312 out2 = src2 * 0x01010101; 313 out3 = src3 * 0x01010101; 314 store0 = ((uint64_t) out1 << 32) | out0; 315 store1 = ((uint64_t) out3 << 32) | out2; 316 317 SD4(store0, store0, store0, store0, src, stride); 318 src += (4 * stride); 319 SD4(store1, store1, store1, store1, src, stride); 320} 321 322static void intra_predict_hor_dc_8x8_msa(uint8_t *src, int32_t stride) 323{ 324 uint32_t src0, src1; 325 uint64_t out0, out1; 326 327 src0 = src[0 * stride - 1]; 328 src0 += src[1 * stride - 1]; 329 src0 += src[2 * stride - 1]; 330 src0 += src[3 * stride - 1]; 331 src1 = src[4 * stride - 1]; 332 src1 += src[5 * stride - 1]; 333 src1 += src[6 * stride - 1]; 334 src1 += src[7 * stride - 1]; 335 src0 = (src0 + 2) >> 2; 336 src1 = (src1 + 2) >> 2; 337 out0 = src0 * 0x0101010101010101; 338 out1 = src1 * 0x0101010101010101; 339 340 SD4(out0, out0, out0, out0, src, stride); 341 src += (4 * stride); 342 SD4(out1, out1, out1, out1, src, stride); 343} 344 345static void intra_predict_vert_dc_8x8_msa(uint8_t *src, int32_t stride) 346{ 347 uint64_t out0; 348 v16i8 mask = { 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; 349 v16u8 src_top, res0; 350 v8u16 add; 351 v4u32 sum; 352 353 src_top = LD_UB(src - stride); 354 add = __msa_hadd_u_h(src_top, src_top); 355 sum = __msa_hadd_u_w(add, add); 356 sum = (v4u32) __msa_srari_w((v4i32) sum, 2); 357 res0 = (v16u8) __msa_vshf_b(mask, (v16i8) sum, (v16i8) sum); 358 out0 = __msa_copy_u_d((v2i64) res0, 0); 359 360 SD4(out0, out0, out0, out0, src, stride); 361 src += (4 * stride); 362 SD4(out0, out0, out0, out0, src, stride); 363} 364 365static void intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src, int32_t stride) 366{ 367 uint32_t src0, src1, src2; 368 uint32_t out0, out1, out2; 369 uint64_t store0, store1; 370 v16u8 src_top; 371 v8u16 add; 372 v4u32 sum; 373 374 src_top = LD_UB(src - stride); 375 add = __msa_hadd_u_h(src_top, src_top); 376 sum = __msa_hadd_u_w(add, add); 377 src0 = __msa_copy_u_w((v4i32) sum, 0); 378 src1 = __msa_copy_u_w((v4i32) sum, 1); 379 380 src2 = src[0 * stride - 1]; 381 src2 += src[1 * stride - 1]; 382 src2 += src[2 * stride - 1]; 383 src2 += src[3 * stride - 1]; 384 src2 = (src0 + src2 + 4) >> 3; 385 src0 = (src0 + 2) >> 2; 386 src1 = (src1 + 2) >> 2; 387 out0 = src0 * 0x01010101; 388 out1 = src1 * 0x01010101; 389 out2 = src2 * 0x01010101; 390 store1 = ((uint64_t) out1 << 32); 391 store0 = store1 | ((uint64_t) out2); 392 store1 = store1 | ((uint64_t) out0); 393 394 SD4(store0, store0, store0, store0, src, stride); 395 src += (4 * stride); 396 SD4(store1, store1, store1, store1, src, stride); 397} 398 399static void intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src, int32_t stride) 400{ 401 uint32_t src0, src1, src2, src3; 402 uint32_t out0, out1, out2, out3; 403 uint64_t store0, store1; 404 v16u8 src_top; 405 v8u16 add; 406 v4u32 sum; 407 408 src_top = LD_UB(src - stride); 409 add = __msa_hadd_u_h(src_top, src_top); 410 sum = __msa_hadd_u_w(add, add); 411 src0 = __msa_copy_u_w((v4i32) sum, 0); 412 src1 = __msa_copy_u_w((v4i32) sum, 1); 413 414 src2 = src[4 * stride - 1]; 415 src2 += src[5 * stride - 1]; 416 src2 += src[6 * stride - 1]; 417 src2 += src[7 * stride - 1]; 418 src0 = (src0 + 2) >> 2; 419 src3 = (src1 + src2 + 4) >> 3; 420 src1 = (src1 + 2) >> 2; 421 src2 = (src2 + 2) >> 2; 422 423 out0 = src0 * 0x01010101; 424 out1 = src1 * 0x01010101; 425 out2 = src2 * 0x01010101; 426 out3 = src3 * 0x01010101; 427 store0 = ((uint64_t) out1 << 32) | out0; 428 store1 = ((uint64_t) out3 << 32) | out2; 429 430 SD4(store0, store0, store0, store0, src, stride); 431 src += (4 * stride); 432 SD4(store1, store1, store1, store1, src, stride); 433} 434 435static void intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src, int32_t stride) 436{ 437 uint32_t src0; 438 uint64_t out0, out1; 439 440 src0 = src[0 * stride - 1]; 441 src0 += src[1 * stride - 1]; 442 src0 += src[2 * stride - 1]; 443 src0 += src[3 * stride - 1]; 444 src0 = (src0 + 2) >> 2; 445 out0 = src0 * 0x0101010101010101; 446 out1 = 0x8080808080808080; 447 448 SD4(out0, out0, out0, out0, src, stride); 449 src += (4 * stride); 450 SD4(out1, out1, out1, out1, src, stride); 451} 452 453static void intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src, int32_t stride) 454{ 455 uint32_t src0; 456 uint64_t out0, out1; 457 458 src0 = src[4 * stride - 1]; 459 src0 += src[5 * stride - 1]; 460 src0 += src[6 * stride - 1]; 461 src0 += src[7 * stride - 1]; 462 src0 = (src0 + 2) >> 2; 463 464 out0 = 0x8080808080808080; 465 out1 = src0 * 0x0101010101010101; 466 467 SD4(out0, out0, out0, out0, src, stride); 468 src += (4 * stride); 469 SD4(out1, out1, out1, out1, src, stride); 470} 471 472void ff_h264_intra_predict_plane_8x8_msa(uint8_t *src, ptrdiff_t stride) 473{ 474 intra_predict_plane_8x8_msa(src, stride); 475} 476 477void ff_h264_intra_predict_dc_4blk_8x8_msa(uint8_t *src, ptrdiff_t stride) 478{ 479 intra_predict_dc_4blk_8x8_msa(src, stride); 480} 481 482void ff_h264_intra_predict_hor_dc_8x8_msa(uint8_t *src, ptrdiff_t stride) 483{ 484 intra_predict_hor_dc_8x8_msa(src, stride); 485} 486 487void ff_h264_intra_predict_vert_dc_8x8_msa(uint8_t *src, ptrdiff_t stride) 488{ 489 intra_predict_vert_dc_8x8_msa(src, stride); 490} 491 492void ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src, 493 ptrdiff_t stride) 494{ 495 intra_predict_mad_cow_dc_l0t_8x8_msa(src, stride); 496} 497 498void ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src, 499 ptrdiff_t stride) 500{ 501 intra_predict_mad_cow_dc_0lt_8x8_msa(src, stride); 502} 503 504void ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src, 505 ptrdiff_t stride) 506{ 507 intra_predict_mad_cow_dc_l00_8x8_msa(src, stride); 508} 509 510void ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src, 511 ptrdiff_t stride) 512{ 513 intra_predict_mad_cow_dc_0l0_8x8_msa(src, stride); 514} 515 516void ff_h264_intra_predict_plane_16x16_msa(uint8_t *src, ptrdiff_t stride) 517{ 518 intra_predict_plane_16x16_msa(src, stride); 519} 520 521void ff_h264_intra_pred_vert_8x8_msa(uint8_t *src, ptrdiff_t stride) 522{ 523 uint8_t *dst = src; 524 525 intra_predict_vert_8x8_msa(src - stride, dst, stride); 526} 527 528void ff_h264_intra_pred_horiz_8x8_msa(uint8_t *src, ptrdiff_t stride) 529{ 530 uint8_t *dst = src; 531 532 intra_predict_horiz_8x8_msa(src - 1, stride, dst, stride); 533} 534 535void ff_h264_intra_pred_dc_16x16_msa(uint8_t *src, ptrdiff_t stride) 536{ 537 uint8_t *src_top = src - stride; 538 uint8_t *src_left = src - 1; 539 uint8_t *dst = src; 540 uint32_t addition = 0; 541 v16u8 src_above, out; 542 v8u16 sum_above; 543 v4u32 sum_top; 544 v2u64 sum; 545 546 src_above = LD_UB(src_top); 547 548 sum_above = __msa_hadd_u_h(src_above, src_above); 549 sum_top = __msa_hadd_u_w(sum_above, sum_above); 550 sum = __msa_hadd_u_d(sum_top, sum_top); 551 sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum); 552 sum = __msa_hadd_u_d(sum_top, sum_top); 553 addition = __msa_copy_u_w((v4i32) sum, 0); 554 addition += src_left[ 0 * stride]; 555 addition += src_left[ 1 * stride]; 556 addition += src_left[ 2 * stride]; 557 addition += src_left[ 3 * stride]; 558 addition += src_left[ 4 * stride]; 559 addition += src_left[ 5 * stride]; 560 addition += src_left[ 6 * stride]; 561 addition += src_left[ 7 * stride]; 562 addition += src_left[ 8 * stride]; 563 addition += src_left[ 9 * stride]; 564 addition += src_left[10 * stride]; 565 addition += src_left[11 * stride]; 566 addition += src_left[12 * stride]; 567 addition += src_left[13 * stride]; 568 addition += src_left[14 * stride]; 569 addition += src_left[15 * stride]; 570 addition = (addition + 16) >> 5; 571 out = (v16u8) __msa_fill_b(addition); 572 573 ST_UB8(out, out, out, out, out, out, out, out, dst, stride); 574 dst += (8 * stride); 575 ST_UB8(out, out, out, out, out, out, out, out, dst, stride); 576} 577 578void ff_h264_intra_pred_vert_16x16_msa(uint8_t *src, ptrdiff_t stride) 579{ 580 uint8_t *dst = src; 581 582 intra_predict_vert_16x16_msa(src - stride, dst, stride); 583} 584 585void ff_h264_intra_pred_horiz_16x16_msa(uint8_t *src, ptrdiff_t stride) 586{ 587 uint8_t *dst = src; 588 589 intra_predict_horiz_16x16_msa(src - 1, stride, dst, stride); 590} 591 592void ff_h264_intra_pred_dc_left_16x16_msa(uint8_t *src, ptrdiff_t stride) 593{ 594 uint8_t *src_left = src - 1; 595 uint8_t *dst = src; 596 uint32_t addition; 597 v16u8 out; 598 599 addition = src_left[ 0 * stride]; 600 addition += src_left[ 1 * stride]; 601 addition += src_left[ 2 * stride]; 602 addition += src_left[ 3 * stride]; 603 addition += src_left[ 4 * stride]; 604 addition += src_left[ 5 * stride]; 605 addition += src_left[ 6 * stride]; 606 addition += src_left[ 7 * stride]; 607 addition += src_left[ 8 * stride]; 608 addition += src_left[ 9 * stride]; 609 addition += src_left[10 * stride]; 610 addition += src_left[11 * stride]; 611 addition += src_left[12 * stride]; 612 addition += src_left[13 * stride]; 613 addition += src_left[14 * stride]; 614 addition += src_left[15 * stride]; 615 616 addition = (addition + 8) >> 4; 617 out = (v16u8) __msa_fill_b(addition); 618 619 ST_UB8(out, out, out, out, out, out, out, out, dst, stride); 620 dst += (8 * stride); 621 ST_UB8(out, out, out, out, out, out, out, out, dst, stride); 622} 623 624void ff_h264_intra_pred_dc_top_16x16_msa(uint8_t *src, ptrdiff_t stride) 625{ 626 uint8_t *src_top = src - stride; 627 uint8_t *dst = src; 628 v16u8 src_above, out; 629 v8u16 sum_above; 630 v4u32 sum_top; 631 v2u64 sum; 632 633 src_above = LD_UB(src_top); 634 635 sum_above = __msa_hadd_u_h(src_above, src_above); 636 sum_top = __msa_hadd_u_w(sum_above, sum_above); 637 sum = __msa_hadd_u_d(sum_top, sum_top); 638 sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum); 639 sum = __msa_hadd_u_d(sum_top, sum_top); 640 sum = (v2u64) __msa_srari_d((v2i64) sum, 4); 641 out = (v16u8) __msa_splati_b((v16i8) sum, 0); 642 643 ST_UB8(out, out, out, out, out, out, out, out, dst, stride); 644 dst += (8 * stride); 645 ST_UB8(out, out, out, out, out, out, out, out, dst, stride); 646} 647 648void ff_h264_intra_pred_dc_128_8x8_msa(uint8_t *src, ptrdiff_t stride) 649{ 650 uint64_t out; 651 v16u8 store; 652 653 store = (v16u8) __msa_fill_b(128); 654 out = __msa_copy_u_d((v2i64) store, 0); 655 656 SD4(out, out, out, out, src, stride); 657 src += (4 * stride); 658 SD4(out, out, out, out, src, stride); 659} 660 661void ff_h264_intra_pred_dc_128_16x16_msa(uint8_t *src, ptrdiff_t stride) 662{ 663 v16u8 out; 664 665 out = (v16u8) __msa_fill_b(128); 666 667 ST_UB8(out, out, out, out, out, out, out, out, src, stride); 668 src += (8 * stride); 669 ST_UB8(out, out, out, out, out, out, out, out, src, stride); 670} 671 672void ff_vp8_pred8x8_127_dc_8_msa(uint8_t *src, ptrdiff_t stride) 673{ 674 intra_predict_127dc_8x8_msa(src, stride); 675} 676 677void ff_vp8_pred8x8_129_dc_8_msa(uint8_t *src, ptrdiff_t stride) 678{ 679 intra_predict_129dc_8x8_msa(src, stride); 680} 681 682void ff_vp8_pred16x16_127_dc_8_msa(uint8_t *src, ptrdiff_t stride) 683{ 684 intra_predict_127dc_16x16_msa(src, stride); 685} 686 687void ff_vp8_pred16x16_129_dc_8_msa(uint8_t *src, ptrdiff_t stride) 688{ 689 intra_predict_129dc_16x16_msa(src, stride); 690} 691