1/* 2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "libavcodec/mips/hpeldsp_mips.h" 23 24#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \ 25{ \ 26 v16u8 tmp_m; \ 27 \ 28 tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \ 29 tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \ 30 ST_UB(tmp_m, (pdst)); \ 31} 32 33#define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 34{ \ 35 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 36 uint8_t *pdst_m = (uint8_t *) (pdst); \ 37 \ 38 PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7, \ 39 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 40 ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride); \ 41} 42 43#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \ 44 pdst, stride) \ 45{ \ 46 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 47 uint8_t *pdst_m = (uint8_t *) (pdst); \ 48 \ 49 PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ 50 PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ 51 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ 52 ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \ 53} 54 55static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride, 56 uint8_t *dst, int32_t dst_stride, 57 uint8_t height) 58{ 59 uint8_t loop_cnt; 60 uint32_t out0, out1; 61 v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1; 62 v16i8 zeros = { 0 }; 63 64 for (loop_cnt = (height >> 1); loop_cnt--;) { 65 LD_UB2(src, src_stride, src0, src1); 66 src += (2 * src_stride); 67 68 SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1); 69 AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1); 70 71 out0 = __msa_copy_u_w((v4i32) res0, 0); 72 out1 = __msa_copy_u_w((v4i32) res1, 0); 73 SW(out0, dst); 74 dst += dst_stride; 75 SW(out1, dst); 76 dst += dst_stride; 77 } 78} 79 80static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride, 81 uint8_t *dst, int32_t dst_stride, 82 uint8_t height) 83{ 84 uint8_t loop_cnt; 85 v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1; 86 v16i8 zeros = { 0 }; 87 88 for (loop_cnt = (height >> 2); loop_cnt--;) { 89 LD_SB4(src, src_stride, src0, src1, src2, src3); 90 src += (4 * src_stride); 91 92 SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1, 93 src0_sld1, src1_sld1, src2_sld1, src3_sld1); 94 AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, 95 src2, src2_sld1, src3, src3_sld1, dst, dst_stride); 96 dst += (4 * dst_stride); 97 } 98} 99 100static void common_hz_bil_16w_msa(const uint8_t *src, int32_t src_stride, 101 uint8_t *dst, int32_t dst_stride, 102 uint8_t height) 103{ 104 uint8_t loop_cnt; 105 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 106 v16u8 src8, src9, src10, src11, src12, src13, src14, src15; 107 108 for (loop_cnt = (height >> 3); loop_cnt--;) { 109 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 110 LD_UB8((src + 1), src_stride, 111 src8, src9, src10, src11, src12, src13, src14, src15); 112 src += (8 * src_stride); 113 114 AVER_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11, 115 dst, dst_stride); 116 dst += (4 * dst_stride); 117 118 AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15, 119 dst, dst_stride); 120 dst += (4 * dst_stride); 121 } 122} 123 124static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride, 125 uint8_t *dst, int32_t dst_stride) 126{ 127 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 128 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1; 129 v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1; 130 v16i8 zeros = { 0 }; 131 132 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 133 src += (8 * src_stride); 134 135 SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1, 136 src0_sld1, src1_sld1, src2_sld1, src3_sld1); 137 SLDI_B4_SB(zeros, src4, zeros, src5, zeros, src6, zeros, src7, 1, 138 src4_sld1, src5_sld1, src6_sld1, src7_sld1); 139 140 AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, 141 src2, src2_sld1, src3, src3_sld1, dst, dst_stride); 142 dst += (4 * dst_stride); 143 AVE_ST8x4_UB(src4, src4_sld1, src5, src5_sld1, 144 src6, src6_sld1, src7, src7_sld1, dst, dst_stride); 145} 146 147static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride, 148 uint8_t *dst, int32_t dst_stride) 149{ 150 v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1; 151 v16i8 zeros = { 0 }; 152 153 LD_SB4(src, src_stride, src0, src1, src2, src3); 154 SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1, 155 src0_sld1, src1_sld1, src2_sld1, src3_sld1); 156 AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, 157 src2, src2_sld1, src3, src3_sld1, dst, dst_stride); 158} 159 160static void common_hz_bil_no_rnd_16x16_msa(const uint8_t *src, 161 int32_t src_stride, 162 uint8_t *dst, int32_t dst_stride) 163{ 164 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 165 v16u8 src9, src10, src11, src12, src13, src14, src15; 166 167 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 168 LD_UB8((src + 1), src_stride, 169 src8, src9, src10, src11, src12, src13, src14, src15); 170 src += (8 * src_stride); 171 172 AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11, 173 dst, dst_stride); 174 dst += (4 * dst_stride); 175 176 LD_UB4(src, src_stride, src0, src1, src2, src3); 177 LD_UB4((src + 1), src_stride, src8, src9, src10, src11); 178 src += (4 * src_stride); 179 180 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15, 181 dst, dst_stride); 182 dst += (4 * dst_stride); 183 184 LD_UB4(src, src_stride, src4, src5, src6, src7); 185 LD_UB4((src + 1), src_stride, src12, src13, src14, src15); 186 src += (4 * src_stride); 187 188 AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11, 189 dst, dst_stride); 190 dst += (4 * dst_stride); 191 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15, 192 dst, dst_stride); 193} 194 195static void common_hz_bil_no_rnd_8x16_msa(const uint8_t *src, 196 int32_t src_stride, 197 uint8_t *dst, int32_t dst_stride) 198{ 199 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 200 v16u8 src9, src10, src11, src12, src13, src14, src15; 201 202 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 203 LD_UB8((src + 1), src_stride, 204 src8, src9, src10, src11, src12, src13, src14, src15); 205 206 AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11, 207 dst, dst_stride); 208 dst += (4 * dst_stride); 209 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15, 210 dst, dst_stride); 211} 212 213static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src, 214 int32_t src_stride, 215 uint8_t *dst, int32_t dst_stride, 216 uint8_t height) 217{ 218 uint8_t loop_cnt; 219 uint32_t dst0, dst1, out0, out1; 220 v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1; 221 v16u8 tmp0 = { 0 }; 222 v16u8 tmp1 = { 0 }; 223 v16i8 zeros = { 0 }; 224 225 for (loop_cnt = (height >> 1); loop_cnt--;) { 226 LD_UB2(src, src_stride, src0, src1); 227 src += (2 * src_stride); 228 229 SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1); 230 231 dst0 = LW(dst); 232 dst1 = LW(dst + dst_stride); 233 tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0); 234 tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1); 235 236 AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1); 237 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 238 239 out0 = __msa_copy_u_w((v4i32) res0, 0); 240 out1 = __msa_copy_u_w((v4i32) res1, 0); 241 SW(out0, dst); 242 dst += dst_stride; 243 SW(out1, dst); 244 dst += dst_stride; 245 } 246} 247 248static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src, 249 int32_t src_stride, 250 uint8_t *dst, int32_t dst_stride, 251 uint8_t height) 252{ 253 uint8_t loop_cnt; 254 v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1; 255 v16i8 zeros = { 0 }; 256 257 for (loop_cnt = (height >> 2); loop_cnt--;) { 258 LD_SB4(src, src_stride, src0, src1, src2, src3); 259 src += (4 * src_stride); 260 261 SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1, 262 src0_sld1, src1_sld1, src2_sld1, src3_sld1); 263 264 AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1, 265 src3, src3_sld1, dst, dst_stride); 266 dst += (4 * dst_stride); 267 } 268} 269 270static void common_hz_bil_and_aver_dst_16w_msa(const uint8_t *src, 271 int32_t src_stride, 272 uint8_t *dst, int32_t dst_stride, 273 uint8_t height) 274{ 275 uint8_t loop_cnt; 276 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 277 v16u8 src9, src10, src11, src12, src13, src14, src15; 278 279 for (loop_cnt = (height >> 3); loop_cnt--;) { 280 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 281 LD_UB8((src + 1), src_stride, 282 src8, src9, src10, src11, src12, src13, src14, src15); 283 src += (8 * src_stride); 284 285 AVER_DST_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11, 286 dst, dst_stride); 287 dst += (4 * dst_stride); 288 AVER_DST_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15, 289 dst, dst_stride); 290 dst += (4 * dst_stride); 291 } 292} 293 294static void common_vt_bil_4w_msa(const uint8_t *src, int32_t src_stride, 295 uint8_t *dst, int32_t dst_stride, 296 uint8_t height) 297{ 298 uint8_t loop_cnt; 299 uint32_t out0, out1; 300 v16u8 src0, src1, src2, res0, res1; 301 302 src0 = LD_UB(src); 303 src += src_stride; 304 305 for (loop_cnt = (height >> 1); loop_cnt--;) { 306 LD_UB2(src, src_stride, src1, src2); 307 src += (2 * src_stride); 308 309 AVER_UB2_UB(src0, src1, src1, src2, res0, res1); 310 311 out0 = __msa_copy_u_w((v4i32) res0, 0); 312 out1 = __msa_copy_u_w((v4i32) res1, 0); 313 SW(out0, dst); 314 dst += dst_stride; 315 SW(out1, dst); 316 dst += dst_stride; 317 318 src0 = src2; 319 } 320} 321 322static void common_vt_bil_8w_msa(const uint8_t *src, int32_t src_stride, 323 uint8_t *dst, int32_t dst_stride, 324 uint8_t height) 325{ 326 uint8_t loop_cnt; 327 v16u8 src0, src1, src2, src3, src4; 328 329 src0 = LD_UB(src); 330 src += src_stride; 331 332 for (loop_cnt = (height >> 2); loop_cnt--;) { 333 LD_UB4(src, src_stride, src1, src2, src3, src4); 334 src += (4 * src_stride); 335 336 AVER_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, 337 dst, dst_stride); 338 dst += (4 * dst_stride); 339 340 src0 = src4; 341 } 342} 343 344static void common_vt_bil_16w_msa(const uint8_t *src, int32_t src_stride, 345 uint8_t *dst, int32_t dst_stride, 346 uint8_t height) 347{ 348 uint8_t loop_cnt; 349 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 350 351 src0 = LD_UB(src); 352 src += src_stride; 353 354 for (loop_cnt = (height >> 3); loop_cnt--;) { 355 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); 356 src += (8 * src_stride); 357 358 AVER_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, 359 dst, dst_stride); 360 dst += (4 * dst_stride); 361 AVER_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8, 362 dst, dst_stride); 363 dst += (4 * dst_stride); 364 365 src0 = src8; 366 } 367} 368 369static void common_vt_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride, 370 uint8_t *dst, int32_t dst_stride) 371{ 372 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 373 374 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 375 src += (8 * src_stride); 376 src8 = LD_UB(src); 377 378 AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, 379 dst, dst_stride); 380 dst += (4 * dst_stride); 381 382 AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8, 383 dst, dst_stride); 384} 385 386static void common_vt_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride, 387 uint8_t *dst, int32_t dst_stride) 388{ 389 v16u8 src0, src1, src2, src3, src4; 390 391 LD_UB5(src, src_stride, src0, src1, src2, src3, src4); 392 AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, 393 dst, dst_stride); 394} 395 396static void common_vt_bil_no_rnd_16x16_msa(const uint8_t *src, 397 int32_t src_stride, 398 uint8_t *dst, int32_t dst_stride) 399{ 400 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 401 v16u8 src9, src10, src11, src12, src13, src14, src15, src16; 402 403 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 404 src += (8 * src_stride); 405 LD_UB8(src, src_stride, 406 src8, src9, src10, src11, src12, src13, src14, src15); 407 src += (8 * src_stride); 408 src16 = LD_UB(src); 409 410 AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, 411 dst, dst_stride); 412 dst += (4 * dst_stride); 413 AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8, 414 dst, dst_stride); 415 dst += (4 * dst_stride); 416 AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12, 417 dst, dst_stride); 418 dst += (4 * dst_stride); 419 AVE_ST16x4_UB(src12, src13, src13, src14, 420 src14, src15, src15, src16, dst, dst_stride); 421} 422 423static void common_vt_bil_no_rnd_8x16_msa(const uint8_t *src, 424 int32_t src_stride, 425 uint8_t *dst, int32_t dst_stride) 426{ 427 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 428 429 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 430 src += (8 * src_stride); 431 src8 = LD_UB(src); 432 433 AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, 434 dst, dst_stride); 435 dst += (4 * dst_stride); 436 AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8, 437 dst, dst_stride); 438} 439 440static void common_vt_bil_and_aver_dst_4w_msa(const uint8_t *src, 441 int32_t src_stride, 442 uint8_t *dst, int32_t dst_stride, 443 uint8_t height) 444{ 445 uint8_t loop_cnt; 446 uint32_t out0, out1, dst0, dst1; 447 v16u8 src0, src1, src2; 448 v16u8 tmp0 = { 0 }; 449 v16u8 tmp1 = { 0 }; 450 v16u8 res0, res1; 451 452 src0 = LD_UB(src); 453 src += src_stride; 454 455 for (loop_cnt = (height >> 1); loop_cnt--;) { 456 LD_UB2(src, src_stride, src1, src2); 457 src += (2 * src_stride); 458 dst0 = LW(dst); 459 dst1 = LW(dst + dst_stride); 460 tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0); 461 tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1); 462 AVER_UB2_UB(src0, src1, src1, src2, res0, res1); 463 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 464 out0 = __msa_copy_u_w((v4i32) res0, 0); 465 out1 = __msa_copy_u_w((v4i32) res1, 0); 466 SW(out0, dst); 467 dst += dst_stride; 468 SW(out1, dst); 469 dst += dst_stride; 470 src0 = src2; 471 } 472} 473 474static void common_vt_bil_and_aver_dst_8w_msa(const uint8_t *src, 475 int32_t src_stride, 476 uint8_t *dst, int32_t dst_stride, 477 uint8_t height) 478{ 479 uint8_t loop_cnt; 480 v16u8 src0, src1, src2, src3, src4; 481 482 src0 = LD_UB(src); 483 src += src_stride; 484 485 for (loop_cnt = (height >> 2); loop_cnt--;) { 486 LD_UB4(src, src_stride, src1, src2, src3, src4); 487 src += (4 * src_stride); 488 489 AVER_DST_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4, 490 dst, dst_stride); 491 dst += (4 * dst_stride); 492 src0 = src4; 493 } 494} 495 496static void common_vt_bil_and_aver_dst_16w_msa(const uint8_t *src, 497 int32_t src_stride, 498 uint8_t *dst, int32_t dst_stride, 499 uint8_t height) 500{ 501 uint8_t loop_cnt; 502 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 503 v16u8 res0, res1, res2, res3, res4, res5, res6, res7; 504 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 505 506 src0 = LD_UB(src); 507 src += src_stride; 508 509 for (loop_cnt = (height >> 3); loop_cnt--;) { 510 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); 511 src += (8 * src_stride); 512 AVER_UB4_UB(src0, src1, src1, src2, src2, src3, src3, src4, 513 res0, res1, res2, res3); 514 AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8, 515 res4, res5, res6, res7); 516 517 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 518 AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3, 519 res0, res1, res2, res3); 520 AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7, 521 res4, res5, res6, res7); 522 ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7, dst, dst_stride); 523 dst += (8 * dst_stride); 524 525 src0 = src8; 526 } 527} 528 529static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride, 530 uint8_t *dst, int32_t dst_stride, 531 uint8_t height) 532{ 533 uint8_t loop_cnt; 534 uint32_t res0, res1; 535 v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1; 536 v16u8 src0_r, src1_r, src2_r, res; 537 v8u16 add0, add1, add2, sum0, sum1; 538 v16i8 zeros = { 0 }; 539 540 src0 = LD_SB(src); 541 src += src_stride; 542 543 for (loop_cnt = (height >> 1); loop_cnt--;) { 544 LD_SB2(src, src_stride, src1, src2); 545 src += (2 * src_stride); 546 547 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1, 548 src1_sld1, src2_sld1); 549 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, 550 src0_r, src1_r, src2_r); 551 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); 552 ADD2(add0, add1, add1, add2, sum0, sum1); 553 SRARI_H2_UH(sum0, sum1, 2); 554 res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0); 555 res0 = __msa_copy_u_w((v4i32) res, 0); 556 res1 = __msa_copy_u_w((v4i32) res, 2); 557 SW(res0, dst); 558 dst += dst_stride; 559 SW(res1, dst); 560 dst += dst_stride; 561 562 src0 = src2; 563 } 564} 565 566static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride, 567 uint8_t *dst, int32_t dst_stride, 568 uint8_t height) 569{ 570 uint8_t loop_cnt; 571 v16i8 src0, src1, src2, src3, src4; 572 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1; 573 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r; 574 v8u16 add0, add1, add2, add3, add4; 575 v8u16 sum0, sum1, sum2, sum3; 576 v16i8 zeros = { 0 }; 577 578 src0 = LD_SB(src); 579 src += src_stride; 580 581 for (loop_cnt = (height >> 2); loop_cnt--;) { 582 LD_SB4(src, src_stride, src1, src2, src3, src4); 583 src += (4 * src_stride); 584 585 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1, 586 src1_sld1, src2_sld1); 587 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1); 588 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r, 589 src1_r, src2_r); 590 ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r); 591 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); 592 HADD_UB2_UH(src3_r, src4_r, add3, add4); 593 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, 594 sum0, sum1, sum2, sum3); 595 SRARI_H4_UH(sum0, sum1, sum2, sum3, 2); 596 PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1); 597 ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride); 598 dst += (4 * dst_stride); 599 src0 = src4; 600 } 601} 602 603static void common_hv_bil_16w_msa(const uint8_t *src, int32_t src_stride, 604 uint8_t *dst, int32_t dst_stride, 605 uint8_t height) 606{ 607 uint8_t loop_cnt; 608 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 609 v16u8 src10, src11, src12, src13, src14, src15, src16, src17; 610 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; 611 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l; 612 v8u16 src7_l, src8_l; 613 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r; 614 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l; 615 616 for (loop_cnt = (height >> 3); loop_cnt--;) { 617 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 618 LD_UB8((src + 1), src_stride, 619 src9, src10, src11, src12, src13, src14, src15, src16); 620 src += (8 * src_stride); 621 622 src8 = LD_UB(src); 623 src17 = LD_UB(src + 1); 624 625 ILVRL_B2_UH(src9, src0, src0_r, src0_l); 626 ILVRL_B2_UH(src10, src1, src1_r, src1_l); 627 ILVRL_B2_UH(src11, src2, src2_r, src2_l); 628 ILVRL_B2_UH(src12, src3, src3_r, src3_l); 629 ILVRL_B2_UH(src13, src4, src4_r, src4_l); 630 ILVRL_B2_UH(src14, src5, src5_r, src5_l); 631 ILVRL_B2_UH(src15, src6, src6_r, src6_l); 632 ILVRL_B2_UH(src16, src7, src7_r, src7_l); 633 ILVRL_B2_UH(src17, src8, src8_r, src8_l); 634 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r); 635 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r); 636 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r); 637 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l); 638 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l); 639 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l); 640 ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r, 641 sum0_r, sum1_r, sum2_r, sum3_r); 642 ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r, 643 sum4_r, sum5_r, sum6_r, sum7_r); 644 ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l, 645 sum0_l, sum1_l, sum2_l, sum3_l); 646 ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l, 647 sum4_l, sum5_l, sum6_l, sum7_l); 648 SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2); 649 SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2); 650 SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2); 651 SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2); 652 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r, 653 sum3_l, sum3_r, dst, dst_stride); 654 dst += (4 * dst_stride); 655 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r, 656 sum7_l, sum7_r, dst, dst_stride); 657 dst += (4 * dst_stride); 658 } 659} 660 661static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride, 662 uint8_t *dst, int32_t dst_stride) 663{ 664 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 665 v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1; 666 v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1; 667 v8u16 src0_r, src1_r, src2_r, src3_r; 668 v8u16 src4_r, src5_r, src6_r, src7_r, src8_r; 669 v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8; 670 v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 671 v16i8 out0, out1; 672 v16i8 zeros = { 0 }; 673 674 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 675 src += (8 * src_stride); 676 src8 = LD_UB(src); 677 678 SLDI_B4_UB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1, 679 src0_sld1, src1_sld1, src2_sld1, src3_sld1); 680 SLDI_B3_UB(zeros, src4, zeros, src5, zeros, src6, 1, src4_sld1, 681 src5_sld1, src6_sld1); 682 SLDI_B2_UB(zeros, src7, zeros, src8, 1, src7_sld1, src8_sld1); 683 ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1, 684 src3, src0_r, src1_r, src2_r, src3_r); 685 ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r, 686 src5_r, src6_r); 687 ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r); 688 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); 689 HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5); 690 HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8); 691 692 sum0 = add0 + add1 + 1; 693 sum1 = add1 + add2 + 1; 694 sum2 = add2 + add3 + 1; 695 sum3 = add3 + add4 + 1; 696 sum4 = add4 + add5 + 1; 697 sum5 = add5 + add6 + 1; 698 sum6 = add6 + add7 + 1; 699 sum7 = add7 + add8 + 1; 700 701 SRA_4V(sum0, sum1, sum2, sum3, 2); 702 SRA_4V(sum4, sum5, sum6, sum7, 2); 703 PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1); 704 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 705 PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1); 706 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 707} 708 709static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride, 710 uint8_t *dst, int32_t dst_stride) 711{ 712 v16i8 src0, src1, src2, src3, src4; 713 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1; 714 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r; 715 v8u16 add0, add1, add2, add3, add4; 716 v8u16 sum0, sum1, sum2, sum3; 717 v16i8 out0, out1; 718 v16i8 zeros = { 0 }; 719 720 LD_SB4(src, src_stride, src0, src1, src2, src3); 721 src += (4 * src_stride); 722 src4 = LD_SB(src); 723 724 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1, 725 src1_sld1, src2_sld1); 726 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1); 727 ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r, 728 src1_r, src2_r); 729 ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r); 730 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); 731 HADD_UB2_UH(src3_r, src4_r, add3, add4); 732 733 sum0 = add0 + add1 + 1; 734 sum1 = add1 + add2 + 1; 735 sum2 = add2 + add3 + 1; 736 sum3 = add3 + add4 + 1; 737 738 SRA_4V(sum0, sum1, sum2, sum3, 2); 739 PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1); 740 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 741} 742 743static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src, 744 int32_t src_stride, 745 uint8_t *dst, int32_t dst_stride) 746{ 747 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 748 v16u8 src10, src11, src12, src13, src14, src15, src16, src17; 749 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; 750 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l; 751 v8u16 src7_l, src8_l; 752 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r; 753 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l; 754 755 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 756 LD_UB8((src + 1), src_stride, 757 src9, src10, src11, src12, src13, src14, src15, src16); 758 src += (8 * src_stride); 759 src8 = LD_UB(src); 760 src17 = LD_UB(src + 1); 761 762 ILVRL_B2_UH(src9, src0, src0_r, src0_l); 763 ILVRL_B2_UH(src10, src1, src1_r, src1_l); 764 ILVRL_B2_UH(src11, src2, src2_r, src2_l); 765 ILVRL_B2_UH(src12, src3, src3_r, src3_l); 766 ILVRL_B2_UH(src13, src4, src4_r, src4_l); 767 ILVRL_B2_UH(src14, src5, src5_r, src5_l); 768 ILVRL_B2_UH(src15, src6, src6_r, src6_l); 769 ILVRL_B2_UH(src16, src7, src7_r, src7_l); 770 ILVRL_B2_UH(src17, src8, src8_r, src8_l); 771 772 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r); 773 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r); 774 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r); 775 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l); 776 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l); 777 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l); 778 779 sum0_r = src0_r + src1_r + 1; 780 sum1_r = src1_r + src2_r + 1; 781 sum2_r = src2_r + src3_r + 1; 782 sum3_r = src3_r + src4_r + 1; 783 sum4_r = src4_r + src5_r + 1; 784 sum5_r = src5_r + src6_r + 1; 785 sum6_r = src6_r + src7_r + 1; 786 sum7_r = src7_r + src8_r + 1; 787 sum0_l = src0_l + src1_l + 1; 788 sum1_l = src1_l + src2_l + 1; 789 sum2_l = src2_l + src3_l + 1; 790 sum3_l = src3_l + src4_l + 1; 791 sum4_l = src4_l + src5_l + 1; 792 sum5_l = src5_l + src6_l + 1; 793 sum6_l = src6_l + src7_l + 1; 794 sum7_l = src7_l + src8_l + 1; 795 796 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2); 797 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2); 798 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2); 799 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2); 800 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, 801 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride); 802 dst += (4 * dst_stride); 803 804 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 805 LD_UB8((src + 1), src_stride, 806 src9, src10, src11, src12, src13, src14, src15, src16); 807 src += (8 * src_stride); 808 src8 = LD_UB(src); 809 src17 = LD_UB(src + 1); 810 811 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, 812 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride); 813 dst += (4 * dst_stride); 814 815 ILVRL_B2_UH(src9, src0, src0_r, src0_l); 816 ILVRL_B2_UH(src10, src1, src1_r, src1_l); 817 ILVRL_B2_UH(src11, src2, src2_r, src2_l); 818 ILVRL_B2_UH(src12, src3, src3_r, src3_l); 819 ILVRL_B2_UH(src13, src4, src4_r, src4_l); 820 ILVRL_B2_UH(src14, src5, src5_r, src5_l); 821 ILVRL_B2_UH(src15, src6, src6_r, src6_l); 822 ILVRL_B2_UH(src16, src7, src7_r, src7_l); 823 ILVRL_B2_UH(src17, src8, src8_r, src8_l); 824 825 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r); 826 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r); 827 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r); 828 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l); 829 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l); 830 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l); 831 832 sum0_r = src0_r + src1_r + 1; 833 sum1_r = src1_r + src2_r + 1; 834 sum2_r = src2_r + src3_r + 1; 835 sum3_r = src3_r + src4_r + 1; 836 sum4_r = src4_r + src5_r + 1; 837 sum5_r = src5_r + src6_r + 1; 838 sum6_r = src6_r + src7_r + 1; 839 sum7_r = src7_r + src8_r + 1; 840 sum0_l = src0_l + src1_l + 1; 841 sum1_l = src1_l + src2_l + 1; 842 sum2_l = src2_l + src3_l + 1; 843 sum3_l = src3_l + src4_l + 1; 844 sum4_l = src4_l + src5_l + 1; 845 sum5_l = src5_l + src6_l + 1; 846 sum6_l = src6_l + src7_l + 1; 847 sum7_l = src7_l + src8_l + 1; 848 849 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2); 850 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2); 851 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2); 852 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2); 853 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, 854 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride); 855 dst += (4 * dst_stride); 856 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, 857 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride); 858} 859 860static void common_hv_bil_no_rnd_8x16_msa(const uint8_t *src, 861 int32_t src_stride, 862 uint8_t *dst, int32_t dst_stride) 863{ 864 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 865 v16u8 src10, src11, src12, src13, src14, src15, src16, src17; 866 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; 867 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l; 868 v8u16 src7_l, src8_l; 869 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r; 870 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l; 871 872 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 873 LD_UB8((src + 1), src_stride, 874 src9, src10, src11, src12, src13, src14, src15, src16); 875 src += (8 * src_stride); 876 src8 = LD_UB(src); 877 src17 = LD_UB(src + 1); 878 879 ILVRL_B2_UH(src9, src0, src0_r, src0_l); 880 ILVRL_B2_UH(src10, src1, src1_r, src1_l); 881 ILVRL_B2_UH(src11, src2, src2_r, src2_l); 882 ILVRL_B2_UH(src12, src3, src3_r, src3_l); 883 ILVRL_B2_UH(src13, src4, src4_r, src4_l); 884 ILVRL_B2_UH(src14, src5, src5_r, src5_l); 885 ILVRL_B2_UH(src15, src6, src6_r, src6_l); 886 ILVRL_B2_UH(src16, src7, src7_r, src7_l); 887 ILVRL_B2_UH(src17, src8, src8_r, src8_l); 888 889 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r); 890 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r); 891 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r); 892 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l); 893 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l); 894 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l); 895 896 sum0_r = src0_r + src1_r + 1; 897 sum1_r = src1_r + src2_r + 1; 898 sum2_r = src2_r + src3_r + 1; 899 sum3_r = src3_r + src4_r + 1; 900 sum4_r = src4_r + src5_r + 1; 901 sum5_r = src5_r + src6_r + 1; 902 sum6_r = src6_r + src7_r + 1; 903 sum7_r = src7_r + src8_r + 1; 904 sum0_l = src0_l + src1_l + 1; 905 sum1_l = src1_l + src2_l + 1; 906 sum2_l = src2_l + src3_l + 1; 907 sum3_l = src3_l + src4_l + 1; 908 sum4_l = src4_l + src5_l + 1; 909 sum5_l = src5_l + src6_l + 1; 910 sum6_l = src6_l + src7_l + 1; 911 sum7_l = src7_l + src8_l + 1; 912 913 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2); 914 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2); 915 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2); 916 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2); 917 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, 918 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride); 919 dst += (4 * dst_stride); 920 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, 921 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride); 922} 923 924static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src, 925 int32_t src_stride, 926 uint8_t *dst, int32_t dst_stride, 927 uint8_t height) 928{ 929 uint8_t loop_cnt; 930 uint32_t out0, out1; 931 v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1; 932 v16u8 src0_r, src1_r, src2_r; 933 v8u16 add0, add1, add2, sum0, sum1; 934 v16u8 dst0, dst1, res0, res1; 935 v16i8 zeros = { 0 }; 936 937 src0 = LD_SB(src); 938 src += src_stride; 939 940 for (loop_cnt = (height >> 1); loop_cnt--;) { 941 LD_SB2(src, src_stride, src1, src2); 942 src += (2 * src_stride); 943 944 LD_UB2(dst, dst_stride, dst0, dst1); 945 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1, 946 src1_sld1, src2_sld1); 947 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r, 948 src1_r, src2_r); 949 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); 950 ADD2(add0, add1, add1, add2, sum0, sum1); 951 SRARI_H2_UH(sum0, sum1, 2); 952 PCKEV_B2_UB(sum0, sum0, sum1, sum1, res0, res1); 953 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); 954 955 out0 = __msa_copy_u_w((v4i32) res0, 0); 956 out1 = __msa_copy_u_w((v4i32) res1, 0); 957 SW(out0, dst); 958 dst += dst_stride; 959 SW(out1, dst); 960 dst += dst_stride; 961 962 src0 = src2; 963 } 964} 965 966static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src, 967 int32_t src_stride, 968 uint8_t *dst, int32_t dst_stride, 969 uint8_t height) 970{ 971 uint8_t loop_cnt; 972 v16i8 src0, src1, src2, src3, src4; 973 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1; 974 v16u8 dst0, dst1, dst2, dst3; 975 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r; 976 v8u16 add0, add1, add2, add3, add4; 977 v8u16 sum0, sum1, sum2, sum3; 978 v16i8 zeros = { 0 }; 979 980 src0 = LD_SB(src); 981 src += src_stride; 982 983 for (loop_cnt = (height >> 2); loop_cnt--;) { 984 LD_SB4(src, src_stride, src1, src2, src3, src4); 985 src += (4 * src_stride); 986 987 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 988 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1, 989 src1_sld1, src2_sld1); 990 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1); 991 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r, 992 src1_r, src2_r); 993 ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r); 994 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); 995 HADD_UB2_UH(src3_r, src4_r, add3, add4); 996 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, 997 sum0, sum1, sum2, sum3); 998 SRARI_H4_UH(sum0, sum1, sum2, sum3, 2); 999 PCKEV_AVG_ST8x4_UB(sum0, dst0, sum1, dst1, 1000 sum2, dst2, sum3, dst3, dst, dst_stride); 1001 dst += (4 * dst_stride); 1002 src0 = src4; 1003 } 1004} 1005 1006static void common_hv_bil_and_aver_dst_16w_msa(const uint8_t *src, 1007 int32_t src_stride, 1008 uint8_t *dst, int32_t dst_stride, 1009 uint8_t height) 1010{ 1011 uint8_t loop_cnt; 1012 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1013 v16u8 src11, src12, src13, src14, src15, src16, src17; 1014 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; 1015 v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l; 1016 v16u8 src7_l, src8_l; 1017 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1018 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r; 1019 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l; 1020 v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8; 1021 1022 for (loop_cnt = (height >> 3); loop_cnt--;) { 1023 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 1024 LD_UB8((src + 1), src_stride, 1025 src9, src10, src11, src12, src13, src14, src15, src16); 1026 src += (8 * src_stride); 1027 1028 src8 = LD_UB(src); 1029 src17 = LD_UB(src + 1); 1030 1031 ILVRL_B2_UB(src9, src0, src0_r, src0_l); 1032 ILVRL_B2_UB(src10, src1, src1_r, src1_l); 1033 ILVRL_B2_UB(src11, src2, src2_r, src2_l); 1034 ILVRL_B2_UB(src12, src3, src3_r, src3_l); 1035 ILVRL_B2_UB(src13, src4, src4_r, src4_l); 1036 ILVRL_B2_UB(src14, src5, src5_r, src5_l); 1037 ILVRL_B2_UB(src15, src6, src6_r, src6_l); 1038 ILVRL_B2_UB(src16, src7, src7_r, src7_l); 1039 ILVRL_B2_UB(src17, src8, src8_r, src8_l); 1040 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); 1041 HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5); 1042 HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8); 1043 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r, 1044 sum2_r, sum3_r); 1045 ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r, 1046 sum6_r, sum7_r); 1047 HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2); 1048 HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5); 1049 HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8); 1050 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l, 1051 sum2_l, sum3_l); 1052 ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l, 1053 sum6_l, sum7_l); 1054 SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2); 1055 SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2); 1056 SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2); 1057 SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2); 1058 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 1059 PCKEV_AVG_ST_UB(sum0_l, sum0_r, dst0, dst); 1060 dst += dst_stride; 1061 PCKEV_AVG_ST_UB(sum1_l, sum1_r, dst1, dst); 1062 dst += dst_stride; 1063 PCKEV_AVG_ST_UB(sum2_l, sum2_r, dst2, dst); 1064 dst += dst_stride; 1065 PCKEV_AVG_ST_UB(sum3_l, sum3_r, dst3, dst); 1066 dst += dst_stride; 1067 PCKEV_AVG_ST_UB(sum4_l, sum4_r, dst4, dst); 1068 dst += dst_stride; 1069 PCKEV_AVG_ST_UB(sum5_l, sum5_r, dst5, dst); 1070 dst += dst_stride; 1071 PCKEV_AVG_ST_UB(sum6_l, sum6_r, dst6, dst); 1072 dst += dst_stride; 1073 PCKEV_AVG_ST_UB(sum7_l, sum7_r, dst7, dst); 1074 dst += dst_stride; 1075 } 1076} 1077 1078static void copy_width8_msa(const uint8_t *src, int32_t src_stride, 1079 uint8_t *dst, int32_t dst_stride, 1080 int32_t height) 1081{ 1082 int32_t cnt; 1083 uint64_t out0, out1, out2, out3, out4, out5, out6, out7; 1084 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 1085 1086 if (0 == height % 12) { 1087 for (cnt = (height / 12); cnt--;) { 1088 LD_UB8(src, src_stride, 1089 src0, src1, src2, src3, src4, src5, src6, src7); 1090 src += (8 * src_stride); 1091 1092 out0 = __msa_copy_u_d((v2i64) src0, 0); 1093 out1 = __msa_copy_u_d((v2i64) src1, 0); 1094 out2 = __msa_copy_u_d((v2i64) src2, 0); 1095 out3 = __msa_copy_u_d((v2i64) src3, 0); 1096 out4 = __msa_copy_u_d((v2i64) src4, 0); 1097 out5 = __msa_copy_u_d((v2i64) src5, 0); 1098 out6 = __msa_copy_u_d((v2i64) src6, 0); 1099 out7 = __msa_copy_u_d((v2i64) src7, 0); 1100 1101 SD4(out0, out1, out2, out3, dst, dst_stride); 1102 dst += (4 * dst_stride); 1103 SD4(out4, out5, out6, out7, dst, dst_stride); 1104 dst += (4 * dst_stride); 1105 1106 LD_UB4(src, src_stride, src0, src1, src2, src3); 1107 src += (4 * src_stride); 1108 1109 out0 = __msa_copy_u_d((v2i64) src0, 0); 1110 out1 = __msa_copy_u_d((v2i64) src1, 0); 1111 out2 = __msa_copy_u_d((v2i64) src2, 0); 1112 out3 = __msa_copy_u_d((v2i64) src3, 0); 1113 1114 SD4(out0, out1, out2, out3, dst, dst_stride); 1115 dst += (4 * dst_stride); 1116 } 1117 } else if (0 == height % 8) { 1118 for (cnt = height >> 3; cnt--;) { 1119 LD_UB8(src, src_stride, 1120 src0, src1, src2, src3, src4, src5, src6, src7); 1121 src += (8 * src_stride); 1122 1123 out0 = __msa_copy_u_d((v2i64) src0, 0); 1124 out1 = __msa_copy_u_d((v2i64) src1, 0); 1125 out2 = __msa_copy_u_d((v2i64) src2, 0); 1126 out3 = __msa_copy_u_d((v2i64) src3, 0); 1127 out4 = __msa_copy_u_d((v2i64) src4, 0); 1128 out5 = __msa_copy_u_d((v2i64) src5, 0); 1129 out6 = __msa_copy_u_d((v2i64) src6, 0); 1130 out7 = __msa_copy_u_d((v2i64) src7, 0); 1131 1132 SD4(out0, out1, out2, out3, dst, dst_stride); 1133 dst += (4 * dst_stride); 1134 SD4(out4, out5, out6, out7, dst, dst_stride); 1135 dst += (4 * dst_stride); 1136 } 1137 } else if (0 == height % 4) { 1138 for (cnt = (height / 4); cnt--;) { 1139 LD_UB4(src, src_stride, src0, src1, src2, src3); 1140 src += (4 * src_stride); 1141 out0 = __msa_copy_u_d((v2i64) src0, 0); 1142 out1 = __msa_copy_u_d((v2i64) src1, 0); 1143 out2 = __msa_copy_u_d((v2i64) src2, 0); 1144 out3 = __msa_copy_u_d((v2i64) src3, 0); 1145 1146 SD4(out0, out1, out2, out3, dst, dst_stride); 1147 dst += (4 * dst_stride); 1148 } 1149 } else if (0 == height % 2) { 1150 for (cnt = (height / 2); cnt--;) { 1151 LD_UB2(src, src_stride, src0, src1); 1152 src += (2 * src_stride); 1153 out0 = __msa_copy_u_d((v2i64) src0, 0); 1154 out1 = __msa_copy_u_d((v2i64) src1, 0); 1155 1156 SD(out0, dst); 1157 dst += dst_stride; 1158 SD(out1, dst); 1159 dst += dst_stride; 1160 } 1161 } 1162} 1163 1164static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, 1165 uint8_t *dst, int32_t dst_stride, 1166 int32_t height, int32_t width) 1167{ 1168 int32_t cnt, loop_cnt; 1169 const uint8_t *src_tmp; 1170 uint8_t *dst_tmp; 1171 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 1172 1173 for (cnt = (width >> 4); cnt--;) { 1174 src_tmp = src; 1175 dst_tmp = dst; 1176 1177 for (loop_cnt = (height >> 3); loop_cnt--;) { 1178 LD_UB8(src_tmp, src_stride, 1179 src0, src1, src2, src3, src4, src5, src6, src7); 1180 src_tmp += (8 * src_stride); 1181 1182 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, 1183 dst_tmp, dst_stride); 1184 dst_tmp += (8 * dst_stride); 1185 } 1186 1187 src += 16; 1188 dst += 16; 1189 } 1190} 1191 1192static void copy_width16_msa(const uint8_t *src, int32_t src_stride, 1193 uint8_t *dst, int32_t dst_stride, 1194 int32_t height) 1195{ 1196 int32_t cnt; 1197 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 1198 1199 if (0 == height % 12) { 1200 for (cnt = (height / 12); cnt--;) { 1201 LD_UB8(src, src_stride, 1202 src0, src1, src2, src3, src4, src5, src6, src7); 1203 src += (8 * src_stride); 1204 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, 1205 dst, dst_stride); 1206 dst += (8 * dst_stride); 1207 1208 LD_UB4(src, src_stride, src0, src1, src2, src3); 1209 src += (4 * src_stride); 1210 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 1211 dst += (4 * dst_stride); 1212 } 1213 } else if (0 == height % 8) { 1214 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); 1215 } else if (0 == height % 4) { 1216 for (cnt = (height >> 2); cnt--;) { 1217 LD_UB4(src, src_stride, src0, src1, src2, src3); 1218 src += (4 * src_stride); 1219 1220 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 1221 dst += (4 * dst_stride); 1222 } 1223 } 1224} 1225 1226static void avg_width4_msa(const uint8_t *src, int32_t src_stride, 1227 uint8_t *dst, int32_t dst_stride, 1228 int32_t height) 1229{ 1230 int32_t cnt; 1231 uint32_t out0, out1, out2, out3; 1232 v16u8 src0, src1, src2, src3; 1233 v16u8 dst0, dst1, dst2, dst3; 1234 1235 if (0 == (height % 4)) { 1236 for (cnt = (height / 4); cnt--;) { 1237 LD_UB4(src, src_stride, src0, src1, src2, src3); 1238 src += (4 * src_stride); 1239 1240 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 1241 1242 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, 1243 dst0, dst1, dst2, dst3); 1244 1245 out0 = __msa_copy_u_w((v4i32) dst0, 0); 1246 out1 = __msa_copy_u_w((v4i32) dst1, 0); 1247 out2 = __msa_copy_u_w((v4i32) dst2, 0); 1248 out3 = __msa_copy_u_w((v4i32) dst3, 0); 1249 SW4(out0, out1, out2, out3, dst, dst_stride); 1250 dst += (4 * dst_stride); 1251 } 1252 } else if (0 == (height % 2)) { 1253 for (cnt = (height / 2); cnt--;) { 1254 LD_UB2(src, src_stride, src0, src1); 1255 src += (2 * src_stride); 1256 1257 LD_UB2(dst, dst_stride, dst0, dst1); 1258 1259 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); 1260 1261 out0 = __msa_copy_u_w((v4i32) dst0, 0); 1262 out1 = __msa_copy_u_w((v4i32) dst1, 0); 1263 SW(out0, dst); 1264 dst += dst_stride; 1265 SW(out1, dst); 1266 dst += dst_stride; 1267 } 1268 } 1269} 1270 1271static void avg_width8_msa(const uint8_t *src, int32_t src_stride, 1272 uint8_t *dst, int32_t dst_stride, 1273 int32_t height) 1274{ 1275 int32_t cnt; 1276 uint64_t out0, out1, out2, out3; 1277 v16u8 src0, src1, src2, src3; 1278 v16u8 dst0, dst1, dst2, dst3; 1279 1280 for (cnt = (height / 4); cnt--;) { 1281 LD_UB4(src, src_stride, src0, src1, src2, src3); 1282 src += (4 * src_stride); 1283 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 1284 1285 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, 1286 dst0, dst1, dst2, dst3); 1287 1288 out0 = __msa_copy_u_d((v2i64) dst0, 0); 1289 out1 = __msa_copy_u_d((v2i64) dst1, 0); 1290 out2 = __msa_copy_u_d((v2i64) dst2, 0); 1291 out3 = __msa_copy_u_d((v2i64) dst3, 0); 1292 SD4(out0, out1, out2, out3, dst, dst_stride); 1293 dst += (4 * dst_stride); 1294 } 1295} 1296 1297static void avg_width16_msa(const uint8_t *src, int32_t src_stride, 1298 uint8_t *dst, int32_t dst_stride, 1299 int32_t height) 1300{ 1301 int32_t cnt; 1302 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 1303 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1304 1305 for (cnt = (height / 8); cnt--;) { 1306 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 1307 src += (8 * src_stride); 1308 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 1309 1310 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, 1311 dst0, dst1, dst2, dst3); 1312 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, 1313 dst4, dst5, dst6, dst7); 1314 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); 1315 dst += (8 * dst_stride); 1316 } 1317} 1318 1319void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels, 1320 ptrdiff_t line_size, int h) 1321{ 1322 copy_width16_msa(pixels, line_size, block, line_size, h); 1323} 1324 1325void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels, 1326 ptrdiff_t line_size, int h) 1327{ 1328 common_hz_bil_16w_msa(pixels, line_size, block, line_size, h); 1329} 1330 1331void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels, 1332 ptrdiff_t line_size, int h) 1333{ 1334 common_vt_bil_16w_msa(pixels, line_size, block, line_size, h); 1335} 1336 1337void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels, 1338 ptrdiff_t line_size, int h) 1339{ 1340 common_hv_bil_16w_msa(pixels, line_size, block, line_size, h); 1341} 1342 1343void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels, 1344 ptrdiff_t line_size, int h) 1345{ 1346 copy_width8_msa(pixels, line_size, block, line_size, h); 1347} 1348 1349void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels, 1350 ptrdiff_t line_size, int h) 1351{ 1352 common_hz_bil_8w_msa(pixels, line_size, block, line_size, h); 1353} 1354 1355void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels, 1356 ptrdiff_t line_size, int h) 1357{ 1358 common_vt_bil_8w_msa(pixels, line_size, block, line_size, h); 1359} 1360 1361void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels, 1362 ptrdiff_t line_size, int h) 1363{ 1364 common_hv_bil_8w_msa(pixels, line_size, block, line_size, h); 1365} 1366 1367void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels, 1368 ptrdiff_t line_size, int h) 1369{ 1370 common_hz_bil_4w_msa(pixels, line_size, block, line_size, h); 1371} 1372 1373void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels, 1374 ptrdiff_t line_size, int h) 1375{ 1376 common_vt_bil_4w_msa(pixels, line_size, block, line_size, h); 1377} 1378 1379void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels, 1380 ptrdiff_t line_size, int h) 1381{ 1382 common_hv_bil_4w_msa(pixels, line_size, block, line_size, h); 1383} 1384 1385void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels, 1386 ptrdiff_t line_size, int h) 1387{ 1388 if (h == 16) { 1389 common_hz_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size); 1390 } else if (h == 8) { 1391 common_hz_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size); 1392 } 1393} 1394 1395void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels, 1396 ptrdiff_t line_size, int h) 1397{ 1398 if (h == 16) { 1399 common_vt_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size); 1400 } else if (h == 8) { 1401 common_vt_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size); 1402 } 1403} 1404 1405void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block, 1406 const uint8_t *pixels, 1407 ptrdiff_t line_size, int h) 1408{ 1409 if (h == 16) { 1410 common_hv_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size); 1411 } else if (h == 8) { 1412 common_hv_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size); 1413 } 1414} 1415 1416void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels, 1417 ptrdiff_t line_size, int h) 1418{ 1419 if (h == 8) { 1420 common_hz_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size); 1421 } else if (h == 4) { 1422 common_hz_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size); 1423 } 1424} 1425 1426void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels, 1427 ptrdiff_t line_size, int h) 1428{ 1429 if (h == 8) { 1430 common_vt_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size); 1431 } else if (h == 4) { 1432 common_vt_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size); 1433 } 1434} 1435 1436void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels, 1437 ptrdiff_t line_size, int h) 1438{ 1439 if (h == 8) { 1440 common_hv_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size); 1441 } else if (h == 4) { 1442 common_hv_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size); 1443 } 1444} 1445 1446void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels, 1447 ptrdiff_t line_size, int h) 1448{ 1449 avg_width16_msa(pixels, line_size, block, line_size, h); 1450} 1451 1452void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels, 1453 ptrdiff_t line_size, int h) 1454{ 1455 common_hz_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h); 1456} 1457 1458void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels, 1459 ptrdiff_t line_size, int h) 1460{ 1461 common_vt_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h); 1462} 1463 1464void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels, 1465 ptrdiff_t line_size, int h) 1466{ 1467 common_hv_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h); 1468} 1469 1470void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels, 1471 ptrdiff_t line_size, int h) 1472{ 1473 avg_width8_msa(pixels, line_size, block, line_size, h); 1474} 1475 1476void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels, 1477 ptrdiff_t line_size, int h) 1478{ 1479 common_hz_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h); 1480} 1481 1482void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels, 1483 ptrdiff_t line_size, int h) 1484{ 1485 common_vt_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h); 1486} 1487 1488void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels, 1489 ptrdiff_t line_size, int h) 1490{ 1491 common_hv_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h); 1492} 1493 1494void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels, 1495 ptrdiff_t line_size, int h) 1496{ 1497 avg_width4_msa(pixels, line_size, block, line_size, h); 1498} 1499 1500void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels, 1501 ptrdiff_t line_size, int h) 1502{ 1503 common_hz_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h); 1504} 1505 1506void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels, 1507 ptrdiff_t line_size, int h) 1508{ 1509 common_vt_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h); 1510} 1511 1512void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels, 1513 ptrdiff_t line_size, int h) 1514{ 1515 common_hv_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h); 1516} 1517