1/* 2 * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "h264chroma_mips.h" 23 24static const uint8_t chroma_mask_arr[16 * 5] = { 25 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 26 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24, 27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8, 29 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20 30}; 31 32static void avc_chroma_hz_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, 33 uint32_t coeff0, uint32_t coeff1) 34{ 35 uint16_t out0, out1; 36 v16i8 src0, src1; 37 v8u16 res_r; 38 v8i16 res; 39 v16i8 mask; 40 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 41 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 42 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 43 44 mask = LD_SB(&chroma_mask_arr[0]); 45 46 LD_SB2(src, stride, src0, src1); 47 48 src0 = __msa_vshf_b(mask, src1, src0); 49 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec); 50 res_r <<= 3; 51 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 52 res_r = __msa_sat_u_h(res_r, 7); 53 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 54 55 out0 = __msa_copy_u_h(res, 0); 56 out1 = __msa_copy_u_h(res, 2); 57 58 SH(out0, dst); 59 dst += stride; 60 SH(out1, dst); 61} 62 63static void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 64 uint32_t coeff0, uint32_t coeff1) 65{ 66 v16u8 src0, src1, src2, src3; 67 v8u16 res_r; 68 v8i16 res; 69 v16i8 mask; 70 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 71 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 72 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 73 74 mask = LD_SB(&chroma_mask_arr[64]); 75 76 LD_UB4(src, stride, src0, src1, src2, src3); 77 78 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); 79 80 src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0); 81 82 res_r = __msa_dotp_u_h(src0, coeff_vec); 83 res_r <<= 3; 84 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 85 res_r = __msa_sat_u_h(res_r, 7); 86 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 87 88 ST_H4(res, 0, 1, 2, 3, dst, stride); 89} 90 91static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 92 uint32_t coeff0, uint32_t coeff1, 93 int32_t height) 94{ 95 if (2 == height) { 96 avc_chroma_hz_2x2_msa(src, dst, stride, coeff0, coeff1); 97 } else if (4 == height) { 98 avc_chroma_hz_2x4_msa(src, dst, stride, coeff0, coeff1); 99 } 100} 101 102static void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, 103 uint32_t coeff0, uint32_t coeff1) 104{ 105 v16i8 src0, src1; 106 v8u16 res_r; 107 v4i32 res; 108 v16i8 mask; 109 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 110 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 111 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 112 113 mask = LD_SB(&chroma_mask_arr[0]); 114 115 LD_SB2(src, stride, src0, src1); 116 117 src0 = __msa_vshf_b(mask, src1, src0); 118 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec); 119 res_r <<= 3; 120 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 121 res_r = __msa_sat_u_h(res_r, 7); 122 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 123 124 ST_W2(res, 0, 1, dst, stride); 125} 126 127static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 128 uint32_t coeff0, uint32_t coeff1) 129{ 130 v16u8 src0, src1, src2, src3, out; 131 v8u16 res0_r, res1_r; 132 v16i8 mask; 133 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 134 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 135 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 136 137 mask = LD_SB(&chroma_mask_arr[0]); 138 139 LD_UB4(src, stride, src0, src1, src2, src3); 140 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); 141 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r); 142 res0_r <<= 3; 143 res1_r <<= 3; 144 SRARI_H2_UH(res0_r, res1_r, 6); 145 SAT_UH2_UH(res0_r, res1_r, 7); 146 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); 147 ST_W4(out, 0, 1, 2, 3, dst, stride); 148} 149 150static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 151 uint32_t coeff0, uint32_t coeff1) 152{ 153 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1; 154 v16i8 mask; 155 v8u16 res0, res1, res2, res3; 156 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 157 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 158 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 159 160 mask = LD_SB(&chroma_mask_arr[0]); 161 162 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 163 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); 164 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6); 165 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1); 166 DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3); 167 SLLI_4V(res0, res1, res2, res3, 3); 168 SRARI_H4_UH(res0, res1, res2, res3, 6); 169 SAT_UH4_UH(res0, res1, res2, res3, 7); 170 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 171 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 172} 173 174static void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 175 uint32_t coeff0, uint32_t coeff1, 176 int32_t height) 177{ 178 if (2 == height) { 179 avc_chroma_hz_4x2_msa(src, dst, stride, coeff0, coeff1); 180 } else if (4 == height) { 181 avc_chroma_hz_4x4_msa(src, dst, stride, coeff0, coeff1); 182 } else if (8 == height) { 183 avc_chroma_hz_4x8_msa(src, dst, stride, coeff0, coeff1); 184 } 185} 186 187static void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 188 uint32_t coeff0, uint32_t coeff1) 189{ 190 v16u8 src0, src1, src2, src3, out0, out1; 191 v8u16 res0, res1, res2, res3; 192 v16i8 mask; 193 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 194 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 195 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 196 197 mask = LD_SB(&chroma_mask_arr[32]); 198 LD_UB4(src, stride, src0, src1, src2, src3); 199 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1); 200 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3); 201 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 202 coeff_vec, res0, res1, res2, res3); 203 SLLI_4V(res0, res1, res2, res3, 3); 204 SRARI_H4_UH(res0, res1, res2, res3, 6); 205 SAT_UH4_UH(res0, res1, res2, res3, 7); 206 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 207 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 208} 209 210static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 211 uint32_t coeff0, uint32_t coeff1) 212{ 213 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 214 v16u8 out0, out1, out2, out3; 215 v8u16 res0, res1, res2, res3, res4, res5, res6, res7; 216 v16i8 mask; 217 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 218 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 219 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 220 221 mask = LD_SB(&chroma_mask_arr[32]); 222 223 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 224 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1); 225 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3); 226 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5); 227 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7); 228 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 229 coeff_vec, res0, res1, res2, res3); 230 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec, 231 coeff_vec, res4, res5, res6, res7); 232 SLLI_4V(res0, res1, res2, res3, 3); 233 SLLI_4V(res4, res5, res6, res7, 3); 234 SRARI_H4_UH(res0, res1, res2, res3, 6); 235 SRARI_H4_UH(res4, res5, res6, res7, 6); 236 SAT_UH4_UH(res0, res1, res2, res3, 7); 237 SAT_UH4_UH(res4, res5, res6, res7, 7); 238 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 239 PCKEV_B2_UB(res5, res4, res7, res6, out2, out3); 240 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 241} 242 243static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst, 244 int32_t stride, uint32_t coeff0, 245 uint32_t coeff1, int32_t height) 246{ 247 uint32_t row; 248 v16u8 src0, src1, src2, src3, out0, out1; 249 v8u16 res0, res1, res2, res3; 250 v16i8 mask; 251 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 252 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 253 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 254 255 mask = LD_SB(&chroma_mask_arr[32]); 256 257 for (row = height >> 2; row--;) { 258 LD_UB4(src, stride, src0, src1, src2, src3); 259 src += (4 * stride); 260 261 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1); 262 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3); 263 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 264 coeff_vec, res0, res1, res2, res3); 265 SLLI_4V(res0, res1, res2, res3, 3); 266 SRARI_H4_UH(res0, res1, res2, res3, 6); 267 SAT_UH4_UH(res0, res1, res2, res3, 7); 268 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 269 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 270 dst += (4 * stride); 271 } 272 273 if (0 != (height % 4)) { 274 for (row = (height % 4); row--;) { 275 src0 = LD_UB(src); 276 src += stride; 277 278 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0); 279 280 res0 = __msa_dotp_u_h(src0, coeff_vec); 281 res0 <<= 3; 282 res0 = (v8u16) __msa_srari_h((v8i16) res0, 6); 283 res0 = __msa_sat_u_h(res0, 7); 284 res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0); 285 286 ST_D1(res0, 0, dst); 287 dst += stride; 288 } 289 } 290} 291 292static void avc_chroma_hz_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 293 uint32_t coeff0, uint32_t coeff1, 294 int32_t height) 295{ 296 if (4 == height) { 297 avc_chroma_hz_8x4_msa(src, dst, stride, coeff0, coeff1); 298 } else if (8 == height) { 299 avc_chroma_hz_8x8_msa(src, dst, stride, coeff0, coeff1); 300 } else { 301 avc_chroma_hz_nonmult_msa(src, dst, stride, coeff0, coeff1, height); 302 } 303} 304 305static void avc_chroma_vt_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, 306 uint32_t coeff0, uint32_t coeff1) 307{ 308 uint16_t out0, out1; 309 v16i8 src0, src1, src2; 310 v16u8 tmp0, tmp1; 311 v8i16 res; 312 v8u16 res_r; 313 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 314 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 315 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 316 317 LD_SB3(src, stride, src0, src1, src2); 318 319 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1); 320 321 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0); 322 323 res_r = __msa_dotp_u_h(tmp0, coeff_vec); 324 res_r <<= 3; 325 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 326 res_r = __msa_sat_u_h(res_r, 7); 327 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 328 329 out0 = __msa_copy_u_h(res, 0); 330 out1 = __msa_copy_u_h(res, 2); 331 332 SH(out0, dst); 333 dst += stride; 334 SH(out1, dst); 335} 336 337static void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 338 uint32_t coeff0, uint32_t coeff1) 339{ 340 v16u8 src0, src1, src2, src3, src4; 341 v16u8 tmp0, tmp1, tmp2, tmp3; 342 v8i16 res; 343 v8u16 res_r; 344 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 345 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 346 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 347 348 LD_UB5(src, stride, src0, src1, src2, src3, src4); 349 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 350 tmp0, tmp1, tmp2, tmp3); 351 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 352 353 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0); 354 355 res_r = __msa_dotp_u_h(tmp0, coeff_vec); 356 res_r <<= 3; 357 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 358 res_r = __msa_sat_u_h(res_r, 7); 359 360 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 361 362 ST_H4(res, 0, 1, 2, 3, dst, stride); 363} 364 365static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 366 uint32_t coeff0, uint32_t coeff1, 367 int32_t height) 368{ 369 if (2 == height) { 370 avc_chroma_vt_2x2_msa(src, dst, stride, coeff0, coeff1); 371 } else if (4 == height) { 372 avc_chroma_vt_2x4_msa(src, dst, stride, coeff0, coeff1); 373 } 374} 375 376static void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, 377 uint32_t coeff0, uint32_t coeff1) 378{ 379 v16u8 src0, src1, src2; 380 v16u8 tmp0, tmp1; 381 v4i32 res; 382 v8u16 res_r; 383 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 384 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 385 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 386 387 LD_UB3(src, stride, src0, src1, src2); 388 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1); 389 390 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0); 391 res_r = __msa_dotp_u_h(tmp0, coeff_vec); 392 res_r <<= 3; 393 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 394 res_r = __msa_sat_u_h(res_r, 7); 395 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 396 397 ST_W2(res, 0, 1, dst, stride); 398} 399 400static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 401 uint32_t coeff0, uint32_t coeff1) 402{ 403 v16u8 src0, src1, src2, src3, src4; 404 v16u8 tmp0, tmp1, tmp2, tmp3; 405 v16u8 out; 406 v8u16 res0_r, res1_r; 407 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 408 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 409 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 410 411 LD_UB5(src, stride, src0, src1, src2, src3, src4); 412 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2, 413 tmp3); 414 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 415 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r); 416 res0_r <<= 3; 417 res1_r <<= 3; 418 SRARI_H2_UH(res0_r, res1_r, 6); 419 SAT_UH2_UH(res0_r, res1_r, 7); 420 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); 421 ST_W4(out, 0, 1, 2, 3, dst, stride); 422} 423 424static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 425 uint32_t coeff0, uint32_t coeff1) 426{ 427 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 428 v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1; 429 v8u16 res0, res1, res2, res3; 430 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 431 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 432 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 433 434 LD_UB5(src, stride, src0, src1, src2, src3, src4); 435 src += (5 * stride); 436 LD_UB4(src, stride, src5, src6, src7, src8); 437 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2, 438 tmp3); 439 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6, 440 tmp7); 441 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 442 ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6); 443 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1); 444 DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3); 445 SLLI_4V(res0, res1, res2, res3, 3); 446 SRARI_H4_UH(res0, res1, res2, res3, 6); 447 SAT_UH4_UH(res0, res1, res2, res3, 7); 448 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 449 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 450} 451 452static void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 453 uint32_t coeff0, uint32_t coeff1, 454 int32_t height) 455{ 456 if (2 == height) { 457 avc_chroma_vt_4x2_msa(src, dst, stride, coeff0, coeff1); 458 } else if (4 == height) { 459 avc_chroma_vt_4x4_msa(src, dst, stride, coeff0, coeff1); 460 } else if (8 == height) { 461 avc_chroma_vt_4x8_msa(src, dst, stride, coeff0, coeff1); 462 } 463} 464 465static void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 466 uint32_t coeff0, uint32_t coeff1) 467{ 468 v16u8 src0, src1, src2, src3, src4, out0, out1; 469 v8u16 res0, res1, res2, res3; 470 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 471 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 472 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 473 474 LD_UB5(src, stride, src0, src1, src2, src3, src4); 475 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2, 476 src3); 477 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 478 coeff_vec, res0, res1, res2, res3); 479 SLLI_4V(res0, res1, res2, res3, 3); 480 SRARI_H4_UH(res0, res1, res2, res3, 6); 481 SAT_UH4_UH(res0, res1, res2, res3, 7); 482 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 483 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 484} 485 486static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 487 uint32_t coeff0, uint32_t coeff1) 488{ 489 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 490 v16u8 out0, out1, out2, out3; 491 v8u16 res0, res1, res2, res3, res4, res5, res6, res7; 492 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 493 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 494 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 495 496 LD_UB5(src, stride, src0, src1, src2, src3, src4); 497 src += (5 * stride); 498 LD_UB4(src, stride, src5, src6, src7, src8); 499 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2, 500 src3); 501 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6, 502 src7); 503 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 504 coeff_vec, res0, res1, res2, res3); 505 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec, 506 coeff_vec, res4, res5, res6, res7); 507 SLLI_4V(res0, res1, res2, res3, 3); 508 SLLI_4V(res4, res5, res6, res7, 3); 509 SRARI_H4_UH(res0, res1, res2, res3, 6); 510 SRARI_H4_UH(res4, res5, res6, res7, 6); 511 SAT_UH4_UH(res0, res1, res2, res3, 7); 512 SAT_UH4_UH(res0, res1, res2, res3, 7); 513 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 514 PCKEV_B2_UB(res5, res4, res7, res6, out2, out3); 515 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 516} 517 518static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 519 uint32_t coeff0, uint32_t coeff1, 520 int32_t height) 521{ 522 if (4 == height) { 523 avc_chroma_vt_8x4_msa(src, dst, stride, coeff0, coeff1); 524 } else if (8 == height) { 525 avc_chroma_vt_8x8_msa(src, dst, stride, coeff0, coeff1); 526 } 527} 528 529static void avc_chroma_hv_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, 530 uint32_t coef_hor0, uint32_t coef_hor1, 531 uint32_t coef_ver0, uint32_t coef_ver1) 532{ 533 uint16_t out0, out1; 534 v16u8 src0, src1, src2; 535 v8u16 res_hz0, res_hz1, res_vt0, res_vt1; 536 v8i16 res_vert; 537 v16i8 mask; 538 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 539 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 540 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 541 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 542 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 543 544 mask = LD_SB(&chroma_mask_arr[48]); 545 546 LD_UB3(src, stride, src0, src1, src2); 547 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 548 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); 549 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 550 551 res_vt0 += res_vt1; 552 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); 553 res_vt0 = __msa_sat_u_h(res_vt0, 7); 554 res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); 555 556 out0 = __msa_copy_u_h(res_vert, 0); 557 out1 = __msa_copy_u_h(res_vert, 1); 558 559 SH(out0, dst); 560 dst += stride; 561 SH(out1, dst); 562} 563 564static void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 565 uint32_t coef_hor0, uint32_t coef_hor1, 566 uint32_t coef_ver0, uint32_t coef_ver1) 567{ 568 v16u8 src0, src1, src2, src3, src4; 569 v16u8 tmp0, tmp1, tmp2, tmp3; 570 v8u16 res_hz0, res_hz1, res_vt0, res_vt1; 571 v8i16 res; 572 v16i8 mask; 573 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 574 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 575 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 576 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 577 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 578 579 mask = LD_SB(&chroma_mask_arr[48]); 580 581 LD_UB5(src, stride, src0, src1, src2, src3, src4); 582 583 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1); 584 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3); 585 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 586 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); 587 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 588 589 res_vt0 += res_vt1; 590 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); 591 res_vt0 = __msa_sat_u_h(res_vt0, 7); 592 593 res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); 594 595 ST_H4(res, 0, 1, 2, 3, dst, stride); 596} 597 598static void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 599 uint32_t coef_hor0, uint32_t coef_hor1, 600 uint32_t coef_ver0, uint32_t coef_ver1, 601 int32_t height) 602{ 603 if (2 == height) { 604 avc_chroma_hv_2x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 605 coef_ver1); 606 } else if (4 == height) { 607 avc_chroma_hv_2x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 608 coef_ver1); 609 } 610} 611 612static void avc_chroma_hv_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, 613 uint32_t coef_hor0, uint32_t coef_hor1, 614 uint32_t coef_ver0, uint32_t coef_ver1) 615{ 616 v16u8 src0, src1, src2; 617 v8u16 res_hz0, res_hz1, res_vt0, res_vt1; 618 v16i8 mask; 619 v4i32 res; 620 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 621 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 622 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 623 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 624 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 625 626 mask = LD_SB(&chroma_mask_arr[0]); 627 LD_UB3(src, stride, src0, src1, src2); 628 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 629 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); 630 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 631 632 res_vt0 += res_vt1; 633 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); 634 res_vt0 = __msa_sat_u_h(res_vt0, 7); 635 res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); 636 637 ST_W2(res, 0, 1, dst, stride); 638} 639 640static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 641 uint32_t coef_hor0, uint32_t coef_hor1, 642 uint32_t coef_ver0, uint32_t coef_ver1) 643{ 644 v16u8 src0, src1, src2, src3, src4; 645 v8u16 res_hz0, res_hz1, res_hz2, res_hz3; 646 v8u16 res_vt0, res_vt1, res_vt2, res_vt3; 647 v16i8 mask; 648 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 649 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 650 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 651 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 652 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 653 v4i32 res0, res1; 654 655 mask = LD_SB(&chroma_mask_arr[0]); 656 657 LD_UB5(src, stride, src0, src1, src2, src3, src4); 658 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 659 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3); 660 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, 661 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, 662 res_hz3); 663 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, 664 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 665 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1); 666 SRARI_H2_UH(res_vt0, res_vt1, 6); 667 SAT_UH2_UH(res_vt0, res_vt1, 7); 668 PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1); 669 ST_W2(res0, 0, 1, dst, stride); 670 ST_W2(res1, 0, 1, dst + 2 * stride, stride); 671} 672 673static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 674 uint32_t coef_hor0, uint32_t coef_hor1, 675 uint32_t coef_ver0, uint32_t coef_ver1) 676{ 677 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1; 678 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7; 679 v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7; 680 v16i8 mask; 681 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 682 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 683 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 684 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 685 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 686 687 mask = LD_SB(&chroma_mask_arr[0]); 688 689 LD_UB5(src, stride, src0, src1, src2, src3, src4); 690 src += (5 * stride); 691 LD_UB4(src, stride, src5, src6, src7, src8); 692 693 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 694 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3); 695 VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5); 696 VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7); 697 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, 698 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3); 699 DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec, 700 coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7); 701 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, 702 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 703 MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1, 704 res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7); 705 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1); 706 ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3); 707 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); 708 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); 709 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1); 710 ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 711} 712 713static void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 714 uint32_t coef_hor0, uint32_t coef_hor1, 715 uint32_t coef_ver0, uint32_t coef_ver1, 716 int32_t height) 717{ 718 if (2 == height) { 719 avc_chroma_hv_4x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 720 coef_ver1); 721 } else if (4 == height) { 722 avc_chroma_hv_4x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 723 coef_ver1); 724 } else if (8 == height) { 725 avc_chroma_hv_4x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 726 coef_ver1); 727 } 728} 729 730static void avc_chroma_hv_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 731 uint32_t coef_hor0, uint32_t coef_hor1, 732 uint32_t coef_ver0, uint32_t coef_ver1) 733{ 734 v16u8 src0, src1, src2, src3, src4, out0, out1; 735 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4; 736 v8u16 res_vt0, res_vt1, res_vt2, res_vt3; 737 v16i8 mask; 738 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 739 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 740 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 741 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 742 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 743 744 mask = LD_SB(&chroma_mask_arr[32]); 745 746 src0 = LD_UB(src); 747 src += stride; 748 749 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0); 750 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec); 751 752 LD_UB4(src, stride, src1, src2, src3, src4); 753 src += (4 * stride); 754 755 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2); 756 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4); 757 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec, 758 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4); 759 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0, 760 res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 761 762 res_vt0 += (res_hz0 * coeff_vt_vec1); 763 res_vt1 += (res_hz1 * coeff_vt_vec1); 764 res_vt2 += (res_hz2 * coeff_vt_vec1); 765 res_vt3 += (res_hz3 * coeff_vt_vec1); 766 767 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); 768 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); 769 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1); 770 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 771} 772 773static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 774 uint32_t coef_hor0, uint32_t coef_hor1, 775 uint32_t coef_ver0, uint32_t coef_ver1) 776{ 777 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 778 v16u8 out0, out1, out2, out3; 779 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4; 780 v8u16 res_hz5, res_hz6, res_hz7, res_hz8; 781 v8u16 res_vt0, res_vt1, res_vt2, res_vt3; 782 v8u16 res_vt4, res_vt5, res_vt6, res_vt7; 783 v16i8 mask; 784 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 785 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 786 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 787 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 788 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 789 790 mask = LD_SB(&chroma_mask_arr[32]); 791 792 LD_UB5(src, stride, src0, src1, src2, src3, src4); 793 src += (5 * stride); 794 LD_UB4(src, stride, src5, src6, src7, src8); 795 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0); 796 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2); 797 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4); 798 VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6); 799 VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8); 800 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec); 801 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec, 802 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, 803 res_hz4); 804 DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec, 805 coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8); 806 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, 807 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, 808 res_vt3); 809 MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7, 810 coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, 811 res_vt7); 812 res_vt0 += (res_hz0 * coeff_vt_vec1); 813 res_vt1 += (res_hz1 * coeff_vt_vec1); 814 res_vt2 += (res_hz2 * coeff_vt_vec1); 815 res_vt3 += (res_hz3 * coeff_vt_vec1); 816 res_vt4 += (res_hz4 * coeff_vt_vec1); 817 res_vt5 += (res_hz5 * coeff_vt_vec1); 818 res_vt6 += (res_hz6 * coeff_vt_vec1); 819 res_vt7 += (res_hz7 * coeff_vt_vec1); 820 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); 821 SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6); 822 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); 823 SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7); 824 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1); 825 PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3); 826 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 827} 828 829static void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 830 uint32_t coef_hor0, uint32_t coef_hor1, 831 uint32_t coef_ver0, uint32_t coef_ver1, 832 int32_t height) 833{ 834 if (4 == height) { 835 avc_chroma_hv_8x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 836 coef_ver1); 837 } else if (8 == height) { 838 avc_chroma_hv_8x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 839 coef_ver1); 840 } 841} 842 843static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst, 844 int32_t stride, uint32_t coeff0, 845 uint32_t coeff1) 846{ 847 uint16_t out0, out1; 848 v16i8 src0, src1; 849 v16u8 dst_data = { 0 }; 850 v8u16 res_r; 851 v16u8 res; 852 v16i8 mask; 853 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 854 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 855 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 856 857 mask = LD_SB(&chroma_mask_arr[0]); 858 859 LD_SB2(src, stride, src0, src1); 860 861 out0 = LH(dst); 862 out1 = LH(dst + stride); 863 864 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0); 865 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1); 866 867 src0 = __msa_vshf_b(mask, src1, src0); 868 869 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec); 870 res_r <<= 3; 871 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 872 res_r = __msa_sat_u_h(res_r, 7); 873 874 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 875 dst_data = __msa_aver_u_b(res, dst_data); 876 877 out0 = __msa_copy_u_h((v8i16) dst_data, 0); 878 out1 = __msa_copy_u_h((v8i16) dst_data, 2); 879 880 SH(out0, dst); 881 dst += stride; 882 SH(out1, dst); 883} 884 885static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst, 886 int32_t stride, uint32_t coeff0, 887 uint32_t coeff1) 888{ 889 uint16_t tp0, tp1, tp2, tp3; 890 v16u8 src0, src1, src2, src3; 891 v16u8 dst0, dst_data = { 0 }; 892 v8u16 res_r; 893 v16i8 mask; 894 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 895 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 896 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 897 898 mask = LD_SB(&chroma_mask_arr[64]); 899 900 LD_UB4(src, stride, src0, src1, src2, src3); 901 tp0 = LH(dst); 902 tp1 = LH(dst + stride); 903 tp2 = LH(dst + 2 * stride); 904 tp3 = LH(dst + 3 * stride); 905 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0); 906 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1); 907 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2); 908 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3); 909 910 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); 911 912 src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0); 913 914 res_r = __msa_dotp_u_h(src0, coeff_vec); 915 res_r <<= 3; 916 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 917 res_r = __msa_sat_u_h(res_r, 7); 918 919 dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 920 dst0 = __msa_aver_u_b(dst0, dst_data); 921 922 ST_H4(dst0, 0, 1, 2, 3, dst, stride); 923} 924 925static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst, 926 int32_t stride, uint32_t coeff0, 927 uint32_t coeff1, int32_t height) 928{ 929 if (2 == height) { 930 avc_chroma_hz_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1); 931 } else if (4 == height) { 932 avc_chroma_hz_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1); 933 } 934} 935 936static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst, 937 int32_t stride, uint32_t coeff0, 938 uint32_t coeff1) 939{ 940 uint32_t load0, load1; 941 v16i8 src0, src1; 942 v16u8 dst_data = { 0 }; 943 v8u16 res_r; 944 v16i8 res, mask; 945 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 946 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 947 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 948 949 mask = LD_SB(&chroma_mask_arr[0]); 950 951 LD_SB2(src, stride, src0, src1); 952 953 LW2(dst, stride, load0, load1); 954 955 INSERT_W2_UB(load0, load1, dst_data); 956 957 src0 = __msa_vshf_b(mask, src1, src0); 958 959 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec); 960 res_r <<= 3; 961 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 962 res_r = __msa_sat_u_h(res_r, 7); 963 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 964 dst_data = __msa_aver_u_b((v16u8) res, dst_data); 965 966 ST_W2(dst_data, 0, 1, dst, stride); 967} 968 969static void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst, 970 int32_t stride, uint32_t coeff0, 971 uint32_t coeff1) 972{ 973 uint32_t tp0, tp1, tp2, tp3; 974 v16u8 src0, src1, src2, src3; 975 v16u8 out, dst_data = { 0 }; 976 v16i8 mask; 977 v8u16 res0_r, res1_r; 978 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 979 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 980 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 981 982 mask = LD_SB(&chroma_mask_arr[0]); 983 984 LD_UB4(src, stride, src0, src1, src2, src3); 985 LW4(dst, stride, tp0, tp1, tp2, tp3); 986 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data); 987 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); 988 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r); 989 res0_r <<= 3; 990 res1_r <<= 3; 991 SRARI_H2_UH(res0_r, res1_r, 6); 992 SAT_UH2_UH(res0_r, res1_r, 7); 993 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); 994 out = __msa_aver_u_b(out, dst_data); 995 ST_W4(out, 0, 1, 2, 3, dst, stride); 996} 997 998static void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst, 999 int32_t stride, uint32_t coeff0, 1000 uint32_t coeff1) 1001{ 1002 uint32_t tp0, tp1, tp2, tp3; 1003 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1; 1004 v16u8 dst0 = { 0 }, dst1 = { 0 }; 1005 v16i8 mask; 1006 v8u16 res0, res1, res2, res3; 1007 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1008 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1009 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1010 1011 mask = LD_SB(&chroma_mask_arr[0]); 1012 1013 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 1014 LW4(dst, stride, tp0, tp1, tp2, tp3); 1015 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1016 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1017 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 1018 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); 1019 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6); 1020 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1); 1021 DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3); 1022 SLLI_4V(res0, res1, res2, res3, 3); 1023 SRARI_H4_UH(res0, res1, res2, res3, 6); 1024 SAT_UH4_UH(res0, res1, res2, res3, 7); 1025 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 1026 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1027 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 1028} 1029 1030static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst, 1031 int32_t stride, uint32_t coeff0, 1032 uint32_t coeff1, int32_t height) 1033{ 1034 if (2 == height) { 1035 avc_chroma_hz_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1); 1036 } else if (4 == height) { 1037 avc_chroma_hz_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1); 1038 } else if (8 == height) { 1039 avc_chroma_hz_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1); 1040 } 1041} 1042 1043static void avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst, 1044 int32_t stride, uint32_t coeff0, 1045 uint32_t coeff1) 1046{ 1047 uint64_t tp0, tp1, tp2, tp3; 1048 v16u8 src0, src1, src2, src3, out0, out1; 1049 v16u8 dst0 = { 0 }, dst1 = { 0 }; 1050 v8u16 res0, res1, res2, res3; 1051 v16i8 mask; 1052 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1053 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1054 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1055 1056 mask = LD_SB(&chroma_mask_arr[32]); 1057 LD_UB4(src, stride, src0, src1, src2, src3); 1058 LD4(dst, stride, tp0, tp1, tp2, tp3); 1059 INSERT_D2_UB(tp0, tp1, dst0); 1060 INSERT_D2_UB(tp2, tp3, dst1); 1061 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1); 1062 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3); 1063 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 1064 coeff_vec, res0, res1, res2, res3); 1065 SLLI_4V(res0, res1, res2, res3, 3); 1066 SRARI_H4_UH(res0, res1, res2, res3, 6); 1067 SAT_UH4_UH(res0, res1, res2, res3, 7); 1068 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 1069 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 1070 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 1071} 1072 1073static void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst, 1074 int32_t stride, uint32_t coeff0, 1075 uint32_t coeff1) 1076{ 1077 uint64_t tp0, tp1, tp2, tp3; 1078 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 1079 v16u8 out0, out1, out2, out3; 1080 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 1081 v8u16 res0, res1, res2, res3, res4, res5, res6, res7; 1082 v16i8 mask; 1083 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1084 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1085 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1086 1087 mask = LD_SB(&chroma_mask_arr[32]); 1088 1089 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 1090 LD4(dst, stride, tp0, tp1, tp2, tp3); 1091 INSERT_D2_UB(tp0, tp1, dst0); 1092 INSERT_D2_UB(tp2, tp3, dst1); 1093 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1094 INSERT_D2_UB(tp0, tp1, dst2); 1095 INSERT_D2_UB(tp2, tp3, dst3); 1096 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1); 1097 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3); 1098 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5); 1099 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7); 1100 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 1101 coeff_vec, res0, res1, res2, res3); 1102 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec, 1103 coeff_vec, res4, res5, res6, res7); 1104 SLLI_4V(res0, res1, res2, res3, 3); 1105 SLLI_4V(res4, res5, res6, res7, 3); 1106 SRARI_H4_UH(res0, res1, res2, res3, 6); 1107 SRARI_H4_UH(res4, res5, res6, res7, 6); 1108 SAT_UH4_UH(res0, res1, res2, res3, 7); 1109 SAT_UH4_UH(res4, res5, res6, res7, 7); 1110 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 1111 PCKEV_B2_UB(res5, res4, res7, res6, out2, out3); 1112 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1113 AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3); 1114 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1115} 1116 1117static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst, 1118 int32_t stride, uint32_t coeff0, 1119 uint32_t coeff1, int32_t height) 1120{ 1121 if (4 == height) { 1122 avc_chroma_hz_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1); 1123 } else if (8 == height) { 1124 avc_chroma_hz_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1); 1125 } 1126} 1127 1128static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst, 1129 int32_t stride, uint32_t coeff0, 1130 uint32_t coeff1) 1131{ 1132 uint16_t out0, out1; 1133 v16i8 src0, src1, src2, tmp0, tmp1, res; 1134 v16u8 dst_data = { 0 }; 1135 v8i16 out; 1136 v8u16 res_r; 1137 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1138 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1139 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1140 1141 LD_SB3(src, stride, src0, src1, src2); 1142 out0 = LH(dst); 1143 out1 = LH(dst + stride); 1144 1145 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0); 1146 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1); 1147 1148 ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1); 1149 1150 tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0); 1151 res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec); 1152 res_r <<= 3; 1153 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 1154 res_r = __msa_sat_u_h(res_r, 7); 1155 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 1156 out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data); 1157 out0 = __msa_copy_u_h(out, 0); 1158 out1 = __msa_copy_u_h(out, 2); 1159 1160 SH(out0, dst); 1161 dst += stride; 1162 SH(out1, dst); 1163} 1164 1165static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst, 1166 int32_t stride, uint32_t coeff0, 1167 uint32_t coeff1) 1168{ 1169 uint16_t tp0, tp1, tp2, tp3; 1170 v16i8 src0, src1, src2, src3, src4; 1171 v16u8 tmp0, tmp1, tmp2, tmp3; 1172 v8u16 res_r; 1173 v8i16 res; 1174 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1175 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1176 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1177 v16u8 dst_data = { 0 }; 1178 1179 LD_SB5(src, stride, src0, src1, src2, src3, src4); 1180 1181 tp0 = LH(dst); 1182 tp1 = LH(dst + stride); 1183 tp2 = LH(dst + 2 * stride); 1184 tp3 = LH(dst + 3 * stride); 1185 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0); 1186 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1); 1187 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2); 1188 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3); 1189 1190 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 1191 tmp0, tmp1, tmp2, tmp3); 1192 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 1193 1194 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0); 1195 1196 res_r = __msa_dotp_u_h(tmp0, coeff_vec); 1197 res_r <<= 3; 1198 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 1199 res_r = __msa_sat_u_h(res_r, 7); 1200 1201 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 1202 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data); 1203 1204 ST_H4(res, 0, 1, 2, 3, dst, stride); 1205} 1206 1207static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst, 1208 int32_t stride, uint32_t coeff0, 1209 uint32_t coeff1, int32_t height) 1210{ 1211 if (2 == height) { 1212 avc_chroma_vt_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1); 1213 } else if (4 == height) { 1214 avc_chroma_vt_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1); 1215 } 1216} 1217 1218static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst, 1219 int32_t stride, uint32_t coeff0, 1220 uint32_t coeff1) 1221{ 1222 uint32_t load0, load1; 1223 v16u8 src0, src1, src2, tmp0, tmp1; 1224 v16u8 dst_data = { 0 }; 1225 v8u16 res_r; 1226 v16u8 res; 1227 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1228 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1229 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1230 1231 LD_UB3(src, stride, src0, src1, src2); 1232 1233 LW2(dst, stride, load0, load1); 1234 1235 INSERT_W2_UB(load0, load1, dst_data); 1236 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1); 1237 1238 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0); 1239 1240 res_r = __msa_dotp_u_h(tmp0, coeff_vec); 1241 res_r <<= 3; 1242 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 1243 res_r = __msa_sat_u_h(res_r, 7); 1244 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 1245 res = __msa_aver_u_b(res, dst_data); 1246 1247 ST_W2(res, 0, 1, dst, stride); 1248} 1249 1250static void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst, 1251 int32_t stride, uint32_t coeff0, 1252 uint32_t coeff1) 1253{ 1254 uint32_t tp0, tp1, tp2, tp3; 1255 v16u8 src0, src1, src2, src3, src4; 1256 v16u8 tmp0, tmp1, tmp2, tmp3; 1257 v16u8 dst0 = { 0 }; 1258 v8u16 res0_r, res1_r; 1259 v16u8 out; 1260 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1261 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1262 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1263 1264 LD_UB5(src, stride, src0, src1, src2, src3, src4); 1265 LW4(dst, stride, tp0, tp1, tp2, tp3); 1266 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1267 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2, 1268 tmp3); 1269 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 1270 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r); 1271 res0_r <<= 3; 1272 res1_r <<= 3; 1273 SRARI_H2_UH(res0_r, res1_r, 6); 1274 SAT_UH2_UH(res0_r, res1_r, 7); 1275 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); 1276 out = __msa_aver_u_b(out, dst0); 1277 ST_W4(out, 0, 1, 2, 3, dst, stride); 1278} 1279 1280static void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst, 1281 int32_t stride, uint32_t coeff0, 1282 uint32_t coeff1) 1283{ 1284 uint32_t tp0, tp1, tp2, tp3; 1285 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1286 v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1; 1287 v16u8 dst0 = { 0 }, dst1 = { 0 }; 1288 v8u16 res0, res1, res2, res3; 1289 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1290 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1291 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1292 1293 LD_UB5(src, stride, src0, src1, src2, src3, src4); 1294 src += (5 * stride); 1295 LD_UB4(src, stride, src5, src6, src7, src8); 1296 LW4(dst, stride, tp0, tp1, tp2, tp3); 1297 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1298 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1299 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 1300 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2, 1301 tmp3); 1302 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6, 1303 tmp7); 1304 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 1305 ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6); 1306 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1); 1307 DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3); 1308 SLLI_4V(res0, res1, res2, res3, 3); 1309 SRARI_H4_UH(res0, res1, res2, res3, 6); 1310 SAT_UH4_UH(res0, res1, res2, res3, 7); 1311 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 1312 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1313 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 1314} 1315 1316static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst, 1317 int32_t stride, uint32_t coeff0, 1318 uint32_t coeff1, int32_t height) 1319{ 1320 if (2 == height) { 1321 avc_chroma_vt_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1); 1322 } else if (4 == height) { 1323 avc_chroma_vt_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1); 1324 } else if (8 == height) { 1325 avc_chroma_vt_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1); 1326 } 1327} 1328 1329static void avc_chroma_vt_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst, 1330 int32_t stride, uint32_t coeff0, 1331 uint32_t coeff1) 1332{ 1333 uint64_t tp0, tp1, tp2, tp3; 1334 v16u8 src0, src1, src2, src3, src4; 1335 v16u8 out0, out1; 1336 v8u16 res0, res1, res2, res3; 1337 v16u8 dst0 = { 0 }, dst1 = { 0 }; 1338 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1339 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1340 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1341 1342 LD_UB5(src, stride, src0, src1, src2, src3, src4); 1343 LD4(dst, stride, tp0, tp1, tp2, tp3); 1344 INSERT_D2_UB(tp0, tp1, dst0); 1345 INSERT_D2_UB(tp2, tp3, dst1); 1346 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 1347 src0, src1, src2, src3); 1348 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 1349 coeff_vec, res0, res1, res2, res3); 1350 SLLI_4V(res0, res1, res2, res3, 3); 1351 SRARI_H4_UH(res0, res1, res2, res3, 6); 1352 SAT_UH4_UH(res0, res1, res2, res3, 7); 1353 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 1354 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1355 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 1356} 1357 1358static void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst, 1359 int32_t stride, uint32_t coeff0, 1360 uint32_t coeff1) 1361{ 1362 uint64_t tp0, tp1, tp2, tp3; 1363 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1364 v16u8 out0, out1, out2, out3; 1365 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 1366 v8u16 res0, res1, res2, res3, res4, res5, res6, res7; 1367 v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1368 v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1369 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1370 1371 LD_UB5(src, stride, src0, src1, src2, src3, src4); 1372 src += (5 * stride); 1373 LD_UB4(src, stride, src5, src6, src7, src8); 1374 LD4(dst, stride, tp0, tp1, tp2, tp3); 1375 INSERT_D2_UB(tp0, tp1, dst0); 1376 INSERT_D2_UB(tp2, tp3, dst1); 1377 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1378 INSERT_D2_UB(tp0, tp1, dst2); 1379 INSERT_D2_UB(tp2, tp3, dst3); 1380 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 1381 src0, src1, src2, src3); 1382 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, 1383 src4, src5, src6, src7); 1384 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 1385 coeff_vec, res0, res1, res2, res3); 1386 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec, 1387 coeff_vec, res4, res5, res6, res7); 1388 SLLI_4V(res0, res1, res2, res3, 3); 1389 SLLI_4V(res4, res5, res6, res7, 3); 1390 SRARI_H4_UH(res0, res1, res2, res3, 6); 1391 SRARI_H4_UH(res4, res5, res6, res7, 6); 1392 SAT_UH4_UH(res0, res1, res2, res3, 7); 1393 SAT_UH4_UH(res0, res1, res2, res3, 7); 1394 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 1395 PCKEV_B2_UB(res5, res4, res7, res6, out2, out3); 1396 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1397 AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3); 1398 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1399} 1400 1401static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst, 1402 int32_t stride, uint32_t coeff0, 1403 uint32_t coeff1, int32_t height) 1404{ 1405 if (4 == height) { 1406 avc_chroma_vt_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1); 1407 } else if (8 == height) { 1408 avc_chroma_vt_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1); 1409 } 1410} 1411 1412static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst, 1413 int32_t stride, 1414 uint32_t coef_hor0, 1415 uint32_t coef_hor1, 1416 uint32_t coef_ver0, 1417 uint32_t coef_ver1) 1418{ 1419 uint16_t out0, out1; 1420 v16u8 dst0 = { 0 }; 1421 v16u8 src0, src1, src2; 1422 v8u16 res_hz0, res_hz1, res_vt0, res_vt1; 1423 v16i8 res, mask; 1424 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1425 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1426 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1427 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1428 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1429 1430 mask = LD_SB(&chroma_mask_arr[48]); 1431 1432 LD_UB3(src, stride, src0, src1, src2); 1433 out0 = LH(dst); 1434 out1 = LH(dst + stride); 1435 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0); 1436 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1); 1437 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 1438 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); 1439 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 1440 1441 res_vt0 += res_vt1; 1442 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); 1443 res_vt0 = __msa_sat_u_h(res_vt0, 7); 1444 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); 1445 dst0 = __msa_aver_u_b((v16u8) res, dst0); 1446 out0 = __msa_copy_u_h((v8i16) dst0, 0); 1447 out1 = __msa_copy_u_h((v8i16) dst0, 1); 1448 1449 SH(out0, dst); 1450 dst += stride; 1451 SH(out1, dst); 1452} 1453 1454static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst, 1455 int32_t stride, 1456 uint32_t coef_hor0, 1457 uint32_t coef_hor1, 1458 uint32_t coef_ver0, 1459 uint32_t coef_ver1) 1460{ 1461 uint16_t tp0, tp1, tp2, tp3; 1462 v16u8 src0, src1, src2, src3, src4; 1463 v16u8 tmp0, tmp1, tmp2, tmp3; 1464 v16u8 dst0 = { 0 }; 1465 v8u16 res_hz0, res_hz1, res_vt0, res_vt1; 1466 v16i8 res, mask; 1467 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1468 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1469 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1470 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1471 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1472 1473 mask = LD_SB(&chroma_mask_arr[48]); 1474 1475 LD_UB5(src, stride, src0, src1, src2, src3, src4); 1476 tp0 = LH(dst); 1477 tp1 = LH(dst + stride); 1478 tp2 = LH(dst + 2 * stride); 1479 tp3 = LH(dst + 3 * stride); 1480 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, tp0); 1481 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, tp1); 1482 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 2, tp2); 1483 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 3, tp3); 1484 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1); 1485 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3); 1486 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 1487 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); 1488 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 1489 1490 res_vt0 += res_vt1; 1491 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); 1492 res_vt0 = __msa_sat_u_h(res_vt0, 7); 1493 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); 1494 dst0 = __msa_aver_u_b((v16u8) res, dst0); 1495 1496 ST_H4(dst0, 0, 1, 2, 3, dst, stride); 1497} 1498 1499static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst, 1500 int32_t stride, 1501 uint32_t coef_hor0, 1502 uint32_t coef_hor1, 1503 uint32_t coef_ver0, 1504 uint32_t coef_ver1, 1505 int32_t height) 1506{ 1507 if (2 == height) { 1508 avc_chroma_hv_and_aver_dst_2x2_msa(src, dst, stride, coef_hor0, 1509 coef_hor1, coef_ver0, coef_ver1); 1510 } else if (4 == height) { 1511 avc_chroma_hv_and_aver_dst_2x4_msa(src, dst, stride, coef_hor0, 1512 coef_hor1, coef_ver0, coef_ver1); 1513 } 1514} 1515 1516static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst, 1517 int32_t stride, 1518 uint32_t coef_hor0, 1519 uint32_t coef_hor1, 1520 uint32_t coef_ver0, 1521 uint32_t coef_ver1) 1522{ 1523 uint32_t tp0, tp1; 1524 v16u8 src0, src1, src2; 1525 v16u8 dst0, dst_data = { 0 }; 1526 v8u16 res_hz0, res_hz1, res_vt0, res_vt1; 1527 v16i8 mask; 1528 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1529 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1530 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1531 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1532 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1533 1534 mask = LD_SB(&chroma_mask_arr[0]); 1535 1536 LD_UB3(src, stride, src0, src1, src2); 1537 LW2(dst, stride, tp0, tp1); 1538 INSERT_W2_UB(tp0, tp1, dst_data); 1539 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 1540 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); 1541 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 1542 1543 res_vt0 += res_vt1; 1544 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); 1545 res_vt0 = __msa_sat_u_h(res_vt0, 7); 1546 dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); 1547 dst0 = __msa_aver_u_b(dst0, dst_data); 1548 1549 ST_W2(dst0, 0, 1, dst, stride); 1550} 1551 1552static void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst, 1553 int32_t stride, 1554 uint32_t coef_hor0, 1555 uint32_t coef_hor1, 1556 uint32_t coef_ver0, 1557 uint32_t coef_ver1) 1558{ 1559 uint32_t tp0, tp1, tp2, tp3; 1560 v16u8 src0, src1, src2, src3, src4; 1561 v16u8 out, dst_data = { 0 }; 1562 v8u16 res_hz0, res_hz1, res_hz2, res_hz3; 1563 v8u16 res_vt0, res_vt1, res_vt2, res_vt3; 1564 v16i8 mask; 1565 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1566 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1567 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1568 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1569 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1570 1571 mask = LD_SB(&chroma_mask_arr[0]); 1572 1573 LD_UB5(src, stride, src0, src1, src2, src3, src4); 1574 LW4(dst, stride, tp0, tp1, tp2, tp3); 1575 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data); 1576 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 1577 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3); 1578 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, 1579 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, 1580 res_hz3); 1581 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, 1582 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 1583 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1); 1584 SRARI_H2_UH(res_vt0, res_vt1, 6); 1585 SAT_UH2_UH(res_vt0, res_vt1, 7); 1586 out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0); 1587 out = __msa_aver_u_b(out, dst_data); 1588 ST_W4(out, 0, 1, 2, 3, dst, stride); 1589} 1590 1591static void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst, 1592 int32_t stride, 1593 uint32_t coef_hor0, 1594 uint32_t coef_hor1, 1595 uint32_t coef_ver0, 1596 uint32_t coef_ver1) 1597{ 1598 uint32_t tp0, tp1, tp2, tp3; 1599 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1; 1600 v16u8 dst0 = { 0 }, dst1 = { 0 }; 1601 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7; 1602 v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7; 1603 v16i8 mask; 1604 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1605 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1606 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1607 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1608 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1609 1610 mask = LD_SB(&chroma_mask_arr[0]); 1611 1612 LD_UB5(src, stride, src0, src1, src2, src3, src4); 1613 src += (5 * stride); 1614 LD_UB4(src, stride, src5, src6, src7, src8); 1615 LW4(dst, stride, tp0, tp1, tp2, tp3); 1616 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1617 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1618 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 1619 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 1620 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3); 1621 VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5); 1622 VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7); 1623 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, 1624 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3); 1625 DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec, 1626 coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7); 1627 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, 1628 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 1629 MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1, 1630 res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7); 1631 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1); 1632 ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3); 1633 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); 1634 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); 1635 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1); 1636 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 1637 ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 1638} 1639 1640static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst, 1641 int32_t stride, 1642 uint32_t coef_hor0, 1643 uint32_t coef_hor1, 1644 uint32_t coef_ver0, 1645 uint32_t coef_ver1, 1646 int32_t height) 1647{ 1648 if (2 == height) { 1649 avc_chroma_hv_and_aver_dst_4x2_msa(src, dst, stride, coef_hor0, 1650 coef_hor1, coef_ver0, coef_ver1); 1651 } else if (4 == height) { 1652 avc_chroma_hv_and_aver_dst_4x4_msa(src, dst, stride, coef_hor0, 1653 coef_hor1, coef_ver0, coef_ver1); 1654 } else if (8 == height) { 1655 avc_chroma_hv_and_aver_dst_4x8_msa(src, dst, stride, coef_hor0, 1656 coef_hor1, coef_ver0, coef_ver1); 1657 } 1658} 1659 1660static void avc_chroma_hv_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst, 1661 int32_t stride, 1662 uint32_t coef_hor0, 1663 uint32_t coef_hor1, 1664 uint32_t coef_ver0, 1665 uint32_t coef_ver1) 1666{ 1667 uint64_t tp0, tp1, tp2, tp3; 1668 v16u8 src0, src1, src2, src3, src4, out0, out1; 1669 v8u16 res_hz0, res_hz1, res_hz2; 1670 v8u16 res_hz3, res_hz4; 1671 v8u16 res_vt0, res_vt1, res_vt2, res_vt3; 1672 v16u8 dst0 = { 0 }, dst1 = { 0 }; 1673 v16i8 mask; 1674 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1675 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1676 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1677 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1678 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1679 1680 mask = LD_SB(&chroma_mask_arr[32]); 1681 1682 src0 = LD_UB(src); 1683 src += stride; 1684 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0); 1685 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec); 1686 LD_UB4(src, stride, src1, src2, src3, src4); 1687 src += (4 * stride); 1688 LD4(dst, stride, tp0, tp1, tp2, tp3); 1689 INSERT_D2_UB(tp0, tp1, dst0); 1690 INSERT_D2_UB(tp2, tp3, dst1); 1691 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2); 1692 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4); 1693 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec, 1694 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4); 1695 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0, 1696 res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 1697 res_vt0 += (res_hz0 * coeff_vt_vec1); 1698 res_vt1 += (res_hz1 * coeff_vt_vec1); 1699 res_vt2 += (res_hz2 * coeff_vt_vec1); 1700 res_vt3 += (res_hz3 * coeff_vt_vec1); 1701 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); 1702 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); 1703 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1); 1704 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1705 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 1706} 1707 1708static void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst, 1709 int32_t stride, 1710 uint32_t coef_hor0, 1711 uint32_t coef_hor1, 1712 uint32_t coef_ver0, 1713 uint32_t coef_ver1) 1714{ 1715 uint64_t tp0, tp1, tp2, tp3; 1716 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1717 v16u8 out0, out1, out2, out3; 1718 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 1719 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4; 1720 v8u16 res_hz5, res_hz6, res_hz7, res_hz8; 1721 v8u16 res_vt0, res_vt1, res_vt2, res_vt3; 1722 v8u16 res_vt4, res_vt5, res_vt6, res_vt7; 1723 v16i8 mask; 1724 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1725 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1726 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1727 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1728 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1729 1730 mask = LD_SB(&chroma_mask_arr[32]); 1731 1732 LD_UB5(src, stride, src0, src1, src2, src3, src4); 1733 src += (5 * stride); 1734 LD_UB4(src, stride, src5, src6, src7, src8); 1735 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0); 1736 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2); 1737 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4); 1738 VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6); 1739 VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8); 1740 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec); 1741 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec, 1742 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, 1743 res_hz4); 1744 DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec, 1745 coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8); 1746 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, 1747 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, 1748 res_vt3); 1749 MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7, 1750 coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, 1751 res_vt7); 1752 LD4(dst, stride, tp0, tp1, tp2, tp3); 1753 INSERT_D2_UB(tp0, tp1, dst0); 1754 INSERT_D2_UB(tp2, tp3, dst1); 1755 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1756 INSERT_D2_UB(tp0, tp1, dst2); 1757 INSERT_D2_UB(tp2, tp3, dst3); 1758 res_vt0 += (res_hz0 * coeff_vt_vec1); 1759 res_vt1 += (res_hz1 * coeff_vt_vec1); 1760 res_vt2 += (res_hz2 * coeff_vt_vec1); 1761 res_vt3 += (res_hz3 * coeff_vt_vec1); 1762 res_vt4 += (res_hz4 * coeff_vt_vec1); 1763 res_vt5 += (res_hz5 * coeff_vt_vec1); 1764 res_vt6 += (res_hz6 * coeff_vt_vec1); 1765 res_vt7 += (res_hz7 * coeff_vt_vec1); 1766 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); 1767 SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6); 1768 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); 1769 SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7); 1770 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1); 1771 PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3); 1772 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1773 AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3); 1774 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1775} 1776 1777static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst, 1778 int32_t stride, 1779 uint32_t coef_hor0, 1780 uint32_t coef_hor1, 1781 uint32_t coef_ver0, 1782 uint32_t coef_ver1, 1783 int32_t height) 1784{ 1785 if (4 == height) { 1786 avc_chroma_hv_and_aver_dst_8x4_msa(src, dst, stride, coef_hor0, 1787 coef_hor1, coef_ver0, coef_ver1); 1788 } else if (8 == height) { 1789 avc_chroma_hv_and_aver_dst_8x8_msa(src, dst, stride, coef_hor0, 1790 coef_hor1, coef_ver0, coef_ver1); 1791 } 1792} 1793 1794static void copy_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 1795 int32_t height) 1796{ 1797 uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; 1798 1799 if (8 == height) { 1800 LW4(src, stride, tp0, tp1, tp2, tp3); 1801 src += 4 * stride; 1802 LW4(src, stride, tp4, tp5, tp6, tp7); 1803 SW4(tp0, tp1, tp2, tp3, dst, stride); 1804 dst += 4 * stride; 1805 SW4(tp4, tp5, tp6, tp7, dst, stride); 1806 } else if (4 == height) { 1807 LW4(src, stride, tp0, tp1, tp2, tp3); 1808 SW4(tp0, tp1, tp2, tp3, dst, stride); 1809 } else if (2 == height) { 1810 LW2(src, stride, tp0, tp1); 1811 SW(tp0, dst); 1812 dst += stride; 1813 SW(tp1, dst); 1814 } 1815} 1816 1817static void copy_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 1818 int32_t height) 1819{ 1820 uint64_t src0, src1, src2, src3, src4, src5, src6, src7; 1821 1822 if (8 == height) { 1823 LD4(src, stride, src0, src1, src2, src3); 1824 src += 4 * stride; 1825 LD4(src, stride, src4, src5, src6, src7); 1826 SD4(src0, src1, src2, src3, dst, stride); 1827 dst += 4 * stride; 1828 SD4(src4, src5, src6, src7, dst, stride); 1829 } else if (4 == height) { 1830 LD4(src, stride, src0, src1, src2, src3); 1831 SD4(src0, src1, src2, src3, dst, stride); 1832 } 1833} 1834 1835static void avg_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 1836 int32_t height) 1837{ 1838 uint32_t tp0, tp1, tp2, tp3; 1839 v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 }; 1840 1841 if (8 == height) { 1842 LW4(src, stride, tp0, tp1, tp2, tp3); 1843 src += 4 * stride; 1844 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 1845 LW4(src, stride, tp0, tp1, tp2, tp3); 1846 INSERT_W4_UB(tp0, tp1, tp2, tp3, src1); 1847 LW4(dst, stride, tp0, tp1, tp2, tp3); 1848 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1849 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1850 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 1851 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); 1852 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 1853 } else if (4 == height) { 1854 LW4(src, stride, tp0, tp1, tp2, tp3); 1855 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 1856 LW4(dst, stride, tp0, tp1, tp2, tp3); 1857 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1858 dst0 = __msa_aver_u_b(src0, dst0); 1859 ST_W4(dst0, 0, 1, 2, 3, dst, stride); 1860 } else if (2 == height) { 1861 LW2(src, stride, tp0, tp1); 1862 INSERT_W2_UB(tp0, tp1, src0); 1863 LW2(dst, stride, tp0, tp1); 1864 INSERT_W2_UB(tp0, tp1, dst0); 1865 dst0 = __msa_aver_u_b(src0, dst0); 1866 ST_W2(dst0, 0, 1, dst, stride); 1867 } 1868} 1869 1870static void avg_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 1871 int32_t height) 1872{ 1873 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; 1874 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 1875 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 1876 1877 if (8 == height) { 1878 LD4(src, stride, tp0, tp1, tp2, tp3); 1879 src += 4 * stride; 1880 LD4(src, stride, tp4, tp5, tp6, tp7); 1881 INSERT_D2_UB(tp0, tp1, src0); 1882 INSERT_D2_UB(tp2, tp3, src1); 1883 INSERT_D2_UB(tp4, tp5, src2); 1884 INSERT_D2_UB(tp6, tp7, src3); 1885 LD4(dst, stride, tp0, tp1, tp2, tp3); 1886 LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7); 1887 INSERT_D2_UB(tp0, tp1, dst0); 1888 INSERT_D2_UB(tp2, tp3, dst1); 1889 INSERT_D2_UB(tp4, tp5, dst2); 1890 INSERT_D2_UB(tp6, tp7, dst3); 1891 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, 1892 dst2, dst3); 1893 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1894 } else if (4 == height) { 1895 LD4(src, stride, tp0, tp1, tp2, tp3); 1896 INSERT_D2_UB(tp0, tp1, src0); 1897 INSERT_D2_UB(tp2, tp3, src1); 1898 LD4(dst, stride, tp0, tp1, tp2, tp3); 1899 INSERT_D2_UB(tp0, tp1, dst0); 1900 INSERT_D2_UB(tp2, tp3, dst1); 1901 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); 1902 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 1903 } 1904} 1905 1906void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, 1907 ptrdiff_t stride, int height, int x, int y) 1908{ 1909 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 1910 1911 if (x && y) { 1912 avc_chroma_hv_8w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height); 1913 } else if (x) { 1914 avc_chroma_hz_8w_msa(src, dst, stride, x, (8 - x), height); 1915 } else if (y) { 1916 avc_chroma_vt_8w_msa(src, dst, stride, y, (8 - y), height); 1917 } else { 1918 copy_width8_msa(src, dst, stride, height); 1919 } 1920} 1921 1922void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, 1923 ptrdiff_t stride, int height, int x, int y) 1924{ 1925 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 1926 1927 if (x && y) { 1928 avc_chroma_hv_4w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height); 1929 } else if (x) { 1930 avc_chroma_hz_4w_msa(src, dst, stride, x, (8 - x), height); 1931 } else if (y) { 1932 avc_chroma_vt_4w_msa(src, dst, stride, y, (8 - y), height); 1933 } else { 1934 copy_width4_msa(src, dst, stride, height); 1935 } 1936} 1937 1938void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, 1939 ptrdiff_t stride, int height, int x, int y) 1940{ 1941 int32_t cnt; 1942 1943 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 1944 1945 if (x && y) { 1946 avc_chroma_hv_2w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height); 1947 } else if (x) { 1948 avc_chroma_hz_2w_msa(src, dst, stride, x, (8 - x), height); 1949 } else if (y) { 1950 avc_chroma_vt_2w_msa(src, dst, stride, y, (8 - y), height); 1951 } else { 1952 for (cnt = height; cnt--;) { 1953 *((uint16_t *) dst) = *((uint16_t *) src); 1954 1955 src += stride; 1956 dst += stride; 1957 } 1958 } 1959} 1960 1961void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, 1962 ptrdiff_t stride, int height, int x, int y) 1963{ 1964 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 1965 1966 1967 if (x && y) { 1968 avc_chroma_hv_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), y, 1969 (8 - y), height); 1970 } else if (x) { 1971 avc_chroma_hz_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), height); 1972 } else if (y) { 1973 avc_chroma_vt_and_aver_dst_8w_msa(src, dst, stride, y, (8 - y), height); 1974 } else { 1975 avg_width8_msa(src, dst, stride, height); 1976 } 1977} 1978 1979void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, 1980 ptrdiff_t stride, int height, int x, int y) 1981{ 1982 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 1983 1984 if (x && y) { 1985 avc_chroma_hv_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), y, 1986 (8 - y), height); 1987 } else if (x) { 1988 avc_chroma_hz_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), height); 1989 } else if (y) { 1990 avc_chroma_vt_and_aver_dst_4w_msa(src, dst, stride, y, (8 - y), height); 1991 } else { 1992 avg_width4_msa(src, dst, stride, height); 1993 } 1994} 1995 1996void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, 1997 ptrdiff_t stride, int height, int x, int y) 1998{ 1999 int32_t cnt; 2000 2001 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 2002 2003 if (x && y) { 2004 avc_chroma_hv_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), y, 2005 (8 - y), height); 2006 } else if (x) { 2007 avc_chroma_hz_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), height); 2008 } else if (y) { 2009 avc_chroma_vt_and_aver_dst_2w_msa(src, dst, stride, y, (8 - y), height); 2010 } else { 2011 for (cnt = height; cnt--;) { 2012 dst[0] = (dst[0] + src[0] + 1) >> 1; 2013 dst[1] = (dst[1] + src[1] + 1) >> 1; 2014 2015 src += stride; 2016 dst += stride; 2017 } 2018 } 2019} 2020