1/* 2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "libavcodec/mips/hevcdsp_mips.h" 23#include "libavcodec/mips/hevc_macros_msa.h" 24 25static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { 26 /* 8 width cases */ 27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 29}; 30 31#define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \ 32 out0, out1) \ 33{ \ 34 v4i32 out0_r, out1_r, out0_l, out1_l; \ 35 \ 36 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \ 37 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \ 38 \ 39 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \ 40 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \ 41 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \ 42 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \ 43 \ 44 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \ 45 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \ 46 CLIP_SH2_0_255(out0, out1); \ 47} 48 49#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \ 50 wgt, rnd, offset, out0, out1, out2, out3) \ 51{ \ 52 HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \ 53 HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \ 54} 55 56#define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \ 57 offset, out0, out1) \ 58{ \ 59 v4i32 out0_r, out1_r, out0_l, out1_l; \ 60 \ 61 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \ 62 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \ 63 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \ 64 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \ 65 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \ 66 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \ 67 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \ 68 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \ 69 CLIP_SH2_0_255(out0, out1); \ 70} 71 72#define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \ 73 vec3, wgt, rnd, offset, out0, out1, \ 74 out2, out3) \ 75{ \ 76 HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \ 77 out0, out1); \ 78 HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \ 79 out2, out3); \ 80} 81 82static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr, 83 int32_t src_stride, 84 int16_t *src1_ptr, 85 int32_t src2_stride, 86 uint8_t *dst, 87 int32_t dst_stride, 88 int32_t height, 89 int32_t weight0, 90 int32_t weight1, 91 int32_t offset0, 92 int32_t offset1, 93 int32_t rnd_val) 94{ 95 uint32_t loop_cnt, tp0, tp1, tp2, tp3; 96 uint64_t tpd0, tpd1, tpd2, tpd3; 97 int32_t offset, weight; 98 v16u8 out0, out1; 99 v16i8 zero = { 0 }; 100 v16i8 src0 = { 0 }, src1 = { 0 }; 101 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 102 v8i16 dst0, dst1, dst2, dst3, weight_vec; 103 v4i32 dst0_r, dst0_l, offset_vec, rnd_vec; 104 105 offset = (offset0 + offset1) << rnd_val; 106 weight0 = weight0 & 0x0000FFFF; 107 weight = weight0 | (weight1 << 16); 108 109 offset_vec = __msa_fill_w(offset); 110 weight_vec = (v8i16) __msa_fill_w(weight); 111 rnd_vec = __msa_fill_w(rnd_val + 1); 112 113 if (2 == height) { 114 LW2(src0_ptr, src_stride, tp0, tp1); 115 INSERT_W2_SB(tp0, tp1, src0); 116 LD2(src1_ptr, src2_stride, tpd0, tpd1); 117 INSERT_D2_SH(tpd0, tpd1, in0); 118 119 dst0 = (v8i16) __msa_ilvr_b(zero, src0); 120 dst0 <<= 6; 121 122 ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l); 123 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec); 124 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec); 125 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); 126 dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 127 CLIP_SH_0_255(dst0); 128 out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); 129 ST_W2(out0, 0, 1, dst, dst_stride); 130 } else if (4 == height) { 131 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 132 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); 133 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); 134 INSERT_D2_SH(tpd0, tpd1, in0); 135 INSERT_D2_SH(tpd2, tpd3, in1); 136 ILVRL_B2_SH(zero, src0, dst0, dst1); 137 SLLI_2V(dst0, dst1, 6); 138 HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec, 139 offset_vec, dst0, dst1); 140 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 141 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride); 142 } else if (0 == height % 8) { 143 for (loop_cnt = (height >> 3); loop_cnt--;) { 144 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 145 src0_ptr += 4 * src_stride; 146 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); 147 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 148 src0_ptr += 4 * src_stride; 149 INSERT_W4_SB(tp0, tp1, tp2, tp3, src1); 150 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); 151 src1_ptr += (4 * src2_stride); 152 INSERT_D2_SH(tpd0, tpd1, in0); 153 INSERT_D2_SH(tpd2, tpd3, in1); 154 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); 155 src1_ptr += (4 * src2_stride); 156 INSERT_D2_SH(tpd0, tpd1, in2); 157 INSERT_D2_SH(tpd2, tpd3, in3); 158 ILVRL_B2_SH(zero, src0, dst0, dst1); 159 ILVRL_B2_SH(zero, src1, dst2, dst3); 160 SLLI_4V(dst0, dst1, dst2, dst3, 6); 161 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, 162 in3, weight_vec, rnd_vec, offset_vec, 163 dst0, dst1, dst2, dst3); 164 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 165 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 166 dst += (8 * dst_stride); 167 } 168 } 169} 170 171static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr, 172 int32_t src_stride, 173 int16_t *src1_ptr, 174 int32_t src2_stride, 175 uint8_t *dst, 176 int32_t dst_stride, 177 int32_t height, 178 int32_t weight0, 179 int32_t weight1, 180 int32_t offset0, 181 int32_t offset1, 182 int32_t rnd_val) 183{ 184 uint32_t loop_cnt; 185 int32_t offset, weight; 186 uint64_t tp0, tp1, tp2, tp3; 187 v16u8 out0, out1; 188 v16i8 zero = { 0 }; 189 v16i8 src0 = { 0 }, src1 = { 0 }; 190 v8i16 in0, in1, in2, in3; 191 v8i16 dst0, dst1, dst2, dst3; 192 v4i32 offset_vec, weight_vec, rnd_vec; 193 194 offset = (offset0 + offset1) << rnd_val; 195 weight0 = weight0 & 0x0000FFFF; 196 weight = weight0 | (weight1 << 16); 197 198 weight_vec = __msa_fill_w(weight); 199 offset_vec = __msa_fill_w(offset); 200 rnd_vec = __msa_fill_w(rnd_val + 1); 201 202 for (loop_cnt = (height >> 2); loop_cnt--;) { 203 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 204 src0_ptr += (4 * src_stride); 205 INSERT_D2_SB(tp0, tp1, src0); 206 INSERT_D2_SB(tp2, tp3, src1); 207 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 208 src1_ptr += (4 * src2_stride); 209 ILVRL_B2_SH(zero, src0, dst0, dst1); 210 ILVRL_B2_SH(zero, src1, dst2, dst3); 211 SLLI_4V(dst0, dst1, dst2, dst3, 6); 212 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, 213 in0, in1, in2, in3, 214 weight_vec, rnd_vec, offset_vec, 215 dst0, dst1, dst2, dst3); 216 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 217 ST_W2(out0, 0, 2, dst, dst_stride); 218 ST_H2(out0, 2, 6, dst + 4, dst_stride); 219 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 220 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 221 dst += (4 * dst_stride); 222 } 223} 224 225static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr, 226 int32_t src_stride, 227 int16_t *src1_ptr, 228 int32_t src2_stride, 229 uint8_t *dst, 230 int32_t dst_stride, 231 int32_t height, 232 int32_t weight0, 233 int32_t weight1, 234 int32_t offset0, 235 int32_t offset1, 236 int32_t rnd_val) 237{ 238 uint64_t tp0, tp1, tp2, tp3; 239 int32_t offset, weight; 240 v16u8 out0, out1, out2; 241 v16i8 zero = { 0 }; 242 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }; 243 v8i16 in0, in1, in2, in3, in4, in5; 244 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 245 v4i32 offset_vec, weight_vec, rnd_vec; 246 247 offset = (offset0 + offset1) << rnd_val; 248 weight0 = weight0 & 0x0000FFFF; 249 weight = weight0 | (weight1 << 16); 250 251 offset_vec = __msa_fill_w(offset); 252 weight_vec = __msa_fill_w(weight); 253 rnd_vec = __msa_fill_w(rnd_val + 1); 254 255 if (2 == height) { 256 LD2(src0_ptr, src_stride, tp0, tp1); 257 INSERT_D2_SB(tp0, tp1, src0); 258 LD_SH2(src1_ptr, src2_stride, in0, in1); 259 ILVRL_B2_SH(zero, src0, dst0, dst1); 260 SLLI_2V(dst0, dst1, 6); 261 262 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, 263 weight_vec, rnd_vec, offset_vec, 264 dst0, dst1); 265 266 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 267 ST_D2(out0, 0, 1, dst, dst_stride); 268 } else if (6 == height) { 269 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 270 src0_ptr += 4 * src_stride; 271 INSERT_D2_SB(tp0, tp1, src0); 272 INSERT_D2_SB(tp2, tp3, src1); 273 LD2(src0_ptr, src_stride, tp0, tp1); 274 INSERT_D2_SB(tp0, tp1, src2); 275 ILVRL_B2_SH(zero, src0, dst0, dst1); 276 ILVRL_B2_SH(zero, src1, dst2, dst3); 277 ILVRL_B2_SH(zero, src2, dst4, dst5); 278 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); 279 SLLI_4V(dst0, dst1, dst2, dst3, 6); 280 SLLI_2V(dst4, dst5, 6); 281 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3, 282 weight_vec, rnd_vec, offset_vec, dst0, dst1, 283 dst2, dst3); 284 HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec, 285 offset_vec, dst4, dst5); 286 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 287 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 288 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 289 } else if (0 == height % 4) { 290 uint32_t loop_cnt; 291 292 for (loop_cnt = (height >> 2); loop_cnt--;) { 293 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 294 src0_ptr += (4 * src_stride); 295 INSERT_D2_SB(tp0, tp1, src0); 296 INSERT_D2_SB(tp2, tp3, src1); 297 ILVRL_B2_SH(zero, src0, dst0, dst1); 298 ILVRL_B2_SH(zero, src1, dst2, dst3); 299 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 300 src1_ptr += (4 * src2_stride); 301 302 SLLI_4V(dst0, dst1, dst2, dst3, 6); 303 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, 304 in3, weight_vec, rnd_vec, offset_vec, 305 dst0, dst1, dst2, dst3); 306 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 307 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 308 dst += (4 * dst_stride); 309 } 310 } 311} 312 313static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr, 314 int32_t src_stride, 315 int16_t *src1_ptr, 316 int32_t src2_stride, 317 uint8_t *dst, 318 int32_t dst_stride, 319 int32_t height, 320 int32_t weight0, 321 int32_t weight1, 322 int32_t offset0, 323 int32_t offset1, 324 int32_t rnd_val) 325{ 326 uint32_t loop_cnt; 327 int32_t offset, weight; 328 v16i8 zero = { 0 }; 329 v16u8 out0, out1, out2; 330 v16i8 src0, src1, src2, src3; 331 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 332 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 333 v4i32 offset_vec, weight_vec, rnd_vec; 334 335 offset = (offset0 + offset1) << rnd_val; 336 weight0 = weight0 & 0x0000FFFF; 337 weight = weight0 | (weight1 << 16); 338 339 offset_vec = __msa_fill_w(offset); 340 weight_vec = __msa_fill_w(weight); 341 rnd_vec = __msa_fill_w(rnd_val + 1); 342 343 for (loop_cnt = (16 >> 2); loop_cnt--;) { 344 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 345 src0_ptr += (4 * src_stride); 346 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 347 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 348 src1_ptr += (4 * src2_stride); 349 350 ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 351 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 352 dst0, dst1, dst2, dst3); 353 354 SLLI_4V(dst0, dst1, dst2, dst3, 6); 355 ILVL_W2_SB(src1, src0, src3, src2, src0, src1); 356 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5); 357 358 dst4 <<= 6; 359 dst5 <<= 6; 360 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3, 361 weight_vec, rnd_vec, offset_vec, dst0, dst1, 362 dst2, dst3); 363 HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec, 364 offset_vec, dst4, dst5); 365 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 366 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 367 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 368 dst += (4 * dst_stride); 369 } 370} 371 372static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr, 373 int32_t src_stride, 374 int16_t *src1_ptr, 375 int32_t src2_stride, 376 uint8_t *dst, 377 int32_t dst_stride, 378 int32_t height, 379 int32_t weight0, 380 int32_t weight1, 381 int32_t offset0, 382 int32_t offset1, 383 int32_t rnd_val) 384{ 385 uint32_t loop_cnt; 386 int32_t offset, weight; 387 v16u8 out0, out1, out2, out3; 388 v16i8 zero = { 0 }; 389 v16i8 src0, src1, src2, src3; 390 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 391 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 392 v4i32 offset_vec, weight_vec, rnd_vec; 393 394 offset = (offset0 + offset1) << rnd_val; 395 weight0 = weight0 & 0x0000FFFF; 396 weight = weight0 | (weight1 << 16); 397 398 offset_vec = __msa_fill_w(offset); 399 weight_vec = __msa_fill_w(weight); 400 rnd_vec = __msa_fill_w(rnd_val + 1); 401 402 for (loop_cnt = (height >> 2); loop_cnt--;) { 403 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 404 src0_ptr += (4 * src_stride); 405 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 406 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 407 src1_ptr += (4 * src2_stride); 408 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1, 409 tmp2, tmp3); 410 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5, 411 tmp6, tmp7); 412 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6); 413 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6); 414 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5, 415 weight_vec, rnd_vec, offset_vec, tmp0, tmp1, 416 tmp4, tmp5); 417 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7, 418 weight_vec, rnd_vec, offset_vec, tmp2, tmp3, 419 tmp6, tmp7); 420 PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1); 421 PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3); 422 ST_UB4(out0, out1, out2, out3, dst, dst_stride); 423 dst += (4 * dst_stride); 424 } 425} 426 427static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr, 428 int32_t src_stride, 429 int16_t *src1_ptr, 430 int32_t src2_stride, 431 uint8_t *dst, 432 int32_t dst_stride, 433 int32_t height, 434 int32_t weight0, 435 int32_t weight1, 436 int32_t offset0, 437 int32_t offset1, 438 int32_t rnd_val) 439{ 440 uint32_t loop_cnt; 441 int32_t offset, weight; 442 v16u8 out0, out1, out2, out3, out4, out5; 443 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 }; 444 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10; 445 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11; 446 v4i32 offset_vec, weight_vec, rnd_vec; 447 448 offset = (offset0 + offset1) << rnd_val; 449 weight0 = weight0 & 0x0000FFFF; 450 weight = weight0 | (weight1 << 16); 451 452 offset_vec = __msa_fill_w(offset); 453 weight_vec = __msa_fill_w(weight); 454 rnd_vec = __msa_fill_w(rnd_val + 1); 455 456 for (loop_cnt = 8; loop_cnt--;) { 457 LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5); 458 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7); 459 src0_ptr += (4 * src_stride); 460 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 461 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 462 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11); 463 src1_ptr += (4 * src2_stride); 464 465 ILVRL_B2_SH(zero, src0, dst0, dst1); 466 ILVRL_B2_SH(zero, src1, dst2, dst3); 467 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5); 468 ILVRL_B2_SH(zero, src4, dst6, dst7); 469 ILVRL_B2_SH(zero, src5, dst8, dst9); 470 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11); 471 SLLI_4V(dst0, dst1, dst2, dst3, 6); 472 SLLI_4V(dst4, dst5, dst6, dst7, 6); 473 SLLI_4V(dst8, dst9, dst10, dst11, 6); 474 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5, 475 weight_vec, rnd_vec, offset_vec, dst0, dst1, 476 dst2, dst3); 477 HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6, 478 weight_vec, rnd_vec, offset_vec, dst4, dst5, 479 dst6, dst7); 480 HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10, 481 in11, weight_vec, rnd_vec, offset_vec, 482 dst8, dst9, dst10, dst11); 483 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 484 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); 485 ST_UB4(out0, out1, out3, out4, dst, dst_stride); 486 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride); 487 dst += (4 * dst_stride); 488 } 489} 490 491static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr, 492 int32_t src_stride, 493 int16_t *src1_ptr, 494 int32_t src2_stride, 495 uint8_t *dst, 496 int32_t dst_stride, 497 int32_t height, 498 int32_t weight0, 499 int32_t weight1, 500 int32_t offset0, 501 int32_t offset1, 502 int32_t rnd_val) 503{ 504 uint32_t loop_cnt; 505 int32_t offset, weight; 506 v16u8 out0, out1, out2, out3; 507 v16i8 zero = { 0 }; 508 v16i8 src0, src1, src2, src3; 509 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 510 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 511 v4i32 offset_vec, weight_vec, rnd_vec; 512 513 offset = (offset0 + offset1) << rnd_val; 514 weight0 = weight0 & 0x0000FFFF; 515 weight = weight0 | (weight1 << 16); 516 517 offset_vec = __msa_fill_w(offset); 518 weight_vec = __msa_fill_w(weight); 519 rnd_vec = __msa_fill_w(rnd_val + 1); 520 521 for (loop_cnt = (height >> 1); loop_cnt--;) { 522 LD_SB2(src0_ptr, 16, src0, src1); 523 src0_ptr += src_stride; 524 LD_SB2(src0_ptr, 16, src2, src3); 525 src0_ptr += src_stride; 526 LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 527 src1_ptr += src2_stride; 528 LD_SH4(src1_ptr, 8, in4, in5, in6, in7); 529 src1_ptr += src2_stride; 530 531 ILVRL_B2_SH(zero, src0, tmp0, tmp4); 532 ILVRL_B2_SH(zero, src1, tmp1, tmp5); 533 ILVRL_B2_SH(zero, src2, tmp2, tmp6); 534 ILVRL_B2_SH(zero, src3, tmp3, tmp7); 535 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6); 536 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6); 537 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3, 538 weight_vec, rnd_vec, offset_vec, tmp0, tmp4, 539 tmp1, tmp5); 540 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7, 541 weight_vec, rnd_vec, offset_vec, tmp2, tmp6, 542 tmp3, tmp7); 543 PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1); 544 PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3); 545 ST_UB2(out0, out1, dst, 16); 546 dst += dst_stride; 547 ST_UB2(out2, out3, dst, 16); 548 dst += dst_stride; 549 } 550} 551 552static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr, 553 int32_t src_stride, 554 int16_t *src1_ptr, 555 int32_t src2_stride, 556 uint8_t *dst, 557 int32_t dst_stride, 558 int32_t height, 559 int32_t weight0, 560 int32_t weight1, 561 int32_t offset0, 562 int32_t offset1, 563 int32_t rnd_val) 564{ 565 uint32_t loop_cnt; 566 int32_t offset, weight; 567 v16u8 out0, out1, out2; 568 v16i8 src0, src1, src2; 569 v16i8 zero = { 0 }; 570 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5; 571 v4i32 offset_vec, weight_vec, rnd_vec; 572 573 offset = (offset0 + offset1) << rnd_val; 574 weight0 = weight0 & 0x0000FFFF; 575 weight = weight0 | (weight1 << 16); 576 577 offset_vec = __msa_fill_w(offset); 578 weight_vec = __msa_fill_w(weight); 579 rnd_vec = __msa_fill_w(rnd_val + 1); 580 581 for (loop_cnt = 64; loop_cnt--;) { 582 LD_SB3(src0_ptr, 16, src0, src1, src2); 583 src0_ptr += src_stride; 584 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5); 585 src1_ptr += src2_stride; 586 587 ILVRL_B2_SH(zero, src0, dst0, dst1); 588 ILVRL_B2_SH(zero, src1, dst2, dst3); 589 ILVRL_B2_SH(zero, src2, dst4, dst5); 590 SLLI_4V(dst0, dst1, dst2, dst3, 6); 591 SLLI_2V(dst4, dst5, 6); 592 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3, 593 weight_vec, rnd_vec, offset_vec, dst0, dst1, 594 dst2, dst3); 595 HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec, 596 offset_vec, dst4, dst5); 597 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 598 ST_UB2(out0, out1, dst, 16); 599 ST_UB(out2, dst + 32); 600 dst += dst_stride; 601 } 602} 603 604static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr, 605 int32_t src_stride, 606 int16_t *src1_ptr, 607 int32_t src2_stride, 608 uint8_t *dst, 609 int32_t dst_stride, 610 int32_t height, 611 int32_t weight0, 612 int32_t weight1, 613 int32_t offset0, 614 int32_t offset1, 615 int32_t rnd_val) 616{ 617 uint32_t loop_cnt; 618 int32_t offset, weight; 619 v16u8 out0, out1, out2, out3; 620 v16i8 zero = { 0 }; 621 v16i8 src0, src1, src2, src3; 622 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 623 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 624 v4i32 offset_vec, weight_vec, rnd_vec; 625 626 offset = (offset0 + offset1) << rnd_val; 627 weight0 = weight0 & 0x0000FFFF; 628 weight = weight0 | (weight1 << 16); 629 630 offset_vec = __msa_fill_w(offset); 631 weight_vec = __msa_fill_w(weight); 632 rnd_vec = __msa_fill_w(rnd_val + 1); 633 634 for (loop_cnt = height; loop_cnt--;) { 635 LD_SB4(src0_ptr, 16, src0, src1, src2, src3); 636 src0_ptr += src_stride; 637 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7); 638 src1_ptr += src2_stride; 639 640 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1, 641 tmp2, tmp3); 642 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5, 643 tmp6, tmp7); 644 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6); 645 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6); 646 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3, 647 weight_vec, rnd_vec, offset_vec, tmp0, tmp4, 648 tmp1, tmp5); 649 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7, 650 weight_vec, rnd_vec, offset_vec, tmp2, tmp6, 651 tmp3, tmp7); 652 PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1); 653 PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3); 654 ST_UB4(out0, out1, out2, out3, dst, 16); 655 dst += dst_stride; 656 } 657} 658 659static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr, 660 int32_t src_stride, 661 int16_t *src1_ptr, 662 int32_t src2_stride, 663 uint8_t *dst, 664 int32_t dst_stride, 665 const int8_t *filter, 666 int32_t height, 667 int32_t weight0, 668 int32_t weight1, 669 int32_t offset0, 670 int32_t offset1, 671 int32_t rnd_val) 672{ 673 uint32_t loop_cnt; 674 int32_t offset, weight, constant; 675 v8i16 filt0, filt1, filt2, filt3; 676 v16i8 src0, src1, src2, src3; 677 v16i8 mask1, mask2, mask3; 678 v16i8 vec0, vec1, vec2, vec3; 679 v8i16 dst0, dst1; 680 v8i16 in0, in1, in2, in3; 681 v8i16 filter_vec, out0, out1; 682 v4i32 weight_vec, offset_vec, rnd_vec; 683 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 684 685 src0_ptr -= 3; 686 filter_vec = LD_SH(filter); 687 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 688 689 mask1 = mask0 + 2; 690 mask2 = mask0 + 4; 691 mask3 = mask0 + 6; 692 693 offset = (offset0 + offset1) << rnd_val; 694 weight0 = weight0 & 0x0000FFFF; 695 weight = weight0 | (weight1 << 16); 696 constant = 128 * weight1; 697 constant <<= 6; 698 offset += constant; 699 700 offset_vec = __msa_fill_w(offset); 701 weight_vec = __msa_fill_w(weight); 702 rnd_vec = __msa_fill_w(rnd_val + 1); 703 704 for (loop_cnt = (height >> 2); loop_cnt--;) { 705 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 706 src0_ptr += (4 * src_stride); 707 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 708 src1_ptr += (4 * src2_stride); 709 ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 710 XORI_B4_128_SB(src0, src1, src2, src3); 711 712 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, 713 vec0, vec1, vec2, vec3); 714 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 715 filt3); 716 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3, 717 vec0, vec1, vec2, vec3); 718 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 719 filt3); 720 721 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, 722 weight_vec, rnd_vec, offset_vec, 723 out0, out1); 724 725 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0); 726 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride); 727 dst += (4 * dst_stride); 728 } 729} 730 731static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr, 732 int32_t src_stride, 733 int16_t *src1_ptr, 734 int32_t src2_stride, 735 uint8_t *dst, 736 int32_t dst_stride, 737 const int8_t *filter, 738 int32_t height, 739 int32_t weight0, 740 int32_t weight1, 741 int32_t offset0, 742 int32_t offset1, 743 int32_t rnd_val) 744{ 745 uint32_t loop_cnt; 746 int32_t offset, weight, constant; 747 v8i16 filt0, filt1, filt2, filt3; 748 v16i8 src0, src1, src2, src3; 749 v16i8 mask1, mask2, mask3; 750 v16i8 vec0, vec1, vec2, vec3; 751 v8i16 dst0, dst1, dst2, dst3; 752 v8i16 in0, in1, in2, in3; 753 v8i16 filter_vec, out0, out1, out2, out3; 754 v4i32 weight_vec, offset_vec, rnd_vec; 755 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 756 757 src0_ptr -= 3; 758 offset = (offset0 + offset1) << rnd_val; 759 weight0 = weight0 & 0x0000FFFF; 760 weight = weight0 | (weight1 << 16); 761 constant = 128 * weight1; 762 constant <<= 6; 763 offset += constant; 764 765 offset_vec = __msa_fill_w(offset); 766 weight_vec = __msa_fill_w(weight); 767 rnd_vec = __msa_fill_w(rnd_val + 1); 768 769 filter_vec = LD_SH(filter); 770 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 771 772 mask1 = mask0 + 2; 773 mask2 = mask0 + 4; 774 mask3 = mask0 + 6; 775 776 for (loop_cnt = (height >> 2); loop_cnt--;) { 777 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 778 src0_ptr += (4 * src_stride); 779 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 780 src1_ptr += (4 * src2_stride); 781 XORI_B4_128_SB(src0, src1, src2, src3); 782 783 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 784 vec0, vec1, vec2, vec3); 785 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 786 filt3); 787 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 788 vec0, vec1, vec2, vec3); 789 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 790 filt3); 791 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 792 vec0, vec1, vec2, vec3); 793 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 794 filt3); 795 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 796 vec0, vec1, vec2, vec3); 797 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 798 filt3); 799 800 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 801 in0, in1, in2, in3, 802 weight_vec, rnd_vec, offset_vec, 803 out0, out1, out2, out3); 804 805 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 806 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 807 dst += (4 * dst_stride); 808 } 809} 810 811static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr, 812 int32_t src_stride, 813 int16_t *src1_ptr, 814 int32_t src2_stride, 815 uint8_t *dst, 816 int32_t dst_stride, 817 const int8_t *filter, 818 int32_t height, 819 int32_t weight0, 820 int32_t weight1, 821 int32_t offset0, 822 int32_t offset1, 823 int32_t rnd_val) 824{ 825 uint32_t loop_cnt; 826 int32_t offset, weight, constant; 827 v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3; 828 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 829 v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3; 830 v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec; 831 v4i32 weight_vec, offset_vec, rnd_vec; 832 833 src0_ptr -= 3; 834 835 weight0 = weight0 & 0x0000FFFF; 836 weight = weight0 | (weight1 << 16); 837 constant = 128 * weight1; 838 constant <<= 6; 839 offset = (offset0 + offset1) << rnd_val; 840 offset += constant; 841 842 offset_vec = __msa_fill_w(offset); 843 weight_vec = __msa_fill_w(weight); 844 rnd_vec = __msa_fill_w(rnd_val + 1); 845 846 filter_vec = LD_SH(filter); 847 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 848 849 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 850 mask1 = mask0 + 2; 851 mask2 = mask0 + 4; 852 mask3 = mask0 + 6; 853 mask4 = LD_SB(&ff_hevc_mask_arr[16]); 854 mask5 = mask4 + 2; 855 mask6 = mask4 + 4; 856 mask7 = mask4 + 6; 857 858 for (loop_cnt = 4; loop_cnt--;) { 859 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 860 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 861 XORI_B4_128_SB(src0, src1, src2, src3); 862 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 863 vec3); 864 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 865 filt3); 866 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 867 vec3); 868 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 869 filt3); 870 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 871 vec3); 872 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 873 filt3); 874 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 875 vec3); 876 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 877 filt3); 878 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, 879 weight_vec, rnd_vec, offset_vec, out0, out1, out2, 880 out3); 881 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 882 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 883 884 LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3); 885 src0_ptr += (4 * src_stride); 886 LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3); 887 src1_ptr += (4 * src2_stride); 888 ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 889 XORI_B4_128_SB(src0, src1, src2, src3); 890 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 891 vec3); 892 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 893 filt3); 894 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 895 vec3); 896 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 897 filt3); 898 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, 899 offset_vec, out0, out1); 900 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0); 901 ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride); 902 dst += (4 * dst_stride); 903 } 904} 905 906static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr, 907 int32_t src_stride, 908 int16_t *src1_ptr, 909 int32_t src2_stride, 910 uint8_t *dst, 911 int32_t dst_stride, 912 const int8_t *filter, 913 int32_t height, 914 int32_t weight0, 915 int32_t weight1, 916 int32_t offset0, 917 int32_t offset1, 918 int32_t rnd_val) 919{ 920 uint32_t loop_cnt; 921 int32_t offset, weight, constant; 922 v16i8 src0, src1, src2, src3; 923 v8i16 in0, in1, in2, in3; 924 v8i16 filt0, filt1, filt2, filt3; 925 v16i8 mask1, mask2, mask3; 926 v8i16 filter_vec, out0, out1, out2, out3; 927 v16i8 vec0, vec1, vec2, vec3; 928 v8i16 dst0, dst1, dst2, dst3; 929 v4i32 weight_vec, offset_vec, rnd_vec; 930 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 931 932 src0_ptr -= 3; 933 offset = (offset0 + offset1) << rnd_val; 934 weight0 = weight0 & 0x0000FFFF; 935 weight = weight0 | (weight1 << 16); 936 constant = 128 * weight1; 937 constant <<= 6; 938 offset += constant; 939 940 offset_vec = __msa_fill_w(offset); 941 weight_vec = __msa_fill_w(weight); 942 rnd_vec = __msa_fill_w(rnd_val + 1); 943 944 filter_vec = LD_SH(filter); 945 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 946 947 mask1 = mask0 + 2; 948 mask2 = mask0 + 4; 949 mask3 = mask0 + 6; 950 951 for (loop_cnt = (height >> 1); loop_cnt--;) { 952 LD_SB2(src0_ptr, 8, src0, src1); 953 src0_ptr += src_stride; 954 LD_SB2(src0_ptr, 8, src2, src3); 955 src0_ptr += src_stride; 956 LD_SH2(src1_ptr, 8, in0, in1); 957 src1_ptr += src2_stride; 958 LD_SH2(src1_ptr, 8, in2, in3); 959 src1_ptr += src2_stride; 960 XORI_B4_128_SB(src0, src1, src2, src3); 961 962 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 963 vec0, vec1, vec2, vec3); 964 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 965 filt3); 966 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 967 vec0, vec1, vec2, vec3); 968 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 969 filt3); 970 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 971 vec0, vec1, vec2, vec3); 972 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 973 filt3); 974 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 975 vec0, vec1, vec2, vec3); 976 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 977 filt3); 978 979 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 980 in0, in1, in2, in3, 981 weight_vec, rnd_vec, offset_vec, 982 out0, out1, out2, out3); 983 984 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 985 ST_SH2(out0, out1, dst, dst_stride); 986 dst += (2 * dst_stride); 987 } 988} 989 990static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr, 991 int32_t src_stride, 992 int16_t *src1_ptr, 993 int32_t src2_stride, 994 uint8_t *dst, 995 int32_t dst_stride, 996 const int8_t *filter, 997 int32_t height, 998 int32_t weight0, 999 int32_t weight1, 1000 int32_t offset0, 1001 int32_t offset1, 1002 int32_t rnd_val) 1003{ 1004 uint32_t loop_cnt; 1005 uint64_t dst_val0; 1006 int32_t offset, weight, constant; 1007 v16i8 src0, src1; 1008 v8i16 in0, in1, in2; 1009 v8i16 filt0, filt1, filt2, filt3; 1010 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1011 v16i8 vec0, vec1, vec2, vec3; 1012 v8i16 dst0, dst1, dst2; 1013 v4i32 dst2_r, dst2_l; 1014 v8i16 filter_vec, out0, out1, out2; 1015 v4i32 weight_vec, offset_vec, rnd_vec; 1016 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1017 1018 src0_ptr = src0_ptr - 3; 1019 offset = (offset0 + offset1) << rnd_val; 1020 weight0 = weight0 & 0x0000FFFF; 1021 weight = weight0 | (weight1 << 16); 1022 constant = 128 * weight1; 1023 constant <<= 6; 1024 offset += constant; 1025 1026 offset_vec = __msa_fill_w(offset); 1027 weight_vec = __msa_fill_w(weight); 1028 rnd_vec = __msa_fill_w(rnd_val + 1); 1029 1030 filter_vec = LD_SH(filter); 1031 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1032 1033 mask1 = mask0 + 2; 1034 mask2 = mask0 + 4; 1035 mask3 = mask0 + 6; 1036 mask4 = mask0 + 8; 1037 mask5 = mask0 + 10; 1038 mask6 = mask0 + 12; 1039 mask7 = mask0 + 14; 1040 1041 LD_SB2(src0_ptr, 16, src0, src1); 1042 src0_ptr += src_stride; 1043 LD_SH2(src1_ptr, 8, in0, in1); 1044 in2 = LD_SH(src1_ptr + 16); 1045 src1_ptr += src2_stride; 1046 XORI_B2_128_SB(src0, src1); 1047 1048 for (loop_cnt = 31; loop_cnt--;) { 1049 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1050 vec0, vec1, vec2, vec3); 1051 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1052 filt3); 1053 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 1054 vec0, vec1, vec2, vec3); 1055 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1056 filt3); 1057 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1058 vec0, vec1, vec2, vec3); 1059 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1060 filt3); 1061 1062 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, 1063 weight_vec, rnd_vec, offset_vec, 1064 out0, out1); 1065 1066 ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l); 1067 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, 1068 (v8i16) weight_vec); 1069 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, 1070 (v8i16) weight_vec); 1071 SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); 1072 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); 1073 CLIP_SH_0_255(out2); 1074 1075 LD_SB2(src0_ptr, 16, src0, src1); 1076 src0_ptr += src_stride; 1077 LD_SH2(src1_ptr, 8, in0, in1); 1078 in2 = LD_SH(src1_ptr + 16); 1079 src1_ptr += src2_stride; 1080 XORI_B2_128_SB(src0, src1); 1081 PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); 1082 dst_val0 = __msa_copy_u_d((v2i64) out2, 0); 1083 ST_SH(out0, dst); 1084 SD(dst_val0, dst + 16); 1085 dst += dst_stride; 1086 } 1087 1088 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1089 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1090 filt3); 1091 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); 1092 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1093 filt3); 1094 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1095 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1096 filt3); 1097 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec, 1098 out0, out1); 1099 ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l); 1100 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec); 1101 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec); 1102 SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); 1103 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); 1104 CLIP_SH_0_255(out2); 1105 PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); 1106 dst_val0 = __msa_copy_u_d((v2i64) out2, 0); 1107 ST_SH(out0, dst); 1108 SD(dst_val0, dst + 16); 1109 dst += dst_stride; 1110} 1111 1112static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr, 1113 int32_t src_stride, 1114 int16_t *src1_ptr, 1115 int32_t src2_stride, 1116 uint8_t *dst, 1117 int32_t dst_stride, 1118 const int8_t *filter, 1119 int32_t height, 1120 int32_t weight0, 1121 int32_t weight1, 1122 int32_t offset0, 1123 int32_t offset1, 1124 int32_t rnd_val) 1125{ 1126 uint32_t loop_cnt; 1127 int32_t offset, weight, constant; 1128 v16i8 src0, src1, src2; 1129 v8i16 in0, in1, in2, in3; 1130 v8i16 filt0, filt1, filt2, filt3; 1131 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1132 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1133 v16i8 vec0, vec1, vec2, vec3; 1134 v8i16 dst0, dst1, dst2, dst3; 1135 v8i16 filter_vec, out0, out1, out2, out3; 1136 v4i32 weight_vec, offset_vec, rnd_vec; 1137 1138 src0_ptr -= 3; 1139 offset = (offset0 + offset1) << rnd_val; 1140 weight0 = weight0 & 0x0000FFFF; 1141 weight = weight0 | (weight1 << 16); 1142 constant = 128 * weight1; 1143 constant <<= 6; 1144 offset += constant; 1145 1146 offset_vec = __msa_fill_w(offset); 1147 weight_vec = __msa_fill_w(weight); 1148 rnd_vec = __msa_fill_w(rnd_val + 1); 1149 1150 filter_vec = LD_SH(filter); 1151 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1152 1153 mask1 = mask0 + 2; 1154 mask2 = mask0 + 4; 1155 mask3 = mask0 + 6; 1156 mask4 = mask0 + 8; 1157 mask5 = mask0 + 10; 1158 mask6 = mask0 + 12; 1159 mask7 = mask0 + 14; 1160 1161 for (loop_cnt = height; loop_cnt--;) { 1162 LD_SB2(src0_ptr, 16, src0, src1); 1163 src2 = LD_SB(src0_ptr + 24); 1164 src0_ptr += src_stride; 1165 LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 1166 src1_ptr += src2_stride; 1167 1168 XORI_B3_128_SB(src0, src1, src2); 1169 1170 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1171 vec0, vec1, vec2, vec3); 1172 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1173 filt3); 1174 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 1175 vec0, vec1, vec2, vec3); 1176 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1177 filt3); 1178 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1179 vec0, vec1, vec2, vec3); 1180 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1181 filt3); 1182 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1183 vec0, vec1, vec2, vec3); 1184 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1185 filt3); 1186 1187 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 1188 in0, in1, in2, in3, 1189 weight_vec, rnd_vec, offset_vec, 1190 out0, out1, out2, out3); 1191 1192 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 1193 ST_SH2(out0, out1, dst, 16); 1194 dst += dst_stride; 1195 } 1196} 1197 1198static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr, 1199 int32_t src_stride, 1200 int16_t *src1_ptr, 1201 int32_t src2_stride, 1202 uint8_t *dst, 1203 int32_t dst_stride, 1204 const int8_t *filter, 1205 int32_t height, 1206 int32_t weight0, 1207 int32_t weight1, 1208 int32_t offset0, 1209 int32_t offset1, 1210 int32_t rnd_val) 1211{ 1212 uint32_t loop_cnt; 1213 int32_t offset, weight, constant; 1214 v16i8 src0, src1, src2, src3, src4; 1215 v8i16 in0, in1, in2, in3; 1216 v8i16 filt0, filt1, filt2, filt3; 1217 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1218 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1219 v16i8 vec0, vec1, vec2, vec3; 1220 v8i16 dst0, dst1, dst2, dst3; 1221 v8i16 filter_vec, out0, out1, out2, out3; 1222 v4i32 weight_vec, offset_vec, rnd_vec; 1223 1224 src0_ptr -= 3; 1225 offset = (offset0 + offset1) << rnd_val; 1226 weight0 = weight0 & 0x0000FFFF; 1227 weight = weight0 | (weight1 << 16); 1228 constant = 128 * weight1; 1229 constant <<= 6; 1230 offset += constant; 1231 1232 offset_vec = __msa_fill_w(offset); 1233 weight_vec = __msa_fill_w(weight); 1234 rnd_vec = __msa_fill_w(rnd_val + 1); 1235 1236 filter_vec = LD_SH(filter); 1237 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1238 1239 mask1 = mask0 + 2; 1240 mask2 = mask0 + 4; 1241 mask3 = mask0 + 6; 1242 mask4 = mask0 + 8; 1243 mask5 = mask0 + 10; 1244 mask6 = mask0 + 12; 1245 mask7 = mask0 + 14; 1246 1247 for (loop_cnt = 64; loop_cnt--;) { 1248 LD_SB2(src0_ptr, 16, src0, src1); 1249 src2 = LD_SB(src0_ptr + 24); 1250 LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 1251 XORI_B3_128_SB(src0, src1, src2); 1252 LD_SB2(src0_ptr + 32, 8, src3, src4); 1253 src0_ptr += src_stride; 1254 XORI_B2_128_SB(src3, src4); 1255 1256 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1257 vec0, vec1, vec2, vec3); 1258 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1259 filt3); 1260 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 1261 vec0, vec1, vec2, vec3); 1262 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1263 filt3); 1264 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1265 vec0, vec1, vec2, vec3); 1266 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1267 filt3); 1268 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1269 vec0, vec1, vec2, vec3); 1270 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1271 filt3); 1272 1273 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, 1274 weight_vec, rnd_vec, offset_vec, 1275 out0, out1, out2, out3); 1276 1277 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 1278 ST_SH2(out0, out1, dst, 16); 1279 1280 LD_SH2(src1_ptr + 32, 8, in2, in3); 1281 src1_ptr += src2_stride; 1282 1283 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1284 vec0, vec1, vec2, vec3); 1285 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1286 filt3); 1287 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 1288 vec0, vec1, vec2, vec3); 1289 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1290 filt3); 1291 1292 HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3, 1293 weight_vec, rnd_vec, offset_vec, 1294 out0, out1); 1295 1296 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0); 1297 ST_SH(out0, dst + 32); 1298 dst += dst_stride; 1299 } 1300} 1301 1302static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr, 1303 int32_t src_stride, 1304 int16_t *src1_ptr, 1305 int32_t src2_stride, 1306 uint8_t *dst, 1307 int32_t dst_stride, 1308 const int8_t *filter, 1309 int32_t height, 1310 int32_t weight0, 1311 int32_t weight1, 1312 int32_t offset0, 1313 int32_t offset1, 1314 int32_t rnd_val) 1315{ 1316 uint8_t *src0_ptr_tmp; 1317 uint8_t *dst_tmp; 1318 int16_t *src1_ptr_tmp; 1319 uint32_t loop_cnt, cnt; 1320 int32_t offset, weight, constant; 1321 v16i8 src0, src1, src2; 1322 v8i16 in0, in1, in2, in3; 1323 v8i16 filt0, filt1, filt2, filt3; 1324 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1325 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1326 v16i8 vec0, vec1, vec2, vec3; 1327 v8i16 dst0, dst1, dst2, dst3; 1328 v8i16 filter_vec, out0, out1, out2, out3; 1329 v4i32 weight_vec, offset_vec, rnd_vec; 1330 1331 src0_ptr -= 3; 1332 offset = (offset0 + offset1) << rnd_val; 1333 weight0 = weight0 & 0x0000FFFF; 1334 weight = weight0 | (weight1 << 16); 1335 constant = 128 * weight1; 1336 constant <<= 6; 1337 offset += constant; 1338 1339 offset_vec = __msa_fill_w(offset); 1340 weight_vec = __msa_fill_w(weight); 1341 rnd_vec = __msa_fill_w(rnd_val + 1); 1342 1343 filter_vec = LD_SH(filter); 1344 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1345 1346 mask1 = mask0 + 2; 1347 mask2 = mask0 + 4; 1348 mask3 = mask0 + 6; 1349 mask4 = mask0 + 8; 1350 mask5 = mask0 + 10; 1351 mask6 = mask0 + 12; 1352 mask7 = mask0 + 14; 1353 1354 for (loop_cnt = height; loop_cnt--;) { 1355 src0_ptr_tmp = src0_ptr; 1356 dst_tmp = dst; 1357 src1_ptr_tmp = src1_ptr; 1358 1359 for (cnt = 2; cnt--;) { 1360 LD_SB2(src0_ptr_tmp, 16, src0, src1); 1361 src2 = LD_SB(src0_ptr_tmp + 24); 1362 src0_ptr_tmp += 32; 1363 LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3); 1364 src1_ptr_tmp += 32; 1365 XORI_B3_128_SB(src0, src1, src2); 1366 1367 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1368 vec0, vec1, vec2, vec3); 1369 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1370 filt2, filt3); 1371 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 1372 vec0, vec1, vec2, vec3); 1373 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1374 filt2, filt3); 1375 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1376 vec0, vec1, vec2, vec3); 1377 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1378 filt2, filt3); 1379 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1380 vec0, vec1, vec2, vec3); 1381 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1382 filt2, filt3); 1383 1384 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 1385 in0, in1, in2, in3, 1386 weight_vec, rnd_vec, offset_vec, 1387 out0, out1, out2, out3); 1388 1389 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 1390 ST_SH2(out0, out1, dst_tmp, 16); 1391 dst_tmp += 32; 1392 } 1393 1394 src0_ptr += src_stride; 1395 src1_ptr += src2_stride; 1396 dst += dst_stride; 1397 1398 } 1399} 1400 1401static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr, 1402 int32_t src_stride, 1403 int16_t *src1_ptr, 1404 int32_t src2_stride, 1405 uint8_t *dst, 1406 int32_t dst_stride, 1407 const int8_t *filter, 1408 int32_t height, 1409 int32_t weight0, 1410 int32_t weight1, 1411 int32_t offset0, 1412 int32_t offset1, 1413 int32_t rnd_val) 1414{ 1415 uint32_t loop_cnt; 1416 int32_t offset, weight; 1417 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1418 v16i8 src11, src12, src13, src14; 1419 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 1420 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1421 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1422 v16i8 src1110_r, src1211_r, src1312_r, src1413_r; 1423 v16i8 src2110, src4332, src6554, src8776, src10998; 1424 v16i8 src12111110, src14131312; 1425 v8i16 dst10, dst32, dst54, dst76; 1426 v8i16 filt0, filt1, filt2, filt3; 1427 v8i16 filter_vec, out0, out1, out2, out3; 1428 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec; 1429 1430 src0_ptr -= (3 * src_stride); 1431 offset = (offset0 + offset1) << rnd_val; 1432 weight0 = weight0 & 0x0000FFFF; 1433 weight = weight0 | (weight1 << 16); 1434 1435 const_vec = __msa_ldi_w(128); 1436 const_vec <<= 6; 1437 offset_vec = __msa_fill_w(offset); 1438 weight_vec = __msa_fill_w(weight); 1439 rnd_vec = __msa_fill_w(rnd_val + 1); 1440 weight1_vec = __msa_fill_w(weight1); 1441 offset_vec += const_vec * weight1_vec; 1442 1443 filter_vec = LD_SH(filter); 1444 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1445 1446 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1447 src0_ptr += (7 * src_stride); 1448 1449 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1450 src10_r, src32_r, src54_r, src21_r); 1451 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1452 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 1453 src2110, src4332, src6554); 1454 XORI_B3_128_SB(src2110, src4332, src6554); 1455 1456 for (loop_cnt = (height >> 3); loop_cnt--;) { 1457 LD_SB8(src0_ptr, src_stride, 1458 src7, src8, src9, src10, src11, src12, src13, src14); 1459 src0_ptr += (8 * src_stride); 1460 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); 1461 src1_ptr += (8 * src2_stride); 1462 1463 ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 1464 ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 1465 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1466 src76_r, src87_r, src98_r, src109_r); 1467 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, 1468 src1110_r, src1211_r, src1312_r, src1413_r); 1469 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r, 1470 src1413_r, src1312_r, 1471 src8776, src10998, src12111110, src14131312); 1472 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); 1473 1474 DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, 1475 filt0, dst10, dst32, dst54, dst76); 1476 DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1, 1477 filt1, dst10, dst32, dst54, dst76); 1478 DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2, 1479 filt2, filt2, dst10, dst32, dst54, dst76); 1480 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3, 1481 filt3, filt3, dst10, dst32, dst54, dst76); 1482 1483 HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76, 1484 in0, in1, in2, in3, 1485 weight_vec, rnd_vec, offset_vec, 1486 out0, out1, out2, out3); 1487 1488 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 1489 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 1490 dst += (8 * dst_stride); 1491 1492 src2110 = src10998; 1493 src4332 = src12111110; 1494 src6554 = src14131312; 1495 src6 = src14; 1496 } 1497} 1498 1499static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr, 1500 int32_t src_stride, 1501 int16_t *src1_ptr, 1502 int32_t src2_stride, 1503 uint8_t *dst, 1504 int32_t dst_stride, 1505 const int8_t *filter, 1506 int32_t height, 1507 int32_t weight0, 1508 int32_t weight1, 1509 int32_t offset0, 1510 int32_t offset1, 1511 int32_t rnd_val) 1512{ 1513 uint32_t loop_cnt; 1514 int32_t offset, weight; 1515 v16i8 src0, src1, src2, src3, src4, src5; 1516 v16i8 src6, src7, src8, src9, src10; 1517 v8i16 in0, in1, in2, in3; 1518 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1519 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1520 v8i16 tmp0, tmp1, tmp2, tmp3; 1521 v8i16 filt0, filt1, filt2, filt3; 1522 v8i16 filter_vec, out0, out1, out2, out3; 1523 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec; 1524 1525 src0_ptr -= (3 * src_stride); 1526 offset = (offset0 + offset1) << rnd_val; 1527 weight0 = weight0 & 0x0000FFFF; 1528 weight = weight0 | (weight1 << 16); 1529 1530 const_vec = __msa_ldi_w(128); 1531 const_vec <<= 6; 1532 offset_vec = __msa_fill_w(offset); 1533 weight_vec = __msa_fill_w(weight); 1534 rnd_vec = __msa_fill_w(rnd_val + 1); 1535 weight1_vec = __msa_fill_w(weight1); 1536 offset_vec += const_vec * weight1_vec; 1537 1538 filter_vec = LD_SH(filter); 1539 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1540 1541 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1542 src0_ptr += (7 * src_stride); 1543 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1544 1545 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1546 src10_r, src32_r, src54_r, src21_r); 1547 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1548 1549 for (loop_cnt = (height >> 2); loop_cnt--;) { 1550 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 1551 src0_ptr += (4 * src_stride); 1552 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 1553 src1_ptr += (4 * src2_stride); 1554 1555 XORI_B4_128_SB(src7, src8, src9, src10); 1556 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1557 src76_r, src87_r, src98_r, src109_r); 1558 1559 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0, 1560 filt0, tmp0, tmp1, tmp2, tmp3); 1561 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1, 1562 filt1, tmp0, tmp1, tmp2, tmp3); 1563 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2, 1564 filt2, tmp0, tmp1, tmp2, tmp3); 1565 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3, 1566 filt3, tmp0, tmp1, tmp2, tmp3); 1567 1568 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 1569 in0, in1, in2, in3, 1570 weight_vec, rnd_vec, offset_vec, 1571 out0, out1, out2, out3); 1572 1573 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 1574 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1575 dst += (4 * dst_stride); 1576 1577 src10_r = src54_r; 1578 src32_r = src76_r; 1579 src54_r = src98_r; 1580 src21_r = src65_r; 1581 src43_r = src87_r; 1582 src65_r = src109_r; 1583 src6 = src10; 1584 } 1585} 1586 1587static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr, 1588 int32_t src_stride, 1589 int16_t *src1_ptr, 1590 int32_t src2_stride, 1591 uint8_t *dst, 1592 int32_t dst_stride, 1593 const int8_t *filter, 1594 int32_t height, 1595 int32_t weight0, 1596 int32_t weight1, 1597 int32_t offset0, 1598 int32_t offset1, 1599 int32_t rnd_val) 1600{ 1601 uint32_t loop_cnt; 1602 int32_t offset, weight; 1603 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1604 v8i16 in0, in1, in2, in3; 1605 v16i8 src10_r, src32_r, src54_r, src76_r; 1606 v16i8 src21_r, src43_r, src65_r, src87_r; 1607 v8i16 tmp0, tmp1, tmp2; 1608 v16i8 src10_l, src32_l, src54_l, src76_l; 1609 v16i8 src21_l, src43_l, src65_l, src87_l; 1610 v16i8 src2110, src4332, src6554, src8776; 1611 v8i16 filt0, filt1, filt2, filt3; 1612 v8i16 out0, out1, out2, filter_vec; 1613 v4i32 dst2_r, dst2_l; 1614 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec; 1615 1616 src0_ptr -= (3 * src_stride); 1617 offset = (offset0 + offset1) << rnd_val; 1618 weight0 = weight0 & 0x0000FFFF; 1619 weight = weight0 | (weight1 << 16); 1620 1621 const_vec = __msa_ldi_w(128); 1622 const_vec <<= 6; 1623 offset_vec = __msa_fill_w(offset); 1624 weight_vec = __msa_fill_w(weight); 1625 rnd_vec = __msa_fill_w(rnd_val + 1); 1626 weight1_vec = __msa_fill_w(weight1); 1627 offset_vec += const_vec * weight1_vec; 1628 1629 filter_vec = LD_SH(filter); 1630 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1631 1632 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1633 src0_ptr += (7 * src_stride); 1634 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1635 1636 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1637 src10_r, src32_r, src54_r, src21_r); 1638 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1639 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1640 src10_l, src32_l, src54_l, src21_l); 1641 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1642 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, 1643 src2110, src4332, src6554); 1644 1645 for (loop_cnt = 8; loop_cnt--;) { 1646 LD_SB2(src0_ptr, src_stride, src7, src8); 1647 src0_ptr += (2 * src_stride); 1648 LD_SH2(src1_ptr, src2_stride, in0, in1); 1649 LD_SH2((src1_ptr + 8), src2_stride, in2, in3); 1650 src1_ptr += (2 * src2_stride); 1651 in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2); 1652 XORI_B2_128_SB(src7, src8); 1653 1654 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 1655 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 1656 src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l); 1657 1658 DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0, 1659 tmp0, tmp1, tmp2); 1660 DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1); 1661 tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1); 1662 DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1); 1663 tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2); 1664 DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1); 1665 tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3); 1666 1667 HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1, 1668 weight_vec, rnd_vec, offset_vec, 1669 out0, out1); 1670 1671 ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l); 1672 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, 1673 (v8i16) weight_vec); 1674 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, 1675 (v8i16) weight_vec); 1676 SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); 1677 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); 1678 CLIP_SH_0_255(out2); 1679 PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); 1680 ST_D2(out0, 0, 1, dst, dst_stride); 1681 ST_W2(out2, 0, 1, dst + 8, dst_stride); 1682 dst += (2 * dst_stride); 1683 1684 src10_r = src32_r; 1685 src32_r = src54_r; 1686 src54_r = src76_r; 1687 src21_r = src43_r; 1688 src43_r = src65_r; 1689 src65_r = src87_r; 1690 src2110 = src4332; 1691 src4332 = src6554; 1692 src6554 = src8776; 1693 src6 = src8; 1694 } 1695} 1696 1697static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr, 1698 int32_t src_stride, 1699 int16_t *src1_ptr, 1700 int32_t src2_stride, 1701 uint8_t *dst, 1702 int32_t dst_stride, 1703 const int8_t *filter, 1704 int32_t height, 1705 int32_t weight0, 1706 int32_t weight1, 1707 int32_t offset0, 1708 int32_t offset1, 1709 int32_t rnd_val, 1710 int32_t width) 1711{ 1712 uint8_t *src0_ptr_tmp; 1713 int16_t *src1_ptr_tmp; 1714 uint8_t *dst_tmp; 1715 uint32_t loop_cnt, cnt; 1716 int32_t offset, weight; 1717 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1718 v8i16 in0, in1, in2, in3; 1719 v16i8 src10_r, src32_r, src54_r, src76_r; 1720 v16i8 src21_r, src43_r, src65_r, src87_r; 1721 v16i8 src10_l, src32_l, src54_l, src76_l; 1722 v16i8 src21_l, src43_l, src65_l, src87_l; 1723 v8i16 tmp0, tmp1, tmp2, tmp3; 1724 v8i16 filt0, filt1, filt2, filt3; 1725 v8i16 filter_vec; 1726 v8i16 out0, out1, out2, out3; 1727 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec; 1728 1729 src0_ptr -= (3 * src_stride); 1730 1731 offset = (offset0 + offset1) << rnd_val; 1732 weight0 = weight0 & 0x0000FFFF; 1733 weight = weight0 | (weight1 << 16); 1734 1735 const_vec = __msa_ldi_w(128); 1736 const_vec <<= 6; 1737 offset_vec = __msa_fill_w(offset); 1738 weight_vec = __msa_fill_w(weight); 1739 rnd_vec = __msa_fill_w(rnd_val + 1); 1740 weight1_vec = __msa_fill_w(weight1); 1741 offset_vec += const_vec * weight1_vec; 1742 1743 filter_vec = LD_SH(filter); 1744 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1745 1746 for (cnt = (width >> 4); cnt--;) { 1747 src0_ptr_tmp = src0_ptr; 1748 src1_ptr_tmp = src1_ptr; 1749 dst_tmp = dst; 1750 1751 LD_SB7(src0_ptr_tmp, src_stride, 1752 src0, src1, src2, src3, src4, src5, src6); 1753 src0_ptr_tmp += (7 * src_stride); 1754 1755 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1756 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1757 src10_r, src32_r, src54_r, src21_r); 1758 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1759 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1760 src10_l, src32_l, src54_l, src21_l); 1761 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1762 1763 for (loop_cnt = (height >> 1); loop_cnt--;) { 1764 LD_SB2(src0_ptr_tmp, src_stride, src7, src8); 1765 src0_ptr_tmp += (2 * src_stride); 1766 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); 1767 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3); 1768 src1_ptr_tmp += (2 * src2_stride); 1769 1770 XORI_B2_128_SB(src7, src8); 1771 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 1772 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 1773 1774 DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0, 1775 filt0, filt0, tmp0, tmp1, tmp2, tmp3); 1776 DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1, 1777 filt1, filt1, tmp0, tmp1, tmp2, tmp3); 1778 DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2, 1779 filt2, filt2, tmp0, tmp1, tmp2, tmp3); 1780 DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3, 1781 filt3, filt3, tmp0, tmp1, tmp2, tmp3); 1782 1783 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 1784 in0, in1, in2, in3, 1785 weight_vec, rnd_vec, offset_vec, 1786 out0, out1, out2, out3); 1787 1788 PCKEV_B2_SH(out2, out0, out3, out1, out0, out1); 1789 ST_SH2(out0, out1, dst_tmp, dst_stride); 1790 dst_tmp += (2 * dst_stride); 1791 1792 src10_r = src32_r; 1793 src32_r = src54_r; 1794 src54_r = src76_r; 1795 src21_r = src43_r; 1796 src43_r = src65_r; 1797 src65_r = src87_r; 1798 src10_l = src32_l; 1799 src32_l = src54_l; 1800 src54_l = src76_l; 1801 src21_l = src43_l; 1802 src43_l = src65_l; 1803 src65_l = src87_l; 1804 src6 = src8; 1805 } 1806 1807 src0_ptr += 16; 1808 src1_ptr += 16; 1809 dst += 16; 1810 } 1811} 1812 1813static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr, 1814 int32_t src_stride, 1815 int16_t *src1_ptr, 1816 int32_t src2_stride, 1817 uint8_t *dst, 1818 int32_t dst_stride, 1819 const int8_t *filter, 1820 int32_t height, 1821 int32_t weight0, 1822 int32_t weight1, 1823 int32_t offset0, 1824 int32_t offset1, 1825 int32_t rnd_val) 1826{ 1827 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, 1828 src1_ptr, src2_stride, 1829 dst, dst_stride, filter, height, 1830 weight0, weight1, offset0, offset1, 1831 rnd_val, 16); 1832} 1833 1834static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr, 1835 int32_t src_stride, 1836 int16_t *src1_ptr, 1837 int32_t src2_stride, 1838 uint8_t *dst, 1839 int32_t dst_stride, 1840 const int8_t *filter, 1841 int32_t height, 1842 int32_t weight0, 1843 int32_t weight1, 1844 int32_t offset0, 1845 int32_t offset1, 1846 int32_t rnd_val) 1847{ 1848 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, 1849 src1_ptr, src2_stride, 1850 dst, dst_stride, filter, height, 1851 weight0, weight1, offset0, offset1, 1852 rnd_val, 16); 1853 hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride, 1854 src1_ptr + 16, src2_stride, 1855 dst + 16, dst_stride, filter, height, 1856 weight0, weight1, offset0, offset1, rnd_val); 1857} 1858 1859static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr, 1860 int32_t src_stride, 1861 int16_t *src1_ptr, 1862 int32_t src2_stride, 1863 uint8_t *dst, 1864 int32_t dst_stride, 1865 const int8_t *filter, 1866 int32_t height, 1867 int32_t weight0, 1868 int32_t weight1, 1869 int32_t offset0, 1870 int32_t offset1, 1871 int32_t rnd_val) 1872{ 1873 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, 1874 src1_ptr, src2_stride, 1875 dst, dst_stride, filter, height, 1876 weight0, weight1, offset0, offset1, 1877 rnd_val, 32); 1878} 1879 1880static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr, 1881 int32_t src_stride, 1882 int16_t *src1_ptr, 1883 int32_t src2_stride, 1884 uint8_t *dst, 1885 int32_t dst_stride, 1886 const int8_t *filter, 1887 int32_t height, 1888 int32_t weight0, 1889 int32_t weight1, 1890 int32_t offset0, 1891 int32_t offset1, 1892 int32_t rnd_val) 1893{ 1894 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, 1895 src1_ptr, src2_stride, 1896 dst, dst_stride, filter, height, 1897 weight0, weight1, offset0, offset1, 1898 rnd_val, 48); 1899} 1900 1901static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr, 1902 int32_t src_stride, 1903 int16_t *src1_ptr, 1904 int32_t src2_stride, 1905 uint8_t *dst, 1906 int32_t dst_stride, 1907 const int8_t *filter, 1908 int32_t height, 1909 int32_t weight0, 1910 int32_t weight1, 1911 int32_t offset0, 1912 int32_t offset1, 1913 int32_t rnd_val) 1914{ 1915 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, 1916 src1_ptr, src2_stride, 1917 dst, dst_stride, filter, height, 1918 weight0, weight1, offset0, offset1, 1919 rnd_val, 64); 1920} 1921 1922static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr, 1923 int32_t src_stride, 1924 int16_t *src1_ptr, 1925 int32_t src2_stride, 1926 uint8_t *dst, 1927 int32_t dst_stride, 1928 const int8_t *filter_x, 1929 const int8_t *filter_y, 1930 int32_t height, 1931 int32_t weight0, 1932 int32_t weight1, 1933 int32_t offset0, 1934 int32_t offset1, 1935 int32_t rnd_val) 1936{ 1937 uint32_t loop_cnt; 1938 uint64_t tp0, tp1; 1939 int32_t offset, weight; 1940 v16u8 out; 1941 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1942 v8i16 in0 = { 0 }, in1 = { 0 }; 1943 v8i16 filt0, filt1, filt2, filt3; 1944 v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1945 v16i8 mask1, mask2, mask3; 1946 v8i16 filter_vec, weight_vec; 1947 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1948 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1949 v8i16 dst30, dst41, dst52, dst63, dst66, dst87; 1950 v8i16 tmp0, tmp1, tmp2, tmp3; 1951 v8i16 dst10, dst32, dst54, dst76; 1952 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98; 1953 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3; 1954 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1955 1956 src0_ptr -= ((3 * src_stride) + 3); 1957 1958 filter_vec = LD_SH(filter_x); 1959 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1960 1961 filter_vec = LD_SH(filter_y); 1962 UNPCK_R_SB_SH(filter_vec, filter_vec); 1963 1964 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1965 1966 mask1 = mask0 + 2; 1967 mask2 = mask0 + 4; 1968 mask3 = mask0 + 6; 1969 1970 offset = (offset0 + offset1) << rnd_val; 1971 weight0 = weight0 & 0x0000FFFF; 1972 weight = weight0 | (weight1 << 16); 1973 1974 const_vec = __msa_fill_w((128 * weight1)); 1975 const_vec <<= 6; 1976 offset_vec = __msa_fill_w(offset); 1977 rnd_vec = __msa_fill_w(rnd_val + 1); 1978 offset_vec += const_vec; 1979 weight_vec = (v8i16) __msa_fill_w(weight); 1980 1981 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1982 src0_ptr += (7 * src_stride); 1983 1984 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1985 1986 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1987 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1988 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, 1989 vec8, vec9, vec10, vec11); 1990 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, 1991 vec12, vec13, vec14, vec15); 1992 1993 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1994 filt3); 1995 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1996 filt3); 1997 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1998 filt3); 1999 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 2000 filt3); 2001 2002 ILVRL_H2_SH(dst41, dst30, dst10, dst43); 2003 ILVRL_H2_SH(dst52, dst41, dst21, dst54); 2004 ILVRL_H2_SH(dst63, dst52, dst32, dst65); 2005 2006 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 2007 2008 for (loop_cnt = height >> 2; loop_cnt--;) { 2009 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 2010 src0_ptr += (4 * src_stride); 2011 XORI_B4_128_SB(src7, src8, src9, src10); 2012 2013 LD2(src1_ptr, src2_stride, tp0, tp1); 2014 INSERT_D2_SH(tp0, tp1, in0); 2015 src1_ptr += (2 * src2_stride); 2016 LD2(src1_ptr, src2_stride, tp0, tp1); 2017 INSERT_D2_SH(tp0, tp1, in1); 2018 src1_ptr += (2 * src2_stride); 2019 2020 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3, 2021 vec0, vec1, vec2, vec3); 2022 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3, 2023 vec4, vec5, vec6, vec7); 2024 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2025 filt3); 2026 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2027 filt3); 2028 2029 dst76 = __msa_ilvr_h(dst97, dst66); 2030 ILVRL_H2_SH(dst108, dst97, dst87, dst109); 2031 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 2032 dst98 = __msa_ilvr_h(dst66, dst108); 2033 2034 dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1, 2035 filt_h2, filt_h3); 2036 dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1, 2037 filt_h2, filt_h3); 2038 dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1, 2039 filt_h2, filt_h3); 2040 dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1, 2041 filt_h2, filt_h3); 2042 SRA_4V(dst0, dst1, dst2, dst3, 6); 2043 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3); 2044 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1); 2045 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3); 2046 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 2047 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 2048 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 2049 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 2050 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 2051 CLIP_SW4_0_255(dst0, dst1, dst2, dst3); 2052 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 2053 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 2054 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2055 dst += (4 * dst_stride); 2056 2057 dst10 = dst54; 2058 dst32 = dst76; 2059 dst54 = dst98; 2060 dst21 = dst65; 2061 dst43 = dst87; 2062 dst65 = dst109; 2063 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 2064 } 2065} 2066 2067static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr, 2068 int32_t src_stride, 2069 int16_t *src1_ptr, 2070 int32_t src2_stride, 2071 uint8_t *dst, 2072 int32_t dst_stride, 2073 const int8_t *filter_x, 2074 const int8_t *filter_y, 2075 int32_t height, 2076 int32_t weight0, 2077 int32_t weight1, 2078 int32_t offset0, 2079 int32_t offset1, 2080 int32_t rnd_val, 2081 int32_t width8mult) 2082{ 2083 uint32_t loop_cnt, cnt; 2084 int32_t offset, weight; 2085 uint8_t *src0_ptr_tmp; 2086 int16_t *src1_ptr_tmp; 2087 uint8_t *dst_tmp; 2088 v16u8 out; 2089 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2090 v8i16 in0, in1; 2091 v8i16 filt0, filt1, filt2, filt3; 2092 v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 2093 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2094 v16i8 mask1, mask2, mask3; 2095 v8i16 filter_vec, weight_vec; 2096 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2097 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 2098 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 2099 v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 2100 v8i16 tmp0, tmp1, tmp2, tmp3; 2101 v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 2102 v8i16 dst10_l, dst32_l, dst54_l, dst76_l; 2103 v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 2104 v8i16 dst21_l, dst43_l, dst65_l, dst87_l; 2105 v4i32 offset_vec, rnd_vec, const_vec; 2106 2107 src0_ptr -= ((3 * src_stride) + 3); 2108 2109 offset = (offset0 + offset1) << rnd_val; 2110 weight0 = weight0 & 0x0000FFFF; 2111 weight = weight0 | (weight1 << 16); 2112 2113 const_vec = __msa_fill_w((128 * weight1)); 2114 const_vec <<= 6; 2115 offset_vec = __msa_fill_w(offset); 2116 rnd_vec = __msa_fill_w(rnd_val + 1); 2117 offset_vec += const_vec; 2118 weight_vec = (v8i16) __msa_fill_w(weight); 2119 2120 filter_vec = LD_SH(filter_x); 2121 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 2122 2123 filter_vec = LD_SH(filter_y); 2124 UNPCK_R_SB_SH(filter_vec, filter_vec); 2125 2126 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 2127 2128 mask1 = mask0 + 2; 2129 mask2 = mask0 + 4; 2130 mask3 = mask0 + 6; 2131 2132 for (cnt = width8mult; cnt--;) { 2133 src0_ptr_tmp = src0_ptr; 2134 src1_ptr_tmp = src1_ptr; 2135 dst_tmp = dst; 2136 2137 LD_SB7(src0_ptr_tmp, src_stride, 2138 src0, src1, src2, src3, src4, src5, src6); 2139 src0_ptr_tmp += (7 * src_stride); 2140 2141 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 2142 2143 /* row 0 row 1 row 2 row 3 */ 2144 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 2145 vec0, vec1, vec2, vec3); 2146 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 2147 vec4, vec5, vec6, vec7); 2148 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 2149 vec8, vec9, vec10, vec11); 2150 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 2151 vec12, vec13, vec14, vec15); 2152 2153 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2154 filt3); 2155 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2156 filt3); 2157 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2158 filt3); 2159 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 2160 filt2, filt3); 2161 2162 /* row 4 row 5 row 6 */ 2163 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 2164 vec0, vec1, vec2, vec3); 2165 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, 2166 vec4, vec5, vec6, vec7); 2167 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, 2168 vec8, vec9, vec10, vec11); 2169 2170 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2171 filt3); 2172 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2173 filt3); 2174 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2175 filt3); 2176 2177 for (loop_cnt = height >> 1; loop_cnt--;) { 2178 LD_SB2(src0_ptr_tmp, src_stride, src7, src8); 2179 XORI_B2_128_SB(src7, src8); 2180 src0_ptr_tmp += 2 * src_stride; 2181 2182 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); 2183 src1_ptr_tmp += (2 * src2_stride); 2184 2185 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r, 2186 dst32_r, dst54_r, dst21_r); 2187 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l, 2188 dst32_l, dst54_l, dst21_l); 2189 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); 2190 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); 2191 2192 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, 2193 vec0, vec1, vec2, vec3); 2194 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 2195 filt2, filt3); 2196 2197 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 2198 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 2199 filt_h0, filt_h1, filt_h2, filt_h3); 2200 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 2201 filt_h0, filt_h1, filt_h2, filt_h3); 2202 2203 dst0_r >>= 6; 2204 dst0_l >>= 6; 2205 2206 /* row 8 */ 2207 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, 2208 vec0, vec1, vec2, vec3); 2209 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 2210 filt2, filt3); 2211 2212 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 2213 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, 2214 filt_h0, filt_h1, filt_h2, filt_h3); 2215 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, 2216 filt_h0, filt_h1, filt_h2, filt_h3); 2217 2218 dst1_r >>= 6; 2219 dst1_l >>= 6; 2220 2221 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3); 2222 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1); 2223 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3); 2224 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 2225 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 2226 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 2227 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 2228 SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec); 2229 CLIP_SW4_0_255(dst0_l, dst0_r, dst1_l, dst1_r); 2230 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); 2231 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 2232 ST_D2(out, 0, 1, dst_tmp, dst_stride); 2233 dst_tmp += (2 * dst_stride); 2234 2235 dst0 = dst2; 2236 dst1 = dst3; 2237 dst2 = dst4; 2238 dst3 = dst5; 2239 dst4 = dst6; 2240 dst5 = dst7; 2241 dst6 = dst8; 2242 } 2243 2244 src0_ptr += 8; 2245 src1_ptr += 8; 2246 dst += 8; 2247 } 2248} 2249 2250static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr, 2251 int32_t src_stride, 2252 int16_t *src1_ptr, 2253 int32_t src2_stride, 2254 uint8_t *dst, 2255 int32_t dst_stride, 2256 const int8_t *filter_x, 2257 const int8_t *filter_y, 2258 int32_t height, 2259 int32_t weight0, 2260 int32_t weight1, 2261 int32_t offset0, 2262 int32_t offset1, 2263 int32_t rnd_val) 2264{ 2265 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, 2266 src1_ptr, src2_stride, 2267 dst, dst_stride, filter_x, filter_y, 2268 height, weight0, weight1, offset0, 2269 offset1, rnd_val, 1); 2270} 2271 2272static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr, 2273 int32_t src_stride, 2274 int16_t *src1_ptr, 2275 int32_t src2_stride, 2276 uint8_t *dst, 2277 int32_t dst_stride, 2278 const int8_t *filter_x, 2279 const int8_t *filter_y, 2280 int32_t height, 2281 int32_t weight0, 2282 int32_t weight1, 2283 int32_t offset0, 2284 int32_t offset1, 2285 int32_t rnd_val) 2286{ 2287 uint32_t loop_cnt; 2288 uint8_t *src0_ptr_tmp, *dst_tmp; 2289 int16_t *src1_ptr_tmp; 2290 int32_t offset, weight; 2291 uint64_t tp0, tp1; 2292 v16u8 out; 2293 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2294 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2295 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 2296 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 2297 v8i16 in0 = { 0 }, in1 = { 0 }; 2298 v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3; 2299 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3; 2300 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8; 2301 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r; 2302 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l; 2303 v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76; 2304 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l; 2305 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3; 2306 2307 src0_ptr -= ((3 * src_stride) + 3); 2308 2309 offset = (offset0 + offset1) << rnd_val; 2310 weight0 = weight0 & 0x0000FFFF; 2311 weight = weight0 | (weight1 << 16); 2312 2313 const_vec = __msa_fill_w((128 * weight1)); 2314 const_vec <<= 6; 2315 offset_vec = __msa_fill_w(offset); 2316 rnd_vec = __msa_fill_w(rnd_val + 1); 2317 offset_vec += const_vec; 2318 weight_vec = (v8i16) __msa_fill_w(weight); 2319 2320 filter_vec = LD_SH(filter_x); 2321 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 2322 2323 filter_vec = LD_SH(filter_y); 2324 UNPCK_R_SB_SH(filter_vec, filter_vec); 2325 2326 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 2327 2328 mask0 = LD_SB(ff_hevc_mask_arr); 2329 mask1 = mask0 + 2; 2330 mask2 = mask0 + 4; 2331 mask3 = mask0 + 6; 2332 2333 src0_ptr_tmp = src0_ptr; 2334 src1_ptr_tmp = src1_ptr; 2335 dst_tmp = dst; 2336 2337 LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 2338 src0_ptr_tmp += (7 * src_stride); 2339 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 2340 2341 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 2342 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 2343 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 2344 vec11); 2345 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14, 2346 vec15); 2347 dsth0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2348 filt3); 2349 dsth1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2350 filt3); 2351 dsth2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2352 filt3); 2353 dsth3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 2354 filt2, filt3); 2355 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 2356 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 2357 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 2358 vec11); 2359 dsth4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2360 filt3); 2361 dsth5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2362 filt3); 2363 dsth6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2364 filt3); 2365 2366 for (loop_cnt = 8; loop_cnt--;) { 2367 LD_SB2(src0_ptr_tmp, src_stride, src7, src8); 2368 src0_ptr_tmp += (2 * src_stride); 2369 XORI_B2_128_SB(src7, src8); 2370 2371 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); 2372 src1_ptr_tmp += (2 * src2_stride); 2373 2374 ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1, 2375 dst10_r, dst32_r, dst54_r, dst21_r); 2376 ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1, 2377 dst10_l, dst32_l, dst54_l, dst21_l); 2378 ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r); 2379 ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l); 2380 2381 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 2382 vec3); 2383 dsth7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2384 filt3); 2385 2386 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 2387 dst0 = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 2388 filt_h1, filt_h2, filt_h3); 2389 dst1 = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0, 2390 filt_h1, filt_h2, filt_h3); 2391 dst0 >>= 6; 2392 dst1 >>= 6; 2393 2394 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 2395 vec3); 2396 dsth8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2397 filt3); 2398 2399 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 2400 dst2 = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0, 2401 filt_h1, filt_h2, filt_h3); 2402 dst3 = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0, 2403 filt_h1, filt_h2, filt_h3); 2404 dst2 >>= 6; 2405 dst3 >>= 6; 2406 2407 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3); 2408 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1); 2409 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3); 2410 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 2411 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 2412 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 2413 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 2414 SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec); 2415 CLIP_SW4_0_255(dst1, dst0, dst3, dst2); 2416 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 2417 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 2418 ST_D2(out, 0, 1, dst_tmp, dst_stride); 2419 dst_tmp += (2 * dst_stride); 2420 2421 dsth0 = dsth2; 2422 dsth1 = dsth3; 2423 dsth2 = dsth4; 2424 dsth3 = dsth5; 2425 dsth4 = dsth6; 2426 dsth5 = dsth7; 2427 dsth6 = dsth8; 2428 } 2429 2430 src0_ptr += 8; 2431 src1_ptr += 8; 2432 dst += 8; 2433 2434 mask4 = LD_SB(ff_hevc_mask_arr + 16); 2435 mask5 = mask4 + 2; 2436 mask6 = mask4 + 4; 2437 mask7 = mask4 + 6; 2438 2439 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 2440 src0_ptr += (7 * src_stride); 2441 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 2442 2443 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); 2444 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7); 2445 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10, 2446 vec11); 2447 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14, 2448 vec15); 2449 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2450 filt3); 2451 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2452 filt3); 2453 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2454 filt3); 2455 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 2456 filt3); 2457 ILVRL_H2_SH(dst41, dst30, dst10, dst43); 2458 ILVRL_H2_SH(dst52, dst41, dst21, dst54); 2459 ILVRL_H2_SH(dst63, dst52, dst32, dst65); 2460 2461 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 2462 2463 for (loop_cnt = 4; loop_cnt--;) { 2464 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 2465 src0_ptr += (4 * src_stride); 2466 XORI_B4_128_SB(src7, src8, src9, src10); 2467 2468 LD2(src1_ptr, src2_stride, tp0, tp1); 2469 INSERT_D2_SH(tp0, tp1, in0); 2470 src1_ptr += (2 * src2_stride); 2471 LD2(src1_ptr, src2_stride, tp0, tp1); 2472 INSERT_D2_SH(tp0, tp1, in1); 2473 src1_ptr += (2 * src2_stride); 2474 2475 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 2476 vec3); 2477 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6, 2478 vec7); 2479 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2480 filt3); 2481 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2482 filt3); 2483 2484 dst76 = __msa_ilvr_h(dst97, dst66); 2485 ILVRL_H2_SH(dst108, dst97, dst87, dst109); 2486 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 2487 dst98 = __msa_ilvr_h(dst66, dst108); 2488 2489 dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1, 2490 filt_h2, filt_h3); 2491 dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1, 2492 filt_h2, filt_h3); 2493 dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1, 2494 filt_h2, filt_h3); 2495 dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1, 2496 filt_h2, filt_h3); 2497 SRA_4V(dst0, dst1, dst2, dst3, 6); 2498 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3); 2499 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1); 2500 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3); 2501 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 2502 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 2503 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 2504 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 2505 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 2506 CLIP_SW4_0_255(dst0, dst1, dst2, dst3); 2507 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 2508 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 2509 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2510 dst += (4 * dst_stride); 2511 2512 dst10 = dst54; 2513 dst32 = dst76; 2514 dst54 = dst98; 2515 dst21 = dst65; 2516 dst43 = dst87; 2517 dst65 = dst109; 2518 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 2519 } 2520} 2521 2522static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr, 2523 int32_t src_stride, 2524 int16_t *src1_ptr, 2525 int32_t src2_stride, 2526 uint8_t *dst, 2527 int32_t dst_stride, 2528 const int8_t *filter_x, 2529 const int8_t *filter_y, 2530 int32_t height, 2531 int32_t weight0, 2532 int32_t weight1, 2533 int32_t offset0, 2534 int32_t offset1, 2535 int32_t rnd_val) 2536{ 2537 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, 2538 src1_ptr, src2_stride, 2539 dst, dst_stride, filter_x, filter_y, 2540 height, weight0, weight1, offset0, 2541 offset1, rnd_val, 2); 2542} 2543 2544static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr, 2545 int32_t src_stride, 2546 int16_t *src1_ptr, 2547 int32_t src2_stride, 2548 uint8_t *dst, 2549 int32_t dst_stride, 2550 const int8_t *filter_x, 2551 const int8_t *filter_y, 2552 int32_t height, 2553 int32_t weight0, 2554 int32_t weight1, 2555 int32_t offset0, 2556 int32_t offset1, 2557 int32_t rnd_val) 2558{ 2559 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, 2560 src1_ptr, src2_stride, 2561 dst, dst_stride, filter_x, filter_y, 2562 height, weight0, weight1, offset0, 2563 offset1, rnd_val, 3); 2564} 2565 2566static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr, 2567 int32_t src_stride, 2568 int16_t *src1_ptr, 2569 int32_t src2_stride, 2570 uint8_t *dst, 2571 int32_t dst_stride, 2572 const int8_t *filter_x, 2573 const int8_t *filter_y, 2574 int32_t height, 2575 int32_t weight0, 2576 int32_t weight1, 2577 int32_t offset0, 2578 int32_t offset1, 2579 int32_t rnd_val) 2580{ 2581 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, 2582 src1_ptr, src2_stride, 2583 dst, dst_stride, filter_x, filter_y, 2584 height, weight0, weight1, offset0, 2585 offset1, rnd_val, 4); 2586} 2587 2588static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr, 2589 int32_t src_stride, 2590 int16_t *src1_ptr, 2591 int32_t src2_stride, 2592 uint8_t *dst, 2593 int32_t dst_stride, 2594 const int8_t *filter_x, 2595 const int8_t *filter_y, 2596 int32_t height, 2597 int32_t weight0, 2598 int32_t weight1, 2599 int32_t offset0, 2600 int32_t offset1, 2601 int32_t rnd_val) 2602{ 2603 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, 2604 src1_ptr, src2_stride, 2605 dst, dst_stride, filter_x, filter_y, 2606 height, weight0, weight1, offset0, 2607 offset1, rnd_val, 6); 2608} 2609 2610static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr, 2611 int32_t src_stride, 2612 int16_t *src1_ptr, 2613 int32_t src2_stride, 2614 uint8_t *dst, 2615 int32_t dst_stride, 2616 const int8_t *filter_x, 2617 const int8_t *filter_y, 2618 int32_t height, 2619 int32_t weight0, 2620 int32_t weight1, 2621 int32_t offset0, 2622 int32_t offset1, 2623 int32_t rnd_val) 2624{ 2625 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, 2626 src1_ptr, src2_stride, 2627 dst, dst_stride, filter_x, filter_y, 2628 height, weight0, weight1, offset0, 2629 offset1, rnd_val, 8); 2630} 2631 2632static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, 2633 int32_t src_stride, 2634 int16_t *src1_ptr, 2635 int32_t src2_stride, 2636 uint8_t *dst, 2637 int32_t dst_stride, 2638 const int8_t *filter, 2639 int32_t weight0, 2640 int32_t weight1, 2641 int32_t offset0, 2642 int32_t offset1, 2643 int32_t rnd_val) 2644{ 2645 int32_t offset, weight, constant; 2646 v8i16 filt0, filt1; 2647 v16i8 src0, src1; 2648 v8i16 in0, in1; 2649 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2650 v16i8 mask1, vec0, vec1; 2651 v8i16 dst0; 2652 v4i32 dst0_r, dst0_l; 2653 v8i16 out0, filter_vec; 2654 v4i32 weight_vec, offset_vec, rnd_vec; 2655 2656 src0_ptr -= 1; 2657 2658 filter_vec = LD_SH(filter); 2659 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2660 2661 mask1 = mask0 + 2; 2662 2663 offset = (offset0 + offset1) << rnd_val; 2664 weight0 = weight0 & 0x0000FFFF; 2665 weight = weight0 | (weight1 << 16); 2666 constant = 128 * weight1; 2667 constant <<= 6; 2668 offset += constant; 2669 2670 offset_vec = __msa_fill_w(offset); 2671 weight_vec = __msa_fill_w(weight); 2672 rnd_vec = __msa_fill_w(rnd_val + 1); 2673 2674 LD_SB2(src0_ptr, src_stride, src0, src1); 2675 LD_SH2(src1_ptr, src2_stride, in0, in1); 2676 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); 2677 XORI_B2_128_SB(src0, src1); 2678 2679 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2680 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2681 2682 ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l); 2683 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec); 2684 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec); 2685 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); 2686 out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 2687 CLIP_SH_0_255(out0); 2688 out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0); 2689 ST_W2(out0, 0, 1, dst, dst_stride); 2690} 2691 2692static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, 2693 int32_t src_stride, 2694 int16_t *src1_ptr, 2695 int32_t src2_stride, 2696 uint8_t *dst, 2697 int32_t dst_stride, 2698 const int8_t *filter, 2699 int32_t weight0, 2700 int32_t weight1, 2701 int32_t offset0, 2702 int32_t offset1, 2703 int32_t rnd_val) 2704{ 2705 int32_t offset, weight, constant; 2706 v8i16 filt0, filt1; 2707 v16i8 src0, src1, src2, src3; 2708 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2709 v16i8 mask1; 2710 v8i16 dst0, dst1; 2711 v16i8 vec0, vec1; 2712 v8i16 in0, in1, in2, in3; 2713 v8i16 filter_vec; 2714 v4i32 weight_vec, offset_vec, rnd_vec; 2715 2716 src0_ptr -= 1; 2717 2718 /* rearranging filter */ 2719 filter_vec = LD_SH(filter); 2720 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2721 2722 mask1 = mask0 + 2; 2723 2724 offset = (offset0 + offset1) << rnd_val; 2725 weight0 = weight0 & 0x0000FFFF; 2726 weight = weight0 | (weight1 << 16); 2727 constant = 128 * weight1; 2728 constant <<= 6; 2729 offset += constant; 2730 2731 offset_vec = __msa_fill_w(offset); 2732 weight_vec = __msa_fill_w(weight); 2733 rnd_vec = __msa_fill_w(rnd_val + 1); 2734 2735 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 2736 XORI_B4_128_SB(src0, src1, src2, src3); 2737 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2738 ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 2739 2740 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2741 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2742 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); 2743 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2744 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, 2745 weight_vec, rnd_vec, offset_vec, 2746 dst0, dst1); 2747 2748 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 2749 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride); 2750} 2751 2752static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, 2753 int32_t src_stride, 2754 int16_t *src1_ptr, 2755 int32_t src2_stride, 2756 uint8_t *dst, 2757 int32_t dst_stride, 2758 const int8_t *filter, 2759 int32_t height, 2760 int32_t weight0, 2761 int32_t weight1, 2762 int32_t offset0, 2763 int32_t offset1, 2764 int32_t rnd_val) 2765{ 2766 uint32_t loop_cnt; 2767 int32_t weight, offset, constant; 2768 v8i16 filt0, filt1; 2769 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2770 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2771 v16i8 mask1; 2772 v16i8 vec0, vec1; 2773 v8i16 dst0, dst1, dst2, dst3; 2774 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 2775 v8i16 filter_vec; 2776 v4i32 weight_vec, offset_vec, rnd_vec; 2777 2778 src0_ptr -= 1; 2779 2780 filter_vec = LD_SH(filter); 2781 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2782 2783 offset = (offset0 + offset1) << rnd_val; 2784 weight0 = weight0 & 0x0000FFFF; 2785 weight = weight0 | (weight1 << 16); 2786 constant = 128 * weight1; 2787 constant <<= 6; 2788 offset += constant; 2789 2790 offset_vec = __msa_fill_w(offset); 2791 weight_vec = __msa_fill_w(weight); 2792 rnd_vec = __msa_fill_w(rnd_val + 1); 2793 2794 mask1 = mask0 + 2; 2795 2796 for (loop_cnt = (height >> 3); loop_cnt--;) { 2797 LD_SB8(src0_ptr, src_stride, 2798 src0, src1, src2, src3, src4, src5, src6, src7); 2799 src0_ptr += (8 * src_stride); 2800 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2801 src1_ptr += (4 * src2_stride); 2802 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7); 2803 src1_ptr += (4 * src2_stride); 2804 ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 2805 ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 2806 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2807 2808 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2809 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2810 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); 2811 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2812 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1); 2813 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2814 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1); 2815 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2816 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 2817 in0, in1, in2, in3, 2818 weight_vec, rnd_vec, offset_vec, 2819 dst0, dst1, dst2, dst3); 2820 2821 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2822 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 2823 dst += (8 * dst_stride); 2824 } 2825} 2826 2827static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr, 2828 int32_t src_stride, 2829 int16_t *src1_ptr, 2830 int32_t src2_stride, 2831 uint8_t *dst, 2832 int32_t dst_stride, 2833 const int8_t *filter, 2834 int32_t height, 2835 int32_t weight0, 2836 int32_t weight1, 2837 int32_t offset0, 2838 int32_t offset1, 2839 int32_t rnd_val) 2840{ 2841 if (2 == height) { 2842 hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2843 dst, dst_stride, filter, 2844 weight0, weight1, offset0, offset1, rnd_val); 2845 } else if (4 == height) { 2846 hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2847 dst, dst_stride, filter, 2848 weight0, weight1, offset0, offset1, rnd_val); 2849 } else if (0 == (height % 8)) { 2850 hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride, 2851 src1_ptr, src2_stride, 2852 dst, dst_stride, filter, height, 2853 weight0, weight1, offset0, offset1, 2854 rnd_val); 2855 } 2856} 2857 2858static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr, 2859 int32_t src_stride, 2860 int16_t *src1_ptr, 2861 int32_t src2_stride, 2862 uint8_t *dst, 2863 int32_t dst_stride, 2864 const int8_t *filter, 2865 int32_t height, 2866 int32_t weight0, 2867 int32_t weight1, 2868 int32_t offset0, 2869 int32_t offset1, 2870 int32_t rnd_val) 2871{ 2872 uint32_t loop_cnt; 2873 int32_t offset, weight, constant; 2874 v8i16 filt0, filt1; 2875 v16i8 src0, src1, src2, src3; 2876 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2877 v16i8 mask1; 2878 v16i8 vec0, vec1; 2879 v8i16 in0, in1, in2, in3; 2880 v8i16 dst0, dst1, dst2, dst3; 2881 v8i16 filter_vec; 2882 v4i32 weight_vec, offset_vec, rnd_vec; 2883 2884 src0_ptr -= 1; 2885 2886 filter_vec = LD_SH(filter); 2887 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2888 2889 offset = (offset0 + offset1) << rnd_val; 2890 weight0 = weight0 & 0x0000FFFF; 2891 weight = weight0 | (weight1 << 16); 2892 constant = 128 * weight1; 2893 constant <<= 6; 2894 offset += constant; 2895 2896 offset_vec = __msa_fill_w(offset); 2897 weight_vec = __msa_fill_w(weight); 2898 rnd_vec = __msa_fill_w(rnd_val + 1); 2899 2900 mask1 = mask0 + 2; 2901 2902 for (loop_cnt = 2; loop_cnt--;) { 2903 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 2904 src0_ptr += (4 * src_stride); 2905 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2906 src1_ptr += (4 * src2_stride); 2907 XORI_B4_128_SB(src0, src1, src2, src3); 2908 2909 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2910 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2911 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2912 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2913 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 2914 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2915 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 2916 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2917 2918 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 2919 in0, in1, in2, in3, 2920 weight_vec, rnd_vec, offset_vec, 2921 dst0, dst1, dst2, dst3); 2922 2923 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2924 ST_W2(dst0, 0, 2, dst, dst_stride); 2925 ST_H2(dst0, 2, 6, dst + 4, dst_stride); 2926 ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride); 2927 ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2928 dst += (4 * dst_stride); 2929 } 2930} 2931 2932static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr, 2933 int32_t src_stride, 2934 int16_t *src1_ptr, 2935 int32_t src2_stride, 2936 uint8_t *dst, 2937 int32_t dst_stride, 2938 const int8_t *filter, 2939 int32_t weight0, 2940 int32_t weight1, 2941 int32_t offset0, 2942 int32_t offset1, 2943 int32_t rnd_val) 2944{ 2945 int32_t offset, weight, constant; 2946 v8i16 filt0, filt1; 2947 v16i8 src0, src1; 2948 v8i16 in0, in1; 2949 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2950 v16i8 mask1, vec0, vec1; 2951 v8i16 dst0, dst1; 2952 v8i16 filter_vec; 2953 v4i32 weight_vec, offset_vec, rnd_vec; 2954 2955 src0_ptr -= 1; 2956 2957 filter_vec = LD_SH(filter); 2958 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2959 2960 offset = (offset0 + offset1) << rnd_val; 2961 weight0 = weight0 & 0x0000FFFF; 2962 weight = weight0 | (weight1 << 16); 2963 constant = 128 * weight1; 2964 constant <<= 6; 2965 offset += constant; 2966 2967 offset_vec = __msa_fill_w(offset); 2968 weight_vec = __msa_fill_w(weight); 2969 rnd_vec = __msa_fill_w(rnd_val + 1); 2970 2971 mask1 = mask0 + 2; 2972 2973 LD_SB2(src0_ptr, src_stride, src0, src1); 2974 LD_SH2(src1_ptr, src2_stride, in0, in1); 2975 XORI_B2_128_SB(src0, src1); 2976 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2977 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2978 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2979 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2980 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, 2981 weight_vec, rnd_vec, offset_vec, 2982 dst0, dst1); 2983 2984 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 2985 ST_D2(dst0, 0, 1, dst, dst_stride); 2986} 2987 2988static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr, 2989 int32_t src_stride, 2990 int16_t *src1_ptr, 2991 int32_t src2_stride, 2992 uint8_t *dst, 2993 int32_t dst_stride, 2994 const int8_t *filter, 2995 int32_t weight0, 2996 int32_t weight1, 2997 int32_t offset0, 2998 int32_t offset1, 2999 int32_t rnd_val) 3000{ 3001 int32_t weight, offset, constant; 3002 v8i16 filt0, filt1; 3003 v16i8 src0, src1, src2, src3, src4, src5; 3004 v8i16 in0, in1, in2, in3, in4, in5; 3005 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3006 v16i8 mask1; 3007 v16i8 vec0, vec1; 3008 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 3009 v8i16 filter_vec; 3010 v4i32 weight_vec, offset_vec, rnd_vec; 3011 3012 src0_ptr -= 1; 3013 3014 filter_vec = LD_SH(filter); 3015 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3016 3017 offset = (offset0 + offset1) << rnd_val; 3018 weight0 = weight0 & 0x0000FFFF; 3019 weight = weight0 | (weight1 << 16); 3020 constant = 128 * weight1; 3021 constant <<= 6; 3022 offset += constant; 3023 3024 offset_vec = __msa_fill_w(offset); 3025 weight_vec = __msa_fill_w(weight); 3026 rnd_vec = __msa_fill_w(rnd_val + 1); 3027 3028 mask1 = mask0 + 2; 3029 3030 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5); 3031 3032 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3033 src1_ptr += (4 * src2_stride); 3034 LD_SH2(src1_ptr, src2_stride, in4, in5); 3035 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5); 3036 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3037 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3038 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3039 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3040 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 3041 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3042 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3043 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3044 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 3045 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3046 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); 3047 dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3048 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 3049 in0, in1, in2, in3, 3050 weight_vec, rnd_vec, offset_vec, 3051 dst0, dst1, dst2, dst3); 3052 HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5, 3053 weight_vec, rnd_vec, offset_vec, 3054 dst4, dst5); 3055 3056 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3057 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); 3058 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 3059 ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride); 3060} 3061 3062static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, 3063 int32_t src_stride, 3064 int16_t *src1_ptr, 3065 int32_t src2_stride, 3066 uint8_t *dst, 3067 int32_t dst_stride, 3068 const int8_t *filter, 3069 int32_t height, 3070 int32_t weight0, 3071 int32_t weight1, 3072 int32_t offset0, 3073 int32_t offset1, 3074 int32_t rnd_val) 3075{ 3076 uint32_t loop_cnt; 3077 int32_t offset, weight, constant; 3078 v8i16 filt0, filt1; 3079 v16i8 src0, src1, src2, src3; 3080 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3081 v16i8 mask1; 3082 v16i8 vec0, vec1; 3083 v8i16 in0, in1, in2, in3; 3084 v8i16 dst0, dst1, dst2, dst3; 3085 v8i16 filter_vec; 3086 v4i32 weight_vec, offset_vec, rnd_vec; 3087 3088 src0_ptr -= 1; 3089 3090 filter_vec = LD_SH(filter); 3091 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3092 3093 offset = (offset0 + offset1) << rnd_val; 3094 weight0 = weight0 & 0x0000FFFF; 3095 weight = weight0 | (weight1 << 16); 3096 constant = 128 * weight1; 3097 constant <<= 6; 3098 offset += constant; 3099 3100 offset_vec = __msa_fill_w(offset); 3101 weight_vec = __msa_fill_w(weight); 3102 rnd_vec = __msa_fill_w(rnd_val + 1); 3103 3104 mask1 = mask0 + 2; 3105 3106 for (loop_cnt = (height >> 2); loop_cnt--;) { 3107 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 3108 src0_ptr += (4 * src_stride); 3109 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3110 src1_ptr += (4 * src2_stride); 3111 XORI_B4_128_SB(src0, src1, src2, src3); 3112 3113 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3114 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3115 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3116 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3117 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 3118 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3119 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3120 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3121 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 3122 in0, in1, in2, in3, 3123 weight_vec, rnd_vec, offset_vec, 3124 dst0, dst1, dst2, dst3); 3125 3126 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3127 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 3128 dst += (4 * dst_stride); 3129 } 3130} 3131 3132static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr, 3133 int32_t src_stride, 3134 int16_t *src1_ptr, 3135 int32_t src2_stride, 3136 uint8_t *dst, 3137 int32_t dst_stride, 3138 const int8_t *filter, 3139 int32_t height, 3140 int32_t weight0, 3141 int32_t weight1, 3142 int32_t offset0, 3143 int32_t offset1, 3144 int32_t rnd_val) 3145{ 3146 if (2 == height) { 3147 hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3148 dst, dst_stride, filter, 3149 weight0, weight1, offset0, offset1, rnd_val); 3150 } else if (6 == height) { 3151 hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3152 dst, dst_stride, filter, 3153 weight0, weight1, offset0, offset1, rnd_val); 3154 } else if (0 == (height % 4)) { 3155 hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride, 3156 src1_ptr, src2_stride, 3157 dst, dst_stride, filter, height, 3158 weight0, weight1, offset0, offset1, 3159 rnd_val); 3160 } 3161} 3162 3163static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr, 3164 int32_t src_stride, 3165 int16_t *src1_ptr, 3166 int32_t src2_stride, 3167 uint8_t *dst, 3168 int32_t dst_stride, 3169 const int8_t *filter, 3170 int32_t height, 3171 int32_t weight0, 3172 int32_t weight1, 3173 int32_t offset0, 3174 int32_t offset1, 3175 int32_t rnd_val) 3176{ 3177 uint32_t loop_cnt; 3178 int32_t offset, weight, constant; 3179 v8i16 filt0, filt1; 3180 v16i8 src0, src1, src2, src3; 3181 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 3182 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3183 v16i8 mask2 = { 3184 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 3185 }; 3186 v16i8 mask1, mask3; 3187 v16i8 vec0, vec1; 3188 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 3189 v8i16 filter_vec; 3190 v4i32 weight_vec, offset_vec, rnd_vec; 3191 3192 src0_ptr -= 1; 3193 3194 filter_vec = LD_SH(filter); 3195 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3196 3197 offset = (offset0 + offset1) << rnd_val; 3198 weight0 = weight0 & 0x0000FFFF; 3199 weight = weight0 | (weight1 << 16); 3200 constant = 128 * weight1; 3201 constant <<= 6; 3202 offset += constant; 3203 3204 offset_vec = __msa_fill_w(offset); 3205 weight_vec = __msa_fill_w(weight); 3206 rnd_vec = __msa_fill_w(rnd_val + 1); 3207 3208 mask1 = mask0 + 2; 3209 mask3 = mask2 + 2; 3210 3211 for (loop_cnt = 4; loop_cnt--;) { 3212 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 3213 src0_ptr += (4 * src_stride); 3214 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3215 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 3216 src1_ptr += (4 * src2_stride); 3217 ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 3218 XORI_B4_128_SB(src0, src1, src2, src3); 3219 3220 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3221 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3222 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3223 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3224 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 3225 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3226 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3227 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3228 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 3229 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3230 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); 3231 dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3232 3233 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 3234 in0, in1, in2, in3, 3235 weight_vec, rnd_vec, offset_vec, 3236 dst0, dst1, dst2, dst3); 3237 HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5, 3238 weight_vec, rnd_vec, offset_vec, 3239 dst4, dst5); 3240 3241 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3242 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); 3243 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 3244 ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride); 3245 dst += (4 * dst_stride); 3246 } 3247} 3248 3249static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr, 3250 int32_t src_stride, 3251 int16_t *src1_ptr, 3252 int32_t src2_stride, 3253 uint8_t *dst, 3254 int32_t dst_stride, 3255 const int8_t *filter, 3256 int32_t height, 3257 int32_t weight0, 3258 int32_t weight1, 3259 int32_t offset0, 3260 int32_t offset1, 3261 int32_t rnd_val) 3262{ 3263 uint32_t loop_cnt; 3264 int32_t offset, weight, constant; 3265 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 3266 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 3267 v8i16 filt0, filt1; 3268 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3269 v16i8 mask1; 3270 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3271 v16i8 vec0, vec1; 3272 v8i16 filter_vec; 3273 v4i32 weight_vec, offset_vec, rnd_vec; 3274 3275 src0_ptr -= 1; 3276 3277 filter_vec = LD_SH(filter); 3278 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3279 3280 offset = (offset0 + offset1) << rnd_val; 3281 weight0 = weight0 & 0x0000FFFF; 3282 weight = weight0 | (weight1 << 16); 3283 constant = 128 * weight1; 3284 constant <<= 6; 3285 offset += constant; 3286 3287 offset_vec = __msa_fill_w(offset); 3288 weight_vec = __msa_fill_w(weight); 3289 rnd_vec = __msa_fill_w(rnd_val + 1); 3290 3291 mask1 = mask0 + 2; 3292 3293 for (loop_cnt = (height >> 2); loop_cnt--;) { 3294 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6); 3295 LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7); 3296 src0_ptr += (4 * src_stride); 3297 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6); 3298 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7); 3299 src1_ptr += (4 * src2_stride); 3300 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3301 3302 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3303 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3304 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3305 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3306 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 3307 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3308 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3309 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3310 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 3311 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3312 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); 3313 dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3314 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); 3315 dst6 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3316 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 3317 dst7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3318 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 3319 in0, in1, in2, in3, 3320 weight_vec, rnd_vec, offset_vec, 3321 dst0, dst1, dst2, dst3); 3322 3323 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3324 ST_SH2(dst0, dst1, dst, dst_stride); 3325 dst += (2 * dst_stride); 3326 3327 HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7, 3328 in4, in5, in6, in7, 3329 weight_vec, rnd_vec, offset_vec, 3330 dst0, dst1, dst2, dst3); 3331 3332 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3333 ST_SH2(dst0, dst1, dst, dst_stride); 3334 dst += (2 * dst_stride); 3335 } 3336} 3337 3338static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr, 3339 int32_t src_stride, 3340 int16_t *src1_ptr, 3341 int32_t src2_stride, 3342 uint8_t *dst, 3343 int32_t dst_stride, 3344 const int8_t *filter, 3345 int32_t height, 3346 int32_t weight0, 3347 int32_t weight1, 3348 int32_t offset0, 3349 int32_t offset1, 3350 int32_t rnd_val) 3351{ 3352 uint32_t loop_cnt; 3353 int32_t offset, weight, constant; 3354 v16i8 src0, src1, src2, src3; 3355 v8i16 filt0, filt1; 3356 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3357 v16i8 mask1, mask2, mask3; 3358 v16i8 vec0, vec1; 3359 v8i16 dst0, dst1, dst2, dst3; 3360 v8i16 in0, in1, in2, in3, in4, in5; 3361 v8i16 filter_vec; 3362 v4i32 weight_vec, offset_vec, rnd_vec; 3363 3364 src0_ptr -= 1; 3365 3366 filter_vec = LD_SH(filter); 3367 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3368 3369 offset = (offset0 + offset1) << rnd_val; 3370 weight0 = weight0 & 0x0000FFFF; 3371 weight = weight0 | (weight1 << 16); 3372 constant = 128 * weight1; 3373 constant <<= 6; 3374 offset += constant; 3375 3376 offset_vec = __msa_fill_w(offset); 3377 weight_vec = __msa_fill_w(weight); 3378 rnd_vec = __msa_fill_w(rnd_val + 1); 3379 3380 mask1 = mask0 + 2; 3381 mask2 = mask0 + 8; 3382 mask3 = mask0 + 10; 3383 3384 for (loop_cnt = 16; loop_cnt--;) { 3385 LD_SB2(src0_ptr, src_stride, src0, src2); 3386 LD_SB2(src0_ptr + 16, src_stride, src1, src3); 3387 src0_ptr += (2 * src_stride); 3388 LD_SH2(src1_ptr, src2_stride, in0, in2); 3389 LD_SH2(src1_ptr + 8, src2_stride, in1, in3); 3390 LD_SH2(src1_ptr + 16, src2_stride, in4, in5); 3391 src1_ptr += (2 * src2_stride); 3392 XORI_B4_128_SB(src0, src1, src2, src3); 3393 3394 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3395 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3396 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 3397 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3398 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 3399 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3400 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); 3401 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3402 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 3403 in0, in1, in2, in3, 3404 weight_vec, rnd_vec, offset_vec, 3405 dst0, dst1, dst2, dst3); 3406 3407 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3408 ST_SH2(dst0, dst1, dst, dst_stride); 3409 3410 /* 8 width */ 3411 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3412 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3413 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3414 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3415 HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5, 3416 weight_vec, rnd_vec, offset_vec, 3417 dst0, dst1); 3418 3419 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 3420 ST_D2(dst0, 0, 1, (dst + 16), dst_stride); 3421 dst += (2 * dst_stride); 3422 } 3423} 3424 3425static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr, 3426 int32_t src_stride, 3427 int16_t *src1_ptr, 3428 int32_t src2_stride, 3429 uint8_t *dst, 3430 int32_t dst_stride, 3431 const int8_t *filter, 3432 int32_t height, 3433 int32_t weight0, 3434 int32_t weight1, 3435 int32_t offset0, 3436 int32_t offset1, 3437 int32_t rnd_val) 3438{ 3439 uint32_t loop_cnt; 3440 int32_t offset, weight, constant; 3441 v16i8 src0, src1, src2; 3442 v8i16 filt0, filt1; 3443 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3444 v16i8 mask1, mask2, mask3; 3445 v8i16 dst0, dst1, dst2, dst3; 3446 v16i8 vec0, vec1; 3447 v8i16 in0, in1, in2, in3; 3448 v8i16 filter_vec; 3449 v4i32 weight_vec, offset_vec, rnd_vec; 3450 3451 src0_ptr -= 1; 3452 3453 filter_vec = LD_SH(filter); 3454 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3455 3456 offset = (offset0 + offset1) << rnd_val; 3457 weight0 = weight0 & 0x0000FFFF; 3458 weight = weight0 | (weight1 << 16); 3459 constant = 128 * weight1; 3460 constant <<= 6; 3461 offset += constant; 3462 3463 offset_vec = __msa_fill_w(offset); 3464 weight_vec = __msa_fill_w(weight); 3465 rnd_vec = __msa_fill_w(rnd_val + 1); 3466 3467 mask1 = mask0 + 2; 3468 mask2 = mask0 + 8; 3469 mask3 = mask0 + 10; 3470 3471 for (loop_cnt = height; loop_cnt--;) { 3472 LD_SB2(src0_ptr, 16, src0, src1); 3473 src2 = LD_SB(src0_ptr + 24); 3474 src0_ptr += src_stride; 3475 LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 3476 src1_ptr += src2_stride; 3477 XORI_B3_128_SB(src0, src1, src2); 3478 3479 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3480 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3481 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 3482 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3483 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3484 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3485 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 3486 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3487 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 3488 in0, in1, in2, in3, 3489 weight_vec, rnd_vec, offset_vec, 3490 dst0, dst1, dst2, dst3); 3491 3492 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3493 ST_SH2(dst0, dst1, dst, 16); 3494 dst += dst_stride; 3495 } 3496} 3497 3498static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr, 3499 int32_t src_stride, 3500 int16_t *src1_ptr, 3501 int32_t src2_stride, 3502 uint8_t *dst, 3503 int32_t dst_stride, 3504 const int8_t *filter, 3505 int32_t weight0, 3506 int32_t weight1, 3507 int32_t offset0, 3508 int32_t offset1, 3509 int32_t rnd_val) 3510{ 3511 int32_t weight, offset, constant; 3512 v16i8 src0, src1, src2, src3, src4; 3513 v8i16 in0, in1, dst10; 3514 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; 3515 v4i32 dst10_r, dst10_l; 3516 v8i16 filt0, filt1; 3517 v8i16 filter_vec, out; 3518 v4i32 weight_vec, offset_vec, rnd_vec; 3519 3520 src0_ptr -= src_stride; 3521 3522 offset = (offset0 + offset1) << rnd_val; 3523 weight0 = weight0 & 0x0000FFFF; 3524 weight = weight0 | (weight1 << 16); 3525 constant = 128 * weight1; 3526 constant <<= 6; 3527 offset += constant; 3528 3529 offset_vec = __msa_fill_w(offset); 3530 weight_vec = __msa_fill_w(weight); 3531 rnd_vec = __msa_fill_w(rnd_val + 1); 3532 3533 filter_vec = LD_SH(filter); 3534 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3535 3536 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3537 src0_ptr += (3 * src_stride); 3538 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3539 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 3540 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3541 LD_SB2(src0_ptr, src_stride, src3, src4); 3542 src0_ptr += (2 * src_stride); 3543 LD_SH2(src1_ptr, src2_stride, in0, in1); 3544 src1_ptr += (2 * src2_stride); 3545 3546 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); 3547 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3548 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); 3549 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); 3550 3551 dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3552 3553 ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l); 3554 dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec); 3555 dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec); 3556 SRAR_W2_SW(dst10_r, dst10_l, rnd_vec); 3557 out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r); 3558 CLIP_SH_0_255(out); 3559 out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out); 3560 ST_W2(out, 0, 1, dst, dst_stride); 3561} 3562 3563static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr, 3564 int32_t src_stride, 3565 int16_t *src1_ptr, 3566 int32_t src2_stride, 3567 uint8_t *dst, 3568 int32_t dst_stride, 3569 const int8_t *filter, 3570 int32_t weight0, 3571 int32_t weight1, 3572 int32_t offset0, 3573 int32_t offset1, 3574 int32_t rnd_val) 3575{ 3576 int32_t weight, offset, constant; 3577 v16i8 src0, src1, src2, src3, src4, src5, src6; 3578 v8i16 in0, in1, in2, in3; 3579 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 3580 v16i8 src2110, src4332, src6554; 3581 v8i16 dst10, dst32; 3582 v8i16 filt0, filt1; 3583 v8i16 filter_vec; 3584 v4i32 weight_vec, offset_vec, rnd_vec; 3585 3586 src0_ptr -= src_stride; 3587 3588 offset = (offset0 + offset1) << rnd_val; 3589 weight0 = weight0 & 0x0000FFFF; 3590 weight = weight0 | (weight1 << 16); 3591 constant = 128 * weight1; 3592 constant <<= 6; 3593 offset += constant; 3594 3595 offset_vec = __msa_fill_w(offset); 3596 weight_vec = __msa_fill_w(weight); 3597 rnd_vec = __msa_fill_w(rnd_val + 1); 3598 3599 filter_vec = LD_SH(filter); 3600 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3601 3602 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3603 src0_ptr += (3 * src_stride); 3604 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3605 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 3606 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3607 3608 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6); 3609 src0_ptr += (4 * src_stride); 3610 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3611 src1_ptr += (4 * src2_stride); 3612 ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 3613 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3614 src32_r, src43_r, src54_r, src65_r); 3615 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554); 3616 XORI_B2_128_SB(src4332, src6554); 3617 3618 dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3619 dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1); 3620 3621 HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1, 3622 weight_vec, rnd_vec, offset_vec, 3623 dst10, dst32); 3624 3625 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10); 3626 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride); 3627 dst += (4 * dst_stride); 3628} 3629 3630static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, 3631 int32_t src_stride, 3632 int16_t *src1_ptr, 3633 int32_t src2_stride, 3634 uint8_t *dst, 3635 int32_t dst_stride, 3636 const int8_t *filter, 3637 int32_t height, 3638 int32_t weight0, 3639 int32_t weight1, 3640 int32_t offset0, 3641 int32_t offset1, 3642 int32_t rnd_val) 3643{ 3644 uint32_t loop_cnt; 3645 int32_t weight, offset, constant; 3646 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 3647 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 3648 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 3649 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 3650 v16i8 src2110, src4332, src6554, src8776; 3651 v8i16 dst10, dst32, dst54, dst76; 3652 v8i16 filt0, filt1; 3653 v8i16 filter_vec; 3654 v4i32 weight_vec, offset_vec, rnd_vec; 3655 3656 src0_ptr -= src_stride; 3657 3658 offset = (offset0 + offset1) << rnd_val; 3659 weight0 = weight0 & 0x0000FFFF; 3660 weight = weight0 | (weight1 << 16); 3661 constant = 128 * weight1; 3662 constant <<= 6; 3663 offset += constant; 3664 3665 offset_vec = __msa_fill_w(offset); 3666 weight_vec = __msa_fill_w(weight); 3667 rnd_vec = __msa_fill_w(rnd_val + 1); 3668 3669 filter_vec = LD_SH(filter); 3670 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3671 3672 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3673 src0_ptr += (3 * src_stride); 3674 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3675 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 3676 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3677 3678 for (loop_cnt = (height >> 3); loop_cnt--;) { 3679 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); 3680 src0_ptr += (6 * src_stride); 3681 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); 3682 src1_ptr += (8 * src2_stride); 3683 3684 ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 3685 ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 3686 3687 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3688 src32_r, src43_r, src54_r, src65_r); 3689 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3690 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, 3691 src4332, src6554, src8776); 3692 XORI_B3_128_SB(src4332, src6554, src8776); 3693 3694 dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3695 dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1); 3696 dst54 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1); 3697 3698 LD_SB2(src0_ptr, src_stride, src9, src2); 3699 src0_ptr += (2 * src_stride); 3700 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r); 3701 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r); 3702 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3703 3704 dst76 = HEVC_FILT_4TAP_SH(src8776, src2110, filt0, filt1); 3705 HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76, 3706 in0, in1, in2, in3, 3707 weight_vec, rnd_vec, offset_vec, 3708 dst10, dst32, dst54, dst76); 3709 3710 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32); 3711 ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3712 dst += (8 * dst_stride); 3713 } 3714} 3715 3716static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr, 3717 int32_t src_stride, 3718 int16_t *src1_ptr, 3719 int32_t src2_stride, 3720 uint8_t *dst, 3721 int32_t dst_stride, 3722 const int8_t *filter, 3723 int32_t height, 3724 int32_t weight0, 3725 int32_t weight1, 3726 int32_t offset0, 3727 int32_t offset1, 3728 int32_t rnd_val) 3729{ 3730 if (2 == height) { 3731 hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3732 dst, dst_stride, filter, 3733 weight0, weight1, offset0, offset1, rnd_val); 3734 } else if (4 == height) { 3735 hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3736 dst, dst_stride, filter, 3737 weight0, weight1, offset0, offset1, rnd_val); 3738 } else if (0 == (height % 8)) { 3739 hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride, 3740 src1_ptr, src2_stride, 3741 dst, dst_stride, filter, height, 3742 weight0, weight1, offset0, offset1, 3743 rnd_val); 3744 } 3745} 3746 3747static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr, 3748 int32_t src_stride, 3749 int16_t *src1_ptr, 3750 int32_t src2_stride, 3751 uint8_t *dst, 3752 int32_t dst_stride, 3753 const int8_t *filter, 3754 int32_t height, 3755 int32_t weight0, 3756 int32_t weight1, 3757 int32_t offset0, 3758 int32_t offset1, 3759 int32_t rnd_val) 3760{ 3761 uint32_t loop_cnt; 3762 int32_t offset, weight, constant; 3763 v16i8 src0, src1, src2, src3, src4; 3764 v8i16 in0, in1, in2, in3; 3765 v16i8 src10_r, src32_r, src21_r, src43_r; 3766 v8i16 tmp0, tmp1, tmp2, tmp3; 3767 v8i16 filt0, filt1; 3768 v8i16 filter_vec; 3769 v4i32 weight_vec, offset_vec, rnd_vec; 3770 3771 src0_ptr -= src_stride; 3772 3773 offset = (offset0 + offset1) << rnd_val; 3774 weight0 = weight0 & 0x0000FFFF; 3775 weight = weight0 | (weight1 << 16); 3776 constant = 128 * weight1; 3777 constant <<= 6; 3778 offset += constant; 3779 3780 offset_vec = __msa_fill_w(offset); 3781 weight_vec = __msa_fill_w(weight); 3782 rnd_vec = __msa_fill_w(rnd_val + 1); 3783 3784 filter_vec = LD_SH(filter); 3785 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3786 3787 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3788 src0_ptr += (3 * src_stride); 3789 XORI_B3_128_SB(src0, src1, src2); 3790 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3791 3792 for (loop_cnt = (height >> 2); loop_cnt--;) { 3793 LD_SB2(src0_ptr, src_stride, src3, src4); 3794 src0_ptr += (2 * src_stride); 3795 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3796 src1_ptr += (4 * src2_stride); 3797 XORI_B2_128_SB(src3, src4); 3798 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3799 3800 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3801 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3802 3803 LD_SB2(src0_ptr, src_stride, src1, src2); 3804 src0_ptr += (2 * src_stride); 3805 XORI_B2_128_SB(src1, src2); 3806 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); 3807 3808 tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1); 3809 tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1); 3810 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 3811 in0, in1, in2, in3, 3812 weight_vec, rnd_vec, offset_vec, 3813 tmp0, tmp1, tmp2, tmp3); 3814 3815 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 3816 ST_W2(tmp0, 0, 2, dst, dst_stride); 3817 ST_H2(tmp0, 2, 6, dst + 4, dst_stride); 3818 ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride); 3819 ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 3820 dst += (4 * dst_stride); 3821 } 3822} 3823 3824static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr, 3825 int32_t src_stride, 3826 int16_t *src1_ptr, 3827 int32_t src2_stride, 3828 uint8_t *dst, 3829 int32_t dst_stride, 3830 const int8_t *filter, 3831 int32_t weight0, 3832 int32_t weight1, 3833 int32_t offset0, 3834 int32_t offset1, 3835 int32_t rnd_val) 3836{ 3837 int32_t offset, weight, constant; 3838 v16i8 src0, src1, src2, src3, src4; 3839 v8i16 in0, in1, tmp0, tmp1; 3840 v16i8 src10_r, src32_r, src21_r, src43_r; 3841 v8i16 filt0, filt1; 3842 v8i16 filter_vec; 3843 v4i32 weight_vec, offset_vec, rnd_vec; 3844 3845 src0_ptr -= src_stride; 3846 3847 offset = (offset0 + offset1) << rnd_val; 3848 weight0 = weight0 & 0x0000FFFF; 3849 weight = weight0 | (weight1 << 16); 3850 constant = 128 * weight1; 3851 constant <<= 6; 3852 offset += constant; 3853 3854 offset_vec = __msa_fill_w(offset); 3855 weight_vec = __msa_fill_w(weight); 3856 rnd_vec = __msa_fill_w(rnd_val + 1); 3857 3858 filter_vec = LD_SH(filter); 3859 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3860 3861 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3862 src0_ptr += (3 * src_stride); 3863 XORI_B3_128_SB(src0, src1, src2); 3864 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3865 3866 LD_SB2(src0_ptr, src_stride, src3, src4); 3867 LD_SH2(src1_ptr, src2_stride, in0, in1); 3868 XORI_B2_128_SB(src3, src4); 3869 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3870 3871 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3872 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3873 HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1, 3874 weight_vec, rnd_vec, offset_vec, 3875 tmp0, tmp1); 3876 3877 tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 3878 ST_D2(tmp0, 0, 1, dst, dst_stride); 3879} 3880 3881static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr, 3882 int32_t src_stride, 3883 int16_t *src1_ptr, 3884 int32_t src2_stride, 3885 uint8_t *dst, 3886 int32_t dst_stride, 3887 const int8_t *filter, 3888 int32_t weight0, 3889 int32_t weight1, 3890 int32_t offset0, 3891 int32_t offset1, 3892 int32_t rnd_val) 3893{ 3894 int32_t offset, weight, constant; 3895 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3896 v8i16 in0, in1, in2, in3, in4, in5; 3897 v16i8 src10_r, src32_r, src54_r, src76_r; 3898 v16i8 src21_r, src43_r, src65_r, src87_r; 3899 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 3900 v8i16 filt0, filt1; 3901 v8i16 filter_vec; 3902 v4i32 weight_vec, offset_vec, rnd_vec; 3903 3904 src0_ptr -= src_stride; 3905 3906 offset = (offset0 + offset1) << rnd_val; 3907 weight0 = weight0 & 0x0000FFFF; 3908 weight = weight0 | (weight1 << 16); 3909 constant = 128 * weight1; 3910 constant <<= 6; 3911 offset += constant; 3912 3913 offset_vec = __msa_fill_w(offset); 3914 weight_vec = __msa_fill_w(weight); 3915 rnd_vec = __msa_fill_w(rnd_val + 1); 3916 3917 filter_vec = LD_SH(filter); 3918 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3919 3920 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3921 src0_ptr += (3 * src_stride); 3922 XORI_B3_128_SB(src0, src1, src2); 3923 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3924 3925 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); 3926 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); 3927 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8); 3928 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3929 src32_r, src43_r, src54_r, src65_r); 3930 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3931 3932 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3933 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3934 tmp2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3935 tmp3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3936 tmp4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1); 3937 tmp5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1); 3938 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 3939 in0, in1, in2, in3, 3940 weight_vec, rnd_vec, offset_vec, 3941 tmp0, tmp1, tmp2, tmp3); 3942 HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5, 3943 weight_vec, rnd_vec, offset_vec, 3944 tmp4, tmp5); 3945 3946 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 3947 tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); 3948 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 3949 ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride); 3950} 3951 3952static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, 3953 int32_t src_stride, 3954 int16_t *src1_ptr, 3955 int32_t src2_stride, 3956 uint8_t *dst, 3957 int32_t dst_stride, 3958 const int8_t *filter, 3959 int32_t height, 3960 int32_t weight0, 3961 int32_t weight1, 3962 int32_t offset0, 3963 int32_t offset1, 3964 int32_t rnd_val) 3965{ 3966 uint32_t loop_cnt; 3967 int32_t offset, weight, constant; 3968 v16i8 src0, src1, src2, src3, src4; 3969 v8i16 in0, in1, in2, in3; 3970 v16i8 src10_r, src32_r, src21_r, src43_r; 3971 v8i16 tmp0, tmp1, tmp2, tmp3; 3972 v8i16 filt0, filt1; 3973 v8i16 filter_vec; 3974 v4i32 weight_vec, offset_vec, rnd_vec; 3975 3976 src0_ptr -= src_stride; 3977 3978 offset = (offset0 + offset1) << rnd_val; 3979 weight0 = weight0 & 0x0000FFFF; 3980 weight = weight0 | (weight1 << 16); 3981 constant = 128 * weight1; 3982 constant <<= 6; 3983 offset += constant; 3984 3985 offset_vec = __msa_fill_w(offset); 3986 weight_vec = __msa_fill_w(weight); 3987 rnd_vec = __msa_fill_w(rnd_val + 1); 3988 3989 filter_vec = LD_SH(filter); 3990 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3991 3992 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3993 src0_ptr += (3 * src_stride); 3994 XORI_B3_128_SB(src0, src1, src2); 3995 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3996 3997 for (loop_cnt = (height >> 2); loop_cnt--;) { 3998 LD_SB2(src0_ptr, src_stride, src3, src4); 3999 src0_ptr += (2 * src_stride); 4000 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 4001 src1_ptr += (4 * src2_stride); 4002 XORI_B2_128_SB(src3, src4); 4003 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4004 4005 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4006 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4007 4008 LD_SB2(src0_ptr, src_stride, src1, src2); 4009 src0_ptr += (2 * src_stride); 4010 XORI_B2_128_SB(src1, src2); 4011 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); 4012 4013 tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1); 4014 tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1); 4015 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 4016 in0, in1, in2, in3, 4017 weight_vec, rnd_vec, offset_vec, 4018 tmp0, tmp1, tmp2, tmp3); 4019 4020 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 4021 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 4022 dst += (4 * dst_stride); 4023 } 4024} 4025 4026static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr, 4027 int32_t src_stride, 4028 int16_t *src1_ptr, 4029 int32_t src2_stride, 4030 uint8_t *dst, 4031 int32_t dst_stride, 4032 const int8_t *filter, 4033 int32_t height, 4034 int32_t weight0, 4035 int32_t weight1, 4036 int32_t offset0, 4037 int32_t offset1, 4038 int32_t rnd_val) 4039{ 4040 if (2 == height) { 4041 hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4042 dst, dst_stride, filter, 4043 weight0, weight1, offset0, offset1, rnd_val); 4044 } else if (6 == height) { 4045 hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4046 dst, dst_stride, filter, 4047 weight0, weight1, offset0, offset1, rnd_val); 4048 } else { 4049 hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride, 4050 src1_ptr, src2_stride, 4051 dst, dst_stride, filter, height, 4052 weight0, weight1, offset0, offset1, 4053 rnd_val); 4054 } 4055} 4056 4057static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr, 4058 int32_t src_stride, 4059 int16_t *src1_ptr, 4060 int32_t src2_stride, 4061 uint8_t *dst, 4062 int32_t dst_stride, 4063 const int8_t *filter, 4064 int32_t height, 4065 int32_t weight0, 4066 int32_t weight1, 4067 int32_t offset0, 4068 int32_t offset1, 4069 int32_t rnd_val) 4070{ 4071 uint32_t loop_cnt; 4072 int32_t offset, weight, constant; 4073 v16i8 src0, src1, src2, src3, src4, src5; 4074 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 4075 v16i8 src10_r, src32_r, src21_r, src43_r; 4076 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 4077 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 4078 v16i8 src2110, src4332; 4079 v8i16 filt0, filt1; 4080 v8i16 filter_vec; 4081 v4i32 weight_vec, offset_vec, rnd_vec; 4082 4083 src0_ptr -= (1 * src_stride); 4084 4085 offset = (offset0 + offset1) << rnd_val; 4086 weight0 = weight0 & 0x0000FFFF; 4087 weight = weight0 | (weight1 << 16); 4088 constant = 128 * weight1; 4089 constant <<= 6; 4090 offset += constant; 4091 4092 offset_vec = __msa_fill_w(offset); 4093 weight_vec = __msa_fill_w(weight); 4094 rnd_vec = __msa_fill_w(rnd_val + 1); 4095 4096 filter_vec = LD_SH(filter); 4097 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4098 4099 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4100 src0_ptr += (3 * src_stride); 4101 XORI_B3_128_SB(src0, src1, src2); 4102 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 4103 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 4104 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); 4105 4106 for (loop_cnt = (height >> 2); loop_cnt--;) { 4107 LD_SB2(src0_ptr, src_stride, src3, src4); 4108 src0_ptr += (2 * src_stride); 4109 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 4110 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 4111 src1_ptr += (4 * src2_stride); 4112 ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 4113 XORI_B2_128_SB(src3, src4); 4114 4115 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4116 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 4117 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); 4118 4119 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4120 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4121 tmp4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 4122 4123 LD_SB2(src0_ptr, src_stride, src5, src2); 4124 src0_ptr += (2 * src_stride); 4125 XORI_B2_128_SB(src5, src2); 4126 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 4127 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l); 4128 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); 4129 4130 tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1); 4131 tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1); 4132 tmp5 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1); 4133 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 4134 in0, in1, in2, in3, 4135 weight_vec, rnd_vec, offset_vec, 4136 tmp0, tmp1, tmp2, tmp3); 4137 HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5, 4138 weight_vec, rnd_vec, offset_vec, 4139 tmp4, tmp5); 4140 4141 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 4142 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); 4143 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 4144 ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride); 4145 dst += (4 * dst_stride); 4146 } 4147} 4148 4149static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr, 4150 int32_t src_stride, 4151 int16_t *src1_ptr, 4152 int32_t src2_stride, 4153 uint8_t *dst, 4154 int32_t dst_stride, 4155 const int8_t *filter, 4156 int32_t height, 4157 int32_t weight0, 4158 int32_t weight1, 4159 int32_t offset0, 4160 int32_t offset1, 4161 int32_t rnd_val) 4162{ 4163 uint32_t loop_cnt; 4164 int32_t offset, weight, constant; 4165 v16i8 src0, src1, src2, src3, src4, src5; 4166 v8i16 in0, in1, in2, in3; 4167 v16i8 src10_r, src32_r, src21_r, src43_r; 4168 v16i8 src10_l, src32_l, src21_l, src43_l; 4169 v8i16 tmp0, tmp1, tmp2, tmp3; 4170 v8i16 filt0, filt1; 4171 v8i16 filter_vec; 4172 v4i32 weight_vec, offset_vec, rnd_vec; 4173 4174 src0_ptr -= src_stride; 4175 4176 offset = (offset0 + offset1) << rnd_val; 4177 weight0 = weight0 & 0x0000FFFF; 4178 weight = weight0 | (weight1 << 16); 4179 constant = 128 * weight1; 4180 constant <<= 6; 4181 offset += constant; 4182 4183 offset_vec = __msa_fill_w(offset); 4184 weight_vec = __msa_fill_w(weight); 4185 rnd_vec = __msa_fill_w(rnd_val + 1); 4186 4187 filter_vec = LD_SH(filter); 4188 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4189 4190 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4191 src0_ptr += (3 * src_stride); 4192 XORI_B3_128_SB(src0, src1, src2); 4193 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 4194 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 4195 4196 for (loop_cnt = (height >> 2); loop_cnt--;) { 4197 LD_SB2(src0_ptr, src_stride, src3, src4); 4198 src0_ptr += (2 * src_stride); 4199 LD_SH2(src1_ptr, src2_stride, in0, in1); 4200 LD_SH2(src1_ptr + 8, src2_stride, in2, in3); 4201 src1_ptr += (2 * src2_stride); 4202 XORI_B2_128_SB(src3, src4); 4203 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4204 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 4205 4206 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4207 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4208 tmp2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 4209 tmp3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 4210 4211 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 4212 in0, in1, in2, in3, 4213 weight_vec, rnd_vec, offset_vec, 4214 tmp0, tmp1, tmp2, tmp3); 4215 PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 4216 ST_SH2(tmp0, tmp1, dst, dst_stride); 4217 dst += (2 * dst_stride); 4218 LD_SB2(src0_ptr, src_stride, src5, src2); 4219 src0_ptr += (2 * src_stride); 4220 4221 LD_SH2(src1_ptr, src2_stride, in0, in1); 4222 LD_SH2(src1_ptr + 8, src2_stride, in2, in3); 4223 src1_ptr += (2 * src2_stride); 4224 XORI_B2_128_SB(src5, src2); 4225 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 4226 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 4227 4228 tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1); 4229 tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1); 4230 tmp2 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1); 4231 tmp3 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1); 4232 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 4233 in0, in1, in2, in3, 4234 weight_vec, rnd_vec, offset_vec, 4235 tmp0, tmp1, tmp2, tmp3); 4236 4237 PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 4238 ST_SH2(tmp0, tmp1, dst, dst_stride); 4239 dst += (2 * dst_stride); 4240 } 4241} 4242 4243static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr, 4244 int32_t src_stride, 4245 int16_t *src1_ptr, 4246 int32_t src2_stride, 4247 uint8_t *dst, 4248 int32_t dst_stride, 4249 const int8_t *filter, 4250 int32_t height, 4251 int32_t weight0, 4252 int32_t weight1, 4253 int32_t offset0, 4254 int32_t offset1, 4255 int32_t rnd_val) 4256{ 4257 uint32_t loop_cnt; 4258 int32_t offset, weight, constant; 4259 v16i8 src0, src1, src2, src3, src4, src5; 4260 v16i8 src6, src7, src8, src9, src10, src11; 4261 v8i16 in0, in1, in2, in3, in4, in5; 4262 v16i8 src10_r, src32_r, src76_r, src98_r; 4263 v16i8 src10_l, src32_l, src21_l, src43_l; 4264 v16i8 src21_r, src43_r, src87_r, src109_r; 4265 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 4266 v8i16 filt0, filt1; 4267 v8i16 filter_vec; 4268 v4i32 weight_vec, offset_vec, rnd_vec; 4269 4270 src0_ptr -= src_stride; 4271 4272 offset = (offset0 + offset1) << rnd_val; 4273 weight0 = weight0 & 0x0000FFFF; 4274 weight = weight0 | (weight1 << 16); 4275 constant = 128 * weight1; 4276 constant <<= 6; 4277 offset += constant; 4278 4279 offset_vec = __msa_fill_w(offset); 4280 weight_vec = __msa_fill_w(weight); 4281 rnd_vec = __msa_fill_w(rnd_val + 1); 4282 4283 filter_vec = LD_SH(filter); 4284 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4285 4286 /* 16width */ 4287 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4288 XORI_B3_128_SB(src0, src1, src2); 4289 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 4290 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 4291 /* 8width */ 4292 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); 4293 src0_ptr += (3 * src_stride); 4294 XORI_B3_128_SB(src6, src7, src8); 4295 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 4296 4297 for (loop_cnt = (height >> 2); loop_cnt--;) { 4298 /* 16width */ 4299 LD_SB2(src0_ptr, src_stride, src3, src4); 4300 LD_SH2(src1_ptr, src2_stride, in0, in1); 4301 LD_SH2(src1_ptr + 8, src2_stride, in2, in3); 4302 XORI_B2_128_SB(src3, src4); 4303 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4304 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 4305 4306 /* 8width */ 4307 LD_SB2(src0_ptr + 16, src_stride, src9, src10); 4308 src0_ptr += (2 * src_stride); 4309 LD_SH2(src1_ptr + 16, src2_stride, in4, in5); 4310 src1_ptr += (2 * src2_stride); 4311 XORI_B2_128_SB(src9, src10); 4312 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 4313 /* 16width */ 4314 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4315 tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 4316 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4317 tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 4318 /* 8width */ 4319 tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 4320 tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 4321 /* 16width */ 4322 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, 4323 in0, in1, in2, in3, 4324 weight_vec, rnd_vec, offset_vec, 4325 tmp0, tmp1, tmp4, tmp5); 4326 /* 8width */ 4327 HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5, 4328 weight_vec, rnd_vec, offset_vec, 4329 tmp2, tmp3); 4330 /* 16width */ 4331 PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1); 4332 /* 8width */ 4333 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2); 4334 ST_SH2(tmp0, tmp1, dst, dst_stride); 4335 ST_D2(tmp2, 0, 1, dst + 16, dst_stride); 4336 dst += (2 * dst_stride); 4337 4338 /* 16width */ 4339 LD_SB2(src0_ptr, src_stride, src5, src2); 4340 LD_SH2(src1_ptr, src2_stride, in0, in1); 4341 LD_SH2(src1_ptr + 8, src2_stride, in2, in3); 4342 XORI_B2_128_SB(src5, src2); 4343 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 4344 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 4345 /* 8width */ 4346 LD_SB2(src0_ptr + 16, src_stride, src11, src8); 4347 src0_ptr += (2 * src_stride); 4348 LD_SH2(src1_ptr + 16, src2_stride, in4, in5); 4349 src1_ptr += (2 * src2_stride); 4350 XORI_B2_128_SB(src11, src8); 4351 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); 4352 /* 16width */ 4353 tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1); 4354 tmp4 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1); 4355 tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1); 4356 tmp5 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1); 4357 /* 8width */ 4358 tmp2 = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1); 4359 tmp3 = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1); 4360 /* 16width */ 4361 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, 4362 in0, in1, in2, in3, 4363 weight_vec, rnd_vec, offset_vec, 4364 tmp0, tmp1, tmp4, tmp5); 4365 /* 8width */ 4366 HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5, 4367 weight_vec, rnd_vec, offset_vec, 4368 tmp2, tmp3); 4369 /* 16width */ 4370 PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1); 4371 4372 /* 8width */ 4373 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2); 4374 ST_SH2(tmp0, tmp1, dst, dst_stride); 4375 ST_D2(tmp2, 0, 1, dst + 16, dst_stride); 4376 dst += (2 * dst_stride); 4377 } 4378} 4379 4380static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr, 4381 int32_t src_stride, 4382 int16_t *src1_ptr, 4383 int32_t src2_stride, 4384 uint8_t *dst, 4385 int32_t dst_stride, 4386 const int8_t *filter, 4387 int32_t height, 4388 int32_t weight0, 4389 int32_t weight1, 4390 int32_t offset0, 4391 int32_t offset1, 4392 int32_t rnd_val) 4393{ 4394 uint32_t loop_cnt; 4395 uint8_t *dst_tmp = dst + 16; 4396 int32_t offset, weight, constant; 4397 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10; 4398 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 4399 v16i8 src10_r, src32_r, src76_r, src98_r; 4400 v16i8 src21_r, src43_r, src87_r, src109_r; 4401 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 4402 v16i8 src10_l, src32_l, src76_l, src98_l; 4403 v16i8 src21_l, src43_l, src87_l, src109_l; 4404 v8i16 filt0, filt1; 4405 v8i16 filter_vec; 4406 v4i32 weight_vec, offset_vec, rnd_vec; 4407 4408 src0_ptr -= src_stride; 4409 4410 offset = (offset0 + offset1) << rnd_val; 4411 weight0 = weight0 & 0x0000FFFF; 4412 weight = weight0 | (weight1 << 16); 4413 constant = 128 * weight1; 4414 constant <<= 6; 4415 offset += constant; 4416 4417 offset_vec = __msa_fill_w(offset); 4418 weight_vec = __msa_fill_w(weight); 4419 rnd_vec = __msa_fill_w(rnd_val + 1); 4420 4421 filter_vec = LD_SH(filter); 4422 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4423 4424 /* 16width */ 4425 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4426 XORI_B3_128_SB(src0, src1, src2); 4427 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 4428 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 4429 /* next 16width */ 4430 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); 4431 src0_ptr += (3 * src_stride); 4432 XORI_B3_128_SB(src6, src7, src8); 4433 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 4434 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 4435 4436 for (loop_cnt = (height >> 1); loop_cnt--;) { 4437 /* 16width */ 4438 LD_SB2(src0_ptr, src_stride, src3, src4); 4439 LD_SH2(src1_ptr, src2_stride, in0, in1); 4440 LD_SH2(src1_ptr + 8, src2_stride, in2, in3); 4441 XORI_B2_128_SB(src3, src4); 4442 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4443 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 4444 4445 /* 16width */ 4446 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4447 tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 4448 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4449 tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 4450 /* 16width */ 4451 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, 4452 in0, in1, in2, in3, 4453 weight_vec, rnd_vec, offset_vec, 4454 tmp0, tmp1, tmp4, tmp5); 4455 /* 16width */ 4456 PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1); 4457 ST_SH2(tmp0, tmp1, dst, dst_stride); 4458 dst += (2 * dst_stride); 4459 4460 src10_r = src32_r; 4461 src21_r = src43_r; 4462 src10_l = src32_l; 4463 src21_l = src43_l; 4464 src2 = src4; 4465 4466 /* next 16width */ 4467 LD_SB2(src0_ptr + 16, src_stride, src9, src10); 4468 src0_ptr += (2 * src_stride); 4469 LD_SH2(src1_ptr + 16, src2_stride, in4, in5); 4470 LD_SH2(src1_ptr + 24, src2_stride, in6, in7); 4471 src1_ptr += (2 * src2_stride); 4472 XORI_B2_128_SB(src9, src10); 4473 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 4474 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); 4475 /* next 16width */ 4476 tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 4477 tmp6 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1); 4478 tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 4479 tmp7 = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1); 4480 /* next 16width */ 4481 HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7, 4482 in4, in5, in6, in7, 4483 weight_vec, rnd_vec, offset_vec, 4484 tmp2, tmp3, tmp6, tmp7); 4485 4486 /* next 16width */ 4487 PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3); 4488 ST_SH2(tmp2, tmp3, dst_tmp, dst_stride); 4489 dst_tmp += (2 * dst_stride); 4490 4491 src76_r = src98_r; 4492 src87_r = src109_r; 4493 src76_l = src98_l; 4494 src87_l = src109_l; 4495 src8 = src10; 4496 } 4497} 4498 4499static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr, 4500 int32_t src_stride, 4501 int16_t *src1_ptr, 4502 int32_t src2_stride, 4503 uint8_t *dst, 4504 int32_t dst_stride, 4505 const int8_t *filter_x, 4506 const int8_t *filter_y, 4507 int32_t weight0, 4508 int32_t weight1, 4509 int32_t offset0, 4510 int32_t offset1, 4511 int32_t rnd_val) 4512{ 4513 uint64_t tp0, tp1; 4514 int32_t offset, weight; 4515 v8i16 in0 = { 0 }; 4516 v16u8 out; 4517 v16i8 src0, src1, src2, src3, src4; 4518 v8i16 filt0, filt1; 4519 v8i16 filt_h0, filt_h1; 4520 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 4521 v16i8 mask1; 4522 v8i16 filter_vec, tmp, weight_vec; 4523 v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 4524 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1; 4525 v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec; 4526 4527 src0_ptr -= (src_stride + 1); 4528 4529 filter_vec = LD_SH(filter_x); 4530 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4531 4532 filter_vec = LD_SH(filter_y); 4533 UNPCK_R_SB_SH(filter_vec, filter_vec); 4534 4535 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4536 4537 mask1 = mask0 + 2; 4538 4539 offset = (offset0 + offset1) << rnd_val; 4540 weight0 = weight0 & 0x0000FFFF; 4541 weight = weight0 | (weight1 << 16); 4542 4543 const_vec = __msa_fill_w((128 * weight1)); 4544 const_vec <<= 6; 4545 offset_vec = __msa_fill_w(offset); 4546 weight_vec = (v8i16) __msa_fill_w(weight); 4547 rnd_vec = __msa_fill_w(rnd_val + 1); 4548 offset_vec += const_vec; 4549 4550 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4); 4551 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4552 4553 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1); 4554 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3); 4555 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5); 4556 4557 dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4558 dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4559 dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4560 4561 ILVRL_H2_SH(dst31, dst20, dst10, dst32); 4562 ILVRL_H2_SH(dst42, dst31, dst21, dst43); 4563 4564 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 4565 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 4566 dst0 >>= 6; 4567 dst1 >>= 6; 4568 dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0); 4569 4570 LD2(src1_ptr, src2_stride, tp0, tp1); 4571 INSERT_D2_SH(tp0, tp1, in0); 4572 4573 ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 4574 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 4575 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 4576 SRAR_W2_SW(dst0, dst1, rnd_vec); 4577 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); 4578 CLIP_SH_0_255(tmp); 4579 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); 4580 ST_W2(out, 0, 1, dst, dst_stride); 4581} 4582 4583static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr, 4584 int32_t src_stride, 4585 int16_t *src1_ptr, 4586 int32_t src2_stride, 4587 uint8_t *dst, 4588 int32_t dst_stride, 4589 const int8_t *filter_x, 4590 const int8_t *filter_y, 4591 int32_t weight0, 4592 int32_t weight1, 4593 int32_t offset0, 4594 int32_t offset1, 4595 int32_t rnd_val) 4596{ 4597 uint64_t tp0, tp1; 4598 int32_t offset, weight; 4599 v16u8 out; 4600 v8i16 in0 = { 0 }, in1 = { 0 }; 4601 v16i8 src0, src1, src2, src3, src4, src5, src6; 4602 v8i16 filt0, filt1; 4603 v8i16 filt_h0, filt_h1; 4604 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 4605 v16i8 mask1; 4606 v8i16 filter_vec, weight_vec; 4607 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4608 v8i16 tmp0, tmp1, tmp2, tmp3; 4609 v8i16 dst30, dst41, dst52, dst63; 4610 v8i16 dst10, dst32, dst54, dst21, dst43, dst65; 4611 v4i32 offset_vec, rnd_vec, const_vec; 4612 v4i32 dst0, dst1, dst2, dst3; 4613 4614 src0_ptr -= (src_stride + 1); 4615 4616 filter_vec = LD_SH(filter_x); 4617 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4618 4619 filter_vec = LD_SH(filter_y); 4620 UNPCK_R_SB_SH(filter_vec, filter_vec); 4621 4622 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4623 4624 mask1 = mask0 + 2; 4625 4626 offset = (offset0 + offset1) << rnd_val; 4627 weight0 = weight0 & 0x0000FFFF; 4628 weight = weight0 | (weight1 << 16); 4629 4630 const_vec = __msa_fill_w((128 * weight1)); 4631 const_vec <<= 6; 4632 offset_vec = __msa_fill_w(offset); 4633 weight_vec = (v8i16) __msa_fill_w(weight); 4634 rnd_vec = __msa_fill_w(rnd_val + 1); 4635 offset_vec += const_vec; 4636 4637 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 4638 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 4639 4640 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1); 4641 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3); 4642 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5); 4643 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7); 4644 4645 dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4646 dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4647 dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4648 dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4649 4650 ILVRL_H2_SH(dst41, dst30, dst10, dst43); 4651 ILVRL_H2_SH(dst52, dst41, dst21, dst54); 4652 ILVRL_H2_SH(dst63, dst52, dst32, dst65); 4653 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 4654 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 4655 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1); 4656 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1); 4657 SRA_4V(dst0, dst1, dst2, dst3, 6); 4658 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3); 4659 4660 LD2(src1_ptr, src2_stride, tp0, tp1); 4661 INSERT_D2_SH(tp0, tp1, in0); 4662 src1_ptr += (2 * src2_stride); 4663 LD2(src1_ptr, src2_stride, tp0, tp1); 4664 INSERT_D2_SH(tp0, tp1, in1); 4665 4666 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1); 4667 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3); 4668 4669 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 4670 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 4671 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 4672 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 4673 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 4674 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 4675 CLIP_SH2_0_255(tmp0, tmp1); 4676 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 4677 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 4678} 4679 4680static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr, 4681 int32_t src_stride, 4682 int16_t *src1_ptr, 4683 int32_t src2_stride, 4684 uint8_t *dst, 4685 int32_t dst_stride, 4686 const int8_t *filter_x, 4687 const int8_t *filter_y, 4688 int32_t height, 4689 int32_t weight0, 4690 int32_t weight1, 4691 int32_t offset0, 4692 int32_t offset1, 4693 int32_t rnd_val) 4694{ 4695 uint32_t loop_cnt; 4696 uint64_t tp0, tp1; 4697 int32_t offset, weight; 4698 v16u8 out0, out1; 4699 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 4700 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4701 v8i16 filt0, filt1; 4702 v8i16 filt_h0, filt_h1; 4703 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4704 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 4705 v16i8 mask1; 4706 v8i16 filter_vec, weight_vec; 4707 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 4708 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 4709 v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 4710 v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 4711 v8i16 dst98_r, dst109_r; 4712 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 4713 v4i32 offset_vec, rnd_vec, const_vec; 4714 4715 src0_ptr -= (src_stride + 1); 4716 4717 filter_vec = LD_SH(filter_x); 4718 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4719 4720 filter_vec = LD_SH(filter_y); 4721 UNPCK_R_SB_SH(filter_vec, filter_vec); 4722 4723 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4724 4725 mask1 = mask0 + 2; 4726 4727 offset = (offset0 + offset1) << rnd_val; 4728 weight0 = weight0 & 0x0000FFFF; 4729 weight = weight0 | (weight1 << 16); 4730 4731 const_vec = __msa_fill_w((128 * weight1)); 4732 const_vec <<= 6; 4733 offset_vec = __msa_fill_w(offset); 4734 weight_vec = (v8i16) __msa_fill_w(weight); 4735 rnd_vec = __msa_fill_w(rnd_val + 1); 4736 offset_vec += const_vec; 4737 4738 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4739 src0_ptr += (3 * src_stride); 4740 XORI_B3_128_SB(src0, src1, src2); 4741 4742 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 4743 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3); 4744 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4745 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4746 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 4747 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 4748 4749 for (loop_cnt = height >> 3; loop_cnt--;) { 4750 LD_SB8(src0_ptr, src_stride, 4751 src3, src4, src5, src6, src7, src8, src9, src10); 4752 src0_ptr += (8 * src_stride); 4753 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4754 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1); 4755 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3); 4756 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5); 4757 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7); 4758 4759 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4760 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4761 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4762 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4763 4764 dst32_r = __msa_ilvr_h(dst73, dst22); 4765 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 4766 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 4767 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 4768 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 4769 dst76_r = __msa_ilvr_h(dst22, dst106); 4770 4771 LD2(src1_ptr, src2_stride, tp0, tp1); 4772 src1_ptr += 2 * src2_stride; 4773 INSERT_D2_SH(tp0, tp1, in0); 4774 LD2(src1_ptr, src2_stride, tp0, tp1); 4775 src1_ptr += 2 * src2_stride; 4776 INSERT_D2_SH(tp0, tp1, in1); 4777 4778 LD2(src1_ptr, src2_stride, tp0, tp1); 4779 src1_ptr += 2 * src2_stride; 4780 INSERT_D2_SH(tp0, tp1, in2); 4781 LD2(src1_ptr, src2_stride, tp0, tp1); 4782 src1_ptr += 2 * src2_stride; 4783 INSERT_D2_SH(tp0, tp1, in3); 4784 4785 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4786 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4787 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4788 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4789 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4790 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4791 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4792 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4793 SRA_4V(dst0, dst1, dst2, dst3, 6); 4794 SRA_4V(dst4, dst5, dst6, dst7, 6); 4795 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, 4796 dst2, dst3); 4797 ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 4798 ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 4799 ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 4800 ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 4801 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 4802 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 4803 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 4804 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 4805 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 4806 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 4807 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 4808 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 4809 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 4810 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 4811 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1, 4812 tmp2, tmp3); 4813 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4814 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4815 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 4816 dst += (8 * dst_stride); 4817 4818 dst10_r = dst98_r; 4819 dst21_r = dst109_r; 4820 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 4821 } 4822} 4823 4824static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr, 4825 int32_t src_stride, 4826 int16_t *src1_ptr, 4827 int32_t src2_stride, 4828 uint8_t *dst, 4829 int32_t dst_stride, 4830 const int8_t *filter_x, 4831 const int8_t *filter_y, 4832 int32_t height, 4833 int32_t weight0, 4834 int32_t weight1, 4835 int32_t offset0, 4836 int32_t offset1, 4837 int32_t rnd_val) 4838{ 4839 if (2 == height) { 4840 hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4841 dst, dst_stride, filter_x, filter_y, 4842 weight0, weight1, offset0, offset1, rnd_val); 4843 } else if (4 == height) { 4844 hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4845 dst, dst_stride, filter_x, filter_y, 4846 weight0, weight1, offset0, offset1, rnd_val); 4847 } else if (0 == (height % 8)) { 4848 hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride, 4849 src1_ptr, src2_stride, 4850 dst, dst_stride, filter_x, filter_y, 4851 height, weight0, weight1, 4852 offset0, offset1, rnd_val); 4853 } 4854} 4855 4856static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr, 4857 int32_t src_stride, 4858 int16_t *src1_ptr, 4859 int32_t src2_stride, 4860 uint8_t *dst, 4861 int32_t dst_stride, 4862 const int8_t *filter_x, 4863 const int8_t *filter_y, 4864 int32_t height, 4865 int32_t weight0, 4866 int32_t weight1, 4867 int32_t offset0, 4868 int32_t offset1, 4869 int32_t rnd_val) 4870{ 4871 uint32_t tpw0, tpw1, tpw2, tpw3; 4872 uint64_t tp0, tp1; 4873 int32_t offset, weight; 4874 v16u8 out0, out1, out2; 4875 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4876 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 4877 v8i16 in4 = { 0 }, in5 = { 0 }; 4878 v8i16 filt0, filt1; 4879 v8i16 filt_h0, filt_h1, filter_vec; 4880 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4881 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4882 v16i8 mask1; 4883 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9; 4884 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec; 4885 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r; 4886 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l; 4887 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l; 4888 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l; 4889 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4890 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 4891 v4i32 dst4_r, dst5_r, dst6_r, dst7_r; 4892 v4i32 offset_vec, rnd_vec, const_vec; 4893 4894 src0_ptr -= (src_stride + 1); 4895 4896 filter_vec = LD_SH(filter_x); 4897 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4898 4899 filter_vec = LD_SH(filter_y); 4900 UNPCK_R_SB_SH(filter_vec, filter_vec); 4901 4902 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4903 4904 mask1 = mask0 + 2; 4905 4906 offset = (offset0 + offset1) << rnd_val; 4907 weight0 = weight0 & 0x0000FFFF; 4908 weight = weight0 | (weight1 << 16); 4909 4910 const_vec = __msa_fill_w((128 * weight1)); 4911 const_vec <<= 6; 4912 offset_vec = __msa_fill_w(offset); 4913 weight_vec = (v8i16) __msa_fill_w(weight); 4914 rnd_vec = __msa_fill_w(rnd_val + 1); 4915 offset_vec += const_vec; 4916 4917 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4918 src0_ptr += (3 * src_stride); 4919 XORI_B3_128_SB(src0, src1, src2); 4920 4921 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4922 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4923 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4924 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4925 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4926 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4927 4928 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 4929 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 4930 4931 LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9, 4932 src10); 4933 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4934 4935 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4936 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4937 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4938 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4939 4940 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4941 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4942 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4943 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4944 4945 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 4946 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3); 4947 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5); 4948 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7); 4949 4950 dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4951 dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4952 dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4953 dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4954 4955 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 4956 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 4957 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 4958 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 4959 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 4960 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 4961 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l); 4962 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l); 4963 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l); 4964 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l); 4965 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l); 4966 4967 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4968 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4969 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4970 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4971 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4972 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4973 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4974 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4975 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1); 4976 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1); 4977 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1); 4978 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1); 4979 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 4980 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 4981 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); 4982 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1); 4983 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3); 4984 4985 LD2(src1_ptr, src2_stride, tp0, tp1); 4986 INSERT_D2_SH(tp0, tp1, in0); 4987 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1); 4988 INSERT_D2_SH(tp0, tp1, in1); 4989 4990 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1); 4991 INSERT_D2_SH(tp0, tp1, in2); 4992 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1); 4993 INSERT_D2_SH(tp0, tp1, in3); 4994 4995 ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 4996 ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 4997 ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 4998 ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 4999 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5000 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5001 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5002 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5003 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 5004 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 5005 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 5006 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 5007 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5008 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5009 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1, 5010 tmp2, tmp3); 5011 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5012 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5013 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 5014 5015 PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5); 5016 5017 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3); 5018 src1_ptr += (4 * src2_stride); 5019 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4); 5020 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3); 5021 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5); 5022 5023 ILVRL_H2_SH(dst4, in4, tmp0, tmp1); 5024 ILVRL_H2_SH(dst5, in5, tmp2, tmp3); 5025 5026 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5027 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5028 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5029 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5030 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5031 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5); 5032 5033 CLIP_SH2_0_255(tmp4, tmp5); 5034 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); 5035 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); 5036} 5037 5038static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr, 5039 int32_t src_stride, 5040 int16_t *src1_ptr, 5041 int32_t src2_stride, 5042 uint8_t *dst, 5043 int32_t dst_stride, 5044 const int8_t *filter_x, 5045 const int8_t *filter_y, 5046 int32_t weight0, 5047 int32_t weight1, 5048 int32_t offset0, 5049 int32_t offset1, 5050 int32_t rnd_val) 5051{ 5052 int32_t weight, offset; 5053 v16u8 out; 5054 v16i8 src0, src1, src2, src3, src4; 5055 v8i16 filt0, filt1; 5056 v8i16 filt_h0, filt_h1; 5057 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 5058 v16i8 mask1; 5059 v8i16 filter_vec, weight_vec; 5060 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 5061 v8i16 dst0, dst1, dst2, dst3, dst4; 5062 v8i16 in0, in1; 5063 v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 5064 v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 5065 v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 5066 v8i16 tmp0, tmp1, tmp2, tmp3; 5067 v4i32 offset_vec, rnd_vec, const_vec; 5068 5069 src0_ptr -= (src_stride + 1); 5070 5071 filter_vec = LD_SH(filter_x); 5072 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 5073 5074 filter_vec = LD_SH(filter_y); 5075 UNPCK_R_SB_SH(filter_vec, filter_vec); 5076 5077 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 5078 5079 mask1 = mask0 + 2; 5080 5081 offset = (offset0 + offset1) << rnd_val; 5082 weight0 = weight0 & 0x0000FFFF; 5083 weight = weight0 | (weight1 << 16); 5084 5085 const_vec = __msa_fill_w((128 * weight1)); 5086 const_vec <<= 6; 5087 offset_vec = __msa_fill_w(offset); 5088 weight_vec = (v8i16) __msa_fill_w(weight); 5089 rnd_vec = __msa_fill_w(rnd_val + 1); 5090 offset_vec += const_vec; 5091 5092 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4); 5093 XORI_B5_128_SB(src0, src1, src2, src3, src4); 5094 5095 LD_SH2(src1_ptr, src2_stride, in0, in1); 5096 5097 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 5098 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 5099 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 5100 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 5101 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 5102 5103 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5104 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5105 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5106 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5107 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 5108 5109 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 5110 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 5111 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 5112 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 5113 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5114 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 5115 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5116 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 5117 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 5118 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3); 5119 5120 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1); 5121 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3); 5122 5123 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5124 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5125 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5126 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5127 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); 5128 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); 5129 CLIP_SH2_0_255(tmp0, tmp1); 5130 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 5131 ST_D2(out, 0, 1, dst, dst_stride); 5132} 5133 5134static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr, 5135 int32_t src_stride, 5136 int16_t *src1_ptr, 5137 int32_t src2_stride, 5138 uint8_t *dst, 5139 int32_t dst_stride, 5140 const int8_t *filter_x, 5141 const int8_t *filter_y, 5142 int32_t weight0, 5143 int32_t weight1, 5144 int32_t offset0, 5145 int32_t offset1, 5146 int32_t rnd_val, 5147 int32_t width8mult) 5148{ 5149 int32_t weight, offset; 5150 uint32_t cnt; 5151 v16u8 out0, out1; 5152 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 5153 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 5154 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec; 5155 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6; 5156 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3; 5157 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 5158 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 5159 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 5160 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 5161 v4i32 offset_vec, rnd_vec, const_vec; 5162 5163 src0_ptr -= (src_stride + 1); 5164 5165 filter_vec = LD_SH(filter_x); 5166 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 5167 5168 filter_vec = LD_SH(filter_y); 5169 UNPCK_R_SB_SH(filter_vec, filter_vec); 5170 5171 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 5172 5173 mask0 = LD_SB(ff_hevc_mask_arr); 5174 mask1 = mask0 + 2; 5175 5176 offset = (offset0 + offset1) << rnd_val; 5177 weight0 = weight0 & 0x0000FFFF; 5178 weight = weight0 | (weight1 << 16); 5179 5180 const_vec = __msa_fill_w((128 * weight1)); 5181 const_vec <<= 6; 5182 offset_vec = __msa_fill_w(offset); 5183 rnd_vec = __msa_fill_w(rnd_val + 1); 5184 offset_vec += const_vec; 5185 weight_vec = (v8i16) __msa_fill_w(weight); 5186 5187 for (cnt = width8mult; cnt--;) { 5188 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 5189 src0_ptr += 8; 5190 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 5191 5192 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 5193 src1_ptr += 8; 5194 5195 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 5196 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 5197 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 5198 5199 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5200 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5201 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5202 5203 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 5204 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 5205 5206 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 5207 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 5208 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 5209 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 5210 5211 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5212 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5213 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5214 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5215 5216 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 5217 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 5218 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 5219 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 5220 5221 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5222 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 5223 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5224 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 5225 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5226 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 5227 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5228 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 5229 5230 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 5231 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 5232 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 5233 dst3_r, dst0, dst1, dst2, dst3); 5234 5235 ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 5236 ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 5237 ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 5238 ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 5239 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5240 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5241 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5242 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5243 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 5244 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 5245 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 5246 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 5247 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5248 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5249 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 5250 tmp0, tmp1, tmp2, tmp3); 5251 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5252 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5253 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 5254 dst += 8; 5255 } 5256} 5257 5258static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr, 5259 int32_t src_stride, 5260 int16_t *src1_ptr, 5261 int32_t src2_stride, 5262 uint8_t *dst, 5263 int32_t dst_stride, 5264 const int8_t *filter_x, 5265 const int8_t *filter_y, 5266 int32_t weight0, 5267 int32_t weight1, 5268 int32_t offset0, 5269 int32_t offset1, 5270 int32_t rnd_val) 5271{ 5272 uint32_t offset, weight; 5273 v16u8 out0, out1, out2; 5274 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 5275 v8i16 filt0, filt1; 5276 v8i16 filt_h0, filt_h1; 5277 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 5278 v16i8 mask1; 5279 v8i16 filter_vec, weight_vec; 5280 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 5281 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 5282 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8; 5283 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 5284 v4i32 dst4_r, dst4_l, dst5_r, dst5_l; 5285 v8i16 dst10_r, dst32_r, dst10_l, dst32_l; 5286 v8i16 dst21_r, dst43_r, dst21_l, dst43_l; 5287 v8i16 dst54_r, dst54_l, dst65_r, dst65_l; 5288 v8i16 dst76_r, dst76_l, dst87_r, dst87_l; 5289 v8i16 in0, in1, in2, in3, in4, in5; 5290 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 5291 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 5292 v4i32 offset_vec, rnd_vec, const_vec; 5293 5294 src0_ptr -= (src_stride + 1); 5295 5296 filter_vec = LD_SH(filter_x); 5297 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 5298 5299 filter_vec = LD_SH(filter_y); 5300 UNPCK_R_SB_SH(filter_vec, filter_vec); 5301 5302 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 5303 5304 mask1 = mask0 + 2; 5305 5306 offset = (offset0 + offset1) << rnd_val; 5307 weight0 = weight0 & 0x0000FFFF; 5308 weight = weight0 | (weight1 << 16); 5309 5310 const_vec = __msa_fill_w((128 * weight1)); 5311 const_vec <<= 6; 5312 offset_vec = __msa_fill_w(offset); 5313 weight_vec = (v8i16) __msa_fill_w(weight); 5314 rnd_vec = __msa_fill_w(rnd_val + 1); 5315 offset_vec += const_vec; 5316 5317 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4); 5318 src0_ptr += (5 * src_stride); 5319 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8); 5320 5321 XORI_B5_128_SB(src0, src1, src2, src3, src4); 5322 XORI_B4_128_SB(src5, src6, src7, src8); 5323 5324 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); 5325 5326 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 5327 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 5328 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 5329 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 5330 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 5331 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11); 5332 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13); 5333 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15); 5334 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17); 5335 5336 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5337 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5338 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5339 dsth3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5340 dsth4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 5341 dsth5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1); 5342 dsth6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1); 5343 dsth7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1); 5344 dsth8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1); 5345 5346 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 5347 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 5348 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 5349 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 5350 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 5351 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 5352 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 5353 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 5354 5355 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5356 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 5357 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5358 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 5359 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5360 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 5361 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5362 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 5363 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 5364 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); 5365 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 5366 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); 5367 5368 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 5369 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 5370 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6); 5371 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r, 5372 dst0, dst1, dst2, dst3); 5373 5374 ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 5375 ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 5376 ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 5377 ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 5378 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5379 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5380 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5381 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5382 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 5383 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 5384 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 5385 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 5386 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5387 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5388 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 5389 tmp0, tmp1, tmp2, tmp3); 5390 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5391 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5392 5393 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1); 5394 ILVRL_H2_SH(dst0, in4, tmp0, tmp1); 5395 ILVRL_H2_SH(dst1, in5, tmp2, tmp3); 5396 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5397 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5398 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5399 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5400 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5401 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5); 5402 CLIP_SH2_0_255(tmp4, tmp5); 5403 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); 5404 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 5405 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 5406} 5407 5408static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr, 5409 int32_t src_stride, 5410 int16_t *src1_ptr, 5411 int32_t src2_stride, 5412 uint8_t *dst, 5413 int32_t dst_stride, 5414 const int8_t *filter_x, 5415 const int8_t *filter_y, 5416 int32_t height, 5417 int32_t weight0, 5418 int32_t weight1, 5419 int32_t offset0, 5420 int32_t offset1, 5421 int32_t rnd_val, 5422 int32_t width) 5423{ 5424 uint32_t loop_cnt; 5425 uint32_t cnt; 5426 int32_t offset, weight; 5427 uint8_t *src0_ptr_tmp; 5428 int16_t *src1_ptr_tmp; 5429 uint8_t *dst_tmp; 5430 v16u8 out0, out1; 5431 v16i8 src0, src1, src2, src3, src4, src5, src6; 5432 v8i16 in0, in1, in2, in3; 5433 v8i16 filt0, filt1; 5434 v8i16 filt_h0, filt_h1; 5435 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 5436 v16i8 mask1; 5437 v8i16 filter_vec; 5438 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 5439 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6; 5440 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 5441 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 5442 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 5443 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 5444 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec; 5445 v4i32 offset_vec, rnd_vec, const_vec; 5446 5447 src0_ptr -= (src_stride + 1); 5448 5449 filter_vec = LD_SH(filter_x); 5450 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 5451 5452 filter_vec = LD_SH(filter_y); 5453 UNPCK_R_SB_SH(filter_vec, filter_vec); 5454 5455 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 5456 5457 mask1 = mask0 + 2; 5458 5459 offset = (offset0 + offset1) << rnd_val; 5460 weight0 = weight0 & 0x0000FFFF; 5461 weight = weight0 | (weight1 << 16); 5462 5463 const_vec = __msa_fill_w((128 * weight1)); 5464 const_vec <<= 6; 5465 offset_vec = __msa_fill_w(offset); 5466 weight_vec = (v8i16) __msa_fill_w(weight); 5467 rnd_vec = __msa_fill_w(rnd_val + 1); 5468 offset_vec += const_vec; 5469 5470 for (cnt = width >> 3; cnt--;) { 5471 src0_ptr_tmp = src0_ptr; 5472 src1_ptr_tmp = src1_ptr; 5473 dst_tmp = dst; 5474 5475 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2); 5476 src0_ptr_tmp += (3 * src_stride); 5477 XORI_B3_128_SB(src0, src1, src2); 5478 5479 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 5480 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 5481 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 5482 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5483 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5484 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5485 5486 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 5487 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 5488 5489 for (loop_cnt = height >> 2; loop_cnt--;) { 5490 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6); 5491 src0_ptr_tmp += (4 * src_stride); 5492 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); 5493 src1_ptr_tmp += (4 * src2_stride); 5494 XORI_B4_128_SB(src3, src4, src5, src6); 5495 5496 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 5497 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 5498 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 5499 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 5500 5501 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5502 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5503 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5504 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5505 5506 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 5507 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 5508 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 5509 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 5510 5511 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5512 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 5513 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5514 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 5515 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5516 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 5517 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5518 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 5519 5520 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 5521 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 5522 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 5523 dst3_r, dst0, dst1, dst2, dst3); 5524 ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 5525 ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 5526 ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 5527 ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 5528 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5529 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5530 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5531 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5532 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 5533 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 5534 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 5535 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 5536 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5537 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5538 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 5539 tmp0, tmp1, tmp2, tmp3); 5540 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5541 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5542 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 5543 dst_tmp += (4 * dst_stride); 5544 5545 dst10_r = dst54_r; 5546 dst10_l = dst54_l; 5547 dst21_r = dst65_r; 5548 dst21_l = dst65_l; 5549 dsth2 = dsth6; 5550 } 5551 5552 src0_ptr += 8; 5553 dst += 8; 5554 src1_ptr += 8; 5555 } 5556} 5557 5558static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr, 5559 int32_t src_stride, 5560 int16_t *src1_ptr, 5561 int32_t src2_stride, 5562 uint8_t *dst, 5563 int32_t dst_stride, 5564 const int8_t *filter_x, 5565 const int8_t *filter_y, 5566 int32_t height, 5567 int32_t weight0, 5568 int32_t weight1, 5569 int32_t offset0, 5570 int32_t offset1, 5571 int32_t rnd_val) 5572{ 5573 if (2 == height) { 5574 hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 5575 dst, dst_stride, filter_x, filter_y, 5576 weight0, weight1, offset0, offset1, rnd_val); 5577 } else if (4 == height) { 5578 hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, 5579 src2_stride, dst, dst_stride, filter_x, 5580 filter_y, weight0, weight1, offset0, 5581 offset1, rnd_val, 1); 5582 } else if (6 == height) { 5583 hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 5584 dst, dst_stride, filter_x, filter_y, 5585 weight0, weight1, offset0, offset1, rnd_val); 5586 } else if (0 == (height % 4)) { 5587 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, 5588 src1_ptr, src2_stride, 5589 dst, dst_stride, filter_x, filter_y, 5590 height, weight0, 5591 weight1, offset0, offset1, rnd_val, 8); 5592 } 5593} 5594 5595static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr, 5596 int32_t src_stride, 5597 int16_t *src1_ptr, 5598 int32_t src2_stride, 5599 uint8_t *dst, 5600 int32_t dst_stride, 5601 const int8_t *filter_x, 5602 const int8_t *filter_y, 5603 int32_t height, 5604 int32_t weight0, 5605 int32_t weight1, 5606 int32_t offset0, 5607 int32_t offset1, 5608 int32_t rnd_val) 5609{ 5610 uint32_t loop_cnt; 5611 uint64_t tp0, tp1; 5612 int32_t offset, weight; 5613 uint8_t *src0_ptr_tmp, *dst_tmp; 5614 int16_t *src1_ptr_tmp; 5615 v16u8 out0, out1; 5616 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 5617 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 5618 v16i8 mask0, mask1, mask2, mask3; 5619 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec; 5620 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 5621 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec; 5622 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 5623 v8i16 dst76_r, dst98_r, dst87_r, dst109_r; 5624 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 5625 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 5626 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 5627 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 5628 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 5629 v4i32 offset_vec, rnd_vec, const_vec; 5630 5631 src0_ptr -= (src_stride + 1); 5632 5633 filter_vec = LD_SH(filter_x); 5634 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 5635 5636 filter_vec = LD_SH(filter_y); 5637 UNPCK_R_SB_SH(filter_vec, filter_vec); 5638 5639 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 5640 5641 mask0 = LD_SB(ff_hevc_mask_arr); 5642 mask1 = mask0 + 2; 5643 5644 offset = (offset0 + offset1) << rnd_val; 5645 weight0 = weight0 & 0x0000FFFF; 5646 weight = weight0 | (weight1 << 16); 5647 5648 const_vec = __msa_fill_w((128 * weight1)); 5649 const_vec <<= 6; 5650 offset_vec = __msa_fill_w(offset); 5651 rnd_vec = __msa_fill_w(rnd_val + 1); 5652 offset_vec += const_vec; 5653 weight_vec = (v8i16) __msa_fill_w(weight); 5654 5655 src0_ptr_tmp = src0_ptr; 5656 dst_tmp = dst; 5657 src1_ptr_tmp = src1_ptr; 5658 5659 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2); 5660 src0_ptr_tmp += (3 * src_stride); 5661 5662 XORI_B3_128_SB(src0, src1, src2); 5663 5664 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 5665 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 5666 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 5667 5668 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5669 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5670 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5671 5672 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 5673 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 5674 5675 for (loop_cnt = 4; loop_cnt--;) { 5676 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6); 5677 src0_ptr_tmp += (4 * src_stride); 5678 XORI_B4_128_SB(src3, src4, src5, src6); 5679 5680 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); 5681 src1_ptr_tmp += (4 * src2_stride); 5682 5683 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 5684 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 5685 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 5686 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 5687 5688 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5689 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5690 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5691 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5692 5693 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 5694 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 5695 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 5696 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 5697 5698 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5699 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 5700 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5701 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 5702 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5703 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 5704 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5705 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 5706 5707 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 5708 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 5709 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 5710 dst3_r, dst0, dst1, dst2, dst3); 5711 ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 5712 ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 5713 ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 5714 ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 5715 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5716 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5717 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5718 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5719 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 5720 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 5721 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 5722 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 5723 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5724 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5725 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 5726 tmp0, tmp1, tmp2, tmp3); 5727 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5728 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5729 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 5730 dst_tmp += (4 * dst_stride); 5731 5732 dst10_r = dst54_r; 5733 dst10_l = dst54_l; 5734 dst21_r = dst65_r; 5735 dst21_l = dst65_l; 5736 dsth2 = dsth6; 5737 } 5738 5739 src0_ptr += 8; 5740 dst += 8; 5741 src1_ptr += 8; 5742 5743 mask2 = LD_SB(ff_hevc_mask_arr + 16); 5744 mask3 = mask2 + 2; 5745 5746 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 5747 src0_ptr += (3 * src_stride); 5748 XORI_B3_128_SB(src0, src1, src2); 5749 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 5750 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3); 5751 5752 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5753 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5754 5755 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 5756 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 5757 5758 for (loop_cnt = 2; loop_cnt--;) { 5759 LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9, 5760 src10); 5761 src0_ptr += (8 * src_stride); 5762 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 5763 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1); 5764 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3); 5765 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5); 5766 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7); 5767 5768 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5769 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5770 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5771 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5772 5773 dst32_r = __msa_ilvr_h(dst73, dst22); 5774 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 5775 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 5776 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 5777 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 5778 dst76_r = __msa_ilvr_h(dst22, dst106); 5779 5780 LD2(src1_ptr, src2_stride, tp0, tp1); 5781 src1_ptr += 2 * src2_stride; 5782 INSERT_D2_SH(tp0, tp1, in0); 5783 LD2(src1_ptr, src2_stride, tp0, tp1); 5784 src1_ptr += 2 * src2_stride; 5785 INSERT_D2_SH(tp0, tp1, in1); 5786 5787 LD2(src1_ptr, src2_stride, tp0, tp1); 5788 src1_ptr += 2 * src2_stride; 5789 INSERT_D2_SH(tp0, tp1, in2); 5790 LD2(src1_ptr, src2_stride, tp0, tp1); 5791 src1_ptr += 2 * src2_stride; 5792 INSERT_D2_SH(tp0, tp1, in3); 5793 5794 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5795 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5796 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5797 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5798 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 5799 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 5800 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 5801 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 5802 5803 SRA_4V(dst0, dst1, dst2, dst3, 6); 5804 SRA_4V(dst4, dst5, dst6, dst7, 6); 5805 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 5806 dst0, dst1, dst2, dst3); 5807 ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 5808 ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 5809 ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 5810 ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 5811 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5812 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5813 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5814 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5815 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 5816 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 5817 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 5818 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 5819 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5820 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5821 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 5822 tmp0, tmp1, tmp2, tmp3); 5823 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5824 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5825 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 5826 dst += (8 * dst_stride); 5827 5828 dst10_r = dst98_r; 5829 dst21_r = dst109_r; 5830 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 5831 } 5832} 5833 5834static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr, 5835 int32_t src_stride, 5836 int16_t *src1_ptr, 5837 int32_t src2_stride, 5838 uint8_t *dst, 5839 int32_t dst_stride, 5840 const int8_t *filter_x, 5841 const int8_t *filter_y, 5842 int32_t height, 5843 int32_t weight0, 5844 int32_t weight1, 5845 int32_t offset0, 5846 int32_t offset1, 5847 int32_t rnd_val) 5848{ 5849 if (4 == height) { 5850 hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, 5851 src2_stride, dst, dst_stride, filter_x, 5852 filter_y, weight0, weight1, offset0, 5853 offset1, rnd_val, 2); 5854 } else { 5855 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, 5856 src2_stride, dst, dst_stride, 5857 filter_x, filter_y, height, weight0, 5858 weight1, offset0, offset1, rnd_val, 16); 5859 } 5860} 5861 5862static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr, 5863 int32_t src_stride, 5864 int16_t *src1_ptr, 5865 int32_t src2_stride, 5866 uint8_t *dst, 5867 int32_t dst_stride, 5868 const int8_t *filter_x, 5869 const int8_t *filter_y, 5870 int32_t height, 5871 int32_t weight0, 5872 int32_t weight1, 5873 int32_t offset0, 5874 int32_t offset1, 5875 int32_t rnd_val) 5876{ 5877 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, 5878 src1_ptr, src2_stride, 5879 dst, dst_stride, 5880 filter_x, filter_y, height, weight0, 5881 weight1, offset0, offset1, rnd_val, 24); 5882} 5883 5884static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr, 5885 int32_t src_stride, 5886 int16_t *src1_ptr, 5887 int32_t src2_stride, 5888 uint8_t *dst, 5889 int32_t dst_stride, 5890 const int8_t *filter_x, 5891 const int8_t *filter_y, 5892 int32_t height, 5893 int32_t weight0, 5894 int32_t weight1, 5895 int32_t offset0, 5896 int32_t offset1, 5897 int32_t rnd_val) 5898{ 5899 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, 5900 src1_ptr, src2_stride, 5901 dst, dst_stride, 5902 filter_x, filter_y, height, weight0, 5903 weight1, offset0, offset1, rnd_val, 32); 5904} 5905 5906#define BI_W_MC_COPY(WIDTH) \ 5907void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ 5908 ptrdiff_t dst_stride, \ 5909 uint8_t *src, \ 5910 ptrdiff_t src_stride, \ 5911 int16_t *src_16bit, \ 5912 int height, \ 5913 int denom, \ 5914 int weight0, \ 5915 int weight1, \ 5916 int offset0, \ 5917 int offset1, \ 5918 intptr_t mx, \ 5919 intptr_t my, \ 5920 int width) \ 5921{ \ 5922 int shift = 14 + 1 - 8; \ 5923 int log2Wd = denom + shift - 1; \ 5924 \ 5925 hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \ 5926 dst, dst_stride, height, \ 5927 weight0, weight1, offset0, \ 5928 offset1, log2Wd); \ 5929} 5930 5931BI_W_MC_COPY(4); 5932BI_W_MC_COPY(6); 5933BI_W_MC_COPY(8); 5934BI_W_MC_COPY(12); 5935BI_W_MC_COPY(16); 5936BI_W_MC_COPY(24); 5937BI_W_MC_COPY(32); 5938BI_W_MC_COPY(48); 5939BI_W_MC_COPY(64); 5940 5941#undef BI_W_MC_COPY 5942 5943#define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 5944void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ 5945 ptrdiff_t \ 5946 dst_stride, \ 5947 uint8_t *src, \ 5948 ptrdiff_t \ 5949 src_stride, \ 5950 int16_t *src_16bit, \ 5951 int height, \ 5952 int denom, \ 5953 int weight0, \ 5954 int weight1, \ 5955 int offset0, \ 5956 int offset1, \ 5957 intptr_t mx, \ 5958 intptr_t my, \ 5959 int width) \ 5960{ \ 5961 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 5962 int log2Wd = denom + 14 - 8; \ 5963 \ 5964 hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ 5965 MAX_PB_SIZE, dst, dst_stride, \ 5966 filter, height, weight0, \ 5967 weight1, offset0, offset1, \ 5968 log2Wd); \ 5969} 5970 5971BI_W_MC(qpel, h, 4, 8, hz, mx); 5972BI_W_MC(qpel, h, 8, 8, hz, mx); 5973BI_W_MC(qpel, h, 12, 8, hz, mx); 5974BI_W_MC(qpel, h, 16, 8, hz, mx); 5975BI_W_MC(qpel, h, 24, 8, hz, mx); 5976BI_W_MC(qpel, h, 32, 8, hz, mx); 5977BI_W_MC(qpel, h, 48, 8, hz, mx); 5978BI_W_MC(qpel, h, 64, 8, hz, mx); 5979 5980BI_W_MC(qpel, v, 4, 8, vt, my); 5981BI_W_MC(qpel, v, 8, 8, vt, my); 5982BI_W_MC(qpel, v, 12, 8, vt, my); 5983BI_W_MC(qpel, v, 16, 8, vt, my); 5984BI_W_MC(qpel, v, 24, 8, vt, my); 5985BI_W_MC(qpel, v, 32, 8, vt, my); 5986BI_W_MC(qpel, v, 48, 8, vt, my); 5987BI_W_MC(qpel, v, 64, 8, vt, my); 5988 5989BI_W_MC(epel, h, 4, 4, hz, mx); 5990BI_W_MC(epel, h, 8, 4, hz, mx); 5991BI_W_MC(epel, h, 6, 4, hz, mx); 5992BI_W_MC(epel, h, 12, 4, hz, mx); 5993BI_W_MC(epel, h, 16, 4, hz, mx); 5994BI_W_MC(epel, h, 24, 4, hz, mx); 5995BI_W_MC(epel, h, 32, 4, hz, mx); 5996 5997BI_W_MC(epel, v, 4, 4, vt, my); 5998BI_W_MC(epel, v, 8, 4, vt, my); 5999BI_W_MC(epel, v, 6, 4, vt, my); 6000BI_W_MC(epel, v, 12, 4, vt, my); 6001BI_W_MC(epel, v, 16, 4, vt, my); 6002BI_W_MC(epel, v, 24, 4, vt, my); 6003BI_W_MC(epel, v, 32, 4, vt, my); 6004 6005#undef BI_W_MC 6006 6007#define BI_W_MC_HV(PEL, WIDTH, TAP) \ 6008void ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \ 6009 ptrdiff_t dst_stride, \ 6010 uint8_t *src, \ 6011 ptrdiff_t src_stride, \ 6012 int16_t *src_16bit, \ 6013 int height, \ 6014 int denom, \ 6015 int weight0, \ 6016 int weight1, \ 6017 int offset0, \ 6018 int offset1, \ 6019 intptr_t mx, \ 6020 intptr_t my, \ 6021 int width) \ 6022{ \ 6023 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 6024 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 6025 int log2Wd = denom + 14 - 8; \ 6026 \ 6027 hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ 6028 MAX_PB_SIZE, dst, dst_stride, \ 6029 filter_x, filter_y, height, \ 6030 weight0, weight1, offset0, \ 6031 offset1, log2Wd); \ 6032} 6033 6034BI_W_MC_HV(qpel, 4, 8); 6035BI_W_MC_HV(qpel, 8, 8); 6036BI_W_MC_HV(qpel, 12, 8); 6037BI_W_MC_HV(qpel, 16, 8); 6038BI_W_MC_HV(qpel, 24, 8); 6039BI_W_MC_HV(qpel, 32, 8); 6040BI_W_MC_HV(qpel, 48, 8); 6041BI_W_MC_HV(qpel, 64, 8); 6042 6043BI_W_MC_HV(epel, 4, 4); 6044BI_W_MC_HV(epel, 8, 4); 6045BI_W_MC_HV(epel, 6, 4); 6046BI_W_MC_HV(epel, 12, 4); 6047BI_W_MC_HV(epel, 16, 4); 6048BI_W_MC_HV(epel, 24, 4); 6049BI_W_MC_HV(epel, 32, 4); 6050 6051#undef BI_W_MC_HV 6052