1/* 2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "libavcodec/mips/hevcdsp_mips.h" 23#include "libavcodec/mips/hevc_macros_msa.h" 24 25static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { 26 /* 8 width cases */ 27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28 /* 4 width cases */ 29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 30}; 31 32#define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \ 33 out0_h, out1_h) \ 34{ \ 35 v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m; \ 36 \ 37 ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m); \ 38 ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m); \ 39 DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w, \ 40 wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m); \ 41 SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w); \ 42 PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h); \ 43 ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h); \ 44 CLIP_SH2_0_255(out0_h, out1_h); \ 45} 46 47#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, \ 48 offset_h, rnd_w, out0_h, out1_h, \ 49 out2_h, out3_h) \ 50{ \ 51 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \ 52 out0_h, out1_h); \ 53 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w, \ 54 out2_h, out3_h); \ 55} 56 57static void hevc_uniwgt_copy_4w_msa(uint8_t *src, 58 int32_t src_stride, 59 uint8_t *dst, 60 int32_t dst_stride, 61 int32_t height, 62 int32_t weight, 63 int32_t offset, 64 int32_t rnd_val) 65{ 66 uint32_t loop_cnt, tp0, tp1, tp2, tp3; 67 v16i8 zero = { 0 }; 68 v16u8 out0, out1; 69 v16i8 src0 = { 0 }, src1 = { 0 }; 70 v8i16 dst0, dst1, dst2, dst3, offset_vec; 71 v4i32 weight_vec, rnd_vec; 72 73 weight = weight & 0x0000FFFF; 74 weight_vec = __msa_fill_w(weight); 75 offset_vec = __msa_fill_h(offset); 76 rnd_vec = __msa_fill_w(rnd_val); 77 78 if (2 == height) { 79 v4i32 dst0_r, dst0_l; 80 81 LW2(src, src_stride, tp0, tp1); 82 INSERT_W2_SB(tp0, tp1, src0); 83 dst0 = (v8i16) __msa_ilvr_b(zero, src0); 84 dst0 <<= 6; 85 86 ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l); 87 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l); 88 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); 89 dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 90 dst0 += offset_vec; 91 CLIP_SH_0_255(dst0); 92 out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); 93 ST_W2(out0, 0, 1, dst, dst_stride); 94 } else if (4 == height) { 95 LW4(src, src_stride, tp0, tp1, tp2, tp3); 96 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); 97 ILVRL_B2_SH(zero, src0, dst0, dst1); 98 SLLI_2V(dst0, dst1, 6); 99 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, 100 rnd_vec, dst0, dst1); 101 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 102 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride); 103 } else if (0 == (height % 8)) { 104 for (loop_cnt = (height >> 3); loop_cnt--;) { 105 LW4(src, src_stride, tp0, tp1, tp2, tp3); 106 src += 4 * src_stride; 107 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); 108 LW4(src, src_stride, tp0, tp1, tp2, tp3); 109 src += 4 * src_stride; 110 INSERT_W4_SB(tp0, tp1, tp2, tp3, src1); 111 ILVRL_B2_SH(zero, src0, dst0, dst1); 112 ILVRL_B2_SH(zero, src1, dst2, dst3); 113 SLLI_4V(dst0, dst1, dst2, dst3, 6); 114 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 115 offset_vec, rnd_vec, dst0, dst1, 116 dst2, dst3); 117 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 118 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 119 dst += 8 * dst_stride; 120 } 121 } 122} 123 124static void hevc_uniwgt_copy_6w_msa(uint8_t *src, 125 int32_t src_stride, 126 uint8_t *dst, 127 int32_t dst_stride, 128 int32_t height, 129 int32_t weight, 130 int32_t offset, 131 int32_t rnd_val) 132{ 133 uint32_t loop_cnt; 134 uint64_t tp0, tp1, tp2, tp3; 135 v16i8 zero = { 0 }; 136 v16u8 out0, out1, out2, out3; 137 v16i8 src0, src1, src2, src3; 138 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; 139 v4i32 weight_vec, rnd_vec; 140 141 weight = weight & 0x0000FFFF; 142 weight_vec = __msa_fill_w(weight); 143 offset_vec = __msa_fill_h(offset); 144 rnd_vec = __msa_fill_w(rnd_val); 145 146 for (loop_cnt = (height >> 3); loop_cnt--;) { 147 LD4(src, src_stride, tp0, tp1, tp2, tp3); 148 src += (4 * src_stride); 149 INSERT_D2_SB(tp0, tp1, src0); 150 INSERT_D2_SB(tp2, tp3, src1); 151 LD4(src, src_stride, tp0, tp1, tp2, tp3); 152 src += (4 * src_stride); 153 INSERT_D2_SB(tp0, tp1, src2); 154 INSERT_D2_SB(tp2, tp3, src3); 155 156 ILVRL_B2_SH(zero, src0, dst0, dst1); 157 ILVRL_B2_SH(zero, src1, dst2, dst3); 158 ILVRL_B2_SH(zero, src2, dst4, dst5); 159 ILVRL_B2_SH(zero, src3, dst6, dst7); 160 161 SLLI_4V(dst0, dst1, dst2, dst3, 6); 162 SLLI_4V(dst4, dst5, dst6, dst7, 6); 163 164 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 165 offset_vec, rnd_vec, dst0, dst1, dst2, 166 dst3); 167 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 168 offset_vec, rnd_vec, dst4, dst5, dst6, 169 dst7); 170 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 171 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 172 173 ST_W2(out0, 0, 2, dst, dst_stride); 174 ST_H2(out0, 2, 6, dst + 4, dst_stride); 175 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 176 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 177 dst += (4 * dst_stride); 178 ST_W2(out2, 0, 2, dst, dst_stride); 179 ST_H2(out2, 2, 6, dst + 4, dst_stride); 180 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride); 181 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 182 dst += (4 * dst_stride); 183 } 184} 185 186static void hevc_uniwgt_copy_8w_msa(uint8_t *src, 187 int32_t src_stride, 188 uint8_t *dst, 189 int32_t dst_stride, 190 int32_t height, 191 int32_t weight, 192 int32_t offset, 193 int32_t rnd_val) 194{ 195 uint32_t loop_cnt; 196 uint64_t tp0, tp1, tp2, tp3; 197 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 198 v16i8 zero = { 0 }; 199 v16u8 out0, out1, out2, out3; 200 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; 201 v4i32 weight_vec, rnd_vec; 202 203 weight = weight & 0x0000FFFF; 204 weight_vec = __msa_fill_w(weight); 205 offset_vec = __msa_fill_h(offset); 206 rnd_vec = __msa_fill_w(rnd_val); 207 208 if (2 == height) { 209 LD2(src, src_stride, tp0, tp1); 210 INSERT_D2_SB(tp0, tp1, src0); 211 ILVRL_B2_SH(zero, src0, dst0, dst1); 212 SLLI_2V(dst0, dst1, 6); 213 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, 214 rnd_vec, dst0, dst1); 215 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 216 ST_D2(out0, 0, 1, dst, dst_stride); 217 } else if (4 == height) { 218 LD4(src, src_stride, tp0, tp1, tp2, tp3); 219 INSERT_D2_SB(tp0, tp1, src0); 220 INSERT_D2_SB(tp2, tp3, src1); 221 ILVRL_B2_SH(zero, src0, dst0, dst1); 222 ILVRL_B2_SH(zero, src1, dst2, dst3); 223 SLLI_4V(dst0, dst1, dst2, dst3, 6); 224 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 225 offset_vec, rnd_vec, dst0, dst1, dst2, 226 dst3); 227 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 228 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 229 } else if (6 == height) { 230 LD4(src, src_stride, tp0, tp1, tp2, tp3); 231 src += 4 * src_stride; 232 INSERT_D2_SB(tp0, tp1, src0); 233 INSERT_D2_SB(tp2, tp3, src1); 234 LD2(src, src_stride, tp0, tp1); 235 INSERT_D2_SB(tp0, tp1, src2); 236 ILVRL_B2_SH(zero, src0, dst0, dst1); 237 ILVRL_B2_SH(zero, src1, dst2, dst3); 238 ILVRL_B2_SH(zero, src2, dst4, dst5); 239 SLLI_4V(dst0, dst1, dst2, dst3, 6); 240 SLLI_2V(dst4, dst5, 6); 241 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 242 offset_vec, rnd_vec, dst0, dst1, dst2, 243 dst3); 244 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 245 rnd_vec, dst4, dst5); 246 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 247 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 248 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 249 } else if (0 == height % 8) { 250 for (loop_cnt = (height >> 3); loop_cnt--;) { 251 LD4(src, src_stride, tp0, tp1, tp2, tp3); 252 src += 4 * src_stride; 253 INSERT_D2_SB(tp0, tp1, src0); 254 INSERT_D2_SB(tp2, tp3, src1); 255 LD4(src, src_stride, tp0, tp1, tp2, tp3); 256 src += 4 * src_stride; 257 INSERT_D2_SB(tp0, tp1, src2); 258 INSERT_D2_SB(tp2, tp3, src3); 259 260 ILVRL_B2_SH(zero, src0, dst0, dst1); 261 ILVRL_B2_SH(zero, src1, dst2, dst3); 262 ILVRL_B2_SH(zero, src2, dst4, dst5); 263 ILVRL_B2_SH(zero, src3, dst6, dst7); 264 SLLI_4V(dst0, dst1, dst2, dst3, 6); 265 SLLI_4V(dst4, dst5, dst6, dst7, 6); 266 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 267 offset_vec, rnd_vec, dst0, dst1, 268 dst2, dst3); 269 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 270 offset_vec, rnd_vec, dst4, dst5, 271 dst6, dst7); 272 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 273 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 274 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, 275 dst, dst_stride); 276 dst += (8 * dst_stride); 277 } 278 } 279} 280 281static void hevc_uniwgt_copy_12w_msa(uint8_t *src, 282 int32_t src_stride, 283 uint8_t *dst, 284 int32_t dst_stride, 285 int32_t height, 286 int32_t weight, 287 int32_t offset, 288 int32_t rnd_val) 289{ 290 uint32_t loop_cnt; 291 v16u8 out0, out1, out2; 292 v16i8 src0, src1, src2, src3; 293 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 294 v8i16 offset_vec; 295 v16i8 zero = { 0 }; 296 v4i32 weight_vec, rnd_vec; 297 298 weight = weight & 0x0000FFFF; 299 weight_vec = __msa_fill_w(weight); 300 offset_vec = __msa_fill_h(offset); 301 rnd_vec = __msa_fill_w(rnd_val); 302 303 for (loop_cnt = 4; loop_cnt--;) { 304 LD_SB4(src, src_stride, src0, src1, src2, src3); 305 src += (4 * src_stride); 306 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 307 dst0, dst1, dst2, dst3); 308 309 ILVL_W2_SB(src1, src0, src3, src2, src0, src1); 310 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5); 311 SLLI_4V(dst0, dst1, dst2, dst3, 6); 312 SLLI_2V(dst4, dst5, 6); 313 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 314 offset_vec, rnd_vec, dst0, dst1, dst2, 315 dst3); 316 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 317 rnd_vec, dst4, dst5); 318 319 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 320 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 321 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 322 dst += (4 * dst_stride); 323 } 324} 325 326static void hevc_uniwgt_copy_16w_msa(uint8_t *src, 327 int32_t src_stride, 328 uint8_t *dst, 329 int32_t dst_stride, 330 int32_t height, 331 int32_t weight, 332 int32_t offset, 333 int32_t rnd_val) 334{ 335 uint32_t loop_cnt; 336 v16u8 out0, out1, out2, out3; 337 v16i8 src0, src1, src2, src3; 338 v16i8 zero = { 0 }; 339 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; 340 v4i32 weight_vec, rnd_vec; 341 342 weight = weight & 0x0000FFFF; 343 weight_vec = __msa_fill_w(weight); 344 offset_vec = __msa_fill_h(offset); 345 rnd_vec = __msa_fill_w(rnd_val); 346 347 for (loop_cnt = height >> 2; loop_cnt--;) { 348 LD_SB4(src, src_stride, src0, src1, src2, src3); 349 src += (4 * src_stride); 350 ILVRL_B2_SH(zero, src0, dst0, dst1); 351 ILVRL_B2_SH(zero, src1, dst2, dst3); 352 ILVRL_B2_SH(zero, src2, dst4, dst5); 353 ILVRL_B2_SH(zero, src3, dst6, dst7); 354 SLLI_4V(dst0, dst1, dst2, dst3, 6); 355 SLLI_4V(dst4, dst5, dst6, dst7, 6); 356 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 357 offset_vec, rnd_vec, dst0, dst1, dst2, 358 dst3); 359 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 360 offset_vec, rnd_vec, dst4, dst5, dst6, 361 dst7); 362 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 363 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 364 ST_UB4(out0, out1, out2, out3, dst, dst_stride); 365 dst += (4 * dst_stride); 366 } 367} 368 369static void hevc_uniwgt_copy_24w_msa(uint8_t *src, 370 int32_t src_stride, 371 uint8_t *dst, 372 int32_t dst_stride, 373 int32_t height, 374 int32_t weight, 375 int32_t offset, 376 int32_t rnd_val) 377{ 378 uint32_t loop_cnt; 379 v16u8 out0, out1, out2, out3, out4, out5; 380 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 381 v16i8 zero = { 0 }; 382 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; 383 v8i16 dst8, dst9, dst10, dst11; 384 v4i32 weight_vec, rnd_vec; 385 386 weight = weight & 0x0000FFFF; 387 weight_vec = __msa_fill_w(weight); 388 offset_vec = __msa_fill_h(offset); 389 rnd_vec = __msa_fill_w(rnd_val); 390 391 for (loop_cnt = (height >> 2); loop_cnt--;) { 392 LD_SB4(src, src_stride, src0, src1, src4, src5); 393 LD_SB4(src + 16, src_stride, src2, src3, src6, src7); 394 src += (4 * src_stride); 395 396 ILVRL_B2_SH(zero, src0, dst0, dst1); 397 ILVRL_B2_SH(zero, src1, dst2, dst3); 398 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5); 399 ILVRL_B2_SH(zero, src4, dst6, dst7); 400 ILVRL_B2_SH(zero, src5, dst8, dst9); 401 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11); 402 SLLI_4V(dst0, dst1, dst2, dst3, 6); 403 SLLI_4V(dst4, dst5, dst6, dst7, 6); 404 SLLI_4V(dst8, dst9, dst10, dst11, 6); 405 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 406 offset_vec, rnd_vec, dst0, dst1, dst2, 407 dst3); 408 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 409 offset_vec, rnd_vec, dst4, dst5, dst6, 410 dst7); 411 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec, 412 offset_vec, rnd_vec, dst8, dst9, dst10, 413 dst11); 414 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 415 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); 416 ST_UB4(out0, out1, out3, out4, dst, dst_stride); 417 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride); 418 dst += (4 * dst_stride); 419 } 420} 421 422static void hevc_uniwgt_copy_32w_msa(uint8_t *src, 423 int32_t src_stride, 424 uint8_t *dst, 425 int32_t dst_stride, 426 int32_t height, 427 int32_t weight, 428 int32_t offset, 429 int32_t rnd_val) 430{ 431 uint32_t loop_cnt; 432 v16u8 out0, out1, out2, out3; 433 v16i8 src0, src1, src2, src3; 434 v16i8 zero = { 0 }; 435 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; 436 v4i32 weight_vec, rnd_vec; 437 438 weight = weight & 0x0000FFFF; 439 weight_vec = __msa_fill_w(weight); 440 offset_vec = __msa_fill_h(offset); 441 rnd_vec = __msa_fill_w(rnd_val); 442 443 for (loop_cnt = (height >> 1); loop_cnt--;) { 444 LD_SB2(src, src_stride, src0, src1); 445 LD_SB2(src + 16, src_stride, src2, src3); 446 src += (2 * src_stride); 447 448 ILVRL_B2_SH(zero, src0, dst0, dst1); 449 ILVRL_B2_SH(zero, src1, dst2, dst3); 450 ILVRL_B2_SH(zero, src2, dst4, dst5); 451 ILVRL_B2_SH(zero, src3, dst6, dst7); 452 SLLI_4V(dst0, dst1, dst2, dst3, 6); 453 SLLI_4V(dst4, dst5, dst6, dst7, 6); 454 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 455 offset_vec, rnd_vec, dst0, dst1, dst2, 456 dst3); 457 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 458 offset_vec, rnd_vec, dst4, dst5, dst6, 459 dst7); 460 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 461 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 462 ST_UB2(out0, out1, dst, dst_stride); 463 ST_UB2(out2, out3, dst + 16, dst_stride); 464 dst += (2 * dst_stride); 465 } 466} 467 468static void hevc_uniwgt_copy_48w_msa(uint8_t *src, 469 int32_t src_stride, 470 uint8_t *dst, 471 int32_t dst_stride, 472 int32_t height, 473 int32_t weight, 474 int32_t offset, 475 int32_t rnd_val) 476{ 477 uint32_t loop_cnt; 478 v16u8 out0, out1, out2, out3, out4, out5; 479 v16i8 src0, src1, src2, src3, src4, src5; 480 v16i8 zero = { 0 }; 481 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec; 482 v8i16 dst6, dst7, dst8, dst9, dst10, dst11; 483 v4i32 weight_vec, rnd_vec; 484 485 weight = weight & 0x0000FFFF; 486 weight_vec = __msa_fill_w(weight); 487 offset_vec = __msa_fill_h(offset); 488 rnd_vec = __msa_fill_w(rnd_val); 489 490 for (loop_cnt = (height >> 1); loop_cnt--;) { 491 LD_SB3(src, 16, src0, src1, src2); 492 src += src_stride; 493 LD_SB3(src, 16, src3, src4, src5); 494 src += src_stride; 495 496 ILVRL_B2_SH(zero, src0, dst0, dst1); 497 ILVRL_B2_SH(zero, src1, dst2, dst3); 498 ILVRL_B2_SH(zero, src2, dst4, dst5); 499 ILVRL_B2_SH(zero, src3, dst6, dst7); 500 ILVRL_B2_SH(zero, src4, dst8, dst9); 501 ILVRL_B2_SH(zero, src5, dst10, dst11); 502 SLLI_4V(dst0, dst1, dst2, dst3, 6); 503 SLLI_4V(dst4, dst5, dst6, dst7, 6); 504 SLLI_4V(dst8, dst9, dst10, dst11, 6); 505 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 506 offset_vec, rnd_vec, dst0, dst1, dst2, 507 dst3); 508 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 509 offset_vec, rnd_vec, dst4, dst5, dst6, 510 dst7); 511 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec, 512 offset_vec, rnd_vec, dst8, dst9, dst10, 513 dst11); 514 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 515 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); 516 ST_UB2(out0, out1, dst, 16); 517 ST_UB(out2, dst + 32); 518 dst += dst_stride; 519 ST_UB2(out3, out4, dst, 16); 520 ST_UB(out5, dst + 32); 521 dst += dst_stride; 522 } 523} 524 525static void hevc_uniwgt_copy_64w_msa(uint8_t *src, 526 int32_t src_stride, 527 uint8_t *dst, 528 int32_t dst_stride, 529 int32_t height, 530 int32_t weight, 531 int32_t offset, 532 int32_t rnd_val) 533{ 534 uint32_t loop_cnt; 535 v16u8 out0, out1, out2, out3, out4, out5, out6, out7; 536 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 537 v16i8 zero = { 0 }; 538 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; 539 v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; 540 v4i32 weight_vec, rnd_vec; 541 542 weight = weight & 0x0000FFFF; 543 weight_vec = __msa_fill_w(weight); 544 offset_vec = __msa_fill_h(offset); 545 rnd_vec = __msa_fill_w(rnd_val); 546 547 for (loop_cnt = (height >> 1); loop_cnt--;) { 548 LD_SB4(src, 16, src0, src1, src2, src3); 549 src += src_stride; 550 LD_SB4(src, 16, src4, src5, src6, src7); 551 src += src_stride; 552 553 ILVRL_B2_SH(zero, src0, dst0, dst1); 554 ILVRL_B2_SH(zero, src1, dst2, dst3); 555 ILVRL_B2_SH(zero, src2, dst4, dst5); 556 ILVRL_B2_SH(zero, src3, dst6, dst7); 557 ILVRL_B2_SH(zero, src4, dst8, dst9); 558 ILVRL_B2_SH(zero, src5, dst10, dst11); 559 ILVRL_B2_SH(zero, src6, dst12, dst13); 560 ILVRL_B2_SH(zero, src7, dst14, dst15); 561 SLLI_4V(dst0, dst1, dst2, dst3, 6); 562 SLLI_4V(dst4, dst5, dst6, dst7, 6); 563 SLLI_4V(dst8, dst9, dst10, dst11, 6); 564 SLLI_4V(dst12, dst13, dst14, dst15, 6); 565 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 566 offset_vec, rnd_vec, dst0, dst1, dst2, 567 dst3); 568 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 569 offset_vec, rnd_vec, dst4, dst5, dst6, 570 dst7); 571 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec, 572 offset_vec, rnd_vec, dst8, dst9, dst10, 573 dst11); 574 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst12, dst13, dst14, dst15, weight_vec, 575 offset_vec, rnd_vec, dst12, dst13, dst14, 576 dst15); 577 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 578 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 579 PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5); 580 PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7); 581 ST_UB4(out0, out1, out2, out3, dst, 16); 582 dst += dst_stride; 583 ST_UB4(out4, out5, out6, out7, dst, 16); 584 dst += dst_stride; 585 } 586} 587 588static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src, 589 int32_t src_stride, 590 uint8_t *dst, 591 int32_t dst_stride, 592 const int8_t *filter, 593 int32_t height, 594 int32_t weight, 595 int32_t offset, 596 int32_t rnd_val) 597{ 598 uint32_t loop_cnt; 599 v16u8 out0, out1; 600 v8i16 filt0, filt1, filt2, filt3; 601 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 602 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 603 v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15; 604 v8i16 filter_vec, dst01, dst23, dst45, dst67; 605 v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec; 606 v4i32 weight_vec, rnd_vec; 607 608 src -= 3; 609 weight = weight & 0x0000FFFF; 610 611 weight_vec = __msa_fill_w(weight); 612 rnd_vec = __msa_fill_w(rnd_val); 613 614 weight *= 128; 615 rnd_val -= 6; 616 617 weight_vec_h = __msa_fill_h(weight); 618 offset_vec = __msa_fill_h(offset); 619 denom_vec = __msa_fill_h(rnd_val); 620 621 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 622 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 623 624 filter_vec = LD_SH(filter); 625 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 626 627 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 628 mask1 = mask0 + 2; 629 mask2 = mask0 + 4; 630 mask3 = mask0 + 6; 631 632 for (loop_cnt = (height >> 3); loop_cnt--;) { 633 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 634 src += (8 * src_stride); 635 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 636 637 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, 638 vec0, vec1, vec2, vec3); 639 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3, 640 vec4, vec5, vec6, vec7); 641 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3, 642 vec8, vec9, vec10, vec11); 643 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3, 644 vec12, vec13, vec14, vec15); 645 dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 646 filt3); 647 dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 648 filt3); 649 dst45 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 650 filt3); 651 dst67 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 652 filt2, filt3); 653 654 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec, 655 offset_vec, rnd_vec, dst0, dst1, dst2, 656 dst3); 657 658 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 659 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 660 dst += (8 * dst_stride); 661 } 662} 663 664static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src, 665 int32_t src_stride, 666 uint8_t *dst, 667 int32_t dst_stride, 668 const int8_t *filter, 669 int32_t height, 670 int32_t weight, 671 int32_t offset, 672 int32_t rnd_val) 673{ 674 uint32_t loop_cnt; 675 v16u8 out0, out1; 676 v16i8 src0, src1, src2, src3; 677 v8i16 filt0, filt1, filt2, filt3; 678 v16i8 mask0, mask1, mask2, mask3; 679 v8i16 filter_vec; 680 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 681 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 682 v8i16 dst0, dst1, dst2, dst3; 683 v8i16 weight_vec_h, offset_vec, denom_vec; 684 v4i32 weight_vec, rnd_vec; 685 686 src -= 3; 687 weight = weight & 0x0000FFFF; 688 689 weight_vec = __msa_fill_w(weight); 690 rnd_vec = __msa_fill_w(rnd_val); 691 692 weight *= 128; 693 rnd_val -= 6; 694 695 weight_vec_h = __msa_fill_h(weight); 696 offset_vec = __msa_fill_h(offset); 697 denom_vec = __msa_fill_h(rnd_val); 698 699 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 700 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 701 702 filter_vec = LD_SH(filter); 703 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 704 705 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 706 mask1 = mask0 + 2; 707 mask2 = mask0 + 4; 708 mask3 = mask0 + 6; 709 710 for (loop_cnt = (height >> 2); loop_cnt--;) { 711 LD_SB4(src, src_stride, src0, src1, src2, src3); 712 src += (4 * src_stride); 713 XORI_B4_128_SB(src0, src1, src2, src3); 714 715 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 716 vec0, vec1, vec2, vec3); 717 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 718 vec4, vec5, vec6, vec7); 719 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 720 vec8, vec9, vec10, vec11); 721 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 722 vec12, vec13, vec14, vec15); 723 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 724 filt3); 725 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 726 filt3); 727 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 728 filt3); 729 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 730 filt2, filt3); 731 732 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 733 offset_vec, rnd_vec, dst0, dst1, dst2, 734 dst3); 735 736 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 737 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 738 dst += (4 * dst_stride); 739 } 740} 741 742static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src, 743 int32_t src_stride, 744 uint8_t *dst, 745 int32_t dst_stride, 746 const int8_t *filter, 747 int32_t height, 748 int32_t weight, 749 int32_t offset, 750 int32_t rnd_val) 751{ 752 uint32_t loop_cnt; 753 v16u8 out0, out1, out2; 754 v8i16 filt0, filt1, filt2, filt3; 755 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 756 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 757 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 758 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 759 v8i16 filter_vec; 760 v8i16 dst01, dst23, dst0, dst1, dst2, dst3, dst4, dst5; 761 v8i16 weight_vec_h, offset_vec, denom_vec; 762 v4i32 weight_vec, rnd_vec; 763 764 src -= 3; 765 weight = weight & 0x0000FFFF; 766 767 weight_vec = __msa_fill_w(weight); 768 rnd_vec = __msa_fill_w(rnd_val); 769 770 weight *= 128; 771 rnd_val -= 6; 772 773 weight_vec_h = __msa_fill_h(weight); 774 offset_vec = __msa_fill_h(offset); 775 denom_vec = __msa_fill_h(rnd_val); 776 777 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 778 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 779 780 filter_vec = LD_SH(filter); 781 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 782 783 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 784 mask1 = mask0 + 2; 785 mask2 = mask0 + 4; 786 mask3 = mask0 + 6; 787 mask4 = LD_SB(&ff_hevc_mask_arr[16]); 788 mask5 = mask4 + 2; 789 mask6 = mask4 + 4; 790 mask7 = mask4 + 6; 791 792 for (loop_cnt = (height >> 2); loop_cnt--;) { 793 LD_SB4(src, src_stride, src0, src1, src2, src3); 794 LD_SB4(src + 8, src_stride, src4, src5, src6, src7); 795 src += (4 * src_stride); 796 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 797 798 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 799 vec0, vec1, vec2, vec3); 800 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 801 vec4, vec5, vec6, vec7); 802 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 803 vec8, vec9, vec10, vec11); 804 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 805 vec12, vec13, vec14, vec15); 806 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 807 filt3); 808 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 809 filt3); 810 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 811 filt3); 812 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 813 filt2, filt3); 814 VSHF_B4_SB(src4, src5, mask4, mask5, mask6, mask7, 815 vec0, vec1, vec2, vec3); 816 VSHF_B4_SB(src6, src7, mask4, mask5, mask6, mask7, 817 vec4, vec5, vec6, vec7); 818 dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 819 filt3); 820 dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 821 filt3); 822 823 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 824 offset_vec, rnd_vec, dst0, dst1, dst2, 825 dst3); 826 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst01, dst23, weight_vec, offset_vec, 827 rnd_vec, dst4, dst5); 828 829 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 830 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 831 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 832 dst += (4 * dst_stride); 833 } 834} 835 836static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src, 837 int32_t src_stride, 838 uint8_t *dst, 839 int32_t dst_stride, 840 const int8_t *filter, 841 int32_t height, 842 int32_t weight, 843 int32_t offset, 844 int32_t rnd_val) 845{ 846 uint32_t loop_cnt; 847 v16u8 out0, out1; 848 v16i8 src0, src1, src2, src3; 849 v8i16 filt0, filt1, filt2, filt3; 850 v16i8 mask0, mask1, mask2, mask3; 851 v8i16 filter_vec; 852 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 853 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 854 v8i16 dst0, dst1, dst2, dst3; 855 v8i16 weight_vec_h, offset_vec, denom_vec; 856 v4i32 weight_vec, rnd_vec; 857 858 src -= 3; 859 860 weight_vec = __msa_fill_w(weight); 861 rnd_vec = __msa_fill_w(rnd_val); 862 863 weight *= 128; 864 rnd_val -= 6; 865 866 weight_vec_h = __msa_fill_h(weight); 867 offset_vec = __msa_fill_h(offset); 868 denom_vec = __msa_fill_h(rnd_val); 869 870 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 871 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 872 873 filter_vec = LD_SH(filter); 874 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 875 876 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 877 mask1 = mask0 + 2; 878 mask2 = mask0 + 4; 879 mask3 = mask0 + 6; 880 881 for (loop_cnt = (height >> 1); loop_cnt--;) { 882 LD_SB2(src, src_stride, src0, src2); 883 LD_SB2(src + 8, src_stride, src1, src3); 884 src += (2 * src_stride); 885 XORI_B4_128_SB(src0, src1, src2, src3); 886 887 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 888 vec0, vec1, vec2, vec3); 889 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 890 vec4, vec5, vec6, vec7); 891 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 892 vec8, vec9, vec10, vec11); 893 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 894 vec12, vec13, vec14, vec15); 895 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 896 filt3); 897 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 898 filt3); 899 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 900 filt3); 901 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 902 filt2, filt3); 903 904 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 905 offset_vec, rnd_vec, dst0, dst1, dst2, 906 dst3); 907 908 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 909 ST_UB2(out0, out1, dst, dst_stride); 910 dst += (2 * dst_stride); 911 } 912} 913 914static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src, 915 int32_t src_stride, 916 uint8_t *dst, 917 int32_t dst_stride, 918 const int8_t *filter, 919 int32_t height, 920 int32_t weight, 921 int32_t offset, 922 int32_t rnd_val) 923{ 924 uint32_t loop_cnt; 925 v16u8 out0, out1, out2; 926 v16i8 src0, src1, src2, src3; 927 v8i16 filt0, filt1, filt2, filt3; 928 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 929 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 930 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 931 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 932 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 933 v4i32 weight_vec, rnd_vec; 934 935 src -= 3; 936 937 weight_vec = __msa_fill_w(weight); 938 rnd_vec = __msa_fill_w(rnd_val); 939 940 weight *= 128; 941 rnd_val -= 6; 942 943 weight_vec_h = __msa_fill_h(weight); 944 offset_vec = __msa_fill_h(offset); 945 denom_vec = __msa_fill_h(rnd_val); 946 947 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 948 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 949 950 filter_vec = LD_SH(filter); 951 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 952 953 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 954 mask1 = mask0 + 2; 955 mask2 = mask0 + 4; 956 mask3 = mask0 + 6; 957 mask4 = mask0 + 8; 958 mask5 = mask0 + 10; 959 mask6 = mask0 + 12; 960 mask7 = mask0 + 14; 961 962 for (loop_cnt = 16; loop_cnt--;) { 963 LD_SB2(src, 16, src0, src1); 964 src += src_stride; 965 LD_SB2(src, 16, src2, src3); 966 src += src_stride; 967 XORI_B4_128_SB(src0, src1, src2, src3); 968 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 969 vec0, vec1, vec2, vec3); 970 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 971 vec4, vec5, vec6, vec7); 972 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 973 vec8, vec9, vec10, vec11); 974 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 975 vec12, vec13, vec14, vec15); 976 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 977 filt3); 978 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 979 filt3); 980 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 981 filt3); 982 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 983 filt2, filt3); 984 985 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, 986 vec0, vec1, vec2, vec3); 987 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 988 vec4, vec5, vec6, vec7); 989 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 990 filt3); 991 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 992 filt3); 993 994 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 995 offset_vec, rnd_vec, dst0, dst1, dst2, 996 dst3); 997 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 998 rnd_vec, dst4, dst5); 999 1000 PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2); 1001 ST_UB2(out0, out1, dst, dst_stride); 1002 ST_D2(out2, 0, 1, dst + 16, dst_stride); 1003 dst += (2 * dst_stride); 1004 } 1005} 1006 1007static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src, 1008 int32_t src_stride, 1009 uint8_t *dst, 1010 int32_t dst_stride, 1011 const int8_t *filter, 1012 int32_t height, 1013 int32_t weight, 1014 int32_t offset, 1015 int32_t rnd_val) 1016{ 1017 uint32_t loop_cnt; 1018 v16u8 out0, out1, out2, out3; 1019 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 1020 v8i16 filt0, filt1, filt2, filt3; 1021 v16i8 mask0, mask1, mask2, mask3; 1022 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1023 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1024 v8i16 filter_vec; 1025 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1026 v8i16 weight_vec_h, offset_vec, denom_vec; 1027 v4i32 weight_vec, rnd_vec; 1028 1029 src -= 3; 1030 1031 weight_vec = __msa_fill_w(weight); 1032 rnd_vec = __msa_fill_w(rnd_val); 1033 1034 weight *= 128; 1035 rnd_val -= 6; 1036 1037 weight_vec_h = __msa_fill_h(weight); 1038 offset_vec = __msa_fill_h(offset); 1039 denom_vec = __msa_fill_h(rnd_val); 1040 1041 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1042 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1043 1044 filter_vec = LD_SH(filter); 1045 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1046 1047 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1048 mask1 = mask0 + 2; 1049 mask2 = mask0 + 4; 1050 mask3 = mask0 + 6; 1051 1052 for (loop_cnt = height >> 1; loop_cnt--;) { 1053 LD_SB4(src, 8, src0, src1, src2, src3); 1054 src += src_stride; 1055 LD_SB4(src, 8, src4, src5, src6, src7); 1056 src += src_stride; 1057 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 1058 1059 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1060 vec0, vec1, vec2, vec3); 1061 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1062 vec4, vec5, vec6, vec7); 1063 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1064 vec8, vec9, vec10, vec11); 1065 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1066 vec12, vec13, vec14, vec15); 1067 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1068 filt3); 1069 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1070 filt3); 1071 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1072 filt3); 1073 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1074 filt2, filt3); 1075 1076 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 1077 vec0, vec1, vec2, vec3); 1078 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, 1079 vec4, vec5, vec6, vec7); 1080 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, 1081 vec8, vec9, vec10, vec11); 1082 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, 1083 vec12, vec13, vec14, vec15); 1084 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1085 filt3); 1086 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1087 filt3); 1088 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1089 filt3); 1090 dst7 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1091 filt2, filt3); 1092 1093 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 1094 offset_vec, rnd_vec, dst0, dst1, dst2, 1095 dst3); 1096 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 1097 offset_vec, rnd_vec, dst4, dst5, dst6, 1098 dst7); 1099 1100 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 1101 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 1102 ST_UB2(out0, out1, dst, 16); 1103 dst += dst_stride; 1104 ST_UB2(out2, out3, dst, 16); 1105 dst += dst_stride; 1106 } 1107} 1108 1109static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src, 1110 int32_t src_stride, 1111 uint8_t *dst, 1112 int32_t dst_stride, 1113 const int8_t *filter, 1114 int32_t height, 1115 int32_t weight, 1116 int32_t offset, 1117 int32_t rnd_val) 1118{ 1119 uint32_t loop_cnt; 1120 v16u8 out0, out1, out2; 1121 v16i8 src0, src1, src2, src3; 1122 v8i16 filt0, filt1, filt2, filt3; 1123 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1124 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1125 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1126 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 1127 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 1128 v4i32 weight_vec, rnd_vec; 1129 1130 src -= 3; 1131 1132 weight = weight & 0x0000FFFF; 1133 weight_vec = __msa_fill_w(weight); 1134 rnd_vec = __msa_fill_w(rnd_val); 1135 1136 weight *= 128; 1137 rnd_val -= 6; 1138 1139 weight_vec_h = __msa_fill_h(weight); 1140 offset_vec = __msa_fill_h(offset); 1141 denom_vec = __msa_fill_h(rnd_val); 1142 1143 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1144 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1145 1146 filter_vec = LD_SH(filter); 1147 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1148 1149 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1150 mask1 = mask0 + 2; 1151 mask2 = mask0 + 4; 1152 mask3 = mask0 + 6; 1153 mask4 = mask0 + 8; 1154 mask5 = mask0 + 10; 1155 mask6 = mask0 + 12; 1156 mask7 = mask0 + 14; 1157 1158 for (loop_cnt = 64; loop_cnt--;) { 1159 LD_SB3(src, 16, src0, src1, src2); 1160 src3 = LD_SB(src + 40); 1161 src += src_stride; 1162 XORI_B4_128_SB(src0, src1, src2, src3); 1163 1164 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1165 vec0, vec1, vec2, vec3); 1166 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 1167 vec4, vec5, vec6, vec7); 1168 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1169 vec8, vec9, vec10, vec11); 1170 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7, 1171 vec12, vec13, vec14, vec15); 1172 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1173 filt3); 1174 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1175 filt3); 1176 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1177 filt3); 1178 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1179 filt2, filt3); 1180 1181 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1182 vec0, vec1, vec2, vec3); 1183 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1184 vec4, vec5, vec6, vec7); 1185 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1186 filt3); 1187 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1188 filt3); 1189 1190 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 1191 offset_vec, rnd_vec, dst0, dst1, dst2, 1192 dst3); 1193 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 1194 rnd_vec, dst4, dst5); 1195 1196 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 1197 ST_UB2(out0, out1, dst, 16); 1198 ST_UB(out2, dst + 32); 1199 dst += dst_stride; 1200 } 1201} 1202 1203static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src, 1204 int32_t src_stride, 1205 uint8_t *dst, 1206 int32_t dst_stride, 1207 const int8_t *filter, 1208 int32_t height, 1209 int32_t weight, 1210 int32_t offset, 1211 int32_t rnd_val) 1212{ 1213 uint8_t *src_tmp; 1214 uint8_t *dst_tmp; 1215 uint32_t loop_cnt, cnt; 1216 v16u8 out0, out1; 1217 v16i8 src0, src1, src2; 1218 v8i16 filt0, filt1, filt2, filt3; 1219 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1220 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1221 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1222 v8i16 dst0, dst1, dst2, dst3; 1223 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 1224 v4i32 weight_vec, rnd_vec; 1225 1226 src -= 3; 1227 1228 weight_vec = __msa_fill_w(weight); 1229 rnd_vec = __msa_fill_w(rnd_val); 1230 1231 weight *= 128; 1232 rnd_val -= 6; 1233 1234 weight_vec_h = __msa_fill_h(weight); 1235 offset_vec = __msa_fill_h(offset); 1236 denom_vec = __msa_fill_h(rnd_val); 1237 1238 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1239 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1240 1241 filter_vec = LD_SH(filter); 1242 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1243 1244 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1245 mask1 = mask0 + 2; 1246 mask2 = mask0 + 4; 1247 mask3 = mask0 + 6; 1248 mask4 = mask0 + 8; 1249 mask5 = mask0 + 10; 1250 mask6 = mask0 + 12; 1251 mask7 = mask0 + 14; 1252 1253 for (loop_cnt = height; loop_cnt--;) { 1254 src_tmp = src; 1255 dst_tmp = dst; 1256 1257 for (cnt = 2; cnt--;) { 1258 LD_SB2(src_tmp, 16, src0, src1); 1259 src2 = LD_SB(src_tmp + 24); 1260 src_tmp += 32; 1261 XORI_B3_128_SB(src0, src1, src2); 1262 1263 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1264 vec0, vec1, vec2, vec3); 1265 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 1266 vec4, vec5, vec6, vec7); 1267 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1268 vec8, vec9, vec10, vec11); 1269 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1270 vec12, vec13, vec14, vec15); 1271 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1272 filt2, filt3); 1273 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, 1274 filt2, filt3); 1275 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, 1276 filt2, filt3); 1277 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1278 filt2, filt3); 1279 1280 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 1281 offset_vec, rnd_vec, dst0, dst1, 1282 dst2, dst3); 1283 1284 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 1285 ST_UB2(out0, out1, dst_tmp, 16); 1286 dst_tmp += 32; 1287 } 1288 1289 src += src_stride; 1290 dst += dst_stride; 1291 } 1292} 1293 1294static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src, 1295 int32_t src_stride, 1296 uint8_t *dst, 1297 int32_t dst_stride, 1298 const int8_t *filter, 1299 int32_t height, 1300 int32_t weight, 1301 int32_t offset, 1302 int32_t rnd_val) 1303{ 1304 int32_t loop_cnt; 1305 v16u8 out0, out1; 1306 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1307 v16i8 src9, src10, src11, src12, src13, src14; 1308 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1309 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1310 v16i8 src1110_r, src1211_r, src1312_r, src1413_r; 1311 v16i8 src2110, src4332, src6554, src8776, src10998; 1312 v16i8 src12111110, src14131312; 1313 v8i16 filter_vec, dst01, dst23, dst45, dst67; 1314 v8i16 filt0, filt1, filt2, filt3; 1315 v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec; 1316 v4i32 weight_vec, rnd_vec; 1317 1318 src -= (3 * src_stride); 1319 1320 1321 weight_vec = __msa_fill_w(weight); 1322 rnd_vec = __msa_fill_w(rnd_val); 1323 1324 weight *= 128; 1325 rnd_val -= 6; 1326 1327 weight_vec_h = __msa_fill_h(weight); 1328 offset_vec = __msa_fill_h(offset); 1329 denom_vec = __msa_fill_h(rnd_val); 1330 1331 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1332 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1333 1334 filter_vec = LD_SH(filter); 1335 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1336 1337 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1338 src += (7 * src_stride); 1339 1340 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1341 src10_r, src32_r, src54_r, src21_r); 1342 1343 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1344 1345 ILVR_D3_SB(src21_r, src10_r, src43_r, 1346 src32_r, src65_r, src54_r, src2110, src4332, src6554); 1347 1348 XORI_B3_128_SB(src2110, src4332, src6554); 1349 1350 for (loop_cnt = (height >> 3); loop_cnt--;) { 1351 LD_SB8(src, src_stride, 1352 src7, src8, src9, src10, src11, src12, src13, src14); 1353 src += (8 * src_stride); 1354 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1355 src76_r, src87_r, src98_r, src109_r); 1356 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, 1357 src1110_r, src1211_r, src1312_r, src1413_r); 1358 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r, 1359 src1413_r, src1312_r, 1360 src8776, src10998, src12111110, src14131312); 1361 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); 1362 dst01 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0, 1363 filt1, filt2, filt3); 1364 dst23 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0, 1365 filt1, filt2, filt3); 1366 dst45 = HEVC_FILT_8TAP_SH(src6554, src8776, src10998, src12111110, 1367 filt0, filt1, filt2, filt3); 1368 dst67 = HEVC_FILT_8TAP_SH(src8776, src10998, src12111110, src14131312, 1369 filt0, filt1, filt2, filt3); 1370 1371 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec, 1372 offset_vec, rnd_vec, dst0, dst1, dst2, 1373 dst3); 1374 1375 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 1376 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 1377 dst += (8 * dst_stride); 1378 1379 src2110 = src10998; 1380 src4332 = src12111110; 1381 src6554 = src14131312; 1382 src6 = src14; 1383 } 1384} 1385 1386static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src, 1387 int32_t src_stride, 1388 uint8_t *dst, 1389 int32_t dst_stride, 1390 const int8_t *filter, 1391 int32_t height, 1392 int32_t weight, 1393 int32_t offset, 1394 int32_t rnd_val) 1395{ 1396 int32_t loop_cnt; 1397 v16u8 out0, out1; 1398 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1399 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1400 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1401 v8i16 filt0, filt1, filt2, filt3; 1402 v8i16 filter_vec; 1403 v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec; 1404 v4i32 weight_vec, rnd_vec; 1405 1406 src -= (3 * src_stride); 1407 1408 weight_vec = __msa_fill_w(weight); 1409 rnd_vec = __msa_fill_w(rnd_val); 1410 1411 weight *= 128; 1412 rnd_val -= 6; 1413 1414 weight_vec_h = __msa_fill_h(weight); 1415 offset_vec = __msa_fill_h(offset); 1416 denom_vec = __msa_fill_h(rnd_val); 1417 1418 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1419 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1420 1421 filter_vec = LD_SH(filter); 1422 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1423 1424 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1425 src += (7 * src_stride); 1426 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1427 1428 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1429 src10_r, src32_r, src54_r, src21_r); 1430 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1431 1432 for (loop_cnt = (height >> 2); loop_cnt--;) { 1433 LD_SB4(src, src_stride, src7, src8, src9, src10); 1434 src += (4 * src_stride); 1435 XORI_B4_128_SB(src7, src8, src9, src10); 1436 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1437 src76_r, src87_r, src98_r, src109_r); 1438 dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0, 1439 filt1, filt2, filt3); 1440 dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0, 1441 filt1, filt2, filt3); 1442 dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0, 1443 filt1, filt2, filt3); 1444 dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0, 1445 filt1, filt2, filt3); 1446 1447 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 1448 offset_vec, rnd_vec, dst0, dst1, dst2, 1449 dst3); 1450 1451 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 1452 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1453 dst += (4 * dst_stride); 1454 1455 src10_r = src54_r; 1456 src32_r = src76_r; 1457 src54_r = src98_r; 1458 src21_r = src65_r; 1459 src43_r = src87_r; 1460 src65_r = src109_r; 1461 src6 = src10; 1462 } 1463} 1464 1465static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src, 1466 int32_t src_stride, 1467 uint8_t *dst, 1468 int32_t dst_stride, 1469 const int8_t *filter, 1470 int32_t height, 1471 int32_t weight, 1472 int32_t offset, 1473 int32_t rnd_val) 1474{ 1475 int32_t loop_cnt; 1476 v16u8 out0, out1, out2; 1477 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1478 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1479 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1480 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; 1481 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; 1482 v16i8 src2110, src4332, src6554, src8776, src10998; 1483 v8i16 filt0, filt1, filt2, filt3; 1484 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 1485 v8i16 weight_vec_h, offset_vec, denom_vec, filter_vec; 1486 v4i32 weight_vec, rnd_vec; 1487 1488 src -= (3 * src_stride); 1489 1490 weight = weight & 0x0000FFFF; 1491 weight_vec = __msa_fill_w(weight); 1492 rnd_vec = __msa_fill_w(rnd_val); 1493 1494 weight *= 128; 1495 rnd_val -= 6; 1496 1497 weight_vec_h = __msa_fill_h(weight); 1498 offset_vec = __msa_fill_h(offset); 1499 denom_vec = __msa_fill_h(rnd_val); 1500 1501 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1502 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1503 1504 filter_vec = LD_SH(filter); 1505 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1506 1507 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1508 src += (7 * src_stride); 1509 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1510 1511 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1512 src10_r, src32_r, src54_r, src21_r); 1513 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1514 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1515 src10_l, src32_l, src54_l, src21_l); 1516 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1517 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, 1518 src2110, src4332, src6554); 1519 1520 for (loop_cnt = 4; loop_cnt--;) { 1521 LD_SB4(src, src_stride, src7, src8, src9, src10); 1522 src += (4 * src_stride); 1523 XORI_B4_128_SB(src7, src8, src9, src10); 1524 1525 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1526 src76_r, src87_r, src98_r, src109_r); 1527 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1528 src76_l, src87_l, src98_l, src109_l); 1529 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998); 1530 1531 dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0, 1532 filt1, filt2, filt3); 1533 dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0, 1534 filt1, filt2, filt3); 1535 dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0, 1536 filt1, filt2, filt3); 1537 dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0, 1538 filt1, filt2, filt3); 1539 dst4 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0, 1540 filt1, filt2, filt3); 1541 dst5 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0, 1542 filt1, filt2, filt3); 1543 1544 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 1545 offset_vec, rnd_vec, dst0, dst1, dst2, 1546 dst3); 1547 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 1548 rnd_vec, dst4, dst5); 1549 1550 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 1551 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1552 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 1553 dst += (4 * dst_stride); 1554 1555 src10_r = src54_r; 1556 src32_r = src76_r; 1557 src54_r = src98_r; 1558 src21_r = src65_r; 1559 src43_r = src87_r; 1560 src65_r = src109_r; 1561 src2110 = src6554; 1562 src4332 = src8776; 1563 src6554 = src10998; 1564 src6 = src10; 1565 } 1566} 1567 1568static void hevc_vt_uniwgt_8t_16multx4mult_msa(uint8_t *src, 1569 int32_t src_stride, 1570 uint8_t *dst, 1571 int32_t dst_stride, 1572 const int8_t *filter, 1573 int32_t height, 1574 int32_t weight, 1575 int32_t offset, 1576 int32_t rnd_val, 1577 int32_t weightmul16) 1578{ 1579 uint8_t *src_tmp; 1580 uint8_t *dst_tmp; 1581 int32_t loop_cnt, cnt; 1582 v16u8 out0, out1, out2, out3; 1583 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1584 v16i8 src10_r, src32_r, src54_r, src76_r; 1585 v16i8 src21_r, src43_r, src65_r, src87_r; 1586 v16i8 src10_l, src32_l, src54_l, src76_l; 1587 v16i8 src21_l, src43_l, src65_l, src87_l; 1588 v16i8 src98_r, src109_r, src98_l, src109_l; 1589 v8i16 filt0, filt1, filt2, filt3; 1590 v8i16 filter_vec; 1591 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1592 v8i16 weight_vec_h, offset_vec, denom_vec; 1593 v4i32 weight_vec, rnd_vec; 1594 1595 src -= (3 * src_stride); 1596 1597 weight_vec = __msa_fill_w(weight); 1598 rnd_vec = __msa_fill_w(rnd_val); 1599 1600 weight *= 128; 1601 rnd_val -= 6; 1602 1603 weight_vec_h = __msa_fill_h(weight); 1604 offset_vec = __msa_fill_h(offset); 1605 denom_vec = __msa_fill_h(rnd_val); 1606 1607 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1608 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1609 1610 filter_vec = LD_SH(filter); 1611 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1612 1613 for (cnt = weightmul16; cnt--;) { 1614 src_tmp = src; 1615 dst_tmp = dst; 1616 1617 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1618 src_tmp += (7 * src_stride); 1619 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1620 1621 for (loop_cnt = (height >> 2); loop_cnt--;) { 1622 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); 1623 src_tmp += (4 * src_stride); 1624 XORI_B4_128_SB(src7, src8, src9, src10); 1625 1626 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1627 src10_r, src32_r, src54_r, src21_r); 1628 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1629 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1630 src10_l, src32_l, src54_l, src21_l); 1631 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1632 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1633 src76_r, src87_r, src98_r, src109_r); 1634 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1635 src76_l, src87_l, src98_l, src109_l); 1636 1637 dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0, 1638 filt1, filt2, filt3); 1639 dst1 = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0, 1640 filt1, filt2, filt3); 1641 dst2 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0, 1642 filt1, filt2, filt3); 1643 dst3 = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0, 1644 filt1, filt2, filt3); 1645 dst4 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0, 1646 filt1, filt2, filt3); 1647 dst5 = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0, 1648 filt1, filt2, filt3); 1649 dst6 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0, 1650 filt1, filt2, filt3); 1651 dst7 = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0, 1652 filt1, filt2, filt3); 1653 1654 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 1655 offset_vec, rnd_vec, dst0, dst1, 1656 dst2, dst3); 1657 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 1658 offset_vec, rnd_vec, dst4, dst5, 1659 dst6, dst7); 1660 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 1661 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 1662 ST_UB4(out0, out1, out2, out3, dst_tmp, dst_stride); 1663 dst_tmp += (4 * dst_stride); 1664 1665 src0 = src4; 1666 src1 = src5; 1667 src2 = src6; 1668 src3 = src7; 1669 src4 = src8; 1670 src5 = src9; 1671 src6 = src10; 1672 } 1673 1674 src += 16; 1675 dst += 16; 1676 } 1677} 1678 1679static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src, 1680 int32_t src_stride, 1681 uint8_t *dst, 1682 int32_t dst_stride, 1683 const int8_t *filter, 1684 int32_t height, 1685 int32_t weight, 1686 int32_t offset, 1687 int32_t rnd_val) 1688{ 1689 hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1690 filter, height, weight, 1691 offset, rnd_val, 1); 1692} 1693 1694static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src, 1695 int32_t src_stride, 1696 uint8_t *dst, 1697 int32_t dst_stride, 1698 const int8_t *filter, 1699 int32_t height, 1700 int32_t weight, 1701 int32_t offset, 1702 int32_t rnd_val) 1703{ 1704 hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1705 filter, 32, weight, 1706 offset, rnd_val, 1); 1707 1708 hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, 1709 filter, 32, weight, offset, rnd_val); 1710} 1711 1712static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src, 1713 int32_t src_stride, 1714 uint8_t *dst, 1715 int32_t dst_stride, 1716 const int8_t *filter, 1717 int32_t height, 1718 int32_t weight, 1719 int32_t offset, 1720 int32_t rnd_val) 1721{ 1722 hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1723 filter, height, weight, 1724 offset, rnd_val, 2); 1725} 1726 1727static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src, 1728 int32_t src_stride, 1729 uint8_t *dst, 1730 int32_t dst_stride, 1731 const int8_t *filter, 1732 int32_t height, 1733 int32_t weight, 1734 int32_t offset, 1735 int32_t rnd_val) 1736{ 1737 hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1738 filter, 64, weight, 1739 offset, rnd_val, 3); 1740} 1741 1742static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src, 1743 int32_t src_stride, 1744 uint8_t *dst, 1745 int32_t dst_stride, 1746 const int8_t *filter, 1747 int32_t height, 1748 int32_t weight, 1749 int32_t offset, 1750 int32_t rnd_val) 1751{ 1752 hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1753 filter, height, weight, 1754 offset, rnd_val, 4); 1755} 1756 1757static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src, 1758 int32_t src_stride, 1759 uint8_t *dst, 1760 int32_t dst_stride, 1761 const int8_t *filter_x, 1762 const int8_t *filter_y, 1763 int32_t height, 1764 int32_t weight, 1765 int32_t offset, 1766 int32_t rnd_val) 1767{ 1768 uint32_t loop_cnt; 1769 v16u8 out; 1770 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1771 v8i16 filt0, filt1, filt2, filt3; 1772 v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1773 v16i8 mask1, mask2, mask3; 1774 v8i16 filter_vec; 1775 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1776 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1777 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108; 1778 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r; 1779 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r; 1780 v4i32 dst0_r, dst1_r, dst2_r, dst3_r; 1781 v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec; 1782 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1783 1784 src -= ((3 * src_stride) + 3); 1785 filter_vec = LD_SH(filter_x); 1786 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1787 1788 filter_vec = LD_SH(filter_y); 1789 UNPCK_R_SB_SH(filter_vec, filter_vec); 1790 1791 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1792 1793 mask1 = mask0 + 2; 1794 mask2 = mask0 + 4; 1795 mask3 = mask0 + 6; 1796 1797 weight_vec = __msa_fill_w(weight); 1798 offset_vec = __msa_fill_w(offset); 1799 rnd_vec = __msa_fill_w(rnd_val); 1800 denom_vec = rnd_vec - 6; 1801 1802 const_128 = __msa_ldi_w(128); 1803 const_128 *= weight_vec; 1804 offset_vec += __msa_srar_w(const_128, denom_vec); 1805 1806 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1807 src += (7 * src_stride); 1808 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1809 1810 /* row 0 row 1 row 2 row 3 */ 1811 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1812 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1813 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, 1814 vec8, vec9, vec10, vec11); 1815 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, 1816 vec12, vec13, vec14, vec15); 1817 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1818 filt3); 1819 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1820 filt3); 1821 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1822 filt3); 1823 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 1824 filt3); 1825 1826 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r); 1827 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r); 1828 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r); 1829 1830 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 1831 1832 for (loop_cnt = height >> 2; loop_cnt--;) { 1833 LD_SB4(src, src_stride, src7, src8, src9, src10); 1834 src += (4 * src_stride); 1835 XORI_B4_128_SB(src7, src8, src9, src10); 1836 1837 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3, 1838 vec0, vec1, vec2, vec3); 1839 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3, 1840 vec4, vec5, vec6, vec7); 1841 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1842 filt3); 1843 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1844 filt3); 1845 1846 dst76_r = __msa_ilvr_h(dst97, dst66); 1847 ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r); 1848 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 1849 dst98_r = __msa_ilvr_h(dst66, dst108); 1850 1851 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 1852 filt_h1, filt_h2, filt_h3); 1853 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0, 1854 filt_h1, filt_h2, filt_h3); 1855 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0, 1856 filt_h1, filt_h2, filt_h3); 1857 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0, 1858 filt_h1, filt_h2, filt_h3); 1859 1860 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1861 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 1862 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 1863 SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec); 1864 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r); 1865 ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r); 1866 CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r); 1867 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 1868 out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); 1869 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 1870 dst += (4 * dst_stride); 1871 1872 dst10_r = dst54_r; 1873 dst32_r = dst76_r; 1874 dst54_r = dst98_r; 1875 dst21_r = dst65_r; 1876 dst43_r = dst87_r; 1877 dst65_r = dst109_r; 1878 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 1879 } 1880} 1881 1882static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src, 1883 int32_t src_stride, 1884 uint8_t *dst, 1885 int32_t dst_stride, 1886 const int8_t *filter_x, 1887 const int8_t *filter_y, 1888 int32_t height, 1889 int32_t weight, 1890 int32_t offset, 1891 int32_t rnd_val, 1892 int32_t width) 1893{ 1894 uint32_t loop_cnt, cnt; 1895 uint8_t *src_tmp; 1896 uint8_t *dst_tmp; 1897 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1898 v8i16 filt0, filt1, filt2, filt3; 1899 v4i32 filt_h0, filt_h1, filt_h2, filt_h3; 1900 v16i8 mask1, mask2, mask3; 1901 v8i16 filter_vec; 1902 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1903 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1904 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 1905 v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 1906 v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 1907 v8i16 dst10_l, dst32_l, dst54_l, dst76_l; 1908 v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 1909 v8i16 dst21_l, dst43_l, dst65_l, dst87_l; 1910 v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec; 1911 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 1912 1913 src -= ((3 * src_stride) + 3); 1914 1915 weight_vec = __msa_fill_w(weight); 1916 offset_vec = __msa_fill_w(offset); 1917 rnd_vec = __msa_fill_w(rnd_val); 1918 denom_vec = rnd_vec - 6; 1919 1920 const_128 = __msa_ldi_w(128); 1921 const_128 *= weight_vec; 1922 offset_vec += __msa_srar_w(const_128, denom_vec); 1923 1924 filter_vec = LD_SH(filter_x); 1925 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1926 1927 filter_vec = LD_SH(filter_y); 1928 UNPCK_R_SB_SH(filter_vec, filter_vec); 1929 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1930 1931 mask1 = mask0 + 2; 1932 mask2 = mask0 + 4; 1933 mask3 = mask0 + 6; 1934 1935 for (cnt = width >> 3; cnt--;) { 1936 src_tmp = src; 1937 dst_tmp = dst; 1938 1939 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1940 src_tmp += (7 * src_stride); 1941 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1942 1943 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1944 vec0, vec1, vec2, vec3); 1945 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1946 vec4, vec5, vec6, vec7); 1947 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1948 vec8, vec9, vec10, vec11); 1949 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1950 vec12, vec13, vec14, vec15); 1951 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1952 filt3); 1953 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1954 filt3); 1955 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1956 filt3); 1957 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1958 filt2, filt3); 1959 1960 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 1961 vec0, vec1, vec2, vec3); 1962 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, 1963 vec4, vec5, vec6, vec7); 1964 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, 1965 vec8, vec9, vec10, vec11); 1966 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1967 filt3); 1968 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1969 filt3); 1970 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1971 filt3); 1972 1973 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, 1974 dst10_r, dst32_r, dst54_r, dst21_r); 1975 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); 1976 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, 1977 dst10_l, dst32_l, dst54_l, dst21_l); 1978 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); 1979 1980 for (loop_cnt = height >> 1; loop_cnt--;) { 1981 LD_SB2(src_tmp, src_stride, src7, src8); 1982 src_tmp += 2 * src_stride; 1983 XORI_B2_128_SB(src7, src8); 1984 1985 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, 1986 vec0, vec1, vec2, vec3); 1987 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1988 filt2, filt3); 1989 1990 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1991 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 1992 filt_h0, filt_h1, filt_h2, filt_h3); 1993 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 1994 filt_h0, filt_h1, filt_h2, filt_h3); 1995 dst0_r >>= 6; 1996 dst0_l >>= 6; 1997 1998 /* row 8 */ 1999 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, 2000 vec0, vec1, vec2, vec3); 2001 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 2002 filt2, filt3); 2003 2004 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 2005 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, 2006 filt_h0, filt_h1, filt_h2, filt_h3); 2007 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, 2008 filt_h0, filt_h1, filt_h2, filt_h3); 2009 dst1_r >>= 6; 2010 dst1_l >>= 6; 2011 2012 MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l); 2013 MUL2(dst1_r, weight_vec, dst1_l, weight_vec, dst1_r, dst1_l); 2014 SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec); 2015 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l); 2016 ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l); 2017 CLIP_SW4_0_255(dst0_r, dst1_r, dst0_l, dst1_l); 2018 2019 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 2020 dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); 2021 ST_D2(dst0_r, 0, 1, dst_tmp, dst_stride); 2022 dst_tmp += (2 * dst_stride); 2023 2024 dst10_r = dst32_r; 2025 dst32_r = dst54_r; 2026 dst54_r = dst76_r; 2027 dst10_l = dst32_l; 2028 dst32_l = dst54_l; 2029 dst54_l = dst76_l; 2030 dst21_r = dst43_r; 2031 dst43_r = dst65_r; 2032 dst65_r = dst87_r; 2033 dst21_l = dst43_l; 2034 dst43_l = dst65_l; 2035 dst65_l = dst87_l; 2036 dst6 = dst8; 2037 } 2038 2039 src += 8; 2040 dst += 8; 2041 } 2042} 2043 2044static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src, 2045 int32_t src_stride, 2046 uint8_t *dst, 2047 int32_t dst_stride, 2048 const int8_t *filter_x, 2049 const int8_t *filter_y, 2050 int32_t height, 2051 int32_t weight, 2052 int32_t offset, 2053 int32_t rnd_val) 2054{ 2055 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 2056 filter_x, filter_y, height, weight, 2057 offset, rnd_val, 8); 2058} 2059 2060static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src, 2061 int32_t src_stride, 2062 uint8_t *dst, 2063 int32_t dst_stride, 2064 const int8_t *filter_x, 2065 const int8_t *filter_y, 2066 int32_t height, 2067 int32_t weight, 2068 int32_t offset, 2069 int32_t rnd_val) 2070{ 2071 uint32_t loop_cnt; 2072 uint8_t *src_tmp, *dst_tmp; 2073 v16u8 out; 2074 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2075 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 2076 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2077 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 2078 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2079 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108; 2080 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3; 2081 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst10_l, dst32_l, dst54_l; 2082 v8i16 dst98_r, dst21_r, dst43_r, dst65_r, dst87_r, dst109_r; 2083 v8i16 dst76_l, filter_vec; 2084 v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r; 2085 v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec; 2086 2087 src -= ((3 * src_stride) + 3); 2088 2089 filter_vec = LD_SH(filter_x); 2090 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 2091 2092 filter_vec = LD_SH(filter_y); 2093 UNPCK_R_SB_SH(filter_vec, filter_vec); 2094 2095 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 2096 2097 weight_vec = __msa_fill_w(weight); 2098 offset_vec = __msa_fill_w(offset); 2099 rnd_vec = __msa_fill_w(rnd_val); 2100 denom_vec = rnd_vec - 6; 2101 2102 const_128 = __msa_ldi_w(128); 2103 const_128 *= weight_vec; 2104 offset_vec += __msa_srar_w(const_128, denom_vec); 2105 2106 mask0 = LD_SB(ff_hevc_mask_arr); 2107 mask1 = mask0 + 2; 2108 mask2 = mask0 + 4; 2109 mask3 = mask0 + 6; 2110 2111 src_tmp = src; 2112 dst_tmp = dst; 2113 2114 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 2115 src_tmp += (7 * src_stride); 2116 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 2117 2118 /* row 0 row 1 row 2 row 3 */ 2119 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 2120 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 2121 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 2122 vec11); 2123 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14, 2124 vec15); 2125 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2126 filt3); 2127 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2128 filt3); 2129 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2130 filt3); 2131 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 2132 filt2, filt3); 2133 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 2134 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 2135 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 2136 vec11); 2137 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2138 filt3); 2139 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2140 filt3); 2141 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2142 filt3); 2143 2144 for (loop_cnt = 16; loop_cnt--;) { 2145 src7 = LD_SB(src_tmp); 2146 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128); 2147 src_tmp += src_stride; 2148 2149 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 2150 vec3); 2151 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2152 filt3); 2153 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 2154 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 2155 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 2156 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 2157 2158 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 2159 filt_h0, filt_h1, filt_h2, filt_h3); 2160 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 2161 filt_h0, filt_h1, filt_h2, filt_h3); 2162 dst0_r >>= 6; 2163 dst0_l >>= 6; 2164 2165 MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l); 2166 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); 2167 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l); 2168 CLIP_SW2_0_255(dst0_r, dst0_l); 2169 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 2170 out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r); 2171 ST_D1(out, 0, dst_tmp); 2172 dst_tmp += dst_stride; 2173 2174 dst0 = dst1; 2175 dst1 = dst2; 2176 dst2 = dst3; 2177 dst3 = dst4; 2178 dst4 = dst5; 2179 dst5 = dst6; 2180 dst6 = dst7; 2181 } 2182 2183 src += 8; 2184 dst += 8; 2185 2186 mask4 = LD_SB(ff_hevc_mask_arr + 16); 2187 mask5 = mask4 + 2; 2188 mask6 = mask4 + 4; 2189 mask7 = mask4 + 6; 2190 2191 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 2192 src += (7 * src_stride); 2193 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 2194 2195 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); 2196 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7); 2197 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10, 2198 vec11); 2199 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14, 2200 vec15); 2201 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2202 filt3); 2203 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2204 filt3); 2205 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2206 filt3); 2207 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 2208 filt3); 2209 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r); 2210 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r); 2211 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r); 2212 2213 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 2214 2215 for (loop_cnt = 4; loop_cnt--;) { 2216 LD_SB4(src, src_stride, src7, src8, src9, src10); 2217 src += (4 * src_stride); 2218 XORI_B4_128_SB(src7, src8, src9, src10); 2219 2220 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 2221 vec3); 2222 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6, 2223 vec7); 2224 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2225 filt3); 2226 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2227 filt3); 2228 2229 dst76_r = __msa_ilvr_h(dst97, dst66); 2230 ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r); 2231 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 2232 dst98_r = __msa_ilvr_h(dst66, dst108); 2233 2234 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 2235 filt_h1, filt_h2, filt_h3); 2236 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0, 2237 filt_h1, filt_h2, filt_h3); 2238 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0, 2239 filt_h1, filt_h2, filt_h3); 2240 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0, 2241 filt_h1, filt_h2, filt_h3); 2242 2243 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 2244 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 2245 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 2246 SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec); 2247 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r); 2248 ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r); 2249 CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r); 2250 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 2251 out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); 2252 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2253 dst += (4 * dst_stride); 2254 2255 dst10_r = dst54_r; 2256 dst32_r = dst76_r; 2257 dst54_r = dst98_r; 2258 dst21_r = dst65_r; 2259 dst43_r = dst87_r; 2260 dst65_r = dst109_r; 2261 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 2262 } 2263} 2264 2265static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src, 2266 int32_t src_stride, 2267 uint8_t *dst, 2268 int32_t dst_stride, 2269 const int8_t *filter_x, 2270 const int8_t *filter_y, 2271 int32_t height, 2272 int32_t weight, 2273 int32_t offset, 2274 int32_t rnd_val) 2275{ 2276 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 2277 filter_x, filter_y, height, weight, 2278 offset, rnd_val, 16); 2279} 2280 2281static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src, 2282 int32_t src_stride, 2283 uint8_t *dst, 2284 int32_t dst_stride, 2285 const int8_t *filter_x, 2286 const int8_t *filter_y, 2287 int32_t height, 2288 int32_t weight, 2289 int32_t offset, 2290 int32_t rnd_val) 2291{ 2292 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 2293 filter_x, filter_y, height, weight, 2294 offset, rnd_val, 24); 2295} 2296 2297static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src, 2298 int32_t src_stride, 2299 uint8_t *dst, 2300 int32_t dst_stride, 2301 const int8_t *filter_x, 2302 const int8_t *filter_y, 2303 int32_t height, 2304 int32_t weight, 2305 int32_t offset, 2306 int32_t rnd_val) 2307{ 2308 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 2309 filter_x, filter_y, height, weight, 2310 offset, rnd_val, 32); 2311} 2312 2313static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src, 2314 int32_t src_stride, 2315 uint8_t *dst, 2316 int32_t dst_stride, 2317 const int8_t *filter_x, 2318 const int8_t *filter_y, 2319 int32_t height, 2320 int32_t weight, 2321 int32_t offset, 2322 int32_t rnd_val) 2323{ 2324 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 2325 filter_x, filter_y, height, weight, 2326 offset, rnd_val, 48); 2327} 2328 2329static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src, 2330 int32_t src_stride, 2331 uint8_t *dst, 2332 int32_t dst_stride, 2333 const int8_t *filter_x, 2334 const int8_t *filter_y, 2335 int32_t height, 2336 int32_t weight, 2337 int32_t offset, 2338 int32_t rnd_val) 2339{ 2340 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 2341 filter_x, filter_y, height, weight, 2342 offset, rnd_val, 64); 2343} 2344 2345static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src, 2346 int32_t src_stride, 2347 uint8_t *dst, 2348 int32_t dst_stride, 2349 const int8_t *filter, 2350 int32_t weight, 2351 int32_t offset, 2352 int32_t rnd_val) 2353{ 2354 v16u8 out; 2355 v8i16 filt0, filt1; 2356 v16i8 src0, src1, vec0, vec1; 2357 v16i8 mask1; 2358 v8i16 dst0; 2359 v4i32 dst0_r, dst0_l; 2360 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2361 v4i32 weight_vec, rnd_vec; 2362 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2363 2364 src -= 1; 2365 2366 filter_vec = LD_SH(filter); 2367 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2368 2369 mask1 = mask0 + 2; 2370 2371 weight = weight & 0x0000FFFF; 2372 2373 weight_vec = __msa_fill_w(weight); 2374 rnd_vec = __msa_fill_w(rnd_val); 2375 2376 weight *= 128; 2377 rnd_val -= 6; 2378 2379 weight_vec_h = __msa_fill_h(weight); 2380 offset_vec = __msa_fill_h(offset); 2381 denom_vec = __msa_fill_h(rnd_val); 2382 2383 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2384 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2385 2386 LD_SB2(src, src_stride, src0, src1); 2387 XORI_B2_128_SB(src0, src1); 2388 2389 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2390 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2391 2392 ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l); 2393 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l); 2394 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); 2395 dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 2396 dst0 = __msa_adds_s_h(dst0, offset_vec); 2397 CLIP_SH_0_255(dst0); 2398 out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); 2399 ST_W2(out, 0, 1, dst, dst_stride); 2400 dst += (4 * dst_stride); 2401} 2402 2403static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src, 2404 int32_t src_stride, 2405 uint8_t *dst, 2406 int32_t dst_stride, 2407 const int8_t *filter, 2408 int32_t weight, 2409 int32_t offset, 2410 int32_t rnd_val) 2411{ 2412 v16u8 out; 2413 v8i16 filt0, filt1; 2414 v16i8 src0, src1, src2, src3; 2415 v16i8 mask1, vec0, vec1, vec2, vec3; 2416 v8i16 dst0, dst1; 2417 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2418 v4i32 weight_vec, rnd_vec; 2419 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2420 2421 src -= 1; 2422 2423 /* rearranging filter */ 2424 filter_vec = LD_SH(filter); 2425 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2426 2427 mask1 = mask0 + 2; 2428 2429 weight = weight & 0x0000FFFF; 2430 2431 weight_vec = __msa_fill_w(weight); 2432 rnd_vec = __msa_fill_w(rnd_val); 2433 2434 weight *= 128; 2435 rnd_val -= 6; 2436 2437 weight_vec_h = __msa_fill_h(weight); 2438 offset_vec = __msa_fill_h(offset); 2439 denom_vec = __msa_fill_h(rnd_val); 2440 2441 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2442 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2443 2444 LD_SB4(src, src_stride, src0, src1, src2, src3); 2445 XORI_B4_128_SB(src0, src1, src2, src3); 2446 2447 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2448 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3); 2449 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2450 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2451 2452 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec, 2453 dst0, dst1); 2454 2455 out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 2456 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2457 dst += (4 * dst_stride); 2458} 2459 2460static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src, 2461 int32_t src_stride, 2462 uint8_t *dst, 2463 int32_t dst_stride, 2464 const int8_t *filter, 2465 int32_t height, 2466 int32_t weight, 2467 int32_t offset, 2468 int32_t rnd_val) 2469{ 2470 uint32_t loop_cnt; 2471 v16u8 out0, out1; 2472 v8i16 filt0, filt1; 2473 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2474 v16i8 mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2475 v8i16 dst0, dst1, dst2, dst3; 2476 v8i16 filter_vec; 2477 v8i16 weight_vec_h, offset_vec, denom_vec; 2478 v4i32 weight_vec, rnd_vec; 2479 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2480 2481 src -= 1; 2482 2483 filter_vec = LD_SH(filter); 2484 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2485 2486 weight = weight & 0x0000FFFF; 2487 2488 weight_vec = __msa_fill_w(weight); 2489 rnd_vec = __msa_fill_w(rnd_val); 2490 2491 weight *= 128; 2492 rnd_val -= 6; 2493 2494 weight_vec_h = __msa_fill_h(weight); 2495 offset_vec = __msa_fill_h(offset); 2496 denom_vec = __msa_fill_h(rnd_val); 2497 2498 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2499 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2500 2501 mask1 = mask0 + 2; 2502 2503 for (loop_cnt = (height >> 3); loop_cnt--;) { 2504 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 2505 src += (8 * src_stride); 2506 2507 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2508 2509 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2510 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3); 2511 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec4, vec5); 2512 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec6, vec7); 2513 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2514 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2515 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2516 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2517 2518 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 2519 weight_vec, offset_vec, rnd_vec, 2520 dst0, dst1, dst2, dst3); 2521 2522 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 2523 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 2524 dst += (8 * dst_stride); 2525 } 2526} 2527 2528static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src, 2529 int32_t src_stride, 2530 uint8_t *dst, 2531 int32_t dst_stride, 2532 const int8_t *filter, 2533 int32_t height, 2534 int32_t weight, 2535 int32_t offset, 2536 int32_t rnd_val) 2537{ 2538 if (2 == height) { 2539 hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride, 2540 filter, weight, offset, rnd_val); 2541 } else if (4 == height) { 2542 hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride, 2543 filter, weight, offset, rnd_val); 2544 } else if (8 == height || 16 == height) { 2545 hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride, 2546 filter, height, weight, 2547 offset, rnd_val); 2548 } 2549} 2550 2551static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src, 2552 int32_t src_stride, 2553 uint8_t *dst, 2554 int32_t dst_stride, 2555 const int8_t *filter, 2556 int32_t height, 2557 int32_t weight, 2558 int32_t offset, 2559 int32_t rnd_val) 2560{ 2561 v16u8 out0, out1, out2, out3; 2562 v8i16 filt0, filt1; 2563 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2564 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2565 v16i8 mask1; 2566 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2567 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2568 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2569 v4i32 weight_vec, rnd_vec; 2570 2571 src -= 1; 2572 2573 filter_vec = LD_SH(filter); 2574 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2575 2576 weight = weight & 0x0000FFFF; 2577 2578 weight_vec = __msa_fill_w(weight); 2579 rnd_vec = __msa_fill_w(rnd_val); 2580 2581 weight *= 128; 2582 rnd_val -= 6; 2583 2584 weight_vec_h = __msa_fill_h(weight); 2585 offset_vec = __msa_fill_h(offset); 2586 denom_vec = __msa_fill_h(rnd_val); 2587 2588 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2589 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2590 2591 mask1 = mask0 + 2; 2592 2593 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 2594 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2595 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2596 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 2597 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 2598 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 2599 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2600 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2601 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2602 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2603 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 2604 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3); 2605 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5); 2606 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7); 2607 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2608 dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2609 dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2610 dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2611 2612 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 2613 weight_vec, offset_vec, rnd_vec, 2614 dst0, dst1, dst2, dst3); 2615 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, 2616 weight_vec, offset_vec, rnd_vec, 2617 dst4, dst5, dst6, dst7); 2618 2619 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 2620 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 2621 ST_W2(out0, 0, 2, dst, dst_stride); 2622 ST_H2(out0, 2, 6, dst + 4, dst_stride); 2623 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 2624 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2625 dst += (4 * dst_stride); 2626 ST_W2(out2, 0, 2, dst, dst_stride); 2627 ST_H2(out2, 2, 6, dst + 4, dst_stride); 2628 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride); 2629 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2630} 2631 2632static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src, 2633 int32_t src_stride, 2634 uint8_t *dst, 2635 int32_t dst_stride, 2636 const int8_t *filter, 2637 int32_t weight, 2638 int32_t offset, 2639 int32_t rnd_val) 2640{ 2641 v16u8 out; 2642 v8i16 filt0, filt1, dst0, dst1; 2643 v16i8 src0, src1; 2644 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2645 v16i8 mask1; 2646 v16i8 vec0, vec1, vec2, vec3; 2647 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2648 v4i32 weight_vec, rnd_vec; 2649 2650 src -= 1; 2651 2652 filter_vec = LD_SH(filter); 2653 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2654 2655 weight = weight & 0x0000FFFF; 2656 2657 weight_vec = __msa_fill_w(weight); 2658 rnd_vec = __msa_fill_w(rnd_val); 2659 2660 weight *= 128; 2661 rnd_val -= 6; 2662 2663 weight_vec_h = __msa_fill_h(weight); 2664 offset_vec = __msa_fill_h(offset); 2665 denom_vec = __msa_fill_h(rnd_val); 2666 2667 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2668 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2669 2670 mask1 = mask0 + 2; 2671 2672 LD_SB2(src, src_stride, src0, src1); 2673 XORI_B2_128_SB(src0, src1); 2674 2675 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2676 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 2677 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2678 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2679 2680 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec, 2681 dst0, dst1); 2682 2683 out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 2684 ST_D2(out, 0, 1, dst, dst_stride); 2685} 2686 2687static void hevc_hz_uniwgt_4t_8x4_msa(uint8_t *src, 2688 int32_t src_stride, 2689 uint8_t *dst, 2690 int32_t dst_stride, 2691 const int8_t *filter, 2692 int32_t weight, 2693 int32_t offset, 2694 int32_t rnd_val) 2695{ 2696 v16u8 out0, out1; 2697 v16i8 src0, src1, src2, src3; 2698 v16i8 mask0, mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2699 v8i16 filt0, filt1, dst0, dst1, dst2, dst3; 2700 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2701 v4i32 weight_vec, rnd_vec; 2702 2703 src -= 1; 2704 2705 filter_vec = LD_SH(filter); 2706 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2707 2708 weight = weight & 0x0000FFFF; 2709 weight_vec = __msa_fill_w(weight); 2710 rnd_vec = __msa_fill_w(rnd_val); 2711 2712 weight *= 128; 2713 rnd_val -= 6; 2714 2715 weight_vec_h = __msa_fill_h(weight); 2716 offset_vec = __msa_fill_h(offset); 2717 denom_vec = __msa_fill_h(rnd_val); 2718 2719 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2720 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2721 2722 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2723 mask1 = mask0 + 2; 2724 2725 LD_SB4(src, src_stride, src0, src1, src2, src3); 2726 XORI_B4_128_SB(src0, src1, src2, src3); 2727 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2728 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 2729 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 2730 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 2731 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2732 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2733 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2734 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2735 2736 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 2737 weight_vec, offset_vec, rnd_vec, 2738 dst0, dst1, dst2, dst3); 2739 2740 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 2741 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2742} 2743 2744static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src, 2745 int32_t src_stride, 2746 uint8_t *dst, 2747 int32_t dst_stride, 2748 const int8_t *filter, 2749 int32_t weight, 2750 int32_t offset, 2751 int32_t rnd_val) 2752{ 2753 v16u8 out0, out1, out2; 2754 v8i16 filt0, filt1; 2755 v16i8 src0, src1, src2, src3, src4, src5; 2756 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2757 v16i8 mask1; 2758 v16i8 vec11; 2759 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 2760 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 2761 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2762 v4i32 weight_vec, rnd_vec; 2763 2764 src -= 1; 2765 2766 filter_vec = LD_SH(filter); 2767 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2768 2769 weight = weight & 0x0000FFFF; 2770 2771 weight_vec = __msa_fill_w(weight); 2772 rnd_vec = __msa_fill_w(rnd_val); 2773 2774 weight *= 128; 2775 rnd_val -= 6; 2776 2777 weight_vec_h = __msa_fill_h(weight); 2778 offset_vec = __msa_fill_h(offset); 2779 denom_vec = __msa_fill_h(rnd_val); 2780 2781 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2782 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2783 2784 mask1 = mask0 + 2; 2785 2786 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5); 2787 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5); 2788 2789 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2790 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 2791 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 2792 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 2793 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 2794 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11); 2795 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2796 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2797 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2798 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2799 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 2800 dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1); 2801 2802 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 2803 weight_vec, offset_vec, rnd_vec, 2804 dst0, dst1, dst2, dst3); 2805 2806 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec, 2807 dst4, dst5); 2808 2809 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 2810 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2811 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 2812} 2813 2814static void hevc_hz_uniwgt_4t_8x8multiple_msa(uint8_t *src, 2815 int32_t src_stride, 2816 uint8_t *dst, 2817 int32_t dst_stride, 2818 const int8_t *filter, 2819 int32_t height, 2820 int32_t weight, 2821 int32_t offset, 2822 int32_t rnd_val) 2823{ 2824 uint32_t loop_cnt; 2825 v8i16 filt0, filt1; 2826 v16u8 out0, out1, out2, out3; 2827 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2828 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2829 v16i8 mask1; 2830 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2831 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2832 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2833 v4i32 weight_vec, rnd_vec; 2834 2835 src -= 1; 2836 2837 filter_vec = LD_SH(filter); 2838 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2839 2840 weight = weight & 0x0000FFFF; 2841 2842 weight_vec = __msa_fill_w(weight); 2843 rnd_vec = __msa_fill_w(rnd_val); 2844 2845 weight *= 128; 2846 rnd_val -= 6; 2847 2848 weight_vec_h = __msa_fill_h(weight); 2849 offset_vec = __msa_fill_h(offset); 2850 denom_vec = __msa_fill_h(rnd_val); 2851 2852 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2853 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2854 2855 mask1 = mask0 + 2; 2856 2857 for (loop_cnt = (height >> 3); loop_cnt--;) { 2858 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 2859 src += (8 * src_stride); 2860 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2861 2862 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2863 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 2864 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 2865 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 2866 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2867 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2868 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2869 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2870 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 2871 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3); 2872 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5); 2873 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7); 2874 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2875 dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2876 dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2877 dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2878 2879 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 2880 weight_vec, offset_vec, rnd_vec, 2881 dst0, dst1, dst2, dst3); 2882 2883 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, 2884 weight_vec, offset_vec, rnd_vec, 2885 dst4, dst5, dst6, dst7); 2886 2887 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 2888 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 2889 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 2890 dst += (8 * dst_stride); 2891 } 2892} 2893 2894static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src, 2895 int32_t src_stride, 2896 uint8_t *dst, 2897 int32_t dst_stride, 2898 const int8_t *filter, 2899 int32_t height, 2900 int32_t weight, 2901 int32_t offset, 2902 int32_t rnd_val) 2903{ 2904 if (2 == height) { 2905 hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride, 2906 filter, weight, offset, rnd_val); 2907 } else if (4 == height) { 2908 hevc_hz_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride, 2909 filter, weight, offset, rnd_val); 2910 } else if (6 == height) { 2911 hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride, 2912 filter, weight, offset, rnd_val); 2913 } else { 2914 hevc_hz_uniwgt_4t_8x8multiple_msa(src, src_stride, dst, dst_stride, 2915 filter, height, weight, offset, 2916 rnd_val); 2917 } 2918} 2919 2920static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src, 2921 int32_t src_stride, 2922 uint8_t *dst, 2923 int32_t dst_stride, 2924 const int8_t *filter, 2925 int32_t height, 2926 int32_t weight, 2927 int32_t offset, 2928 int32_t rnd_val) 2929{ 2930 uint32_t loop_cnt; 2931 v16u8 out0, out1, out2; 2932 v8i16 filt0, filt1; 2933 v16i8 src0, src1, src2, src3; 2934 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2935 v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 2936 }; 2937 v16i8 mask1; 2938 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 2939 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 2940 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2941 v16i8 mask3, vec11; 2942 v4i32 weight_vec, rnd_vec; 2943 2944 src -= 1; 2945 2946 filter_vec = LD_SH(filter); 2947 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2948 2949 weight = weight & 0x0000FFFF; 2950 2951 weight_vec = __msa_fill_w(weight); 2952 rnd_vec = __msa_fill_w(rnd_val); 2953 2954 weight *= 128; 2955 rnd_val -= 6; 2956 2957 weight_vec_h = __msa_fill_h(weight); 2958 offset_vec = __msa_fill_h(offset); 2959 denom_vec = __msa_fill_h(rnd_val); 2960 2961 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2962 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2963 2964 mask1 = mask0 + 2; 2965 mask3 = mask2 + 2; 2966 2967 for (loop_cnt = 4; loop_cnt--;) { 2968 LD_SB4(src, src_stride, src0, src1, src2, src3); 2969 src += (4 * src_stride); 2970 2971 XORI_B4_128_SB(src0, src1, src2, src3); 2972 2973 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2974 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 2975 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 2976 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 2977 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec8, vec9); 2978 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec10, vec11); 2979 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2980 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2981 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2982 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2983 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 2984 dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1); 2985 2986 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 2987 weight_vec, offset_vec, rnd_vec, 2988 dst0, dst1, dst2, dst3); 2989 2990 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 2991 rnd_vec, dst4, dst5); 2992 2993 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 2994 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2995 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 2996 dst += (4 * dst_stride); 2997 } 2998} 2999 3000static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src, 3001 int32_t src_stride, 3002 uint8_t *dst, 3003 int32_t dst_stride, 3004 const int8_t *filter, 3005 int32_t height, 3006 int32_t weight, 3007 int32_t offset, 3008 int32_t rnd_val) 3009{ 3010 uint32_t loop_cnt; 3011 v16u8 out0, out1, out2, out3; 3012 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 3013 v8i16 filt0, filt1; 3014 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3015 v16i8 mask1; 3016 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3017 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3018 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3019 v4i32 weight_vec, rnd_vec; 3020 3021 src -= 1; 3022 3023 filter_vec = LD_SH(filter); 3024 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3025 3026 weight = weight & 0x0000FFFF; 3027 3028 weight_vec = __msa_fill_w(weight); 3029 rnd_vec = __msa_fill_w(rnd_val); 3030 3031 weight *= 128; 3032 rnd_val -= 6; 3033 3034 weight_vec_h = __msa_fill_h(weight); 3035 offset_vec = __msa_fill_h(offset); 3036 denom_vec = __msa_fill_h(rnd_val); 3037 3038 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3039 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3040 3041 mask1 = mask0 + 2; 3042 3043 for (loop_cnt = (height >> 2); loop_cnt--;) { 3044 LD_SB4(src, src_stride, src0, src2, src4, src6); 3045 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 3046 src += (4 * src_stride); 3047 3048 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3049 3050 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3051 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3052 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3053 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 3054 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3055 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3056 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3057 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3058 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 3059 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3); 3060 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5); 3061 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7); 3062 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3063 dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3064 dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3065 dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3066 3067 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 3068 weight_vec, offset_vec, rnd_vec, 3069 dst0, dst1, dst2, dst3); 3070 3071 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, 3072 weight_vec, offset_vec, rnd_vec, 3073 dst4, dst5, dst6, dst7); 3074 3075 PCKEV_B4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 3076 out0, out1, out2, out3); 3077 3078 ST_UB4(out0, out1, out2, out3, dst, dst_stride); 3079 dst += (4 * dst_stride); 3080 } 3081} 3082 3083static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src, 3084 int32_t src_stride, 3085 uint8_t *dst, 3086 int32_t dst_stride, 3087 const int8_t *filter, 3088 int32_t height, 3089 int32_t weight, 3090 int32_t offset, 3091 int32_t rnd_val) 3092{ 3093 uint32_t loop_cnt; 3094 v16u8 out0, out1, out2; 3095 v16i8 src0, src1, src2, src3; 3096 v8i16 filt0, filt1; 3097 v16i8 mask0, mask1, mask2, mask3; 3098 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3099 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 3100 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3101 v4i32 weight_vec, rnd_vec; 3102 3103 src -= 1; 3104 3105 filter_vec = LD_SH(filter); 3106 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3107 3108 weight = weight & 0x0000FFFF; 3109 weight_vec = __msa_fill_w(weight); 3110 rnd_vec = __msa_fill_w(rnd_val); 3111 3112 weight *= 128; 3113 rnd_val -= 6; 3114 3115 weight_vec_h = __msa_fill_h(weight); 3116 offset_vec = __msa_fill_h(offset); 3117 denom_vec = __msa_fill_h(rnd_val); 3118 3119 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3120 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3121 3122 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3123 mask1 = mask0 + 2; 3124 mask2 = mask0 + 8; 3125 mask3 = mask0 + 10; 3126 3127 for (loop_cnt = 16; loop_cnt--;) { 3128 LD_SB2(src, src_stride, src0, src2); 3129 LD_SB2(src + 16, src_stride, src1, src3); 3130 src += (2 * src_stride); 3131 3132 XORI_B4_128_SB(src0, src1, src2, src3); 3133 3134 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3135 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3); 3136 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3137 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec6, vec7); 3138 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3139 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3140 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3141 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3142 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3143 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec2, vec3); 3144 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3145 dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3146 3147 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 3148 weight_vec, offset_vec, rnd_vec, 3149 dst0, dst1, dst2, dst3); 3150 3151 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 3152 rnd_vec, dst4, dst5); 3153 3154 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 3155 ST_UB2(out0, out1, dst, dst_stride); 3156 ST_D2(out2, 0, 1, dst + 16, dst_stride); 3157 dst += (2 * dst_stride); 3158 } 3159} 3160 3161static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src, 3162 int32_t src_stride, 3163 uint8_t *dst, 3164 int32_t dst_stride, 3165 const int8_t *filter, 3166 int32_t height, 3167 int32_t weight, 3168 int32_t offset, 3169 int32_t rnd_val) 3170{ 3171 uint32_t loop_cnt; 3172 v16u8 out0, out1, out2, out3; 3173 v16i8 src0, src1, src2, src3, src4, src5; 3174 v8i16 filt0, filt1; 3175 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3176 v16i8 mask1, mask2, mask3; 3177 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3178 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3179 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3180 v4i32 weight_vec, rnd_vec; 3181 3182 src -= 1; 3183 3184 filter_vec = LD_SH(filter); 3185 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3186 3187 weight = weight & 0x0000FFFF; 3188 3189 weight_vec = __msa_fill_w(weight); 3190 rnd_vec = __msa_fill_w(rnd_val); 3191 3192 weight *= 128; 3193 rnd_val -= 6; 3194 3195 weight_vec_h = __msa_fill_h(weight); 3196 offset_vec = __msa_fill_h(offset); 3197 denom_vec = __msa_fill_h(rnd_val); 3198 3199 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3200 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3201 3202 mask1 = mask0 + 2; 3203 mask2 = mask0 + 8; 3204 mask3 = mask0 + 10; 3205 3206 for (loop_cnt = (height >> 1); loop_cnt--;) { 3207 LD_SB2(src, 16, src0, src1); 3208 src2 = LD_SB(src + 24); 3209 src += src_stride; 3210 LD_SB2(src, 16, src3, src4); 3211 src5 = LD_SB(src + 24); 3212 src += src_stride; 3213 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5); 3214 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3215 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3); 3216 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec4, vec5); 3217 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec6, vec7); 3218 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3219 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3220 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3221 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3222 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3223 VSHF_B2_SB(src3, src4, src3, src4, mask2, mask3, vec2, vec3); 3224 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec4, vec5); 3225 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec6, vec7); 3226 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3227 dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3228 dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3229 dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3230 3231 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 3232 weight_vec, offset_vec, rnd_vec, 3233 dst0, dst1, dst2, dst3); 3234 3235 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, 3236 weight_vec, offset_vec, rnd_vec, 3237 dst4, dst5, dst6, dst7); 3238 3239 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 3240 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 3241 ST_UB2(out0, out1, dst, 16); 3242 dst += dst_stride; 3243 ST_UB2(out2, out3, dst, 16); 3244 dst += dst_stride; 3245 } 3246} 3247 3248static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src, 3249 int32_t src_stride, 3250 uint8_t *dst, 3251 int32_t dst_stride, 3252 const int8_t *filter, 3253 int32_t weight, 3254 int32_t offset, 3255 int32_t rnd_val) 3256{ 3257 v16u8 out; 3258 v16i8 src0, src1, src2, src3, src4; 3259 v16i8 src10_r, src32_r, src21_r, src43_r; 3260 v16i8 src2110, src4332; 3261 v8i16 dst0; 3262 v4i32 dst0_r, dst0_l; 3263 v8i16 filt0, filt1; 3264 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3265 v4i32 weight_vec, rnd_vec; 3266 3267 src -= src_stride; 3268 3269 weight = weight & 0x0000FFFF; 3270 3271 weight_vec = __msa_fill_w(weight); 3272 rnd_vec = __msa_fill_w(rnd_val); 3273 3274 weight *= 128; 3275 rnd_val -= 6; 3276 3277 weight_vec_h = __msa_fill_h(weight); 3278 offset_vec = __msa_fill_h(offset); 3279 denom_vec = __msa_fill_h(rnd_val); 3280 3281 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3282 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3283 3284 filter_vec = LD_SH(filter); 3285 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3286 3287 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3288 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3289 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3290 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 3291 XORI_B2_128_SB(src2110, src4332); 3292 dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3293 ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l); 3294 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l); 3295 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); 3296 dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 3297 dst0 = __msa_adds_s_h(dst0, offset_vec); 3298 CLIP_SH_0_255(dst0); 3299 out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); 3300 ST_W2(out, 0, 1, dst, dst_stride); 3301} 3302 3303static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src, 3304 int32_t src_stride, 3305 uint8_t *dst, 3306 int32_t dst_stride, 3307 const int8_t *filter, 3308 int32_t weight, 3309 int32_t offset, 3310 int32_t rnd_val) 3311{ 3312 v16u8 out; 3313 v16i8 src0, src1, src2, src3, src4, src5, src6; 3314 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 3315 v16i8 src2110, src4332, src6554; 3316 v8i16 dst0, dst1; 3317 v8i16 filt0, filt1; 3318 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3319 v4i32 weight_vec, rnd_vec; 3320 3321 src -= src_stride; 3322 3323 weight = weight & 0x0000FFFF; 3324 3325 weight_vec = __msa_fill_w(weight); 3326 rnd_vec = __msa_fill_w(rnd_val); 3327 3328 weight *= 128; 3329 rnd_val -= 6; 3330 3331 weight_vec_h = __msa_fill_h(weight); 3332 offset_vec = __msa_fill_h(offset); 3333 denom_vec = __msa_fill_h(rnd_val); 3334 3335 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3336 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3337 3338 filter_vec = LD_SH(filter); 3339 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3340 3341 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 3342 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3343 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3344 src32_r, src43_r, src54_r, src65_r); 3345 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 3346 src2110, src4332, src6554); 3347 XORI_B3_128_SB(src2110, src4332, src6554); 3348 dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3349 dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1); 3350 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec, 3351 dst0, dst1); 3352 3353 out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 3354 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 3355} 3356 3357static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src, 3358 int32_t src_stride, 3359 uint8_t *dst, 3360 int32_t dst_stride, 3361 const int8_t *filter, 3362 int32_t height, 3363 int32_t weight, 3364 int32_t offset, 3365 int32_t rnd_val) 3366{ 3367 int32_t loop_cnt; 3368 v16u8 out0, out1; 3369 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3370 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 3371 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 3372 v16i8 src2110, src4332, src6554, src8776; 3373 v16i8 src10998; 3374 v8i16 dst0, dst1, dst2, dst3, filt0, filt1; 3375 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3376 v4i32 weight_vec, rnd_vec; 3377 3378 src -= src_stride; 3379 3380 weight = weight & 0x0000FFFF; 3381 3382 weight_vec = __msa_fill_w(weight); 3383 rnd_vec = __msa_fill_w(rnd_val); 3384 3385 weight *= 128; 3386 rnd_val -= 6; 3387 3388 weight_vec_h = __msa_fill_h(weight); 3389 offset_vec = __msa_fill_h(offset); 3390 denom_vec = __msa_fill_h(rnd_val); 3391 3392 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3393 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3394 3395 filter_vec = LD_SH(filter); 3396 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3397 3398 LD_SB3(src, src_stride, src0, src1, src2); 3399 src += (3 * src_stride); 3400 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3401 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 3402 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3403 3404 for (loop_cnt = (height >> 3); loop_cnt--;) { 3405 LD_SB8(src, src_stride, 3406 src3, src4, src5, src6, src7, src8, src9, src10); 3407 src += (8 * src_stride); 3408 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3409 src32_r, src43_r, src54_r, src65_r); 3410 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3411 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3412 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, 3413 src109_r, src98_r, src4332, src6554, src8776, src10998); 3414 XORI_B4_128_SB(src4332, src6554, src8776, src10998); 3415 dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3416 dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1); 3417 dst2 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1); 3418 dst3 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1); 3419 3420 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 3421 weight_vec, offset_vec, rnd_vec, 3422 dst0, dst1, dst2, dst3); 3423 3424 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 3425 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3426 dst += (8 * dst_stride); 3427 3428 src2 = src10; 3429 src2110 = src10998; 3430 } 3431} 3432 3433static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src, 3434 int32_t src_stride, 3435 uint8_t *dst, 3436 int32_t dst_stride, 3437 const int8_t *filter, 3438 int32_t height, 3439 int32_t weight, 3440 int32_t offset, 3441 int32_t rnd_val) 3442{ 3443 if (2 == height) { 3444 hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride, 3445 filter, weight, offset, rnd_val); 3446 } else if (4 == height) { 3447 hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride, 3448 filter, weight, offset, rnd_val); 3449 } else if (0 == (height % 8)) { 3450 hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride, 3451 filter, height, weight, offset, 3452 rnd_val); 3453 } 3454} 3455 3456static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src, 3457 int32_t src_stride, 3458 uint8_t *dst, 3459 int32_t dst_stride, 3460 const int8_t *filter, 3461 int32_t height, 3462 int32_t weight, 3463 int32_t offset, 3464 int32_t rnd_val) 3465{ 3466 v16u8 out0, out1, out2, out3; 3467 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3468 v16i8 src10_r, src32_r, src21_r, src43_r; 3469 v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r; 3470 v8i16 filt0, filt1; 3471 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3472 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3473 v4i32 weight_vec, rnd_vec; 3474 3475 src -= src_stride; 3476 3477 weight = weight & 0x0000FFFF; 3478 3479 weight_vec = __msa_fill_w(weight); 3480 rnd_vec = __msa_fill_w(rnd_val); 3481 3482 weight *= 128; 3483 rnd_val -= 6; 3484 3485 weight_vec_h = __msa_fill_h(weight); 3486 offset_vec = __msa_fill_h(offset); 3487 denom_vec = __msa_fill_h(rnd_val); 3488 3489 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3490 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3491 3492 filter_vec = LD_SH(filter); 3493 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3494 3495 LD_SB3(src, src_stride, src0, src1, src2); 3496 src += (3 * src_stride); 3497 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 3498 XORI_B3_128_SB(src0, src1, src2); 3499 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3500 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3501 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3502 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 3503 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3504 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3505 dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3506 dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3507 dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3508 dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3509 dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1); 3510 dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1); 3511 dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 3512 dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 3513 3514 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 3515 weight_vec, offset_vec, rnd_vec, 3516 dst0, dst1, dst2, dst3); 3517 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, 3518 weight_vec, offset_vec, rnd_vec, 3519 dst4, dst5, dst6, dst7); 3520 3521 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 3522 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 3523 ST_W2(out0, 0, 2, dst, dst_stride); 3524 ST_H2(out0, 2, 6, dst + 4, dst_stride); 3525 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 3526 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 3527 dst += (4 * dst_stride); 3528 ST_W2(out2, 0, 2, dst, dst_stride); 3529 ST_H2(out2, 2, 6, dst + 4, dst_stride); 3530 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride); 3531 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 3532} 3533 3534static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src, 3535 int32_t src_stride, 3536 uint8_t *dst, 3537 int32_t dst_stride, 3538 const int8_t *filter, 3539 int32_t weight, 3540 int32_t offset, 3541 int32_t rnd_val) 3542{ 3543 v16u8 out; 3544 v16i8 src0, src1, src2, src3, src4; 3545 v16i8 src10_r, src32_r, src21_r, src43_r; 3546 v8i16 dst0, dst1; 3547 v8i16 filt0, filt1; 3548 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3549 v4i32 weight_vec, rnd_vec; 3550 3551 src -= src_stride; 3552 3553 weight = weight & 0x0000FFFF; 3554 3555 weight_vec = __msa_fill_w(weight); 3556 rnd_vec = __msa_fill_w(rnd_val); 3557 3558 weight *= 128; 3559 rnd_val -= 6; 3560 3561 weight_vec_h = __msa_fill_h(weight); 3562 offset_vec = __msa_fill_h(offset); 3563 denom_vec = __msa_fill_h(rnd_val); 3564 3565 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3566 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3567 3568 filter_vec = LD_SH(filter); 3569 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3570 3571 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3572 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3573 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3574 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3575 dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3576 dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3577 3578 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec, 3579 dst0, dst1); 3580 3581 out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 3582 ST_D2(out, 0, 1, dst, dst_stride); 3583} 3584 3585static void hevc_vt_uniwgt_4t_8x4_msa(uint8_t *src, 3586 int32_t src_stride, 3587 uint8_t *dst, 3588 int32_t dst_stride, 3589 const int8_t *filter, 3590 int32_t weight, 3591 int32_t offset, 3592 int32_t rnd_val) 3593{ 3594 v16u8 out0, out1; 3595 v16i8 src0, src1, src2, src3, src4; 3596 v16i8 src10_r, src32_r, src21_r, src43_r; 3597 v16i8 src5, src6, src54_r, src65_r; 3598 v8i16 filt0, filt1; 3599 v8i16 dst0, dst1, dst2, dst3; 3600 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3601 v4i32 weight_vec, rnd_vec; 3602 3603 src -= src_stride; 3604 3605 weight = weight & 0x0000FFFF; 3606 3607 weight_vec = __msa_fill_w(weight); 3608 rnd_vec = __msa_fill_w(rnd_val); 3609 3610 weight *= 128; 3611 rnd_val -= 6; 3612 3613 weight_vec_h = __msa_fill_h(weight); 3614 offset_vec = __msa_fill_h(offset); 3615 denom_vec = __msa_fill_h(rnd_val); 3616 3617 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3618 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3619 3620 filter_vec = LD_SH(filter); 3621 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3622 3623 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 3624 src += (3 * src_stride); 3625 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 3626 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3627 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3628 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 3629 dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3630 dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3631 dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3632 dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3633 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 3634 offset_vec, rnd_vec, dst0, dst1, dst2, 3635 dst3); 3636 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 3637 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 3638} 3639 3640static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src, 3641 int32_t src_stride, 3642 uint8_t *dst, 3643 int32_t dst_stride, 3644 const int8_t *filter, 3645 int32_t weight, 3646 int32_t offset, 3647 int32_t rnd_val) 3648{ 3649 v16u8 out0, out1, out2; 3650 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3651 v16i8 src10_r, src32_r, src54_r, src76_r; 3652 v16i8 src21_r, src43_r, src65_r, src87_r; 3653 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 3654 v8i16 filt0, filt1; 3655 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3656 v4i32 weight_vec, rnd_vec; 3657 3658 src -= src_stride; 3659 3660 weight = weight & 0x0000FFFF; 3661 3662 weight_vec = __msa_fill_w(weight); 3663 rnd_vec = __msa_fill_w(rnd_val); 3664 3665 weight *= 128; 3666 rnd_val -= 6; 3667 3668 weight_vec_h = __msa_fill_h(weight); 3669 offset_vec = __msa_fill_h(offset); 3670 denom_vec = __msa_fill_h(rnd_val); 3671 3672 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3673 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3674 3675 filter_vec = LD_SH(filter); 3676 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3677 3678 LD_SB3(src, src_stride, src0, src1, src2); 3679 src += (3 * src_stride); 3680 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8); 3681 3682 XORI_B3_128_SB(src0, src1, src2); 3683 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8); 3684 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3685 src32_r, src43_r); 3686 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 3687 src76_r, src87_r); 3688 dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3689 dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3690 dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3691 dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3692 dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1); 3693 dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1); 3694 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 3695 offset_vec, rnd_vec, dst0, dst1, dst2, dst3); 3696 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec, 3697 dst4, dst5); 3698 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 3699 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 3700 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 3701} 3702 3703static void hevc_vt_uniwgt_4t_8x8mult_msa(uint8_t *src, 3704 int32_t src_stride, 3705 uint8_t *dst, 3706 int32_t dst_stride, 3707 const int8_t *filter, 3708 int32_t height, 3709 int32_t weight, 3710 int32_t offset, 3711 int32_t rnd_val) 3712{ 3713 int32_t loop_cnt; 3714 v16u8 out0, out1, out2, out3; 3715 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3716 v16i8 src10_r, src32_r, src21_r, src43_r; 3717 v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r; 3718 v8i16 filt0, filt1; 3719 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3720 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3721 v4i32 weight_vec, rnd_vec; 3722 3723 src -= src_stride; 3724 3725 weight = weight & 0x0000FFFF; 3726 3727 weight_vec = __msa_fill_w(weight); 3728 rnd_vec = __msa_fill_w(rnd_val); 3729 3730 weight *= 128; 3731 rnd_val -= 6; 3732 3733 weight_vec_h = __msa_fill_h(weight); 3734 offset_vec = __msa_fill_h(offset); 3735 denom_vec = __msa_fill_h(rnd_val); 3736 3737 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3738 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3739 3740 filter_vec = LD_SH(filter); 3741 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3742 3743 LD_SB3(src, src_stride, src0, src1, src2); 3744 src += (3 * src_stride); 3745 XORI_B3_128_SB(src0, src1, src2); 3746 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3747 3748 for (loop_cnt = (height >> 3); loop_cnt--;) { 3749 LD_SB8(src, src_stride, 3750 src3, src4, src5, src6, src7, src8, src9, src10); 3751 src += (8 * src_stride); 3752 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3753 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3754 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 3755 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3756 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3757 dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3758 dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3759 dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3760 dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3761 dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1); 3762 dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1); 3763 dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 3764 dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 3765 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 3766 offset_vec, rnd_vec, dst0, dst1, dst2, 3767 dst3); 3768 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 3769 offset_vec, rnd_vec, dst4, dst5, dst6, 3770 dst7); 3771 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 3772 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 3773 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 3774 dst += (8 * dst_stride); 3775 3776 src2 = src10; 3777 src10_r = src98_r; 3778 src21_r = src109_r; 3779 } 3780} 3781 3782static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src, 3783 int32_t src_stride, 3784 uint8_t *dst, 3785 int32_t dst_stride, 3786 const int8_t *filter, 3787 int32_t height, 3788 int32_t weight, 3789 int32_t offset, 3790 int32_t rnd_val) 3791{ 3792 if (2 == height) { 3793 hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride, 3794 filter, weight, offset, rnd_val); 3795 } else if (4 == height) { 3796 hevc_vt_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride, 3797 filter, weight, offset, rnd_val); 3798 } else if (6 == height) { 3799 hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride, 3800 filter, weight, offset, rnd_val); 3801 } else { 3802 hevc_vt_uniwgt_4t_8x8mult_msa(src, src_stride, dst, dst_stride, 3803 filter, height, weight, offset, 3804 rnd_val); 3805 } 3806} 3807 3808static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src, 3809 int32_t src_stride, 3810 uint8_t *dst, 3811 int32_t dst_stride, 3812 const int8_t *filter, 3813 int32_t height, 3814 int32_t weight, 3815 int32_t offset, 3816 int32_t rnd_val) 3817{ 3818 int32_t loop_cnt; 3819 v16u8 out0, out1, out2, out3, out4, out5; 3820 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3821 v16i8 src10_r, src32_r, src21_r, src43_r; 3822 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 3823 v16i8 src2110, src4332; 3824 v16i8 src54_r, src76_r, src98_r, src65_r, src87_r, src109_r; 3825 v16i8 src76_l, src98_l, src87_l, src109_l, src6554, src8776, src10998; 3826 v8i16 filt0, filt1; 3827 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 3828 v8i16 dst9, dst10, dst11, filter_vec, weight_vec_h, offset_vec, denom_vec; 3829 v4i32 weight_vec, rnd_vec; 3830 3831 src -= (1 * src_stride); 3832 3833 weight = weight & 0x0000FFFF; 3834 3835 weight_vec = __msa_fill_w(weight); 3836 rnd_vec = __msa_fill_w(rnd_val); 3837 3838 weight *= 128; 3839 rnd_val -= 6; 3840 3841 weight_vec_h = __msa_fill_h(weight); 3842 offset_vec = __msa_fill_h(offset); 3843 denom_vec = __msa_fill_h(rnd_val); 3844 3845 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3846 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3847 3848 filter_vec = LD_SH(filter); 3849 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3850 3851 LD_SB3(src, src_stride, src0, src1, src2); 3852 src += (3 * src_stride); 3853 XORI_B3_128_SB(src0, src1, src2); 3854 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3855 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3856 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); 3857 3858 for (loop_cnt = 2; loop_cnt--;) { 3859 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 3860 src += (8 * src_stride); 3861 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3862 ILVRL_B2_SB(src3, src2, src32_r, src32_l); 3863 ILVRL_B2_SB(src4, src3, src43_r, src43_l); 3864 ILVRL_B2_SB(src5, src4, src54_r, src54_l); 3865 ILVRL_B2_SB(src6, src5, src65_r, src65_l); 3866 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); 3867 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); 3868 dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3869 dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3870 dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3871 dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3872 dst4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3873 dst5 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1); 3874 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 3875 offset_vec, rnd_vec, dst0, dst1, dst2, 3876 dst3); 3877 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 3878 rnd_vec, dst4, dst5); 3879 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 3880 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 3881 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 3882 dst += (4 * dst_stride); 3883 3884 ILVRL_B2_SB(src7, src6, src76_r, src76_l); 3885 ILVRL_B2_SB(src8, src7, src87_r, src87_l); 3886 ILVRL_B2_SB(src9, src8, src98_r, src98_l); 3887 ILVRL_B2_SB(src10, src9, src109_r, src109_l); 3888 src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l); 3889 src10998 = (v16i8) __msa_ilvr_d((v2i64) src109_l, (v2i64) src98_l); 3890 dst6 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1); 3891 dst7 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1); 3892 dst8 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 3893 dst9 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 3894 dst10 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1); 3895 dst11 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1); 3896 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst6, dst7, dst8, dst9, weight_vec, 3897 offset_vec, rnd_vec, dst6, dst7, dst8, 3898 dst9); 3899 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst10, dst11, weight_vec, offset_vec, 3900 rnd_vec, dst10, dst11); 3901 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); 3902 ST_D4(out3, out4, 0, 1, 0, 1, dst, dst_stride); 3903 ST_W4(out5, 0, 1, 2, 3, dst + 8, dst_stride); 3904 dst += (4 * dst_stride); 3905 3906 src2 = src10; 3907 src10_r = src98_r; 3908 src21_r = src109_r; 3909 src2110 = src10998; 3910 } 3911} 3912 3913static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src, 3914 int32_t src_stride, 3915 uint8_t *dst, 3916 int32_t dst_stride, 3917 const int8_t *filter, 3918 int32_t height, 3919 int32_t weight, 3920 int32_t offset, 3921 int32_t rnd_val) 3922{ 3923 int32_t loop_cnt; 3924 v16u8 out0, out1, out2, out3; 3925 v16i8 src0, src1, src2, src3, src4, src5; 3926 v16i8 src10_r, src32_r, src21_r, src43_r; 3927 v16i8 src10_l, src32_l, src21_l, src43_l; 3928 v16i8 src54_r, src54_l, src65_r, src65_l, src6; 3929 v8i16 filt0, filt1; 3930 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3931 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3932 v4i32 weight_vec, rnd_vec; 3933 3934 src -= src_stride; 3935 3936 weight = weight & 0x0000FFFF; 3937 3938 weight_vec = __msa_fill_w(weight); 3939 rnd_vec = __msa_fill_w(rnd_val); 3940 3941 weight *= 128; 3942 rnd_val -= 6; 3943 3944 weight_vec_h = __msa_fill_h(weight); 3945 offset_vec = __msa_fill_h(offset); 3946 denom_vec = __msa_fill_h(rnd_val); 3947 3948 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3949 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3950 3951 filter_vec = LD_SH(filter); 3952 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3953 3954 LD_SB3(src, src_stride, src0, src1, src2); 3955 src += (3 * src_stride); 3956 XORI_B3_128_SB(src0, src1, src2); 3957 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3958 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3959 3960 for (loop_cnt = (height >> 2); loop_cnt--;) { 3961 LD_SB4(src, src_stride, src3, src4, src5, src6); 3962 src += (4 * src_stride); 3963 XORI_B4_128_SB(src3, src4, src5, src6); 3964 ILVRL_B2_SB(src3, src2, src32_r, src32_l); 3965 ILVRL_B2_SB(src4, src3, src43_r, src43_l); 3966 ILVRL_B2_SB(src5, src4, src54_r, src54_l); 3967 ILVRL_B2_SB(src6, src5, src65_r, src65_l); 3968 dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3969 dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3970 dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3971 dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3972 dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 3973 dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 3974 dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1); 3975 dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1); 3976 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 3977 offset_vec, rnd_vec, dst0, dst1, dst2, 3978 dst3); 3979 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 3980 offset_vec, rnd_vec, dst4, dst5, dst6, 3981 dst7); 3982 PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1, 3983 out2, out3); 3984 ST_UB4(out0, out1, out2, out3, dst, dst_stride); 3985 dst += (4 * dst_stride); 3986 3987 src2 = src6; 3988 src10_r = src54_r; 3989 src21_r = src65_r; 3990 src10_l = src54_l; 3991 src21_l = src65_l; 3992 } 3993} 3994 3995static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src, 3996 int32_t src_stride, 3997 uint8_t *dst, 3998 int32_t dst_stride, 3999 const int8_t *filter, 4000 int32_t height, 4001 int32_t weight, 4002 int32_t offset, 4003 int32_t rnd_val) 4004{ 4005 uint32_t loop_cnt; 4006 v16u8 out0, out1, out2, out3, out4, out5; 4007 v16i8 src0, src1, src2, src3, src4, src5; 4008 v16i8 src6, src7, src8, src9, src10, src11, src12, src13; 4009 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 4010 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 4011 v16i8 src87_r, src98_r, src109_r, src1110_r, src1211_r, src1312_r; 4012 v8i16 filt0, filt1; 4013 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10; 4014 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec, dst11; 4015 v4i32 weight_vec, rnd_vec; 4016 4017 src -= src_stride; 4018 4019 weight = weight & 0x0000FFFF; 4020 4021 weight_vec = __msa_fill_w(weight); 4022 rnd_vec = __msa_fill_w(rnd_val); 4023 4024 weight *= 128; 4025 rnd_val -= 6; 4026 4027 weight_vec_h = __msa_fill_h(weight); 4028 offset_vec = __msa_fill_h(offset); 4029 denom_vec = __msa_fill_h(rnd_val); 4030 4031 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 4032 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 4033 4034 filter_vec = LD_SH(filter); 4035 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4036 4037 LD_SB3(src, src_stride, src0, src1, src2); 4038 LD_SB3(src + 16, src_stride, src7, src8, src9); 4039 src += (3 * src_stride); 4040 XORI_B3_128_SB(src0, src1, src2); 4041 XORI_B3_128_SB(src7, src8, src9); 4042 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 4043 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 4044 ILVR_B2_SB(src8, src7, src9, src8, src87_r, src98_r); 4045 4046 for (loop_cnt = 8; loop_cnt--;) { 4047 LD_SB4(src, src_stride, src3, src4, src5, src6); 4048 LD_SB4(src + 16, src_stride, src10, src11, src12, src13); 4049 src += (4 * src_stride); 4050 XORI_B4_128_SB(src3, src4, src5, src6); 4051 XORI_B4_128_SB(src10, src11, src12, src13); 4052 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4053 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 4054 ILVRL_B2_SB(src5, src4, src54_r, src54_l); 4055 ILVRL_B2_SB(src6, src5, src65_r, src65_l); 4056 ILVR_B2_SB(src10, src9, src11, src10, src109_r, src1110_r); 4057 ILVR_B2_SB(src12, src11, src13, src12, src1211_r, src1312_r); 4058 dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4059 dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4060 dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 4061 dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 4062 dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 4063 dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 4064 dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1); 4065 dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1); 4066 dst8 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 4067 dst9 = HEVC_FILT_4TAP_SH(src98_r, src1110_r, filt0, filt1); 4068 dst10 = HEVC_FILT_4TAP_SH(src109_r, src1211_r, filt0, filt1); 4069 dst11 = HEVC_FILT_4TAP_SH(src1110_r, src1312_r, filt0, filt1); 4070 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 4071 offset_vec, rnd_vec, dst0, dst1, dst2, 4072 dst3); 4073 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 4074 offset_vec, rnd_vec, dst4, dst5, dst6, 4075 dst7); 4076 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec, 4077 offset_vec, rnd_vec, dst8, dst9, dst10, 4078 dst11); 4079 PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1, 4080 out2, out3); 4081 PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5); 4082 ST_UB4(out0, out1, out2, out3, dst, dst_stride); 4083 ST_D4(out4, out5, 0, 1, 0, 1, dst + 16, dst_stride); 4084 dst += (4 * dst_stride); 4085 4086 src2 = src6; 4087 src9 = src13; 4088 src10_r = src54_r; 4089 src21_r = src65_r; 4090 src10_l = src54_l; 4091 src21_l = src65_l; 4092 src87_r = src1211_r; 4093 src98_r = src1312_r; 4094 } 4095} 4096 4097static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src, 4098 int32_t src_stride, 4099 uint8_t *dst, 4100 int32_t dst_stride, 4101 const int8_t *filter, 4102 int32_t height, 4103 int32_t weight, 4104 int32_t offset, 4105 int32_t rnd_val) 4106{ 4107 uint32_t loop_cnt; 4108 v16u8 out0, out1, out2, out3; 4109 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 4110 v16i8 src10_r, src32_r, src76_r, src98_r; 4111 v16i8 src21_r, src43_r, src65_r, src87_r; 4112 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 4113 v16i8 src10_l, src32_l, src76_l, src98_l; 4114 v16i8 src21_l, src43_l, src65_l, src87_l; 4115 v8i16 filt0, filt1; 4116 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 4117 v4i32 weight_vec, rnd_vec; 4118 4119 src -= src_stride; 4120 4121 weight = weight & 0x0000FFFF; 4122 4123 weight_vec = __msa_fill_w(weight); 4124 rnd_vec = __msa_fill_w(rnd_val); 4125 4126 weight *= 128; 4127 rnd_val -= 6; 4128 4129 weight_vec_h = __msa_fill_h(weight); 4130 offset_vec = __msa_fill_h(offset); 4131 denom_vec = __msa_fill_h(rnd_val); 4132 4133 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 4134 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 4135 4136 filter_vec = LD_SH(filter); 4137 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4138 4139 LD_SB3(src, src_stride, src0, src1, src2); 4140 LD_SB3(src + 16, src_stride, src5, src6, src7); 4141 src += (3 * src_stride); 4142 XORI_B6_128_SB(src0, src1, src2, src5, src6, src7); 4143 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 4144 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 4145 ILVR_B2_SB(src6, src5, src7, src6, src65_r, src76_r); 4146 ILVL_B2_SB(src6, src5, src7, src6, src65_l, src76_l); 4147 4148 for (loop_cnt = (height >> 1); loop_cnt--;) { 4149 LD_SB2(src, src_stride, src3, src4); 4150 LD_SB2(src + 16, src_stride, src8, src9); 4151 src += (2 * src_stride); 4152 XORI_B4_128_SB(src3, src4, src8, src9); 4153 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4154 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 4155 ILVRL_B2_SB(src8, src7, src87_r, src87_l); 4156 ILVRL_B2_SB(src9, src8, src98_r, src98_l); 4157 dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4158 dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4159 dst2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 4160 dst3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 4161 dst4 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1); 4162 dst5 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 4163 dst6 = HEVC_FILT_4TAP_SH(src65_l, src87_l, filt0, filt1); 4164 dst7 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1); 4165 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 4166 offset_vec, rnd_vec, dst0, dst1, dst2, 4167 dst3); 4168 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 4169 offset_vec, rnd_vec, dst4, dst5, dst6, 4170 dst7); 4171 PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst6, dst4, dst7, dst5, out0, out1, 4172 out2, out3); 4173 ST_UB2(out0, out2, dst, 16); 4174 dst += dst_stride; 4175 ST_UB2(out1, out3, dst, 16); 4176 dst += dst_stride; 4177 4178 src2 = src4; 4179 src7 = src9; 4180 src10_r = src32_r; 4181 src21_r = src43_r; 4182 src10_l = src32_l; 4183 src21_l = src43_l; 4184 src65_r = src87_r; 4185 src76_r = src98_r; 4186 src65_l = src87_l; 4187 src76_l = src98_l; 4188 } 4189} 4190 4191static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src, 4192 int32_t src_stride, 4193 uint8_t *dst, 4194 int32_t dst_stride, 4195 const int8_t *filter_x, 4196 const int8_t *filter_y, 4197 int32_t weight, 4198 int32_t offset, 4199 int32_t rnd_val) 4200{ 4201 v16u8 out; 4202 v16i8 src0, src1, src2, src3, src4; 4203 v8i16 filt0, filt1; 4204 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 4205 v16i8 mask1; 4206 v8i16 filt_h0, filt_h1, filter_vec, tmp; 4207 v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 4208 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43; 4209 v8i16 offset_vec, const_128, denom_vec; 4210 v4i32 dst0, dst1, weight_vec, rnd_vec; 4211 4212 src -= (src_stride + 1); 4213 4214 filter_vec = LD_SH(filter_x); 4215 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4216 4217 filter_vec = LD_SH(filter_y); 4218 UNPCK_R_SB_SH(filter_vec, filter_vec); 4219 4220 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4221 4222 mask1 = mask0 + 2; 4223 4224 weight_vec = __msa_fill_w(weight); 4225 rnd_vec = __msa_fill_w(rnd_val); 4226 4227 offset_vec = __msa_fill_h(offset); 4228 denom_vec = __msa_fill_h(rnd_val - 6); 4229 const_128 = __msa_fill_h((128 * weight)); 4230 offset_vec += __msa_srar_h(const_128, denom_vec); 4231 4232 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 4233 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4234 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1); 4235 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3); 4236 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5); 4237 dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4238 dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4239 dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4240 ILVRL_H2_SH(dst31, dst20, dst10, dst32); 4241 ILVRL_H2_SH(dst42, dst31, dst21, dst43); 4242 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 4243 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 4244 dst0 >>= 6; 4245 dst1 >>= 6; 4246 MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1); 4247 SRAR_W2_SW(dst0, dst1, rnd_vec); 4248 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); 4249 tmp += offset_vec; 4250 CLIP_SH_0_255(tmp); 4251 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); 4252 ST_W2(out, 0, 1, dst, dst_stride); 4253} 4254 4255static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src, 4256 int32_t src_stride, 4257 uint8_t *dst, 4258 int32_t dst_stride, 4259 const int8_t *filter_x, 4260 const int8_t *filter_y, 4261 int32_t weight, 4262 int32_t offset, 4263 int32_t rnd_val) 4264{ 4265 v16u8 out; 4266 v16i8 src0, src1, src2, src3, src4, src5, src6; 4267 v8i16 filt0, filt1; 4268 v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1; 4269 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 4270 v16i8 mask1; 4271 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4272 v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65; 4273 v8i16 offset_vec, const_128, denom_vec; 4274 v4i32 dst0, dst1, dst2, dst3, weight_vec, rnd_vec; 4275 4276 src -= (src_stride + 1); 4277 4278 filter_vec = LD_SH(filter_x); 4279 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4280 4281 filter_vec = LD_SH(filter_y); 4282 UNPCK_R_SB_SH(filter_vec, filter_vec); 4283 4284 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4285 4286 mask1 = mask0 + 2; 4287 4288 weight_vec = __msa_fill_w(weight); 4289 rnd_vec = __msa_fill_w(rnd_val); 4290 4291 offset_vec = __msa_fill_h(offset); 4292 denom_vec = __msa_fill_h(rnd_val - 6); 4293 const_128 = __msa_fill_h((128 * weight)); 4294 offset_vec += __msa_srar_h(const_128, denom_vec); 4295 4296 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 4297 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 4298 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1); 4299 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3); 4300 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5); 4301 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7); 4302 dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4303 dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4304 dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4305 dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4306 ILVRL_H2_SH(dst41, dst30, dst10, dst43); 4307 ILVRL_H2_SH(dst52, dst41, dst21, dst54); 4308 ILVRL_H2_SH(dst63, dst52, dst32, dst65); 4309 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 4310 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 4311 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1); 4312 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1); 4313 SRA_4V(dst0, dst1, dst2, dst3, 6); 4314 MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1); 4315 MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3); 4316 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 4317 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 4318 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4319 CLIP_SH2_0_255(tmp0, tmp1); 4320 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 4321 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 4322} 4323 4324static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src, 4325 int32_t src_stride, 4326 uint8_t *dst, 4327 int32_t dst_stride, 4328 const int8_t *filter_x, 4329 const int8_t *filter_y, 4330 int32_t height, 4331 int32_t weight, 4332 int32_t offset, 4333 int32_t rnd_val) 4334{ 4335 uint32_t loop_cnt; 4336 v16u8 out0, out1; 4337 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4338 v8i16 filt0, filt1; 4339 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 4340 v16i8 mask1; 4341 v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3; 4342 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4343 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 4344 v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 4345 v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 4346 v8i16 dst98_r, dst109_r, offset_vec, const_128, denom_vec; 4347 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec; 4348 4349 src -= (src_stride + 1); 4350 4351 filter_vec = LD_SH(filter_x); 4352 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4353 4354 filter_vec = LD_SH(filter_y); 4355 UNPCK_R_SB_SH(filter_vec, filter_vec); 4356 4357 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4358 4359 mask1 = mask0 + 2; 4360 4361 weight_vec = __msa_fill_w(weight); 4362 rnd_vec = __msa_fill_w(rnd_val); 4363 4364 offset_vec = __msa_fill_h(offset); 4365 denom_vec = __msa_fill_h(rnd_val - 6); 4366 const_128 = __msa_fill_h((128 * weight)); 4367 offset_vec += __msa_srar_h(const_128, denom_vec); 4368 4369 LD_SB3(src, src_stride, src0, src1, src2); 4370 src += (3 * src_stride); 4371 XORI_B3_128_SB(src0, src1, src2); 4372 4373 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 4374 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3); 4375 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4376 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4377 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 4378 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 4379 4380 for (loop_cnt = height >> 3; loop_cnt--;) { 4381 LD_SB8(src, src_stride, 4382 src3, src4, src5, src6, src7, src8, src9, src10); 4383 src += (8 * src_stride); 4384 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4385 4386 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1); 4387 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3); 4388 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5); 4389 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7); 4390 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4391 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4392 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4393 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4394 dst32_r = __msa_ilvr_h(dst73, dst22); 4395 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 4396 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 4397 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 4398 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 4399 dst76_r = __msa_ilvr_h(dst22, dst106); 4400 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4401 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4402 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4403 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4404 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4405 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4406 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4407 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4408 SRA_4V(dst0, dst1, dst2, dst3, 6); 4409 SRA_4V(dst4, dst5, dst6, dst7, 6); 4410 MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1); 4411 MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3); 4412 MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5); 4413 MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7); 4414 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 4415 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 4416 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1, 4417 tmp2, tmp3); 4418 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4419 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 4420 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4421 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4422 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 4423 dst += (8 * dst_stride); 4424 4425 dst10_r = dst98_r; 4426 dst21_r = dst109_r; 4427 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 4428 } 4429} 4430 4431static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src, 4432 int32_t src_stride, 4433 uint8_t *dst, 4434 int32_t dst_stride, 4435 const int8_t *filter_x, 4436 const int8_t *filter_y, 4437 int32_t height, 4438 int32_t weight, 4439 int32_t offset, 4440 int32_t rnd_val) 4441{ 4442 if (2 == height) { 4443 hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride, 4444 filter_x, filter_y, weight, 4445 offset, rnd_val); 4446 } else if (4 == height) { 4447 hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride, 4448 filter_x,filter_y, weight, 4449 offset, rnd_val); 4450 } else if (0 == (height % 8)) { 4451 hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride, 4452 filter_x, filter_y, height, weight, 4453 offset, rnd_val); 4454 } 4455} 4456 4457static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src, 4458 int32_t src_stride, 4459 uint8_t *dst, 4460 int32_t dst_stride, 4461 const int8_t *filter_x, 4462 const int8_t *filter_y, 4463 int32_t height, 4464 int32_t weight, 4465 int32_t offset, 4466 int32_t rnd_val) 4467{ 4468 v16u8 out0, out1, out2; 4469 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4470 v8i16 filt0, filt1; 4471 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4472 v16i8 mask1; 4473 v8i16 filt_h0, filt_h1, filter_vec; 4474 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4475 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9; 4476 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 4477 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r; 4478 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l; 4479 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l; 4480 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l; 4481 v8i16 offset_vec, const_128, denom_vec; 4482 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; 4483 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, weight_vec, rnd_vec; 4484 4485 src -= (src_stride + 1); 4486 4487 filter_vec = LD_SH(filter_x); 4488 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4489 4490 filter_vec = LD_SH(filter_y); 4491 UNPCK_R_SB_SH(filter_vec, filter_vec); 4492 4493 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4494 4495 mask1 = mask0 + 2; 4496 4497 weight_vec = __msa_fill_w(weight); 4498 rnd_vec = __msa_fill_w(rnd_val); 4499 4500 offset_vec = __msa_fill_h(offset); 4501 denom_vec = __msa_fill_h(rnd_val - 6); 4502 const_128 = __msa_fill_h((128 * weight)); 4503 offset_vec += __msa_srar_h(const_128, denom_vec); 4504 4505 LD_SB3(src, src_stride, src0, src1, src2); 4506 src += (3 * src_stride); 4507 XORI_B3_128_SB(src0, src1, src2); 4508 4509 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4510 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4511 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4512 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4513 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4514 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4515 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 4516 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 4517 4518 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 4519 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4520 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4521 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4522 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4523 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4524 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4525 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4526 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4527 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4528 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 4529 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3); 4530 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5); 4531 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7); 4532 dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4533 dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4534 dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4535 dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4536 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 4537 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 4538 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 4539 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 4540 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 4541 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 4542 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l); 4543 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l); 4544 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l); 4545 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l); 4546 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l); 4547 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4548 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4549 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4550 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4551 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4552 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4553 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4554 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4555 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1); 4556 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1); 4557 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1); 4558 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1); 4559 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 4560 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 4561 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); 4562 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 4563 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 4564 MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r); 4565 MUL2(dst6_r, weight_vec, dst7_r, weight_vec, dst6_r, dst7_r); 4566 MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l); 4567 MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l); 4568 SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec); 4569 SRAR_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, rnd_vec); 4570 SRAR_W4_SW(dst0_l, dst1_l, dst2_l, dst3_l, rnd_vec); 4571 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1); 4572 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3); 4573 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5); 4574 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4575 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 4576 ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5); 4577 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4578 CLIP_SH2_0_255(tmp4, tmp5); 4579 PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2); 4580 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 4581 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); 4582} 4583 4584static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src, 4585 int32_t src_stride, 4586 uint8_t *dst, 4587 int32_t dst_stride, 4588 const int8_t *filter_x, 4589 const int8_t *filter_y, 4590 int32_t weight, 4591 int32_t offset, 4592 int32_t rnd_val) 4593{ 4594 v16u8 out; 4595 v16i8 src0, src1, src2, src3, src4; 4596 v8i16 filt0, filt1; 4597 v8i16 filt_h0, filt_h1, filter_vec; 4598 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4599 v16i8 mask1; 4600 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 4601 v8i16 dst0, dst1, dst2, dst3, dst4; 4602 v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 4603 v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 4604 v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 4605 v8i16 tmp0, tmp1; 4606 v8i16 offset_vec, const_128, denom_vec; 4607 v4i32 weight_vec, rnd_vec; 4608 4609 src -= (src_stride + 1); 4610 4611 filter_vec = LD_SH(filter_x); 4612 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4613 4614 filter_vec = LD_SH(filter_y); 4615 UNPCK_R_SB_SH(filter_vec, filter_vec); 4616 4617 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4618 4619 mask1 = mask0 + 2; 4620 4621 weight_vec = __msa_fill_w(weight); 4622 rnd_vec = __msa_fill_w(rnd_val); 4623 4624 offset_vec = __msa_fill_h(offset); 4625 denom_vec = __msa_fill_h(rnd_val - 6); 4626 const_128 = __msa_fill_h((128 * weight)); 4627 offset_vec += __msa_srar_h(const_128, denom_vec); 4628 4629 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 4630 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4631 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4632 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4633 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4634 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 4635 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 4636 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4637 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4638 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4639 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4640 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 4641 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4642 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4643 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4644 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4645 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4646 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4647 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4648 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4649 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4650 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 4651 MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l); 4652 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); 4653 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); 4654 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4655 CLIP_SH2_0_255(tmp0, tmp1); 4656 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 4657 ST_D2(out, 0, 1, dst, dst_stride); 4658} 4659 4660static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src, 4661 int32_t src_stride, 4662 uint8_t *dst, 4663 int32_t dst_stride, 4664 const int8_t *filter_x, 4665 const int8_t *filter_y, 4666 int32_t width8mult, 4667 int32_t weight, 4668 int32_t offset, 4669 int32_t rnd_val) 4670{ 4671 uint32_t cnt; 4672 v16u8 out0, out1; 4673 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 4674 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4675 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec; 4676 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3; 4677 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 4678 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 4679 v8i16 offset_vec, const_128, denom_vec; 4680 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4681 v4i32 weight_vec, rnd_vec; 4682 4683 src -= (src_stride + 1); 4684 4685 filter_vec = LD_SH(filter_x); 4686 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4687 4688 filter_vec = LD_SH(filter_y); 4689 UNPCK_R_SB_SH(filter_vec, filter_vec); 4690 4691 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4692 4693 mask0 = LD_SB(ff_hevc_mask_arr); 4694 mask1 = mask0 + 2; 4695 4696 weight_vec = __msa_fill_w(weight); 4697 rnd_vec = __msa_fill_w(rnd_val); 4698 4699 offset_vec = __msa_fill_h(offset); 4700 denom_vec = __msa_fill_h(rnd_val - 6); 4701 const_128 = __msa_fill_h((128 * weight)); 4702 offset_vec += __msa_srar_h(const_128, denom_vec); 4703 4704 for (cnt = width8mult; cnt--;) { 4705 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 4706 src += 8; 4707 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 4708 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4709 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4710 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4711 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4712 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4713 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4714 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4715 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4716 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4717 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4718 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4719 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4720 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4721 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4722 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4723 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4724 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4725 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4726 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4727 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4728 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4729 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4730 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4731 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4732 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4733 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4734 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4735 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4736 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4737 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4738 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 4739 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 4740 MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l); 4741 MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l); 4742 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); 4743 SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec); 4744 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 4745 dst3_r, tmp0, tmp1, tmp2, tmp3); 4746 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4747 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 4748 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4749 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4750 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 4751 dst += 8; 4752 } 4753} 4754 4755static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src, 4756 int32_t src_stride, 4757 uint8_t *dst, 4758 int32_t dst_stride, 4759 const int8_t *filter_x, 4760 const int8_t *filter_y, 4761 int32_t weight, 4762 int32_t offset, 4763 int32_t rnd_val) 4764{ 4765 v16u8 out0, out1, out2; 4766 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 4767 v8i16 filt0, filt1; 4768 v8i16 filt_h0, filt_h1, filter_vec; 4769 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4770 v16i8 mask1; 4771 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 4772 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 4773 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 4774 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4775 v4i32 dst4_r, dst4_l, dst5_r, dst5_l, weight_vec, rnd_vec; 4776 v8i16 dst10_r, dst32_r, dst10_l, dst32_l; 4777 v8i16 dst21_r, dst43_r, dst21_l, dst43_l; 4778 v8i16 dst54_r, dst54_l, dst65_r, dst65_l; 4779 v8i16 dst76_r, dst76_l, dst87_r, dst87_l; 4780 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 4781 v8i16 offset_vec, const_128, denom_vec; 4782 4783 src -= (src_stride + 1); 4784 4785 filter_vec = LD_SH(filter_x); 4786 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4787 4788 filter_vec = LD_SH(filter_y); 4789 UNPCK_R_SB_SH(filter_vec, filter_vec); 4790 4791 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4792 4793 mask1 = mask0 + 2; 4794 4795 weight_vec = __msa_fill_w(weight); 4796 rnd_vec = __msa_fill_w(rnd_val); 4797 4798 offset_vec = __msa_fill_h(offset); 4799 denom_vec = __msa_fill_h(rnd_val - 6); 4800 const_128 = __msa_fill_h((128 * weight)); 4801 offset_vec += __msa_srar_h(const_128, denom_vec); 4802 4803 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 4804 src += (5 * src_stride); 4805 LD_SB4(src, src_stride, src5, src6, src7, src8); 4806 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4807 XORI_B4_128_SB(src5, src6, src7, src8); 4808 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4809 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4810 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4811 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 4812 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 4813 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11); 4814 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13); 4815 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15); 4816 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17); 4817 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4818 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4819 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4820 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4821 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 4822 dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1); 4823 dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1); 4824 dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1); 4825 dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1); 4826 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4827 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4828 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4829 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4830 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4831 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4832 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 4833 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 4834 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4835 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4836 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4837 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4838 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4839 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4840 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4841 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4842 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4843 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); 4844 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4845 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); 4846 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4847 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4848 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6); 4849 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 4850 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 4851 MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r); 4852 MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l); 4853 MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l); 4854 MUL2(dst4_l, weight_vec, dst5_l, weight_vec, dst4_l, dst5_l); 4855 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); 4856 SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec); 4857 SRAR_W4_SW(dst4_r, dst4_l, dst5_r, dst5_l, rnd_vec); 4858 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r, 4859 tmp0, tmp1, tmp2, tmp3); 4860 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5); 4861 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4862 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 4863 ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5); 4864 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4865 CLIP_SH2_0_255(tmp4, tmp5); 4866 PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2); 4867 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 4868 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 4869} 4870 4871static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src, 4872 int32_t src_stride, 4873 uint8_t *dst, 4874 int32_t dst_stride, 4875 const int8_t *filter_x, 4876 const int8_t *filter_y, 4877 int32_t height, 4878 int32_t weight, 4879 int32_t offset, 4880 int32_t rnd_val, 4881 int32_t width8mult) 4882{ 4883 uint32_t loop_cnt, cnt; 4884 uint8_t *src_tmp; 4885 uint8_t *dst_tmp; 4886 v16u8 out0, out1; 4887 v16i8 src0, src1, src2, src3, src4, src5, src6; 4888 v8i16 filt0, filt1; 4889 v8i16 filt_h0, filt_h1, filter_vec; 4890 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4891 v16i8 mask1; 4892 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4893 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3; 4894 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 4895 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 4896 v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 4897 v8i16 offset_vec, const_128, denom_vec; 4898 v4i32 dst2_r, dst2_l, dst3_r, dst3_l; 4899 v4i32 weight_vec, rnd_vec; 4900 4901 src -= (src_stride + 1); 4902 4903 filter_vec = LD_SH(filter_x); 4904 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4905 4906 filter_vec = LD_SH(filter_y); 4907 UNPCK_R_SB_SH(filter_vec, filter_vec); 4908 4909 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4910 4911 mask1 = mask0 + 2; 4912 4913 weight_vec = __msa_fill_w(weight); 4914 rnd_vec = __msa_fill_w(rnd_val); 4915 4916 offset_vec = __msa_fill_h(offset); 4917 denom_vec = __msa_fill_h(rnd_val - 6); 4918 const_128 = __msa_fill_h((128 * weight)); 4919 offset_vec += __msa_srar_h(const_128, denom_vec); 4920 4921 for (cnt = width8mult; cnt--;) { 4922 src_tmp = src; 4923 dst_tmp = dst; 4924 4925 LD_SB3(src_tmp, src_stride, src0, src1, src2); 4926 src_tmp += (3 * src_stride); 4927 XORI_B3_128_SB(src0, src1, src2); 4928 4929 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4930 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4931 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4932 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4933 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4934 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4935 4936 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4937 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4938 4939 for (loop_cnt = height >> 2; loop_cnt--;) { 4940 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); 4941 src_tmp += (4 * src_stride); 4942 XORI_B4_128_SB(src3, src4, src5, src6); 4943 4944 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4945 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4946 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4947 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4948 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4949 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4950 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4951 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4952 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4953 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4954 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4955 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4956 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4957 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4958 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4959 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4960 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4961 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4962 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4963 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4964 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4965 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4966 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 4967 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 4968 MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l); 4969 MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l); 4970 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); 4971 SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec); 4972 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 4973 dst3_r, tmp0, tmp1, tmp2, tmp3); 4974 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4975 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 4976 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4977 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4978 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 4979 dst_tmp += (4 * dst_stride); 4980 4981 dst10_r = dst54_r; 4982 dst10_l = dst54_l; 4983 dst21_r = dst65_r; 4984 dst21_l = dst65_l; 4985 dst2 = dst6; 4986 } 4987 4988 src += 8; 4989 dst += 8; 4990 } 4991} 4992 4993static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src, 4994 int32_t src_stride, 4995 uint8_t *dst, 4996 int32_t dst_stride, 4997 const int8_t *filter_x, 4998 const int8_t *filter_y, 4999 int32_t height, 5000 int32_t weight, 5001 int32_t offset, 5002 int32_t rnd_val) 5003{ 5004 5005 if (2 == height) { 5006 hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride, 5007 filter_x, filter_y, weight, 5008 offset, rnd_val); 5009 } else if (4 == height) { 5010 hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride, 5011 filter_x, filter_y, 1, weight, 5012 offset, rnd_val); 5013 } else if (6 == height) { 5014 hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride, 5015 filter_x, filter_y, weight, 5016 offset, rnd_val); 5017 } else if (0 == (height % 4)) { 5018 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 5019 filter_x, filter_y, height, weight, 5020 offset, rnd_val, 1); 5021 } 5022} 5023 5024static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src, 5025 int32_t src_stride, 5026 uint8_t *dst, 5027 int32_t dst_stride, 5028 const int8_t *filter_x, 5029 const int8_t *filter_y, 5030 int32_t height, 5031 int32_t weight, 5032 int32_t offset, 5033 int32_t rnd_val) 5034{ 5035 uint32_t loop_cnt; 5036 uint8_t *src_tmp, *dst_tmp; 5037 v16u8 out0, out1; 5038 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 5039 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 5040 v16i8 mask0, mask1, mask2, mask3; 5041 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3; 5042 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6; 5043 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 5044 v8i16 dst76_r, dst98_r, dst87_r, dst109_r; 5045 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 5046 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 5047 v8i16 offset_vec, const_128, denom_vec; 5048 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 5049 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec; 5050 5051 src -= (src_stride + 1); 5052 5053 filter_vec = LD_SH(filter_x); 5054 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 5055 5056 filter_vec = LD_SH(filter_y); 5057 UNPCK_R_SB_SH(filter_vec, filter_vec); 5058 5059 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 5060 5061 mask0 = LD_SB(ff_hevc_mask_arr); 5062 mask1 = mask0 + 2; 5063 5064 weight_vec = __msa_fill_w(weight); 5065 rnd_vec = __msa_fill_w(rnd_val); 5066 5067 offset_vec = __msa_fill_h(offset); 5068 denom_vec = __msa_fill_h(rnd_val - 6); 5069 const_128 = __msa_fill_h((128 * weight)); 5070 offset_vec += __msa_srar_h(const_128, denom_vec); 5071 5072 src_tmp = src; 5073 dst_tmp = dst; 5074 5075 LD_SB3(src_tmp, src_stride, src0, src1, src2); 5076 src_tmp += (3 * src_stride); 5077 XORI_B3_128_SB(src0, src1, src2); 5078 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 5079 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 5080 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 5081 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5082 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5083 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5084 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 5085 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 5086 5087 for (loop_cnt = 4; loop_cnt--;) { 5088 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); 5089 src_tmp += (4 * src_stride); 5090 XORI_B4_128_SB(src3, src4, src5, src6); 5091 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 5092 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 5093 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 5094 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 5095 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5096 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5097 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5098 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5099 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 5100 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 5101 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 5102 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 5103 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5104 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 5105 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5106 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 5107 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5108 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 5109 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5110 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 5111 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 5112 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 5113 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 5114 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 5115 MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l); 5116 MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l); 5117 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); 5118 SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec); 5119 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 5120 dst3_r, tmp0, tmp1, tmp2, tmp3); 5121 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 5122 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 5123 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5124 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5125 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 5126 dst_tmp += (4 * dst_stride); 5127 5128 dst10_r = dst54_r; 5129 dst10_l = dst54_l; 5130 dst21_r = dst65_r; 5131 dst21_l = dst65_l; 5132 dsth2 = dsth6; 5133 } 5134 5135 src += 8; 5136 dst += 8; 5137 5138 mask2 = LD_SB(ff_hevc_mask_arr + 16); 5139 mask3 = mask2 + 2; 5140 5141 LD_SB3(src, src_stride, src0, src1, src2); 5142 src += (3 * src_stride); 5143 XORI_B3_128_SB(src0, src1, src2); 5144 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 5145 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3); 5146 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5147 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5148 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 5149 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 5150 5151 for (loop_cnt = 2; loop_cnt--;) { 5152 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, 5153 src10); 5154 src += (8 * src_stride); 5155 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 5156 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1); 5157 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3); 5158 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5); 5159 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7); 5160 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5161 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5162 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5163 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5164 dst32_r = __msa_ilvr_h(dst73, dst22); 5165 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 5166 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 5167 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 5168 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 5169 dst76_r = __msa_ilvr_h(dst22, dst106); 5170 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5171 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5172 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5173 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5174 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 5175 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 5176 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 5177 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 5178 SRA_4V(dst0, dst1, dst2, dst3, 6); 5179 SRA_4V(dst4, dst5, dst6, dst7, 6); 5180 MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1); 5181 MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3); 5182 MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5); 5183 MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7); 5184 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5185 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5186 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1, 5187 tmp2, tmp3); 5188 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 5189 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 5190 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5191 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5192 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 5193 dst += (8 * dst_stride); 5194 5195 dst10_r = dst98_r; 5196 dst21_r = dst109_r; 5197 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 5198 } 5199} 5200 5201static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src, 5202 int32_t src_stride, 5203 uint8_t *dst, 5204 int32_t dst_stride, 5205 const int8_t *filter_x, 5206 const int8_t *filter_y, 5207 int32_t height, 5208 int32_t weight, 5209 int32_t offset, 5210 int32_t rnd_val) 5211{ 5212 if (4 == height) { 5213 hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride, 5214 filter_x, filter_y, 2, weight, offset, 5215 rnd_val); 5216 } else { 5217 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 5218 filter_x, filter_y, height, weight, 5219 offset, rnd_val, 2); 5220 } 5221} 5222 5223static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src, 5224 int32_t src_stride, 5225 uint8_t *dst, 5226 int32_t dst_stride, 5227 const int8_t *filter_x, 5228 const int8_t *filter_y, 5229 int32_t height, 5230 int32_t weight, 5231 int32_t offset, 5232 int32_t rnd_val) 5233{ 5234 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 5235 filter_x, filter_y, height, weight, 5236 offset, rnd_val, 3); 5237} 5238 5239static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src, 5240 int32_t src_stride, 5241 uint8_t *dst, 5242 int32_t dst_stride, 5243 const int8_t *filter_x, 5244 const int8_t *filter_y, 5245 int32_t height, 5246 int32_t weight, 5247 int32_t offset, 5248 int32_t rnd_val) 5249{ 5250 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 5251 filter_x, filter_y, height, weight, 5252 offset, rnd_val, 4); 5253} 5254 5255#define UNIWGT_MC_COPY(WIDTH) \ 5256void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ 5257 ptrdiff_t dst_stride, \ 5258 uint8_t *src, \ 5259 ptrdiff_t src_stride, \ 5260 int height, \ 5261 int denom, \ 5262 int weight, \ 5263 int offset, \ 5264 intptr_t mx, \ 5265 intptr_t my, \ 5266 int width) \ 5267{ \ 5268 int shift = denom + 14 - 8; \ 5269 hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 5270 height, weight, offset, shift); \ 5271} 5272 5273UNIWGT_MC_COPY(4); 5274UNIWGT_MC_COPY(6); 5275UNIWGT_MC_COPY(8); 5276UNIWGT_MC_COPY(12); 5277UNIWGT_MC_COPY(16); 5278UNIWGT_MC_COPY(24); 5279UNIWGT_MC_COPY(32); 5280UNIWGT_MC_COPY(48); 5281UNIWGT_MC_COPY(64); 5282 5283#undef UNIWGT_MC_COPY 5284 5285#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 5286void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ 5287 ptrdiff_t \ 5288 dst_stride, \ 5289 uint8_t *src, \ 5290 ptrdiff_t \ 5291 src_stride, \ 5292 int height, \ 5293 int denom, \ 5294 int weight, \ 5295 int offset, \ 5296 intptr_t mx, \ 5297 intptr_t my, \ 5298 int width) \ 5299{ \ 5300 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 5301 int shift = denom + 14 - 8; \ 5302 \ 5303 hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \ 5304 dst_stride, filter, height, \ 5305 weight, offset, shift); \ 5306} 5307 5308UNI_W_MC(qpel, h, 4, 8, hz, mx); 5309UNI_W_MC(qpel, h, 8, 8, hz, mx); 5310UNI_W_MC(qpel, h, 12, 8, hz, mx); 5311UNI_W_MC(qpel, h, 16, 8, hz, mx); 5312UNI_W_MC(qpel, h, 24, 8, hz, mx); 5313UNI_W_MC(qpel, h, 32, 8, hz, mx); 5314UNI_W_MC(qpel, h, 48, 8, hz, mx); 5315UNI_W_MC(qpel, h, 64, 8, hz, mx); 5316 5317UNI_W_MC(qpel, v, 4, 8, vt, my); 5318UNI_W_MC(qpel, v, 8, 8, vt, my); 5319UNI_W_MC(qpel, v, 12, 8, vt, my); 5320UNI_W_MC(qpel, v, 16, 8, vt, my); 5321UNI_W_MC(qpel, v, 24, 8, vt, my); 5322UNI_W_MC(qpel, v, 32, 8, vt, my); 5323UNI_W_MC(qpel, v, 48, 8, vt, my); 5324UNI_W_MC(qpel, v, 64, 8, vt, my); 5325 5326UNI_W_MC(epel, h, 4, 4, hz, mx); 5327UNI_W_MC(epel, h, 6, 4, hz, mx); 5328UNI_W_MC(epel, h, 8, 4, hz, mx); 5329UNI_W_MC(epel, h, 12, 4, hz, mx); 5330UNI_W_MC(epel, h, 16, 4, hz, mx); 5331UNI_W_MC(epel, h, 24, 4, hz, mx); 5332UNI_W_MC(epel, h, 32, 4, hz, mx); 5333 5334UNI_W_MC(epel, v, 4, 4, vt, my); 5335UNI_W_MC(epel, v, 6, 4, vt, my); 5336UNI_W_MC(epel, v, 8, 4, vt, my); 5337UNI_W_MC(epel, v, 12, 4, vt, my); 5338UNI_W_MC(epel, v, 16, 4, vt, my); 5339UNI_W_MC(epel, v, 24, 4, vt, my); 5340UNI_W_MC(epel, v, 32, 4, vt, my); 5341 5342#undef UNI_W_MC 5343 5344#define UNI_W_MC_HV(PEL, WIDTH, TAP) \ 5345void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \ 5346 ptrdiff_t dst_stride, \ 5347 uint8_t *src, \ 5348 ptrdiff_t src_stride, \ 5349 int height, \ 5350 int denom, \ 5351 int weight, \ 5352 int offset, \ 5353 intptr_t mx, \ 5354 intptr_t my, \ 5355 int width) \ 5356{ \ 5357 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 5358 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 5359 int shift = denom + 14 - 8; \ 5360 \ 5361 hevc_hv_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 5362 filter_x, filter_y, height, \ 5363 weight, offset, shift); \ 5364} 5365 5366UNI_W_MC_HV(qpel, 4, 8); 5367UNI_W_MC_HV(qpel, 8, 8); 5368UNI_W_MC_HV(qpel, 12, 8); 5369UNI_W_MC_HV(qpel, 16, 8); 5370UNI_W_MC_HV(qpel, 24, 8); 5371UNI_W_MC_HV(qpel, 32, 8); 5372UNI_W_MC_HV(qpel, 48, 8); 5373UNI_W_MC_HV(qpel, 64, 8); 5374 5375UNI_W_MC_HV(epel, 4, 4); 5376UNI_W_MC_HV(epel, 6, 4); 5377UNI_W_MC_HV(epel, 8, 4); 5378UNI_W_MC_HV(epel, 12, 4); 5379UNI_W_MC_HV(epel, 16, 4); 5380UNI_W_MC_HV(epel, 24, 4); 5381UNI_W_MC_HV(epel, 32, 4); 5382 5383#undef UNI_W_MC_HV 5384