1/* 2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "libavcodec/mips/hevcdsp_mips.h" 23#include "libavcodec/mips/hevc_macros_msa.h" 24 25static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { 26 /* 8 width cases */ 27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 29}; 30 31#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \ 32{ \ 33 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \ 34 SRARI_H2_SH(out0, out1, rnd_val); \ 35 CLIP_SH2_0_255(out0, out1); \ 36} 37 38#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \ 39 vec0, vec1, vec2, vec3, rnd_val, \ 40 out0, out1, out2, out3) \ 41{ \ 42 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \ 43 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \ 44} 45 46#define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, \ 47 out0, out1) \ 48{ \ 49 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \ 50 SRARI_H2_SH(out0, out1, rnd_val); \ 51 CLIP_SH2_0_255(out0, out1); \ 52} 53 54#define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \ 55 vec3, rnd_val, out0, out1, out2, out3) \ 56{ \ 57 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1); \ 58 HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3); \ 59} 60 61static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr, 62 int32_t src_stride, 63 int16_t *src1_ptr, 64 int32_t src2_stride, 65 uint8_t *dst, 66 int32_t dst_stride, 67 int32_t height) 68{ 69 uint32_t loop_cnt, tp0, tp1, tp2, tp3; 70 uint64_t tpd0, tpd1, tpd2, tpd3; 71 v16i8 src0 = { 0 }, src1 = { 0 }; 72 v16i8 zero = { 0 }; 73 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 74 v8i16 dst0, dst1, dst2, dst3; 75 76 if (2 == height) { 77 LW2(src0_ptr, src_stride, tp0, tp1); 78 INSERT_W2_SB(tp0, tp1, src0); 79 LD2(src1_ptr, src2_stride, tpd0, tpd1); 80 INSERT_D2_SH(tpd0, tpd1, in0); 81 82 dst0 = (v8i16) __msa_ilvr_b(zero, src0); 83 dst0 <<= 6; 84 dst0 += in0; 85 dst0 = __msa_srari_h(dst0, 7); 86 CLIP_SH_0_255(dst0); 87 88 dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); 89 ST_W2(dst0, 0, 1, dst, dst_stride); 90 } else if (4 == height) { 91 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 92 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); 93 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); 94 INSERT_D2_SH(tpd0, tpd1, in0); 95 INSERT_D2_SH(tpd2, tpd3, in1); 96 ILVRL_B2_SH(zero, src0, dst0, dst1); 97 SLLI_2V(dst0, dst1, 6); 98 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1); 99 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 100 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride); 101 } else if (0 == height % 8) { 102 for (loop_cnt = (height >> 3); loop_cnt--;) { 103 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 104 src0_ptr += 4 * src_stride; 105 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); 106 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 107 src0_ptr += 4 * src_stride; 108 INSERT_W4_SB(tp0, tp1, tp2, tp3, src1); 109 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); 110 src1_ptr += (4 * src2_stride); 111 INSERT_D2_SH(tpd0, tpd1, in0); 112 INSERT_D2_SH(tpd2, tpd3, in1); 113 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); 114 src1_ptr += (4 * src2_stride); 115 INSERT_D2_SH(tpd0, tpd1, in2); 116 INSERT_D2_SH(tpd2, tpd3, in3); 117 ILVRL_B2_SH(zero, src0, dst0, dst1); 118 ILVRL_B2_SH(zero, src1, dst2, dst3); 119 SLLI_4V(dst0, dst1, dst2, dst3, 6); 120 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, 121 dst3, 7, dst0, dst1, dst2, dst3); 122 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 123 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 124 dst += (8 * dst_stride); 125 } 126 } 127} 128 129static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr, 130 int32_t src_stride, 131 int16_t *src1_ptr, 132 int32_t src2_stride, 133 uint8_t *dst, 134 int32_t dst_stride, 135 int32_t height) 136{ 137 uint32_t loop_cnt; 138 uint64_t tp0, tp1, tp2, tp3; 139 v16u8 out0, out1, out2, out3; 140 v16i8 zero = { 0 }; 141 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 142 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 143 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 144 145 for (loop_cnt = (height >> 3); loop_cnt--;) { 146 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 147 src0_ptr += (4 * src_stride); 148 INSERT_D2_SB(tp0, tp1, src0); 149 INSERT_D2_SB(tp2, tp3, src1); 150 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 151 src0_ptr += (4 * src_stride); 152 INSERT_D2_SB(tp0, tp1, src2); 153 INSERT_D2_SB(tp2, tp3, src3); 154 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); 155 src1_ptr += (8 * src2_stride); 156 ILVRL_B2_SH(zero, src0, dst0, dst1); 157 ILVRL_B2_SH(zero, src1, dst2, dst3); 158 ILVRL_B2_SH(zero, src2, dst4, dst5); 159 ILVRL_B2_SH(zero, src3, dst6, dst7); 160 SLLI_4V(dst0, dst1, dst2, dst3, 6); 161 SLLI_4V(dst4, dst5, dst6, dst7, 6); 162 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 163 7, dst0, dst1, dst2, dst3); 164 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7, 165 7, dst4, dst5, dst6, dst7); 166 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 167 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 168 ST_W2(out0, 0, 2, dst, dst_stride); 169 ST_H2(out0, 2, 6, dst + 4, dst_stride); 170 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 171 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 172 dst += (4 * dst_stride); 173 ST_W2(out2, 0, 2, dst, dst_stride); 174 ST_H2(out2, 2, 6, dst + 4, dst_stride); 175 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride); 176 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 177 dst += (4 * dst_stride); 178 } 179} 180 181static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr, 182 int32_t src_stride, 183 int16_t *src1_ptr, 184 int32_t src2_stride, 185 uint8_t *dst, 186 int32_t dst_stride, 187 int32_t height) 188{ 189 uint64_t tp0, tp1, tp2, tp3; 190 v16u8 out0, out1, out2, out3; 191 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 192 v16i8 zero = { 0 }; 193 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 194 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 195 196 if (2 == height) { 197 LD2(src0_ptr, src_stride, tp0, tp1); 198 INSERT_D2_SB(tp0, tp1, src0); 199 LD_SH2(src1_ptr, src2_stride, in0, in1); 200 ILVRL_B2_SH(zero, src0, dst0, dst1); 201 SLLI_2V(dst0, dst1, 6); 202 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1); 203 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 204 ST_D2(out0, 0, 1, dst, dst_stride); 205 } else if (4 == height) { 206 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 207 INSERT_D2_SB(tp0, tp1, src0); 208 INSERT_D2_SB(tp2, tp3, src1); 209 ILVRL_B2_SH(zero, src0, dst0, dst1); 210 ILVRL_B2_SH(zero, src1, dst2, dst3); 211 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 212 SLLI_4V(dst0, dst1, dst2, dst3, 6); 213 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 214 7, dst0, dst1, dst2, dst3); 215 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 216 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 217 } else if (6 == height) { 218 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 219 src0_ptr += 4 * src_stride; 220 INSERT_D2_SB(tp0, tp1, src0); 221 INSERT_D2_SB(tp2, tp3, src1); 222 LD2(src0_ptr, src_stride, tp0, tp1); 223 INSERT_D2_SB(tp0, tp1, src2); 224 ILVRL_B2_SH(zero, src0, dst0, dst1); 225 ILVRL_B2_SH(zero, src1, dst2, dst3); 226 ILVRL_B2_SH(zero, src2, dst4, dst5); 227 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); 228 SLLI_4V(dst0, dst1, dst2, dst3, 6); 229 SLLI_2V(dst4, dst5, 6); 230 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 231 7, dst0, dst1, dst2, dst3); 232 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5); 233 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 234 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 235 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 236 } else if (0 == height % 8) { 237 uint32_t loop_cnt; 238 239 for (loop_cnt = (height >> 3); loop_cnt--;) { 240 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 241 src0_ptr += 4 * src_stride; 242 INSERT_D2_SB(tp0, tp1, src0); 243 INSERT_D2_SB(tp2, tp3, src1); 244 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 245 src0_ptr += 4 * src_stride; 246 INSERT_D2_SB(tp0, tp1, src2); 247 INSERT_D2_SB(tp2, tp3, src3); 248 ILVRL_B2_SH(zero, src0, dst0, dst1); 249 ILVRL_B2_SH(zero, src1, dst2, dst3); 250 ILVRL_B2_SH(zero, src2, dst4, dst5); 251 ILVRL_B2_SH(zero, src3, dst6, dst7); 252 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, 253 in7); 254 src1_ptr += (8 * src2_stride); 255 SLLI_4V(dst0, dst1, dst2, dst3, 6); 256 SLLI_4V(dst4, dst5, dst6, dst7, 6); 257 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, 258 dst3, 7, dst0, dst1, dst2, dst3); 259 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, 260 dst7, 7, dst4, dst5, dst6, dst7); 261 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 262 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 263 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 264 dst += (8 * dst_stride); 265 } 266 } 267} 268 269static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr, 270 int32_t src_stride, 271 int16_t *src1_ptr, 272 int32_t src2_stride, 273 uint8_t *dst, 274 int32_t dst_stride, 275 int32_t height) 276{ 277 uint32_t loop_cnt; 278 v16i8 zero = { 0 }; 279 v16u8 out0, out1, out2; 280 v16i8 src0, src1, src2, src3; 281 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 282 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 283 284 for (loop_cnt = 4; loop_cnt--;) { 285 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 286 src0_ptr += (4 * src_stride); 287 288 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 289 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 290 src1_ptr += (4 * src2_stride); 291 ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 292 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, dst0, dst1, 293 dst2, dst3); 294 SLLI_4V(dst0, dst1, dst2, dst3, 6); 295 ILVL_W2_SB(src1, src0, src3, src2, src0, src1); 296 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5); 297 SLLI_2V(dst4, dst5, 6); 298 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 299 7, dst0, dst1, dst2, dst3); 300 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5); 301 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 302 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 303 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 304 dst += (4 * dst_stride); 305 } 306} 307 308static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr, 309 int32_t src_stride, 310 int16_t *src1_ptr, 311 int32_t src2_stride, 312 uint8_t *dst, 313 int32_t dst_stride, 314 int32_t height) 315{ 316 uint32_t loop_cnt; 317 v16u8 out0, out1, out2, out3; 318 v16i8 src0, src1, src2, src3; 319 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 320 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; 321 v16i8 zero = { 0 }; 322 323 for (loop_cnt = (height >> 2); loop_cnt--;) { 324 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 325 src0_ptr += (4 * src_stride); 326 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 327 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 328 src1_ptr += (4 * src2_stride); 329 ILVRL_B2_SH(zero, src0, dst0_r, dst0_l); 330 ILVRL_B2_SH(zero, src1, dst1_r, dst1_l); 331 ILVRL_B2_SH(zero, src2, dst2_r, dst2_l); 332 ILVRL_B2_SH(zero, src3, dst3_r, dst3_l); 333 SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 334 SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); 335 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in4, in5, dst0_r, dst1_r, dst0_l, 336 dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l); 337 HEVC_BI_RND_CLIP4_MAX_SATU(in2, in3, in6, in7, dst2_r, dst3_r, dst2_l, 338 dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l); 339 PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1); 340 PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3); 341 ST_UB4(out0, out1, out2, out3, dst, dst_stride); 342 dst += (4 * dst_stride); 343 } 344} 345 346static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr, 347 int32_t src_stride, 348 int16_t *src1_ptr, 349 int32_t src2_stride, 350 uint8_t *dst, 351 int32_t dst_stride, 352 int32_t height) 353{ 354 uint32_t loop_cnt; 355 v16u8 out0, out1, out2, out3, out4, out5; 356 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 }; 357 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10; 358 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11; 359 360 for (loop_cnt = 8; loop_cnt--;) { 361 LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5); 362 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7); 363 src0_ptr += (4 * src_stride); 364 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 365 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 366 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11); 367 src1_ptr += (4 * src2_stride); 368 369 ILVRL_B2_SH(zero, src0, dst0, dst1); 370 ILVRL_B2_SH(zero, src1, dst2, dst3); 371 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5); 372 ILVRL_B2_SH(zero, src4, dst6, dst7); 373 ILVRL_B2_SH(zero, src5, dst8, dst9); 374 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11); 375 SLLI_4V(dst0, dst1, dst2, dst3, 6); 376 SLLI_4V(dst4, dst5, dst6, dst7, 6); 377 SLLI_4V(dst8, dst9, dst10, dst11, 6); 378 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in4, in1, in5, dst0, dst1, dst2, dst3, 379 7, dst0, dst1, dst2, dst3); 380 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in2, in6, dst4, dst5, dst6, dst7, 381 7, dst4, dst5, dst6, dst7); 382 HEVC_BI_RND_CLIP4_MAX_SATU(in3, in7, in10, in11, dst8, dst9, dst10, 383 dst11, 7, dst8, dst9, dst10, dst11); 384 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 385 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); 386 ST_UB4(out0, out1, out3, out4, dst, dst_stride); 387 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride); 388 dst += (4 * dst_stride); 389 } 390} 391 392static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr, 393 int32_t src_stride, 394 int16_t *src1_ptr, 395 int32_t src2_stride, 396 uint8_t *dst, 397 int32_t dst_stride, 398 int32_t height) 399{ 400 uint32_t loop_cnt; 401 v16u8 out0, out1, out2, out3; 402 v16i8 src0, src1, src2, src3; 403 v16i8 zero = { 0 }; 404 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 405 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 406 407 for (loop_cnt = (height >> 1); loop_cnt--;) { 408 LD_SB2(src0_ptr, 16, src0, src1); 409 src0_ptr += src_stride; 410 LD_SB2(src0_ptr, 16, src2, src3); 411 src0_ptr += src_stride; 412 LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 413 src1_ptr += src2_stride; 414 LD_SH4(src1_ptr, 8, in4, in5, in6, in7); 415 src1_ptr += src2_stride; 416 417 ILVRL_B2_SH(zero, src0, dst0, dst1); 418 ILVRL_B2_SH(zero, src1, dst2, dst3); 419 ILVRL_B2_SH(zero, src2, dst4, dst5); 420 ILVRL_B2_SH(zero, src3, dst6, dst7); 421 SLLI_4V(dst0, dst1, dst2, dst3, 6); 422 SLLI_4V(dst4, dst5, dst6, dst7, 6); 423 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 424 7, dst0, dst1, dst2, dst3); 425 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7, 426 7, dst4, dst5, dst6, dst7); 427 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 428 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 429 ST_UB2(out0, out1, dst, 16); 430 dst += dst_stride; 431 ST_UB2(out2, out3, dst, 16); 432 dst += dst_stride; 433 } 434} 435 436static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr, 437 int32_t src_stride, 438 int16_t *src1_ptr, 439 int32_t src2_stride, 440 uint8_t *dst, 441 int32_t dst_stride, 442 int32_t height) 443{ 444 uint32_t loop_cnt; 445 v16u8 out0, out1, out2, out3, out4, out5; 446 v16i8 src0, src1, src2, src3, src4, src5; 447 v16i8 zero = { 0 }; 448 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10; 449 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11; 450 451 for (loop_cnt = (height >> 1); loop_cnt--;) { 452 LD_SB3(src0_ptr, 16, src0, src1, src2); 453 src0_ptr += src_stride; 454 LD_SB3(src0_ptr, 16, src3, src4, src5); 455 src0_ptr += src_stride; 456 457 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5); 458 src1_ptr += src2_stride; 459 LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11); 460 src1_ptr += src2_stride; 461 462 ILVRL_B2_SH(zero, src0, dst0, dst1); 463 ILVRL_B2_SH(zero, src1, dst2, dst3); 464 ILVRL_B2_SH(zero, src2, dst4, dst5); 465 ILVRL_B2_SH(zero, src3, dst6, dst7); 466 ILVRL_B2_SH(zero, src4, dst8, dst9); 467 ILVRL_B2_SH(zero, src5, dst10, dst11); 468 469 SLLI_4V(dst0, dst1, dst2, dst3, 6); 470 SLLI_4V(dst4, dst5, dst6, dst7, 6); 471 SLLI_4V(dst8, dst9, dst10, dst11, 6); 472 473 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 474 7, dst0, dst1, dst2, dst3); 475 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7, 476 7, dst4, dst5, dst6, dst7); 477 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in10, in11, dst8, dst9, dst10, 478 dst11, 7, dst8, dst9, dst10, dst11); 479 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 480 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); 481 ST_UB2(out0, out1, dst, 16); 482 ST_UB(out2, dst + 32); 483 dst += dst_stride; 484 ST_UB2(out3, out4, dst, 16); 485 ST_UB(out5, dst + 32); 486 dst += dst_stride; 487 } 488} 489 490static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr, 491 int32_t src_stride, 492 int16_t *src1_ptr, 493 int32_t src2_stride, 494 uint8_t *dst, 495 int32_t dst_stride, 496 int32_t height) 497{ 498 uint32_t loop_cnt; 499 v16u8 out0, out1, out2, out3; 500 v16i8 src0, src1, src2, src3; 501 v16i8 zero = { 0 }; 502 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 503 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 504 505 for (loop_cnt = height; loop_cnt--;) { 506 LD_SB4(src0_ptr, 16, src0, src1, src2, src3); 507 src0_ptr += src_stride; 508 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7); 509 src1_ptr += src2_stride; 510 511 ILVRL_B2_SH(zero, src0, dst0, dst1); 512 ILVRL_B2_SH(zero, src1, dst2, dst3); 513 ILVRL_B2_SH(zero, src2, dst4, dst5); 514 ILVRL_B2_SH(zero, src3, dst6, dst7); 515 SLLI_4V(dst0, dst1, dst2, dst3, 6); 516 SLLI_4V(dst4, dst5, dst6, dst7, 6); 517 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 518 7, dst0, dst1, dst2, dst3); 519 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7, 520 7, dst4, dst5, dst6, dst7); 521 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 522 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 523 524 ST_UB4(out0, out1, out2, out3, dst, 16); 525 dst += dst_stride; 526 } 527} 528 529static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr, 530 int32_t src_stride, 531 int16_t *src1_ptr, 532 int32_t src2_stride, 533 uint8_t *dst, 534 int32_t dst_stride, 535 const int8_t *filter, 536 int32_t height) 537{ 538 uint32_t loop_cnt; 539 v8i16 filt0, filt1, filt2, filt3; 540 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 541 v16i8 mask1, mask2, mask3; 542 v16i8 vec0, vec1, vec2, vec3; 543 v8i16 dst0, dst1, dst2, dst3; 544 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 545 v8i16 filter_vec, const_vec; 546 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 547 548 src0_ptr -= 3; 549 550 /* rearranging filter */ 551 filter_vec = LD_SH(filter); 552 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 553 554 mask1 = mask0 + 2; 555 mask2 = mask0 + 4; 556 mask3 = mask0 + 6; 557 558 const_vec = __msa_ldi_h(128); 559 const_vec <<= 6; 560 561 for (loop_cnt = (height >> 3); loop_cnt--;) { 562 LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3, 563 src4, src5, src6, src7); 564 src0_ptr += (8 * src_stride); 565 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); 566 src1_ptr += (8 * src2_stride); 567 568 ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 569 ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 570 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 571 572 dst0 = const_vec; 573 dst1 = const_vec; 574 dst2 = const_vec; 575 dst3 = const_vec; 576 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 577 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3); 578 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 579 dst1, dst2, dst3); 580 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1); 581 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3); 582 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 583 dst1, dst2, dst3); 584 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1); 585 VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3); 586 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 587 dst1, dst2, dst3); 588 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec0, vec1); 589 VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3); 590 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 591 dst1, dst2, dst3); 592 593 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 594 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 595 596 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 597 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 598 dst += (8 * dst_stride); 599 } 600} 601 602static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr, 603 int32_t src_stride, 604 int16_t *src1_ptr, 605 int32_t src2_stride, 606 uint8_t *dst, 607 int32_t dst_stride, 608 const int8_t *filter, 609 int32_t height) 610{ 611 uint32_t loop_cnt; 612 v8i16 filt0, filt1, filt2, filt3; 613 v16i8 src0, src1, src2, src3; 614 v16i8 mask1, mask2, mask3; 615 v16i8 vec0, vec1, vec2, vec3; 616 v8i16 dst0, dst1, dst2, dst3; 617 v8i16 in0, in1, in2, in3; 618 v8i16 filter_vec, const_vec; 619 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 620 621 src0_ptr -= 3; 622 623 const_vec = __msa_ldi_h(128); 624 const_vec <<= 6; 625 626 filter_vec = LD_SH(filter); 627 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 628 629 mask1 = mask0 + 2; 630 mask2 = mask0 + 4; 631 mask3 = mask0 + 6; 632 633 for (loop_cnt = (height >> 2); loop_cnt--;) { 634 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 635 src0_ptr += (4 * src_stride); 636 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 637 src1_ptr += (4 * src2_stride); 638 XORI_B4_128_SB(src0, src1, src2, src3); 639 640 dst0 = const_vec; 641 dst1 = const_vec; 642 dst2 = const_vec; 643 dst3 = const_vec; 644 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 645 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 646 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 647 dst1, dst2, dst3); 648 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 649 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 650 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 651 dst1, dst2, dst3); 652 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1); 653 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3); 654 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 655 dst1, dst2, dst3); 656 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1); 657 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3); 658 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 659 dst1, dst2, dst3); 660 661 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 662 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 663 664 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 665 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 666 dst += (4 * dst_stride); 667 } 668} 669 670static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr, 671 int32_t src_stride, 672 int16_t *src1_ptr, 673 int32_t src2_stride, 674 uint8_t *dst, 675 int32_t dst_stride, 676 const int8_t *filter, 677 int32_t height) 678{ 679 uint32_t loop_cnt; 680 int32_t tmp0, tmp1; 681 int64_t tmp2, tmp3; 682 v16i8 src0, src1, src2, src3; 683 v16i8 vec0, vec1, vec2; 684 v8i16 filt0, filt1, filt2, filt3; 685 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 686 v8i16 dst0, dst1, dst2; 687 v8i16 in0, in1, in2, in3; 688 v8i16 filter_vec, const_vec; 689 690 src0_ptr -= 3; 691 const_vec = __msa_ldi_h(128); 692 const_vec <<= 6; 693 694 filter_vec = LD_SH(filter); 695 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 696 697 mask0 = LD_SB(ff_hevc_mask_arr); 698 mask1 = mask0 + 2; 699 mask2 = mask0 + 4; 700 mask3 = mask0 + 6; 701 mask4 = LD_SB(&ff_hevc_mask_arr[16]); 702 mask5 = mask4 + 2; 703 mask6 = mask4 + 4; 704 mask7 = mask4 + 6; 705 706 for (loop_cnt = 8; loop_cnt--;) { 707 LD_SB2(src0_ptr, 8, src0, src1); 708 src0_ptr += src_stride; 709 LD_SB2(src0_ptr, 8, src2, src3); 710 src0_ptr += src_stride; 711 LD_SH2(src1_ptr, 8, in0, in1); 712 src1_ptr += src2_stride; 713 LD_SH2(src1_ptr, 8, in2, in3); 714 src1_ptr += src2_stride; 715 XORI_B4_128_SB(src0, src1, src2, src3); 716 717 dst0 = const_vec; 718 dst1 = const_vec; 719 dst2 = const_vec; 720 721 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask0, mask4, mask0, 722 vec0, vec1, vec2); 723 DPADD_SB2_SH(vec0, vec1, filt0, filt0, dst0, dst1); 724 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt0); 725 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask1, mask5, mask1, 726 vec0, vec1, vec2); 727 DPADD_SB2_SH(vec0, vec1, filt1, filt1, dst0, dst1); 728 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt1); 729 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask2, mask6, mask2, 730 vec0, vec1, vec2); 731 DPADD_SB2_SH(vec0, vec1, filt2, filt2, dst0, dst1); 732 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt2); 733 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask3, mask7, mask3, 734 vec0, vec1, vec2); 735 DPADD_SB2_SH(vec0, vec1, filt3, filt3, dst0, dst1); 736 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt3); 737 738 in1 = (v8i16) __msa_pckev_d((v2i64) in3, (v2i64) in1); 739 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); 740 dst2 = __msa_adds_s_h(in2, dst2); 741 dst2 = __msa_srari_h(dst2, 7); 742 CLIP_SH_0_255(dst2); 743 PCKEV_B2_SH(dst1, dst0, dst2, dst2, dst0, dst1); 744 745 tmp2 = __msa_copy_s_d((v2i64) dst0, 0); 746 tmp0 = __msa_copy_s_w((v4i32) dst0, 2); 747 tmp3 = __msa_copy_s_d((v2i64) dst1, 0); 748 tmp1 = __msa_copy_s_w((v4i32) dst0, 3); 749 SD(tmp2, dst); 750 SW(tmp0, dst + 8); 751 dst += dst_stride; 752 SD(tmp3, dst); 753 SW(tmp1, dst + 8); 754 dst += dst_stride; 755 } 756} 757 758static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr, 759 int32_t src_stride, 760 int16_t *src1_ptr, 761 int32_t src2_stride, 762 uint8_t *dst, 763 int32_t dst_stride, 764 const int8_t *filter, 765 int32_t height) 766{ 767 uint32_t loop_cnt; 768 v16i8 src0, src1, src2, src3; 769 v8i16 filt0, filt1, filt2, filt3; 770 v16i8 mask1, mask2, mask3; 771 v16i8 vec0, vec1, vec2, vec3; 772 v8i16 dst0, dst1, dst2, dst3; 773 v8i16 in0, in1, in2, in3; 774 v8i16 filter_vec, const_vec; 775 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 776 777 src0_ptr -= 3; 778 const_vec = __msa_ldi_h(128); 779 const_vec <<= 6; 780 781 filter_vec = LD_SH(filter); 782 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 783 784 mask1 = mask0 + 2; 785 mask2 = mask0 + 4; 786 mask3 = mask0 + 6; 787 788 for (loop_cnt = (height >> 1); loop_cnt--;) { 789 LD_SB2(src0_ptr, 8, src0, src1); 790 src0_ptr += src_stride; 791 LD_SB2(src0_ptr, 8, src2, src3); 792 src0_ptr += src_stride; 793 LD_SH2(src1_ptr, 8, in0, in1); 794 src1_ptr += src2_stride; 795 LD_SH2(src1_ptr, 8, in2, in3); 796 src1_ptr += src2_stride; 797 XORI_B4_128_SB(src0, src1, src2, src3); 798 799 dst0 = const_vec; 800 dst1 = const_vec; 801 dst2 = const_vec; 802 dst3 = const_vec; 803 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 804 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 806 dst1, dst2, dst3); 807 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 808 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 809 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 810 dst1, dst2, dst3); 811 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1); 812 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3); 813 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 814 dst1, dst2, dst3); 815 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1); 816 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3); 817 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 818 dst1, dst2, dst3); 819 820 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 821 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 822 823 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 824 ST_SH2(dst0, dst1, dst, dst_stride); 825 dst += (2 * dst_stride); 826 } 827} 828 829static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr, 830 int32_t src_stride, 831 int16_t *src1_ptr, 832 int32_t src2_stride, 833 uint8_t *dst, 834 int32_t dst_stride, 835 const int8_t *filter, 836 int32_t height) 837{ 838 uint32_t loop_cnt; 839 uint64_t dst_val0; 840 v16i8 src0, src1, tmp0, tmp1; 841 v8i16 filt0, filt1, filt2, filt3; 842 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 843 v16i8 vec0, vec1, vec2, vec3; 844 v8i16 dst0, dst1, dst2; 845 v8i16 in0, in1, in2; 846 v8i16 filter_vec, const_vec; 847 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 848 849 src0_ptr = src0_ptr - 3; 850 const_vec = __msa_ldi_h(128); 851 const_vec <<= 6; 852 853 filter_vec = LD_SH(filter); 854 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 855 856 mask1 = mask0 + 2; 857 mask2 = mask0 + 4; 858 mask3 = mask0 + 6; 859 mask4 = mask0 + 8; 860 mask5 = mask0 + 10; 861 mask6 = mask0 + 12; 862 mask7 = mask0 + 14; 863 864 for (loop_cnt = height; loop_cnt--;) { 865 LD_SB2(src0_ptr, 16, src0, src1); 866 src0_ptr += src_stride; 867 LD_SH2(src1_ptr, 8, in0, in1); 868 in2 = LD_SH(src1_ptr + 16); 869 src1_ptr += src2_stride; 870 XORI_B2_128_SB(src0, src1); 871 872 dst0 = const_vec; 873 dst1 = const_vec; 874 dst2 = const_vec; 875 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 876 VSHF_B2_SB(src1, src1, src0, src0, mask0, mask1, vec2, vec3); 877 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt1, dst0, 878 dst1, dst2, dst0); 879 VSHF_B2_SB(src0, src1, src1, src1, mask5, mask1, vec0, vec1); 880 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec2, vec3); 881 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt2, filt2, dst1, 882 dst2, dst0, dst1); 883 VSHF_B2_SB(src1, src1, src0, src0, mask2, mask3, vec0, vec1); 884 VSHF_B2_SB(src0, src1, src1, src1, mask7, mask3, vec2, vec3); 885 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt3, filt3, filt3, dst2, 886 dst0, dst1, dst2); 887 888 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); 889 dst2 = __msa_adds_s_h(dst2, in2); 890 dst2 = __msa_srari_h(dst2, 7); 891 CLIP_SH_0_255(dst2); 892 893 PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1); 894 dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0); 895 ST_SB(tmp0, dst); 896 SD(dst_val0, dst + 16); 897 dst += dst_stride; 898 } 899} 900 901static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr, 902 int32_t src_stride, 903 int16_t *src1_ptr, 904 int32_t src2_stride, 905 uint8_t *dst, 906 int32_t dst_stride, 907 const int8_t *filter, 908 int32_t height) 909{ 910 uint32_t loop_cnt; 911 v16i8 src0, src1, src2, tmp0, tmp1; 912 v8i16 filt0, filt1, filt2, filt3; 913 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 914 v16i8 vec0, vec1, vec2, vec3; 915 v8i16 dst0, dst1, dst2, dst3; 916 v8i16 in0, in1, in2, in3; 917 v8i16 filter_vec, const_vec; 918 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 919 920 src0_ptr -= 3; 921 const_vec = __msa_ldi_h(128); 922 const_vec <<= 6; 923 924 filter_vec = LD_SH(filter); 925 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 926 927 mask1 = mask0 + 2; 928 mask2 = mask0 + 4; 929 mask3 = mask0 + 6; 930 mask4 = mask0 + 8; 931 mask5 = mask0 + 10; 932 mask6 = mask0 + 12; 933 mask7 = mask0 + 14; 934 935 for (loop_cnt = height; loop_cnt--;) { 936 LD_SB2(src0_ptr, 16, src0, src1); 937 src2 = LD_SB(src0_ptr + 24); 938 src0_ptr += src_stride; 939 LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 940 src1_ptr += src2_stride; 941 XORI_B3_128_SB(src0, src1, src2); 942 943 dst0 = const_vec; 944 dst1 = const_vec; 945 dst2 = const_vec; 946 dst3 = const_vec; 947 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 948 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3); 949 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 950 dst1, dst2, dst3); 951 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1); 952 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3); 953 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 954 dst1, dst2, dst3); 955 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1); 956 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3); 957 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 958 dst1, dst2, dst3); 959 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1); 960 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3); 961 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 962 dst1, dst2, dst3); 963 964 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 965 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 966 967 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1); 968 ST_SB2(tmp0, tmp1, dst, 16); 969 dst += dst_stride; 970 } 971} 972 973static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr, 974 int32_t src_stride, 975 int16_t *src1_ptr, 976 int32_t src2_stride, 977 uint8_t *dst, 978 int32_t dst_stride, 979 const int8_t *filter, 980 int32_t height) 981{ 982 uint32_t loop_cnt; 983 v16i8 src0, src1, src2, src3; 984 v16i8 tmp0, tmp1, tmp2; 985 v8i16 filt0, filt1, filt2, filt3; 986 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 987 v16i8 vec0, vec1, vec2, vec3; 988 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 989 v8i16 in0, in1, in2, in3, in4, in5; 990 v8i16 filter_vec, const_vec; 991 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 992 993 src0_ptr -= 3; 994 995 const_vec = __msa_ldi_h(128); 996 const_vec <<= 6; 997 998 filter_vec = LD_SH(filter); 999 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1000 1001 mask1 = mask0 + 2; 1002 mask2 = mask0 + 4; 1003 mask3 = mask0 + 6; 1004 mask4 = mask0 + 8; 1005 mask5 = mask0 + 10; 1006 mask6 = mask0 + 12; 1007 mask7 = mask0 + 14; 1008 1009 for (loop_cnt = 64; loop_cnt--;) { 1010 LD_SB3(src0_ptr, 16, src0, src1, src2); 1011 src3 = LD_SB(src0_ptr + 40); 1012 src0_ptr += src_stride; 1013 LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 1014 XORI_B4_128_SB(src0, src1, src2, src3); 1015 1016 dst0 = const_vec; 1017 dst1 = const_vec; 1018 dst2 = const_vec; 1019 dst3 = const_vec; 1020 1021 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 1022 VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3); 1023 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 1024 dst1, dst2, dst3); 1025 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1); 1026 VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3); 1027 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 1028 dst1, dst2, dst3); 1029 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1); 1030 VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3); 1031 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 1032 dst1, dst2, dst3); 1033 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1); 1034 VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3); 1035 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 1036 dst1, dst2, dst3); 1037 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); 1038 HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3); 1039 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1); 1040 ST_SB(tmp0, dst); 1041 ST_SB(tmp1, dst + 16); 1042 1043 LD_SH2(src1_ptr + 32, 8, in4, in5); 1044 src1_ptr += src2_stride; 1045 1046 dst4 = const_vec; 1047 dst5 = const_vec; 1048 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec0, vec1); 1049 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 1050 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, 1051 dst5, dst4, dst5); 1052 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec0, vec1); 1053 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3); 1054 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt3, filt3, dst4, 1055 dst5, dst4, dst5); 1056 1057 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); 1058 1059 tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4); 1060 ST_SB(tmp2, dst + 32); 1061 dst += dst_stride; 1062 } 1063} 1064 1065static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr, 1066 int32_t src_stride, 1067 int16_t *src1_ptr, 1068 int32_t src2_stride, 1069 uint8_t *dst, 1070 int32_t dst_stride, 1071 const int8_t *filter, 1072 int32_t height) 1073{ 1074 uint32_t loop_cnt; 1075 v16i8 src0, src1, src2, src3, src4, src5, tmp0, tmp1; 1076 v8i16 filt0, filt1, filt2, filt3; 1077 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1078 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1079 v16i8 vec0, vec1, vec2, vec3; 1080 v8i16 dst0, dst1, dst2, dst3; 1081 v8i16 in0, in1, in2, in3; 1082 v8i16 filter_vec, const_vec; 1083 1084 src0_ptr -= 3; 1085 1086 const_vec = __msa_ldi_h(128); 1087 const_vec <<= 6; 1088 1089 filter_vec = LD_SH(filter); 1090 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1091 1092 mask1 = mask0 + 2; 1093 mask2 = mask0 + 4; 1094 mask3 = mask0 + 6; 1095 mask4 = mask0 + 8; 1096 mask5 = mask0 + 10; 1097 mask6 = mask0 + 12; 1098 mask7 = mask0 + 14; 1099 1100 for (loop_cnt = height; loop_cnt--;) { 1101 LD_SB2(src0_ptr, 16, src0, src1); 1102 src2 = LD_SB(src0_ptr + 24); 1103 LD_SB2(src0_ptr + 32, 16, src3, src4); 1104 src5 = LD_SB(src0_ptr + 56); 1105 LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 1106 XORI_B3_128_SB(src0, src1, src2); 1107 1108 dst0 = const_vec; 1109 dst1 = const_vec; 1110 dst2 = const_vec; 1111 dst3 = const_vec; 1112 1113 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 1114 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3); 1115 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 1116 dst1, dst2, dst3); 1117 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1); 1118 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3); 1119 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 1120 dst1, dst2, dst3); 1121 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1); 1122 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3); 1123 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 1124 dst1, dst2, dst3); 1125 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1); 1126 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3); 1127 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 1128 dst1, dst2, dst3); 1129 1130 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 1131 dst0, dst1, dst2, dst3, 7, 1132 dst0, dst1, dst2, dst3); 1133 1134 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1); 1135 ST_SB2(tmp0, tmp1, dst, 16); 1136 1137 src0 = src3; 1138 src1 = src4; 1139 src2 = src5; 1140 1141 LD_SH4(src1_ptr + 32, 8, in0, in1, in2, in3); 1142 XORI_B3_128_SB(src0, src1, src2); 1143 1144 dst0 = const_vec; 1145 dst1 = const_vec; 1146 dst2 = const_vec; 1147 dst3 = const_vec; 1148 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 1149 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3); 1150 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 1151 dst1, dst2, dst3); 1152 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1); 1153 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3); 1154 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 1155 dst1, dst2, dst3); 1156 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1); 1157 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3); 1158 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 1159 dst1, dst2, dst3); 1160 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1); 1161 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3); 1162 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 1163 dst1, dst2, dst3); 1164 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 1165 dst0, dst1, dst2, dst3, 7, 1166 dst0, dst1, dst2, dst3); 1167 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1); 1168 ST_SB2(tmp0, tmp1, dst + 32, 16); 1169 src1_ptr += src2_stride; 1170 src0_ptr += src_stride; 1171 dst += dst_stride; 1172 } 1173} 1174 1175static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr, 1176 int32_t src_stride, 1177 int16_t *src1_ptr, 1178 int32_t src2_stride, 1179 uint8_t *dst, 1180 int32_t dst_stride, 1181 const int8_t *filter, 1182 int32_t height) 1183{ 1184 int32_t loop_cnt; 1185 v16i8 src0, src1, src2, src3, src4, src5; 1186 v16i8 src6, src7, src8, src9, src10; 1187 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 1188 v16i8 src11, src12, src13, src14; 1189 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1190 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1191 v16i8 src1110_r, src1211_r, src1312_r, src1413_r; 1192 v16i8 src2110, src4332, src6554, src8776, src10998; 1193 v16i8 src12111110, src14131312; 1194 v8i16 dst10, dst32, dst54, dst76; 1195 v8i16 filt0, filt1, filt2, filt3; 1196 v8i16 filter_vec, const_vec; 1197 1198 src0_ptr -= (3 * src_stride); 1199 1200 const_vec = __msa_ldi_h(128); 1201 const_vec <<= 6; 1202 1203 filter_vec = LD_SH(filter); 1204 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1205 1206 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1207 src0_ptr += (7 * src_stride); 1208 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1209 src10_r, src32_r, src54_r, src21_r); 1210 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1211 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 1212 src2110, src4332, src6554); 1213 XORI_B3_128_SB(src2110, src4332, src6554); 1214 1215 for (loop_cnt = (height >> 3); loop_cnt--;) { 1216 LD_SB8(src0_ptr, src_stride, 1217 src7, src8, src9, src10, src11, src12, src13, src14); 1218 src0_ptr += (8 * src_stride); 1219 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); 1220 src1_ptr += (8 * src2_stride); 1221 1222 ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 1223 ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 1224 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1225 src76_r, src87_r, src98_r, src109_r); 1226 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, 1227 src1110_r, src1211_r, src1312_r, src1413_r); 1228 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r, 1229 src1413_r, src1312_r, 1230 src8776, src10998, src12111110, src14131312); 1231 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); 1232 1233 dst10 = const_vec; 1234 DPADD_SB4_SH(src2110, src4332, src6554, src8776, 1235 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10); 1236 dst32 = const_vec; 1237 DPADD_SB4_SH(src4332, src6554, src8776, src10998, 1238 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32); 1239 dst54 = const_vec; 1240 DPADD_SB4_SH(src6554, src8776, src10998, src12111110, 1241 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54); 1242 dst76 = const_vec; 1243 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, 1244 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76); 1245 1246 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 1247 dst10, dst32, dst54, dst76, 7, 1248 dst10, dst32, dst54, dst76); 1249 1250 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54); 1251 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 1252 dst += (8 * dst_stride); 1253 1254 src2110 = src10998; 1255 src4332 = src12111110; 1256 src6554 = src14131312; 1257 src6 = src14; 1258 } 1259} 1260 1261static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr, 1262 int32_t src_stride, 1263 int16_t *src1_ptr, 1264 int32_t src2_stride, 1265 uint8_t *dst, 1266 int32_t dst_stride, 1267 const int8_t *filter, 1268 int32_t height) 1269{ 1270 int32_t loop_cnt; 1271 v16i8 src0, src1, src2, src3, src4, src5; 1272 v16i8 src6, src7, src8, src9, src10; 1273 v8i16 in0, in1, in2, in3; 1274 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1275 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1276 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 1277 v8i16 filt0, filt1, filt2, filt3; 1278 v8i16 filter_vec, const_vec; 1279 1280 src0_ptr -= (3 * src_stride); 1281 const_vec = __msa_ldi_h(128); 1282 const_vec <<= 6; 1283 1284 filter_vec = LD_SH(filter); 1285 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1286 1287 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1288 src0_ptr += (7 * src_stride); 1289 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1290 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1291 src10_r, src32_r, src54_r, src21_r); 1292 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1293 1294 for (loop_cnt = (height >> 2); loop_cnt--;) { 1295 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 1296 src0_ptr += (4 * src_stride); 1297 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 1298 src1_ptr += (4 * src2_stride); 1299 XORI_B4_128_SB(src7, src8, src9, src10); 1300 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1301 src76_r, src87_r, src98_r, src109_r); 1302 1303 dst0_r = const_vec; 1304 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, 1305 filt0, filt1, filt2, filt3, 1306 dst0_r, dst0_r, dst0_r, dst0_r); 1307 dst1_r = const_vec; 1308 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, 1309 filt0, filt1, filt2, filt3, 1310 dst1_r, dst1_r, dst1_r, dst1_r); 1311 dst2_r = const_vec; 1312 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, 1313 filt0, filt1, filt2, filt3, 1314 dst2_r, dst2_r, dst2_r, dst2_r); 1315 dst3_r = const_vec; 1316 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, 1317 filt0, filt1, filt2, filt3, 1318 dst3_r, dst3_r, dst3_r, dst3_r); 1319 1320 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 1321 dst0_r, dst1_r, dst2_r, dst3_r, 7, 1322 dst0_r, dst1_r, dst2_r, dst3_r); 1323 1324 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 1325 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride); 1326 dst += (4 * dst_stride); 1327 1328 src10_r = src54_r; 1329 src32_r = src76_r; 1330 src54_r = src98_r; 1331 src21_r = src65_r; 1332 src43_r = src87_r; 1333 src65_r = src109_r; 1334 1335 src6 = src10; 1336 } 1337} 1338 1339static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr, 1340 int32_t src_stride, 1341 int16_t *src1_ptr, 1342 int32_t src2_stride, 1343 uint8_t *dst, 1344 int32_t dst_stride, 1345 const int8_t *filter, 1346 int32_t height) 1347{ 1348 int32_t loop_cnt; 1349 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1350 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 1351 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1352 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1353 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 1354 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; 1355 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; 1356 v16i8 src2110, src4332, src6554, src8776, src10998; 1357 v8i16 dst0_l, dst1_l; 1358 v8i16 filt0, filt1, filt2, filt3; 1359 v8i16 filter_vec, const_vec; 1360 1361 src0_ptr -= (3 * src_stride); 1362 const_vec = __msa_ldi_h(128); 1363 const_vec <<= 6; 1364 1365 filter_vec = LD_SH(filter); 1366 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1367 1368 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1369 src0_ptr += (7 * src_stride); 1370 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1371 1372 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1373 src10_r, src32_r, src54_r, src21_r); 1374 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1375 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1376 src10_l, src32_l, src54_l, src21_l); 1377 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1378 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, 1379 src2110, src4332, src6554); 1380 1381 for (loop_cnt = (height >> 2); loop_cnt--;) { 1382 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 1383 src0_ptr += (4 * src_stride); 1384 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 1385 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7); 1386 src1_ptr += (4 * src2_stride); 1387 1388 ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 1389 XORI_B4_128_SB(src7, src8, src9, src10); 1390 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1391 src76_r, src87_r, src98_r, src109_r); 1392 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1393 src76_l, src87_l, src98_l, src109_l); 1394 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998); 1395 1396 dst0_r = const_vec; 1397 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, 1398 filt0, filt1, filt2, filt3, 1399 dst0_r, dst0_r, dst0_r, dst0_r); 1400 dst1_r = const_vec; 1401 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, 1402 filt0, filt1, filt2, filt3, 1403 dst1_r, dst1_r, dst1_r, dst1_r); 1404 dst2_r = const_vec; 1405 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, 1406 filt0, filt1, filt2, filt3, 1407 dst2_r, dst2_r, dst2_r, dst2_r); 1408 dst3_r = const_vec; 1409 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, 1410 filt0, filt1, filt2, filt3, 1411 dst3_r, dst3_r, dst3_r, dst3_r); 1412 dst0_l = const_vec; 1413 DPADD_SB4_SH(src2110, src4332, src6554, src8776, 1414 filt0, filt1, filt2, filt3, 1415 dst0_l, dst0_l, dst0_l, dst0_l); 1416 dst1_l = const_vec; 1417 DPADD_SB4_SH(src4332, src6554, src8776, src10998, 1418 filt0, filt1, filt2, filt3, 1419 dst1_l, dst1_l, dst1_l, dst1_l); 1420 1421 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 1422 dst0_r, dst1_r, dst2_r, dst3_r, 7, 1423 dst0_r, dst1_r, dst2_r, dst3_r); 1424 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l); 1425 1426 1427 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 1428 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l); 1429 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride); 1430 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride); 1431 dst += (4 * dst_stride); 1432 1433 src10_r = src54_r; 1434 src32_r = src76_r; 1435 src54_r = src98_r; 1436 src21_r = src65_r; 1437 src43_r = src87_r; 1438 src65_r = src109_r; 1439 src2110 = src6554; 1440 src4332 = src8776; 1441 src6554 = src10998; 1442 src6 = src10; 1443 } 1444} 1445 1446static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr, 1447 int32_t src_stride, 1448 int16_t *src1_ptr, 1449 int32_t src2_stride, 1450 uint8_t *dst, 1451 int32_t dst_stride, 1452 const int8_t *filter, 1453 int32_t height, int32_t width) 1454{ 1455 uint8_t *src0_ptr_tmp; 1456 int16_t *src1_ptr_tmp; 1457 uint8_t *dst_tmp; 1458 uint32_t loop_cnt; 1459 uint32_t cnt; 1460 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1461 v8i16 in0, in1, in2, in3; 1462 v16i8 src10_r, src32_r, src54_r, src76_r; 1463 v16i8 src21_r, src43_r, src65_r, src87_r; 1464 v8i16 dst0_r, dst1_r; 1465 v16i8 src10_l, src32_l, src54_l, src76_l; 1466 v16i8 src21_l, src43_l, src65_l, src87_l; 1467 v8i16 dst0_l, dst1_l; 1468 v8i16 filt0, filt1, filt2, filt3; 1469 v8i16 filter_vec, const_vec; 1470 1471 src0_ptr -= (3 * src_stride); 1472 const_vec = __msa_ldi_h(128); 1473 const_vec <<= 6; 1474 1475 filter_vec = LD_SH(filter); 1476 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1477 1478 for (cnt = (width >> 4); cnt--;) { 1479 src0_ptr_tmp = src0_ptr; 1480 src1_ptr_tmp = src1_ptr; 1481 dst_tmp = dst; 1482 1483 LD_SB7(src0_ptr_tmp, src_stride, 1484 src0, src1, src2, src3, src4, src5, src6); 1485 src0_ptr_tmp += (7 * src_stride); 1486 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1487 1488 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1489 src10_r, src32_r, src54_r, src21_r); 1490 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1491 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1492 src10_l, src32_l, src54_l, src21_l); 1493 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1494 1495 for (loop_cnt = (height >> 1); loop_cnt--;) { 1496 LD_SB2(src0_ptr_tmp, src_stride, src7, src8); 1497 src0_ptr_tmp += (2 * src_stride); 1498 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); 1499 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3); 1500 src1_ptr_tmp += (2 * src2_stride); 1501 XORI_B2_128_SB(src7, src8); 1502 1503 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 1504 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 1505 1506 dst0_r = const_vec; 1507 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, 1508 filt0, filt1, filt2, filt3, 1509 dst0_r, dst0_r, dst0_r, dst0_r); 1510 dst1_r = const_vec; 1511 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, 1512 filt0, filt1, filt2, filt3, 1513 dst1_r, dst1_r, dst1_r, dst1_r); 1514 dst0_l = const_vec; 1515 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l, 1516 filt0, filt1, filt2, filt3, 1517 dst0_l, dst0_l, dst0_l, dst0_l); 1518 dst1_l = const_vec; 1519 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l, 1520 filt0, filt1, filt2, filt3, 1521 dst1_l, dst1_l, dst1_l, dst1_l); 1522 1523 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 1524 dst0_r, dst1_r, dst0_l, dst1_l, 7, 1525 dst0_r, dst1_r, dst0_l, dst1_l); 1526 1527 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 1528 ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride); 1529 dst_tmp += (2 * dst_stride); 1530 1531 src10_r = src32_r; 1532 src32_r = src54_r; 1533 src54_r = src76_r; 1534 src21_r = src43_r; 1535 src43_r = src65_r; 1536 src65_r = src87_r; 1537 src10_l = src32_l; 1538 src32_l = src54_l; 1539 src54_l = src76_l; 1540 src21_l = src43_l; 1541 src43_l = src65_l; 1542 src65_l = src87_l; 1543 src6 = src8; 1544 } 1545 1546 src0_ptr += 16; 1547 src1_ptr += 16; 1548 dst += 16; 1549 } 1550} 1551 1552static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr, 1553 int32_t src_stride, 1554 int16_t *src1_ptr, 1555 int32_t src2_stride, 1556 uint8_t *dst, 1557 int32_t dst_stride, 1558 const int8_t *filter, 1559 int32_t height) 1560{ 1561 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 1562 dst, dst_stride, filter, height, 16); 1563} 1564 1565static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr, 1566 int32_t src_stride, 1567 int16_t *src1_ptr, 1568 int32_t src2_stride, 1569 uint8_t *dst, 1570 int32_t dst_stride, 1571 const int8_t *filter, 1572 int32_t height) 1573{ 1574 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 1575 dst, dst_stride, filter, height, 16); 1576 hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 1577 dst + 16, dst_stride, filter, height); 1578} 1579 1580static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr, 1581 int32_t src_stride, 1582 int16_t *src1_ptr, 1583 int32_t src2_stride, 1584 uint8_t *dst, 1585 int32_t dst_stride, 1586 const int8_t *filter, 1587 int32_t height) 1588{ 1589 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 1590 dst, dst_stride, filter, height, 32); 1591} 1592 1593static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr, 1594 int32_t src_stride, 1595 int16_t *src1_ptr, 1596 int32_t src2_stride, 1597 uint8_t *dst, 1598 int32_t dst_stride, 1599 const int8_t *filter, 1600 int32_t height) 1601{ 1602 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 1603 dst, dst_stride, filter, height, 48); 1604} 1605 1606static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr, 1607 int32_t src_stride, 1608 int16_t *src1_ptr, 1609 int32_t src2_stride, 1610 uint8_t *dst, 1611 int32_t dst_stride, 1612 const int8_t *filter, 1613 int32_t height) 1614{ 1615 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 1616 dst, dst_stride, filter, height, 64); 1617} 1618 1619static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr, 1620 int32_t src_stride, 1621 int16_t *src1_ptr, 1622 int32_t src2_stride, 1623 uint8_t *dst, 1624 int32_t dst_stride, 1625 const int8_t *filter_x, 1626 const int8_t *filter_y, 1627 int32_t height) 1628{ 1629 uint32_t loop_cnt; 1630 uint64_t tp0, tp1; 1631 v16u8 out; 1632 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1633 v8i16 in0 = { 0 }, in1 = { 0 }; 1634 v8i16 filt0, filt1, filt2, filt3; 1635 v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1636 v16i8 mask1, mask2, mask3; 1637 v8i16 filter_vec, const_vec; 1638 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1639 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1640 v8i16 out0, out1; 1641 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108; 1642 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109; 1643 v4i32 dst0, dst1, dst2, dst3; 1644 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1645 1646 src0_ptr -= ((3 * src_stride) + 3); 1647 filter_vec = LD_SH(filter_x); 1648 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1649 1650 filter_vec = LD_SH(filter_y); 1651 UNPCK_R_SB_SH(filter_vec, filter_vec); 1652 1653 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1654 1655 mask1 = mask0 + 2; 1656 mask2 = mask0 + 4; 1657 mask3 = mask0 + 6; 1658 1659 const_vec = __msa_ldi_h(128); 1660 const_vec <<= 6; 1661 1662 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1663 src0_ptr += (7 * src_stride); 1664 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1665 1666 /* row 0 row 1 row 2 row 3 */ 1667 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1668 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1669 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, 1670 vec8, vec9, vec10, vec11); 1671 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, 1672 vec12, vec13, vec14, vec15); 1673 1674 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1675 filt3); 1676 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1677 filt3); 1678 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1679 filt3); 1680 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 1681 filt3); 1682 1683 ILVRL_H2_SH(dst41, dst30, dst10, dst43); 1684 ILVRL_H2_SH(dst52, dst41, dst21, dst54); 1685 ILVRL_H2_SH(dst63, dst52, dst32, dst65); 1686 1687 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 1688 1689 for (loop_cnt = height >> 2; loop_cnt--;) { 1690 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 1691 src0_ptr += (4 * src_stride); 1692 XORI_B4_128_SB(src7, src8, src9, src10); 1693 1694 LD2(src1_ptr, src2_stride, tp0, tp1); 1695 INSERT_D2_SH(tp0, tp1, in0); 1696 src1_ptr += (2 * src2_stride); 1697 LD2(src1_ptr, src2_stride, tp0, tp1); 1698 INSERT_D2_SH(tp0, tp1, in1); 1699 src1_ptr += (2 * src2_stride); 1700 1701 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3, 1702 vec0, vec1, vec2, vec3); 1703 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3, 1704 vec4, vec5, vec6, vec7); 1705 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1706 filt3); 1707 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1708 filt3); 1709 1710 dst76 = __msa_ilvr_h(dst97, dst66); 1711 ILVRL_H2_SH(dst108, dst97, dst87, dst109); 1712 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 1713 dst98 = __msa_ilvr_h(dst66, dst108); 1714 1715 dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1, 1716 filt_h2, filt_h3); 1717 dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1, 1718 filt_h2, filt_h3); 1719 dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1, 1720 filt_h2, filt_h3); 1721 dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1, 1722 filt_h2, filt_h3); 1723 1724 SRA_4V(dst0, dst1, dst2, dst3, 6); 1725 PCKEV_H2_SH(dst1, dst0, dst3, dst2, out0, out1); 1726 ADDS_SH2_SH(out0, in0, out1, in1, out0, out1); 1727 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1); 1728 SRARI_H2_SH(out0, out1, 7); 1729 CLIP_SH2_0_255(out0, out1); 1730 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0); 1731 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 1732 dst += (4 * dst_stride); 1733 1734 dst10 = dst54; 1735 dst32 = dst76; 1736 dst54 = dst98; 1737 dst21 = dst65; 1738 dst43 = dst87; 1739 dst65 = dst109; 1740 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 1741 } 1742} 1743 1744static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr, 1745 int32_t src_stride, 1746 int16_t *src1_ptr, 1747 int32_t src2_stride, 1748 uint8_t *dst, 1749 int32_t dst_stride, 1750 const int8_t *filter_x, 1751 const int8_t *filter_y, 1752 int32_t height, int32_t width) 1753{ 1754 uint32_t loop_cnt; 1755 uint32_t cnt; 1756 uint8_t *src0_ptr_tmp; 1757 int16_t *src1_ptr_tmp; 1758 uint8_t *dst_tmp; 1759 v16u8 out; 1760 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 1761 v8i16 in0, tmp; 1762 v8i16 filt0, filt1, filt2, filt3; 1763 v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1764 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 1765 v16i8 mask1, mask2, mask3; 1766 v8i16 filter_vec, const_vec; 1767 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1768 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1769 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1770 v4i32 dst0_r, dst0_l; 1771 v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 1772 v8i16 dst10_l, dst32_l, dst54_l, dst76_l; 1773 1774 src0_ptr -= ((3 * src_stride) + 3); 1775 const_vec = __msa_ldi_h(128); 1776 const_vec <<= 6; 1777 1778 filter_vec = LD_SH(filter_x); 1779 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1780 1781 filter_vec = LD_SH(filter_y); 1782 UNPCK_R_SB_SH(filter_vec, filter_vec); 1783 1784 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1785 1786 mask1 = mask0 + 2; 1787 mask2 = mask0 + 4; 1788 mask3 = mask0 + 6; 1789 1790 for (cnt = width >> 3; cnt--;) { 1791 src0_ptr_tmp = src0_ptr; 1792 dst_tmp = dst; 1793 src1_ptr_tmp = src1_ptr; 1794 1795 LD_SB7(src0_ptr_tmp, src_stride, 1796 src0, src1, src2, src3, src4, src5, src6); 1797 src0_ptr_tmp += (7 * src_stride); 1798 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1799 1800 /* row 0 row 1 row 2 row 3 */ 1801 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1802 vec0, vec1, vec2, vec3); 1803 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1804 vec4, vec5, vec6, vec7); 1805 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1806 vec8, vec9, vec10, vec11); 1807 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1808 vec12, vec13, vec14, vec15); 1809 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1810 filt3); 1811 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1812 filt3); 1813 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1814 filt3); 1815 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1816 filt2, filt3); 1817 1818 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 1819 vec0, vec1, vec2, vec3); 1820 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, 1821 vec4, vec5, vec6, vec7); 1822 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, 1823 vec8, vec9, vec10, vec11); 1824 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1825 filt3); 1826 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1827 filt3); 1828 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1829 filt3); 1830 1831 for (loop_cnt = height; loop_cnt--;) { 1832 src7 = LD_SB(src0_ptr_tmp); 1833 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128); 1834 src0_ptr_tmp += src_stride; 1835 1836 in0 = LD_SH(src1_ptr_tmp); 1837 src1_ptr_tmp += src2_stride; 1838 1839 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, 1840 vec0, vec1, vec2, vec3); 1841 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1842 filt2, filt3); 1843 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 1844 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 1845 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 1846 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1847 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 1848 filt_h0, filt_h1, filt_h2, filt_h3); 1849 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 1850 filt_h0, filt_h1, filt_h2, filt_h3); 1851 dst0_r >>= 6; 1852 dst0_l >>= 6; 1853 1854 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 1855 ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp); 1856 tmp = __msa_srari_h(tmp, 7); 1857 CLIP_SH_0_255(tmp); 1858 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); 1859 ST_D1(out, 0, dst_tmp); 1860 dst_tmp += dst_stride; 1861 1862 dst0 = dst1; 1863 dst1 = dst2; 1864 dst2 = dst3; 1865 dst3 = dst4; 1866 dst4 = dst5; 1867 dst5 = dst6; 1868 dst6 = dst7; 1869 } 1870 1871 src0_ptr += 8; 1872 dst += 8; 1873 src1_ptr += 8; 1874 } 1875} 1876 1877static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr, 1878 int32_t src_stride, 1879 int16_t *src1_ptr, 1880 int32_t src2_stride, 1881 uint8_t *dst, 1882 int32_t dst_stride, 1883 const int8_t *filter_x, 1884 const int8_t *filter_y, 1885 int32_t height) 1886{ 1887 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 1888 dst, dst_stride, filter_x, filter_y, 1889 height, 8); 1890} 1891 1892static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr, 1893 int32_t src_stride, 1894 int16_t *src1_ptr, 1895 int32_t src2_stride, 1896 uint8_t *dst, 1897 int32_t dst_stride, 1898 const int8_t *filter_x, 1899 const int8_t *filter_y, 1900 int32_t height) 1901{ 1902 uint32_t loop_cnt; 1903 uint8_t *src0_ptr_tmp, *dst_tmp; 1904 int16_t *src1_ptr_tmp; 1905 uint64_t tp0, tp1; 1906 v16u8 out; 1907 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1908 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1909 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1910 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1911 v8i16 in0, in1 = { 0 }, out0, out1, tmp, filter_vec, const_vec; 1912 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3; 1913 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1914 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108; 1915 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109; 1916 v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 1917 v8i16 dst10_l, dst32_l, dst54_l, dst76_l; 1918 v4i32 dst0_r, dst0_l, tmp0, tmp1, tmp2, tmp3; 1919 1920 src0_ptr -= ((3 * src_stride) + 3); 1921 1922 const_vec = __msa_ldi_h(128); 1923 const_vec <<= 6; 1924 1925 filter_vec = LD_SH(filter_x); 1926 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1927 1928 filter_vec = LD_SH(filter_y); 1929 UNPCK_R_SB_SH(filter_vec, filter_vec); 1930 1931 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1932 1933 mask0 = LD_SB(ff_hevc_mask_arr); 1934 mask1 = mask0 + 2; 1935 mask2 = mask0 + 4; 1936 mask3 = mask0 + 6; 1937 1938 src0_ptr_tmp = src0_ptr; 1939 dst_tmp = dst; 1940 src1_ptr_tmp = src1_ptr; 1941 1942 LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, 1943 src6); 1944 src0_ptr_tmp += (7 * src_stride); 1945 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1946 1947 /* row 0 row 1 row 2 row 3 */ 1948 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 1949 vec3); 1950 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, 1951 vec7); 1952 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 1953 vec11); 1954 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14, 1955 vec15); 1956 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1957 filt3); 1958 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1959 filt3); 1960 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1961 filt3); 1962 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1963 filt2, filt3); 1964 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 1965 vec3); 1966 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, 1967 vec7); 1968 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 1969 vec11); 1970 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1971 filt3); 1972 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1973 filt3); 1974 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1975 filt3); 1976 1977 for (loop_cnt = 16; loop_cnt--;) { 1978 src7 = LD_SB(src0_ptr_tmp); 1979 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128); 1980 src0_ptr_tmp += src_stride; 1981 1982 in0 = LD_SH(src1_ptr_tmp); 1983 src1_ptr_tmp += src2_stride; 1984 1985 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 1986 vec3); 1987 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1988 filt2, filt3); 1989 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 1990 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 1991 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 1992 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1993 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 1994 filt_h1, filt_h2, filt_h3); 1995 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0, 1996 filt_h1, filt_h2, filt_h3); 1997 dst0_r >>= 6; 1998 dst0_l >>= 6; 1999 2000 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 2001 ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp); 2002 tmp = __msa_srari_h(tmp, 7); 2003 CLIP_SH_0_255(tmp); 2004 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); 2005 ST_D1(out, 0, dst_tmp); 2006 dst_tmp += dst_stride; 2007 2008 dst0 = dst1; 2009 dst1 = dst2; 2010 dst2 = dst3; 2011 dst3 = dst4; 2012 dst4 = dst5; 2013 dst5 = dst6; 2014 dst6 = dst7; 2015 } 2016 2017 src0_ptr += 8; 2018 dst += 8; 2019 src1_ptr += 8; 2020 2021 mask4 = LD_SB(ff_hevc_mask_arr + 16); 2022 mask5 = mask4 + 2; 2023 mask6 = mask4 + 4; 2024 mask7 = mask4 + 6; 2025 2026 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 2027 src0_ptr += (7 * src_stride); 2028 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 2029 2030 /* row 0 row 1 row 2 row 3 */ 2031 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); 2032 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7); 2033 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, 2034 vec8, vec9, vec10, vec11); 2035 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, 2036 vec12, vec13, vec14, vec15); 2037 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2038 filt3); 2039 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2040 filt3); 2041 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2042 filt3); 2043 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 2044 filt3); 2045 2046 ILVRL_H2_SH(dst41, dst30, dst10, dst43); 2047 ILVRL_H2_SH(dst52, dst41, dst21, dst54); 2048 ILVRL_H2_SH(dst63, dst52, dst32, dst65); 2049 2050 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 2051 2052 for (loop_cnt = 4; loop_cnt--;) { 2053 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 2054 src0_ptr += (4 * src_stride); 2055 XORI_B4_128_SB(src7, src8, src9, src10); 2056 2057 LD2(src1_ptr, src2_stride, tp0, tp1); 2058 INSERT_D2_SH(tp0, tp1, in0); 2059 src1_ptr += (2 * src2_stride); 2060 LD2(src1_ptr, src2_stride, tp0, tp1); 2061 INSERT_D2_SH(tp0, tp1, in1); 2062 src1_ptr += (2 * src2_stride); 2063 2064 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 2065 vec3); 2066 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6, 2067 vec7); 2068 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2069 filt3); 2070 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2071 filt3); 2072 2073 dst76 = __msa_ilvr_h(dst97, dst66); 2074 ILVRL_H2_SH(dst108, dst97, dst87, dst109); 2075 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 2076 dst98 = __msa_ilvr_h(dst66, dst108); 2077 2078 tmp0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1, 2079 filt_h2, filt_h3); 2080 tmp1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1, 2081 filt_h2, filt_h3); 2082 tmp2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1, 2083 filt_h2, filt_h3); 2084 tmp3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1, 2085 filt_h2, filt_h3); 2086 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6); 2087 PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, out0, out1); 2088 ADDS_SH2_SH(out0, in0, out1, in1, out0, out1); 2089 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1); 2090 SRARI_H2_SH(out0, out1, 7); 2091 CLIP_SH2_0_255(out0, out1); 2092 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0); 2093 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2094 dst += (4 * dst_stride); 2095 2096 dst10 = dst54; 2097 dst32 = dst76; 2098 dst54 = dst98; 2099 dst21 = dst65; 2100 dst43 = dst87; 2101 dst65 = dst109; 2102 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 2103 } 2104} 2105 2106static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr, 2107 int32_t src_stride, 2108 int16_t *src1_ptr, 2109 int32_t src2_stride, 2110 uint8_t *dst, 2111 int32_t dst_stride, 2112 const int8_t *filter_x, 2113 const int8_t *filter_y, 2114 int32_t height) 2115{ 2116 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2117 dst, dst_stride, filter_x, filter_y, 2118 height, 16); 2119} 2120 2121static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr, 2122 int32_t src_stride, 2123 int16_t *src1_ptr, 2124 int32_t src2_stride, 2125 uint8_t *dst, 2126 int32_t dst_stride, 2127 const int8_t *filter_x, 2128 const int8_t *filter_y, 2129 int32_t height) 2130{ 2131 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2132 dst, dst_stride, filter_x, filter_y, 2133 height, 24); 2134} 2135 2136static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr, 2137 int32_t src_stride, 2138 int16_t *src1_ptr, 2139 int32_t src2_stride, 2140 uint8_t *dst, 2141 int32_t dst_stride, 2142 const int8_t *filter_x, 2143 const int8_t *filter_y, 2144 int32_t height) 2145{ 2146 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2147 dst, dst_stride, filter_x, filter_y, 2148 height, 32); 2149} 2150 2151static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr, 2152 int32_t src_stride, 2153 int16_t *src1_ptr, 2154 int32_t src2_stride, 2155 uint8_t *dst, 2156 int32_t dst_stride, 2157 const int8_t *filter_x, 2158 const int8_t *filter_y, 2159 int32_t height) 2160{ 2161 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2162 dst, dst_stride, filter_x, filter_y, 2163 height, 48); 2164} 2165 2166static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr, 2167 int32_t src_stride, 2168 int16_t *src1_ptr, 2169 int32_t src2_stride, 2170 uint8_t *dst, 2171 int32_t dst_stride, 2172 const int8_t *filter_x, 2173 const int8_t *filter_y, 2174 int32_t height) 2175{ 2176 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2177 dst, dst_stride, filter_x, filter_y, 2178 height, 64); 2179} 2180 2181static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr, 2182 int32_t src_stride, 2183 int16_t *src1_ptr, 2184 int32_t src2_stride, 2185 uint8_t *dst, 2186 int32_t dst_stride, 2187 const int8_t *filter, 2188 int32_t height) 2189{ 2190 v8i16 filt0, filt1; 2191 v16i8 src0, src1, dst0, vec0, vec1; 2192 v8i16 in0, in1; 2193 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2194 v16i8 mask1; 2195 v8i16 tmp0; 2196 v8i16 filter_vec, const_vec; 2197 2198 src0_ptr -= 1; 2199 2200 const_vec = __msa_ldi_h(128); 2201 const_vec <<= 6; 2202 2203 filter_vec = LD_SH(filter); 2204 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2205 2206 mask1 = mask0 + 2; 2207 2208 LD_SB2(src0_ptr, src_stride, src0, src1); 2209 LD_SH2(src1_ptr, src2_stride, in0, in1); 2210 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); 2211 XORI_B2_128_SB(src0, src1); 2212 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2213 tmp0 = const_vec; 2214 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0); 2215 2216 tmp0 = __msa_adds_s_h(tmp0, in0); 2217 tmp0 = __msa_srari_h(tmp0, 7); 2218 CLIP_SH_0_255(tmp0); 2219 dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0); 2220 2221 ST_W2(dst0, 0, 1, dst, dst_stride); 2222} 2223 2224static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr, 2225 int32_t src_stride, 2226 int16_t *src1_ptr, 2227 int32_t src2_stride, 2228 uint8_t *dst, 2229 int32_t dst_stride, 2230 const int8_t *filter, 2231 int32_t height) 2232{ 2233 v8i16 filt0, filt1; 2234 v16i8 src0, src1, src2, src3, dst0, vec0, vec1; 2235 v8i16 in0, in1, in2, in3; 2236 v16i8 vec2, vec3; 2237 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2238 v16i8 mask1; 2239 v8i16 tmp0, tmp1; 2240 v8i16 filter_vec, const_vec; 2241 2242 src0_ptr -= 1; 2243 2244 const_vec = __msa_ldi_h(128); 2245 const_vec <<= 6; 2246 2247 filter_vec = LD_SH(filter); 2248 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2249 2250 mask1 = mask0 + 2; 2251 2252 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 2253 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2254 2255 ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 2256 XORI_B4_128_SB(src0, src1, src2, src3); 2257 2258 tmp0 = const_vec; 2259 tmp1 = const_vec; 2260 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 2261 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 2262 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, tmp0, tmp1, 2263 tmp0, tmp1); 2264 HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1); 2265 dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 2266 2267 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride); 2268} 2269 2270static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, 2271 int32_t src_stride, 2272 int16_t *src1_ptr, 2273 int32_t src2_stride, 2274 uint8_t *dst, 2275 int32_t dst_stride, 2276 const int8_t *filter, 2277 int32_t height) 2278{ 2279 uint32_t loop_cnt; 2280 v8i16 filt0, filt1; 2281 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2282 v16i8 dst0, dst1; 2283 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 2284 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2285 v16i8 mask1, vec0, vec1, vec2, vec3; 2286 v8i16 tmp0, tmp1, tmp2, tmp3; 2287 v8i16 filter_vec, const_vec; 2288 2289 src0_ptr -= 1; 2290 2291 const_vec = __msa_ldi_h(128); 2292 const_vec <<= 6; 2293 2294 filter_vec = LD_SH(filter); 2295 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2296 2297 mask1 = mask0 + 2; 2298 2299 for (loop_cnt = (height >> 3); loop_cnt--;) { 2300 LD_SB8(src0_ptr, src_stride, 2301 src0, src1, src2, src3, src4, src5, src6, src7); 2302 src0_ptr += (8 * src_stride); 2303 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2304 src1_ptr += (4 * src2_stride); 2305 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7); 2306 src1_ptr += (4 * src2_stride); 2307 ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 2308 ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 2309 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2310 2311 tmp0 = const_vec; 2312 tmp1 = const_vec; 2313 tmp2 = const_vec; 2314 tmp3 = const_vec; 2315 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 2316 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3); 2317 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, 2318 tmp1, tmp2, tmp3); 2319 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1); 2320 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3); 2321 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, tmp0, 2322 tmp1, tmp2, tmp3); 2323 2324 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2325 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3); 2326 2327 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); 2328 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 2329 dst += (8 * dst_stride); 2330 } 2331} 2332 2333static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr, 2334 int32_t src_stride, 2335 int16_t *src1_ptr, 2336 int32_t src2_stride, 2337 uint8_t *dst, 2338 int32_t dst_stride, 2339 const int8_t *filter, 2340 int32_t height) 2341{ 2342 if (2 == height) { 2343 hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2344 dst, dst_stride, filter, height); 2345 } else if (4 == height) { 2346 hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2347 dst, dst_stride, filter, height); 2348 } else if (8 == height || 16 == height) { 2349 hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride, 2350 src1_ptr, src2_stride, 2351 dst, dst_stride, filter, height); 2352 } 2353} 2354 2355static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr, 2356 int32_t src_stride, 2357 int16_t *src1_ptr, 2358 int32_t src2_stride, 2359 uint8_t *dst, 2360 int32_t dst_stride, 2361 const int8_t *filter, 2362 int32_t height) 2363{ 2364 uint32_t loop_cnt; 2365 v8i16 filt0, filt1; 2366 v16i8 src0, src1, src2, src3; 2367 v8i16 in0, in1, in2, in3; 2368 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2369 v16i8 mask1; 2370 v16i8 vec0, vec1, vec2, vec3; 2371 v8i16 dst0, dst1, dst2, dst3; 2372 v8i16 filter_vec, const_vec; 2373 2374 src0_ptr -= 1; 2375 2376 const_vec = __msa_ldi_h(128); 2377 const_vec <<= 6; 2378 2379 filter_vec = LD_SH(filter); 2380 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2381 2382 mask1 = mask0 + 2; 2383 2384 for (loop_cnt = (height >> 2); loop_cnt--;) { 2385 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 2386 src0_ptr += (4 * src_stride); 2387 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2388 src1_ptr += (4 * src2_stride); 2389 XORI_B4_128_SB(src0, src1, src2, src3); 2390 2391 dst0 = const_vec; 2392 dst1 = const_vec; 2393 dst2 = const_vec; 2394 dst3 = const_vec; 2395 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2396 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 2397 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2398 dst1, dst2, dst3); 2399 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 2400 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 2401 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2402 dst1, dst2, dst3); 2403 2404 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2405 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2406 2407 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2408 ST_W2(dst0, 0, 2, dst, dst_stride); 2409 ST_H2(dst0, 2, 6, dst + 4, dst_stride); 2410 ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride); 2411 ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2412 dst += (4 * dst_stride); 2413 } 2414} 2415 2416static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr, 2417 int32_t src_stride, 2418 int16_t *src1_ptr, 2419 int32_t src2_stride, 2420 uint8_t *dst, 2421 int32_t dst_stride, 2422 const int8_t *filter, 2423 int32_t height) 2424{ 2425 v8i16 filt0, filt1; 2426 v16i8 src0, src1; 2427 v8i16 in0, in1; 2428 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2429 v16i8 mask1, vec0, vec1, vec2, vec3; 2430 v8i16 dst0, dst1; 2431 v8i16 filter_vec, const_vec; 2432 2433 src0_ptr -= 1; 2434 2435 const_vec = __msa_ldi_h(128); 2436 const_vec <<= 6; 2437 2438 filter_vec = LD_SH(filter); 2439 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2440 2441 mask1 = mask0 + 2; 2442 2443 LD_SB2(src0_ptr, src_stride, src0, src1); 2444 LD_SH2(src1_ptr, src2_stride, in0, in1); 2445 XORI_B2_128_SB(src0, src1); 2446 2447 dst0 = const_vec; 2448 dst1 = const_vec; 2449 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2450 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec2, vec3); 2451 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst0, dst1, 2452 dst0, dst1); 2453 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); 2454 2455 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 2456 ST_D2(dst0, 0, 1, dst, dst_stride); 2457} 2458 2459static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr, 2460 int32_t src_stride, 2461 int16_t *src1_ptr, 2462 int32_t src2_stride, 2463 uint8_t *dst, 2464 int32_t dst_stride, 2465 const int8_t *filter, 2466 int32_t height) 2467{ 2468 v8i16 filt0, filt1; 2469 v16i8 src0, src1, src2, src3, src4, src5; 2470 v8i16 in0, in1, in2, in3, in4, in5; 2471 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2472 v16i8 mask1; 2473 v16i8 vec0, vec1, vec2, vec3; 2474 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 2475 v8i16 filter_vec, const_vec; 2476 2477 src0_ptr -= 1; 2478 2479 const_vec = __msa_ldi_h(128); 2480 const_vec <<= 6; 2481 2482 filter_vec = LD_SH(filter); 2483 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2484 2485 mask1 = mask0 + 2; 2486 2487 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5); 2488 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2489 src1_ptr += (4 * src2_stride); 2490 LD_SH2(src1_ptr, src2_stride, in4, in5); 2491 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5); 2492 2493 dst0 = const_vec; 2494 dst1 = const_vec; 2495 dst2 = const_vec; 2496 dst3 = const_vec; 2497 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2498 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 2499 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, dst1, 2500 dst2, dst3); 2501 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 2502 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 2503 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, dst1, 2504 dst2, dst3); 2505 dst4 = const_vec; 2506 dst5 = const_vec; 2507 2508 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 2509 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec2, vec3); 2510 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, dst5, 2511 dst4, dst5); 2512 2513 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2514 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2515 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); 2516 2517 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2518 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); 2519 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 2520 ST_D2(dst2, 0, 1, dst + 4 * dst_stride, dst_stride); 2521} 2522 2523static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, 2524 int32_t src_stride, 2525 int16_t *src1_ptr, 2526 int32_t src2_stride, 2527 uint8_t *dst, 2528 int32_t dst_stride, 2529 const int8_t *filter, 2530 int32_t height) 2531{ 2532 uint32_t loop_cnt; 2533 v8i16 filt0, filt1; 2534 v16i8 src0, src1, src2, src3; 2535 v8i16 in0, in1, in2, in3; 2536 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2537 v16i8 mask1; 2538 v16i8 vec0, vec1, vec2, vec3; 2539 v8i16 dst0, dst1, dst2, dst3; 2540 v8i16 filter_vec, const_vec; 2541 2542 src0_ptr -= 1; 2543 2544 const_vec = __msa_ldi_h(128); 2545 const_vec <<= 6; 2546 2547 filter_vec = LD_SH(filter); 2548 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2549 2550 mask1 = mask0 + 2; 2551 2552 for (loop_cnt = (height >> 2); loop_cnt--;) { 2553 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 2554 src0_ptr += (4 * src_stride); 2555 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2556 src1_ptr += (4 * src2_stride); 2557 XORI_B4_128_SB(src0, src1, src2, src3); 2558 2559 dst0 = const_vec; 2560 dst1 = const_vec; 2561 dst2 = const_vec; 2562 dst3 = const_vec; 2563 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2564 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 2565 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2566 dst1, dst2, dst3); 2567 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 2568 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 2569 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2570 dst1, dst2, dst3); 2571 2572 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2573 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2574 2575 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2576 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 2577 dst += (4 * dst_stride); 2578 } 2579} 2580 2581static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr, 2582 int32_t src_stride, 2583 int16_t *src1_ptr, 2584 int32_t src2_stride, 2585 uint8_t *dst, 2586 int32_t dst_stride, 2587 const int8_t *filter, 2588 int32_t height) 2589{ 2590 if (2 == height) { 2591 hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2592 dst, dst_stride, filter, height); 2593 } else if (6 == height) { 2594 hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2595 dst, dst_stride, filter, height); 2596 } else if (0 == (height % 4)) { 2597 hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride, 2598 src1_ptr, src2_stride, 2599 dst, dst_stride, filter, height); 2600 } 2601} 2602 2603static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr, 2604 int32_t src_stride, 2605 int16_t *src1_ptr, 2606 int32_t src2_stride, 2607 uint8_t *dst, 2608 int32_t dst_stride, 2609 const int8_t *filter, 2610 int32_t height) 2611{ 2612 uint32_t loop_cnt; 2613 v8i16 filt0, filt1; 2614 v16i8 src0, src1, src2, src3; 2615 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 2616 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2617 v16i8 mask2 = { 2618 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 2619 }; 2620 v16i8 mask1, mask3; 2621 v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 2622 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 2623 v8i16 filter_vec, const_vec; 2624 2625 src0_ptr -= 1; 2626 2627 const_vec = __msa_ldi_h(128); 2628 const_vec <<= 6; 2629 2630 filter_vec = LD_SH(filter); 2631 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2632 2633 mask1 = mask0 + 2; 2634 mask3 = mask2 + 2; 2635 2636 for (loop_cnt = (height >> 2); loop_cnt--;) { 2637 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 2638 src0_ptr += (4 * src_stride); 2639 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2640 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 2641 src1_ptr += (4 * src2_stride); 2642 2643 ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 2644 XORI_B4_128_SB(src0, src1, src2, src3); 2645 2646 dst0 = const_vec; 2647 dst1 = const_vec; 2648 dst2 = const_vec; 2649 dst3 = const_vec; 2650 dst4 = const_vec; 2651 dst5 = const_vec; 2652 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2653 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 2654 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 2655 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2656 dst1, dst2, dst3); 2657 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5); 2658 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 2659 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 2660 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec4, vec5); 2661 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2662 dst1, dst2, dst3); 2663 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5); 2664 2665 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2666 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2667 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); 2668 2669 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2670 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); 2671 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 2672 ST_W4(dst2, 0, 1, 2, 3, dst + 8, dst_stride); 2673 dst += (4 * dst_stride); 2674 } 2675} 2676 2677static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr, 2678 int32_t src_stride, 2679 int16_t *src1_ptr, 2680 int32_t src2_stride, 2681 uint8_t *dst, 2682 int32_t dst_stride, 2683 const int8_t *filter, 2684 int32_t height) 2685{ 2686 uint32_t loop_cnt; 2687 v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3; 2688 v8i16 in0, in1, in2, in3, dst0, dst1, dst2, dst3; 2689 v8i16 filt0, filt1; 2690 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2691 v16i8 mask1; 2692 v8i16 filter_vec, const_vec; 2693 2694 src0_ptr -= 1; 2695 2696 const_vec = __msa_ldi_h(128); 2697 const_vec <<= 6; 2698 2699 filter_vec = LD_SH(filter); 2700 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2701 2702 mask1 = mask0 + 2; 2703 2704 for (loop_cnt = (height >> 1); loop_cnt--;) { 2705 LD_SB2(src0_ptr, src_stride, src0, src2); 2706 LD_SB2(src0_ptr + 8, src_stride, src1, src3); 2707 src0_ptr += (2 * src_stride); 2708 LD_SH2(src1_ptr, src2_stride, in0, in2); 2709 LD_SH2(src1_ptr + 8, src2_stride, in1, in3); 2710 src1_ptr += (2 * src2_stride); 2711 2712 XORI_B4_128_SB(src0, src1, src2, src3); 2713 2714 dst0 = const_vec; 2715 dst1 = const_vec; 2716 dst2 = const_vec; 2717 dst3 = const_vec; 2718 2719 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2720 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 2721 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2722 dst1, dst2, dst3); 2723 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 2724 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 2725 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2726 dst1, dst2, dst3); 2727 2728 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2729 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2730 2731 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2732 ST_SH2(dst0, dst1, dst, dst_stride); 2733 dst += (2 * dst_stride); 2734 } 2735} 2736 2737static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr, 2738 int32_t src_stride, 2739 int16_t *src1_ptr, 2740 int32_t src2_stride, 2741 uint8_t *dst, 2742 int32_t dst_stride, 2743 const int8_t *filter, 2744 int32_t height) 2745{ 2746 int16_t *src1_ptr_tmp; 2747 uint8_t *dst_tmp; 2748 uint32_t loop_cnt; 2749 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2750 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 2751 v8i16 filt0, filt1; 2752 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2753 v16i8 mask1, mask2, mask3; 2754 v16i8 vec0, vec1, vec2, vec3; 2755 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2756 v8i16 filter_vec, const_vec; 2757 2758 src0_ptr -= 1; 2759 2760 const_vec = __msa_ldi_h(128); 2761 const_vec <<= 6; 2762 2763 filter_vec = LD_SH(filter); 2764 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2765 2766 mask1 = mask0 + 2; 2767 mask2 = mask0 + 8; 2768 mask3 = mask0 + 10; 2769 2770 dst_tmp = dst + 16; 2771 src1_ptr_tmp = src1_ptr + 16; 2772 2773 for (loop_cnt = (height >> 2); loop_cnt--;) { 2774 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6); 2775 LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7); 2776 src0_ptr += (4 * src_stride); 2777 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6); 2778 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7); 2779 src1_ptr += (4 * src2_stride); 2780 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2781 2782 dst0 = const_vec; 2783 dst1 = const_vec; 2784 dst2 = const_vec; 2785 dst3 = const_vec; 2786 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1); 2787 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask2, vec2, vec3); 2788 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2789 dst1, dst2, dst3); 2790 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1); 2791 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask3, vec2, vec3); 2792 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2793 dst1, dst2, dst3); 2794 2795 dst4 = const_vec; 2796 dst5 = const_vec; 2797 dst6 = const_vec; 2798 dst7 = const_vec; 2799 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask2, vec0, vec1); 2800 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask2, vec2, vec3); 2801 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst4, 2802 dst5, dst6, dst7); 2803 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask3, vec0, vec1); 2804 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask3, vec2, vec3); 2805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst4, 2806 dst5, dst6, dst7); 2807 2808 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2809 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2810 HEVC_BI_RND_CLIP4(in4, in5, in6, in7, 2811 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7); 2812 2813 PCKEV_B4_SH(dst1, dst0, dst3, dst2, 2814 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3); 2815 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); 2816 dst += (4 * dst_stride); 2817 2818 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); 2819 src1_ptr_tmp += (4 * src2_stride); 2820 2821 dst0 = const_vec; 2822 dst1 = const_vec; 2823 dst2 = const_vec; 2824 dst3 = const_vec; 2825 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1); 2826 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3); 2827 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2828 dst1, dst2, dst3); 2829 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec0, vec1); 2830 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec2, vec3); 2831 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2832 dst1, dst2, dst3); 2833 2834 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2835 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2836 2837 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2838 ST_D4(dst0, dst1, 0, 1, 0, 1, dst_tmp, dst_stride); 2839 dst_tmp += (4 * dst_stride); 2840 } 2841} 2842 2843static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr, 2844 int32_t src_stride, 2845 int16_t *src1_ptr, 2846 int32_t src2_stride, 2847 uint8_t *dst, 2848 int32_t dst_stride, 2849 const int8_t *filter, 2850 int32_t height) 2851{ 2852 uint32_t loop_cnt; 2853 v16i8 src0, src1, src2; 2854 v8i16 in0, in1, in2, in3; 2855 v8i16 filt0, filt1; 2856 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2857 v16i8 mask1, mask2, mask3; 2858 v8i16 dst0, dst1, dst2, dst3; 2859 v16i8 vec0, vec1, vec2, vec3; 2860 v8i16 filter_vec, const_vec; 2861 2862 src0_ptr -= 1; 2863 2864 const_vec = __msa_ldi_h(128); 2865 const_vec <<= 6; 2866 2867 filter_vec = LD_SH(filter); 2868 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2869 2870 mask1 = mask0 + 2; 2871 mask2 = mask0 + 8; 2872 mask3 = mask0 + 10; 2873 2874 for (loop_cnt = height; loop_cnt--;) { 2875 LD_SB2(src0_ptr, 16, src0, src1); 2876 src2 = LD_SB(src0_ptr + 24); 2877 src0_ptr += src_stride; 2878 LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 2879 src1_ptr += src2_stride; 2880 XORI_B3_128_SB(src0, src1, src2); 2881 2882 dst0 = const_vec; 2883 dst1 = const_vec; 2884 dst2 = const_vec; 2885 dst3 = const_vec; 2886 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1); 2887 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3); 2888 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2889 dst1, dst2, dst3); 2890 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1); 2891 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3); 2892 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2893 dst1, dst2, dst3); 2894 2895 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2896 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2897 2898 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2899 ST_SH2(dst0, dst1, dst, 16); 2900 dst += dst_stride; 2901 } 2902} 2903 2904static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr, 2905 int32_t src_stride, 2906 int16_t *src1_ptr, 2907 int32_t src2_stride, 2908 uint8_t *dst, 2909 int32_t dst_stride, 2910 const int8_t *filter, 2911 int32_t height) 2912{ 2913 v16i8 src0, src1, src2, src3, src4; 2914 v8i16 in0, in1; 2915 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; 2916 v8i16 dst10; 2917 v8i16 filt0, filt1; 2918 v8i16 filter_vec, const_vec; 2919 2920 src0_ptr -= src_stride; 2921 2922 const_vec = __msa_ldi_h(128); 2923 const_vec <<= 6; 2924 2925 filter_vec = LD_SH(filter); 2926 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2927 2928 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 2929 src0_ptr += (3 * src_stride); 2930 2931 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2932 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 2933 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2934 2935 LD_SB2(src0_ptr, src_stride, src3, src4); 2936 LD_SH2(src1_ptr, src2_stride, in0, in1); 2937 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); 2938 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2939 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); 2940 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); 2941 2942 dst10 = const_vec; 2943 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2944 dst10 = __msa_adds_s_h(dst10, in0); 2945 dst10 = __msa_srari_h(dst10, 7); 2946 CLIP_SH_0_255(dst10); 2947 2948 dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10); 2949 ST_W2(dst10, 0, 1, dst, dst_stride); 2950} 2951 2952static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr, 2953 int32_t src_stride, 2954 int16_t *src1_ptr, 2955 int32_t src2_stride, 2956 uint8_t *dst, 2957 int32_t dst_stride, 2958 const int8_t *filter, 2959 int32_t height) 2960{ 2961 v16i8 src0, src1, src2, src3, src4, src5, src6; 2962 v8i16 in0, in1, in2, in3; 2963 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 2964 v16i8 src2110, src4332, src6554; 2965 v8i16 dst10, dst32; 2966 v8i16 filt0, filt1; 2967 v8i16 filter_vec, const_vec; 2968 2969 src0_ptr -= src_stride; 2970 2971 const_vec = __msa_ldi_h(128); 2972 const_vec <<= 6; 2973 2974 filter_vec = LD_SH(filter); 2975 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2976 2977 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 2978 src0_ptr += (3 * src_stride); 2979 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2980 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 2981 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2982 2983 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6); 2984 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2985 ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 2986 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 2987 src32_r, src43_r, src54_r, src65_r); 2988 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554); 2989 XORI_B2_128_SB(src4332, src6554); 2990 2991 dst10 = const_vec; 2992 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2993 dst32 = const_vec; 2994 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); 2995 HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32); 2996 2997 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10); 2998 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride); 2999} 3000 3001static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, 3002 int32_t src_stride, 3003 int16_t *src1_ptr, 3004 int32_t src2_stride, 3005 uint8_t *dst, 3006 int32_t dst_stride, 3007 const int8_t *filter, 3008 int32_t height) 3009{ 3010 int32_t loop_cnt; 3011 v16i8 src0, src1, src2, src3, src4, src5; 3012 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 3013 v16i8 src6, src7, src8, src9; 3014 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 3015 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 3016 v16i8 src2110, src4332, src6554, src8776; 3017 v8i16 dst10, dst32, dst54, dst76; 3018 v8i16 filt0, filt1; 3019 v8i16 filter_vec, const_vec; 3020 3021 src0_ptr -= src_stride; 3022 3023 const_vec = __msa_ldi_h(128); 3024 const_vec <<= 6; 3025 3026 filter_vec = LD_SH(filter); 3027 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3028 3029 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3030 src0_ptr += (3 * src_stride); 3031 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3032 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 3033 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3034 3035 for (loop_cnt = (height >> 3); loop_cnt--;) { 3036 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); 3037 src0_ptr += (6 * src_stride); 3038 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); 3039 src1_ptr += (8 * src2_stride); 3040 ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 3041 ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 3042 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3043 src32_r, src43_r, src54_r, src65_r); 3044 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3045 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, 3046 src4332, src6554, src8776); 3047 XORI_B3_128_SB(src4332, src6554, src8776); 3048 3049 dst10 = const_vec; 3050 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 3051 dst32 = const_vec; 3052 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); 3053 dst54 = const_vec; 3054 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54); 3055 3056 LD_SB2(src0_ptr, src_stride, src9, src2); 3057 src0_ptr += (2 * src_stride); 3058 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r); 3059 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r); 3060 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3061 dst76 = const_vec; 3062 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76); 3063 3064 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3065 dst10, dst32, dst54, dst76, 7, 3066 dst10, dst32, dst54, dst76); 3067 3068 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54); 3069 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3070 dst += (8 * dst_stride); 3071 } 3072} 3073 3074static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr, 3075 int32_t src_stride, 3076 int16_t *src1_ptr, 3077 int32_t src2_stride, 3078 uint8_t *dst, 3079 int32_t dst_stride, 3080 const int8_t *filter, 3081 int32_t height) 3082{ 3083 if (2 == height) { 3084 hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3085 dst, dst_stride, filter, height); 3086 } else if (4 == height) { 3087 hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3088 dst, dst_stride, filter, height); 3089 } else { 3090 hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride, 3091 src1_ptr, src2_stride, 3092 dst, dst_stride, filter, height); 3093 } 3094} 3095 3096static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr, 3097 int32_t src_stride, 3098 int16_t *src1_ptr, 3099 int32_t src2_stride, 3100 uint8_t *dst, 3101 int32_t dst_stride, 3102 const int8_t *filter, 3103 int32_t height) 3104{ 3105 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3106 v8i16 in0, in1, in2, in3; 3107 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r; 3108 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3109 v8i16 filt0, filt1; 3110 v8i16 filter_vec, const_vec; 3111 3112 src0_ptr -= src_stride; 3113 3114 const_vec = __msa_ldi_h(128); 3115 const_vec <<= 6; 3116 3117 filter_vec = LD_SH(filter); 3118 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3119 3120 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3121 src0_ptr += (3 * src_stride); 3122 LD_SB2(src0_ptr, src_stride, src3, src4); 3123 src0_ptr += (2 * src_stride); 3124 LD_SB2(src0_ptr, src_stride, src5, src6); 3125 src0_ptr += (2 * src_stride); 3126 LD_SB2(src0_ptr, src_stride, src7, src8); 3127 src0_ptr += (2 * src_stride); 3128 LD_SB2(src0_ptr, src_stride, src9, src10); 3129 src0_ptr += (2 * src_stride); 3130 3131 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3132 src1_ptr += (4 * src2_stride); 3133 3134 XORI_B3_128_SB(src0, src1, src2); 3135 XORI_B2_128_SB(src3, src4); 3136 XORI_B2_128_SB(src5, src6); 3137 XORI_B2_128_SB(src7, src8); 3138 XORI_B2_128_SB(src9, src10); 3139 3140 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3141 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3142 3143 dst0_r = const_vec; 3144 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3145 dst1_r = const_vec; 3146 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3147 3148 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 3149 3150 dst2_r = const_vec; 3151 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); 3152 dst3_r = const_vec; 3153 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); 3154 3155 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3156 dst0_r, dst1_r, dst2_r, dst3_r, 7, 3157 dst0_r, dst1_r, dst2_r, dst3_r); 3158 3159 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 3160 ST_W2(dst0_r, 0, 2, dst, dst_stride); 3161 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride); 3162 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride); 3163 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 3164 dst += (4 * dst_stride); 3165 3166 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3167 src1_ptr += (4 * src2_stride); 3168 ILVR_B2_SB(src7, src6, src8, src7, src32_r, src43_r); 3169 3170 dst0_r = const_vec; 3171 DPADD_SB2_SH(src54_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3172 dst1_r = const_vec; 3173 DPADD_SB2_SH(src65_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3174 3175 ILVR_B2_SB(src9, src8, src10, src9, src54_r, src65_r); 3176 3177 dst2_r = const_vec; 3178 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); 3179 dst3_r = const_vec; 3180 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); 3181 3182 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3183 dst0_r, dst1_r, dst2_r, dst3_r, 7, 3184 dst0_r, dst1_r, dst2_r, dst3_r); 3185 3186 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 3187 ST_W2(dst0_r, 0, 2, dst, dst_stride); 3188 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride); 3189 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride); 3190 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 3191 dst += (4 * dst_stride); 3192} 3193 3194static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr, 3195 int32_t src_stride, 3196 int16_t *src1_ptr, 3197 int32_t src2_stride, 3198 uint8_t *dst, 3199 int32_t dst_stride, 3200 const int8_t *filter, 3201 int32_t height) 3202{ 3203 v16i8 src0, src1, src2, src3, src4; 3204 v8i16 in0, in1, dst0_r, dst1_r; 3205 v16i8 src10_r, src32_r, src21_r, src43_r; 3206 v8i16 filt0, filt1; 3207 v8i16 filter_vec, const_vec; 3208 3209 src0_ptr -= src_stride; 3210 3211 const_vec = __msa_ldi_h(128); 3212 const_vec <<= 6; 3213 3214 filter_vec = LD_SH(filter); 3215 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3216 3217 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3218 src0_ptr += (3 * src_stride); 3219 XORI_B3_128_SB(src0, src1, src2); 3220 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3221 3222 LD_SB2(src0_ptr, src_stride, src3, src4); 3223 LD_SH2(src1_ptr, src2_stride, in0, in1); 3224 XORI_B2_128_SB(src3, src4); 3225 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3226 3227 dst0_r = const_vec; 3228 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3229 dst1_r = const_vec; 3230 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3231 3232 HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r); 3233 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); 3234 3235 ST_D2(dst0_r, 0, 1, dst, dst_stride); 3236} 3237 3238static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr, 3239 int32_t src_stride, 3240 int16_t *src1_ptr, 3241 int32_t src2_stride, 3242 uint8_t *dst, 3243 int32_t dst_stride, 3244 const int8_t *filter, 3245 int32_t height) 3246{ 3247 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3248 v8i16 in0, in1, in2, in3, in4, in5; 3249 v16i8 src10_r, src32_r, src54_r, src76_r; 3250 v16i8 src21_r, src43_r, src65_r, src87_r; 3251 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; 3252 v8i16 filt0, filt1; 3253 v8i16 filter_vec, const_vec; 3254 3255 src0_ptr -= src_stride; 3256 3257 const_vec = __msa_ldi_h(128); 3258 const_vec <<= 6; 3259 3260 filter_vec = LD_SH(filter); 3261 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3262 3263 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3264 src0_ptr += (3 * src_stride); 3265 XORI_B3_128_SB(src0, src1, src2); 3266 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3267 3268 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); 3269 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); 3270 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8); 3271 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3272 src32_r, src43_r, src54_r, src65_r); 3273 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3274 3275 dst0_r = const_vec; 3276 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3277 dst1_r = const_vec; 3278 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3279 dst2_r = const_vec; 3280 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); 3281 dst3_r = const_vec; 3282 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); 3283 dst4_r = const_vec; 3284 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r); 3285 dst5_r = const_vec; 3286 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r); 3287 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3288 dst0_r, dst1_r, dst2_r, dst3_r, 7, 3289 dst0_r, dst1_r, dst2_r, dst3_r); 3290 HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r); 3291 3292 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 3293 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r); 3294 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride); 3295 ST_D2(dst2_r, 0, 1, dst + 4 * dst_stride, dst_stride); 3296} 3297 3298static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, 3299 int32_t src_stride, 3300 int16_t *src1_ptr, 3301 int32_t src2_stride, 3302 uint8_t *dst, 3303 int32_t dst_stride, 3304 const int8_t *filter, 3305 int32_t height) 3306{ 3307 int32_t loop_cnt; 3308 v16i8 src0, src1, src2, src3, src4, src5; 3309 v8i16 in0, in1, in2, in3; 3310 v16i8 src10_r, src32_r, src21_r, src43_r; 3311 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3312 v8i16 filt0, filt1; 3313 v8i16 filter_vec, const_vec; 3314 3315 src0_ptr -= src_stride; 3316 3317 const_vec = __msa_ldi_h(128); 3318 const_vec <<= 6; 3319 3320 filter_vec = LD_SH(filter); 3321 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3322 3323 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3324 src0_ptr += (3 * src_stride); 3325 XORI_B3_128_SB(src0, src1, src2); 3326 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3327 3328 for (loop_cnt = (height >> 2); loop_cnt--;) { 3329 LD_SB2(src0_ptr, src_stride, src3, src4); 3330 src0_ptr += (2 * src_stride); 3331 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3332 src1_ptr += (4 * src2_stride); 3333 XORI_B2_128_SB(src3, src4); 3334 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3335 3336 dst0_r = const_vec; 3337 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3338 dst1_r = const_vec; 3339 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3340 3341 LD_SB2(src0_ptr, src_stride, src5, src2); 3342 src0_ptr += (2 * src_stride); 3343 XORI_B2_128_SB(src5, src2); 3344 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 3345 3346 dst2_r = const_vec; 3347 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r); 3348 dst3_r = const_vec; 3349 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r); 3350 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3351 dst0_r, dst1_r, dst2_r, dst3_r, 7, 3352 dst0_r, dst1_r, dst2_r, dst3_r); 3353 3354 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 3355 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride); 3356 dst += (4 * dst_stride); 3357 } 3358} 3359 3360static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr, 3361 int32_t src_stride, 3362 int16_t *src1_ptr, 3363 int32_t src2_stride, 3364 uint8_t *dst, 3365 int32_t dst_stride, 3366 const int8_t *filter, 3367 int32_t height) 3368{ 3369 if (2 == height) { 3370 hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3371 dst, dst_stride, filter, height); 3372 } else if (6 == height) { 3373 hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3374 dst, dst_stride, filter, height); 3375 } else { 3376 hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride, 3377 src1_ptr, src2_stride, 3378 dst, dst_stride, filter, height); 3379 } 3380} 3381 3382static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr, 3383 int32_t src_stride, 3384 int16_t *src1_ptr, 3385 int32_t src2_stride, 3386 uint8_t *dst, 3387 int32_t dst_stride, 3388 const int8_t *filter, 3389 int32_t height) 3390{ 3391 int32_t loop_cnt; 3392 v16i8 src0, src1, src2, src3, src4, src5, src6; 3393 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 3394 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r; 3395 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3396 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 3397 v16i8 src2110, src4332, src6554; 3398 v8i16 dst0_l, dst1_l, filt0, filt1; 3399 v8i16 filter_vec, const_vec; 3400 3401 src0_ptr -= (1 * src_stride); 3402 3403 const_vec = __msa_ldi_h(128); 3404 const_vec <<= 6; 3405 3406 filter_vec = LD_SH(filter); 3407 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3408 3409 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3410 src0_ptr += (3 * src_stride); 3411 XORI_B3_128_SB(src0, src1, src2); 3412 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3413 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3414 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); 3415 3416 for (loop_cnt = (height >> 2); loop_cnt--;) { 3417 LD_SB2(src0_ptr, src_stride, src3, src4); 3418 src0_ptr += (2 * src_stride); 3419 LD_SB2(src0_ptr, src_stride, src5, src6); 3420 src0_ptr += (2 * src_stride); 3421 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3422 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7); 3423 src1_ptr += (4 * src2_stride); 3424 ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 3425 XORI_B2_128_SB(src3, src4); 3426 XORI_B2_128_SB(src5, src6); 3427 3428 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3429 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3430 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); 3431 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 3432 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l); 3433 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); 3434 3435 dst0_r = const_vec; 3436 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3437 dst1_r = const_vec; 3438 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3439 dst0_l = const_vec; 3440 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l); 3441 dst2_r = const_vec; 3442 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); 3443 dst3_r = const_vec; 3444 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); 3445 dst1_l = const_vec; 3446 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l); 3447 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3448 dst0_r, dst1_r, dst2_r, dst3_r, 7, 3449 dst0_r, dst1_r, dst2_r, dst3_r); 3450 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l); 3451 3452 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 3453 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l); 3454 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride); 3455 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride); 3456 dst += (4 * dst_stride); 3457 3458 src2 = src6; 3459 src10_r = src54_r; 3460 src21_r = src65_r; 3461 src2110 = src6554; 3462 } 3463} 3464 3465static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr, 3466 int32_t src_stride, 3467 int16_t *src1_ptr, 3468 int32_t src2_stride, 3469 uint8_t *dst, 3470 int32_t dst_stride, 3471 const int8_t *filter, 3472 int32_t height) 3473{ 3474 int32_t loop_cnt; 3475 v16i8 src0, src1, src2, src3, src4, src5; 3476 v8i16 in0, in1, in2, in3; 3477 v16i8 src10_r, src32_r, src21_r, src43_r; 3478 v16i8 src10_l, src32_l, src21_l, src43_l; 3479 v8i16 dst0_r, dst1_r, dst0_l, dst1_l; 3480 v8i16 filt0, filt1; 3481 v8i16 filter_vec, const_vec; 3482 3483 src0_ptr -= src_stride; 3484 3485 const_vec = __msa_ldi_h(128); 3486 const_vec <<= 6; 3487 3488 filter_vec = LD_SH(filter); 3489 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3490 3491 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3492 src0_ptr += (3 * src_stride); 3493 XORI_B3_128_SB(src0, src1, src2); 3494 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3495 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3496 3497 for (loop_cnt = (height >> 2); loop_cnt--;) { 3498 LD_SB2(src0_ptr, src_stride, src3, src4); 3499 src0_ptr += (2 * src_stride); 3500 LD_SH2(src1_ptr, src2_stride, in0, in1); 3501 LD_SH2((src1_ptr + 8), src2_stride, in2, in3); 3502 src1_ptr += (2 * src2_stride); 3503 XORI_B2_128_SB(src3, src4); 3504 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3505 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3506 3507 dst0_r = const_vec; 3508 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3509 dst1_r = const_vec; 3510 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3511 dst0_l = const_vec; 3512 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); 3513 dst1_l = const_vec; 3514 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); 3515 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3516 dst0_r, dst1_r, dst0_l, dst1_l, 7, 3517 dst0_r, dst1_r, dst0_l, dst1_l); 3518 3519 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3520 ST_SH2(dst0_r, dst1_r, dst, dst_stride); 3521 dst += (2 * dst_stride); 3522 3523 LD_SB2(src0_ptr, src_stride, src5, src2); 3524 src0_ptr += (2 * src_stride); 3525 LD_SH2(src1_ptr, src2_stride, in0, in1); 3526 LD_SH2((src1_ptr + 8), src2_stride, in2, in3); 3527 src1_ptr += (2 * src2_stride); 3528 XORI_B2_128_SB(src5, src2); 3529 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 3530 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 3531 3532 dst0_r = const_vec; 3533 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); 3534 dst0_l = const_vec; 3535 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); 3536 dst1_r = const_vec; 3537 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); 3538 dst1_l = const_vec; 3539 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); 3540 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3541 dst0_r, dst1_r, dst0_l, dst1_l, 7, 3542 dst0_r, dst1_r, dst0_l, dst1_l); 3543 3544 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3545 ST_SH2(dst0_r, dst1_r, dst, dst_stride); 3546 dst += (2 * dst_stride); 3547 } 3548} 3549 3550static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr, 3551 int32_t src_stride, 3552 int16_t *src1_ptr, 3553 int32_t src2_stride, 3554 uint8_t *dst, 3555 int32_t dst_stride, 3556 const int8_t *filter, 3557 int32_t height) 3558{ 3559 uint32_t loop_cnt; 3560 v16i8 src0, src1, src2, src3, src4, src5; 3561 v16i8 src6, src7, src8, src9, src10, src11; 3562 v8i16 in0, in1, in2, in3, in4, in5; 3563 v16i8 src10_r, src32_r, src76_r, src98_r; 3564 v16i8 src21_r, src43_r, src87_r, src109_r; 3565 v16i8 src10_l, src32_l, src21_l, src43_l; 3566 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3567 v8i16 dst0_l, dst1_l; 3568 v8i16 filt0, filt1; 3569 v8i16 filter_vec, const_vec; 3570 3571 src0_ptr -= src_stride; 3572 3573 const_vec = __msa_ldi_h(128); 3574 const_vec <<= 6; 3575 3576 filter_vec = LD_SH(filter); 3577 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3578 3579 /* 16width */ 3580 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3581 XORI_B3_128_SB(src0, src1, src2); 3582 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3583 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3584 /* 8width */ 3585 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); 3586 src0_ptr += (3 * src_stride); 3587 XORI_B3_128_SB(src6, src7, src8); 3588 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3589 3590 for (loop_cnt = (height >> 2); loop_cnt--;) { 3591 /* 16width */ 3592 LD_SB2(src0_ptr, src_stride, src3, src4); 3593 LD_SH2(src1_ptr, src2_stride, in0, in1); 3594 LD_SH2((src1_ptr + 8), src2_stride, in2, in3); 3595 LD_SH2((src1_ptr + 16), src2_stride, in4, in5); 3596 src1_ptr += (2 * src2_stride); 3597 XORI_B2_128_SB(src3, src4); 3598 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3599 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3600 /* 8width */ 3601 LD_SB2(src0_ptr + 16, src_stride, src9, src10); 3602 src0_ptr += (2 * src_stride); 3603 XORI_B2_128_SB(src9, src10); 3604 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3605 /* 16width */ 3606 dst0_r = const_vec; 3607 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3608 dst0_l = const_vec; 3609 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); 3610 dst1_r = const_vec; 3611 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3612 dst1_l = const_vec; 3613 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); 3614 /* 8width */ 3615 dst2_r = const_vec; 3616 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); 3617 dst3_r = const_vec; 3618 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); 3619 /* 16width */ 3620 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3621 dst0_r, dst1_r, dst0_l, dst1_l, 7, 3622 dst0_r, dst1_r, dst0_l, dst1_l); 3623 3624 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r); 3625 3626 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3627 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r); 3628 ST_SH2(dst0_r, dst1_r, dst, dst_stride); 3629 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride); 3630 dst += (2 * dst_stride); 3631 3632 /* 16width */ 3633 LD_SB2(src0_ptr, src_stride, src5, src2); 3634 LD_SH2(src1_ptr, src2_stride, in0, in1); 3635 LD_SH2((src1_ptr + 8), src2_stride, in2, in3); 3636 LD_SH2((src1_ptr + 16), src2_stride, in4, in5); 3637 src1_ptr += (2 * src2_stride); 3638 XORI_B2_128_SB(src5, src2); 3639 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 3640 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 3641 /* 8width */ 3642 LD_SB2(src0_ptr + 16, src_stride, src11, src8); 3643 src0_ptr += (2 * src_stride); 3644 XORI_B2_128_SB(src11, src8); 3645 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); 3646 /* 16width */ 3647 dst0_r = const_vec; 3648 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); 3649 dst0_l = const_vec; 3650 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); 3651 dst1_r = const_vec; 3652 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); 3653 dst1_l = const_vec; 3654 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); 3655 /* 8width */ 3656 dst2_r = const_vec; 3657 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r); 3658 dst3_r = const_vec; 3659 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r); 3660 3661 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3662 dst0_r, dst1_r, dst0_l, dst1_l, 7, 3663 dst0_r, dst1_r, dst0_l, dst1_l); 3664 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r); 3665 3666 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3667 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r); 3668 ST_SH2(dst0_r, dst1_r, dst, dst_stride); 3669 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride); 3670 dst += (2 * dst_stride); 3671 } 3672} 3673 3674static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr, 3675 int32_t src_stride, 3676 int16_t *src1_ptr, 3677 int32_t src2_stride, 3678 uint8_t *dst, 3679 int32_t dst_stride, 3680 const int8_t *filter, 3681 int32_t height) 3682{ 3683 uint32_t loop_cnt; 3684 uint8_t *dst_tmp = dst + 16; 3685 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10; 3686 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 3687 v16i8 src10_r, src32_r, src76_r, src98_r; 3688 v16i8 src21_r, src43_r, src87_r, src109_r; 3689 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3690 v16i8 src10_l, src32_l, src76_l, src98_l; 3691 v16i8 src21_l, src43_l, src87_l, src109_l; 3692 v8i16 dst0_l, dst1_l, dst2_l, dst3_l; 3693 v8i16 filt0, filt1; 3694 v8i16 filter_vec, const_vec; 3695 3696 src0_ptr -= src_stride; 3697 3698 const_vec = __msa_ldi_h(128); 3699 const_vec <<= 6; 3700 3701 filter_vec = LD_SH(filter); 3702 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3703 3704 /* 16width */ 3705 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3706 XORI_B3_128_SB(src0, src1, src2); 3707 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3708 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3709 3710 /* next 16width */ 3711 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); 3712 src0_ptr += (3 * src_stride); 3713 XORI_B3_128_SB(src6, src7, src8); 3714 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3715 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 3716 3717 for (loop_cnt = (height >> 1); loop_cnt--;) { 3718 /* 16width */ 3719 LD_SB2(src0_ptr, src_stride, src3, src4); 3720 LD_SH2(src1_ptr, src2_stride, in0, in1); 3721 LD_SH2((src1_ptr + 8), src2_stride, in2, in3); 3722 LD_SH2((src1_ptr + 16), src2_stride, in4, in5); 3723 LD_SH2((src1_ptr + 24), src2_stride, in6, in7); 3724 src1_ptr += (2 * src2_stride); 3725 XORI_B2_128_SB(src3, src4); 3726 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3727 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3728 /* 16width */ 3729 dst0_r = const_vec; 3730 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3731 dst0_l = const_vec; 3732 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); 3733 dst1_r = const_vec; 3734 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3735 dst1_l = const_vec; 3736 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); 3737 /* 16width */ 3738 HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3739 dst0_r, dst1_r, dst0_l, dst1_l, 7, 3740 dst0_r, dst1_r, dst0_l, dst1_l); 3741 3742 src10_r = src32_r; 3743 src21_r = src43_r; 3744 src10_l = src32_l; 3745 src21_l = src43_l; 3746 src2 = src4; 3747 3748 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3749 ST_SH2(dst0_r, dst1_r, dst, dst_stride); 3750 dst += (2 * dst_stride); 3751 3752 /* next 16width */ 3753 LD_SB2(src0_ptr + 16, src_stride, src9, src10); 3754 src0_ptr += (2 * src_stride); 3755 XORI_B2_128_SB(src9, src10); 3756 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3757 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); 3758 /* next 16width */ 3759 dst2_r = const_vec; 3760 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); 3761 dst2_l = const_vec; 3762 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l); 3763 dst3_r = const_vec; 3764 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); 3765 dst3_l = const_vec; 3766 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l); 3767 /* next 16width */ 3768 HEVC_BI_RND_CLIP4(in4, in5, in6, in7, 3769 dst2_r, dst3_r, dst2_l, dst3_l, 7, 3770 dst2_r, dst3_r, dst2_l, dst3_l); 3771 3772 PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r); 3773 ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride); 3774 dst_tmp += (2 * dst_stride); 3775 3776 src76_r = src98_r; 3777 src87_r = src109_r; 3778 src76_l = src98_l; 3779 src87_l = src109_l; 3780 src8 = src10; 3781 } 3782} 3783 3784static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr, 3785 int32_t src_stride, 3786 int16_t *src1_ptr, 3787 int32_t src2_stride, 3788 uint8_t *dst, 3789 int32_t dst_stride, 3790 const int8_t *filter_x, 3791 const int8_t *filter_y) 3792{ 3793 uint64_t tp0, tp1; 3794 v16u8 out; 3795 v8i16 in0 = { 0 }; 3796 v16i8 src0, src1, src2, src3, src4; 3797 v8i16 filt0, filt1; 3798 v8i16 filt_h0, filt_h1; 3799 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3800 v16i8 mask1; 3801 v8i16 filter_vec, const_vec; 3802 v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 3803 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp; 3804 v4i32 dst0, dst1; 3805 3806 src0_ptr -= (src_stride + 1); 3807 3808 filter_vec = LD_SH(filter_x); 3809 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3810 3811 filter_vec = LD_SH(filter_y); 3812 UNPCK_R_SB_SH(filter_vec, filter_vec); 3813 3814 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3815 3816 mask1 = mask0 + 2; 3817 3818 const_vec = __msa_ldi_h(128); 3819 const_vec <<= 6; 3820 3821 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4); 3822 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3823 3824 LD2(src1_ptr, src2_stride, tp0, tp1); 3825 INSERT_D2_SH(tp0, tp1, in0); 3826 in0 = __msa_adds_s_h(in0, const_vec); 3827 3828 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1); 3829 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3); 3830 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5); 3831 3832 dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3833 dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3834 dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3835 3836 ILVRL_H2_SH(dst31, dst20, dst10, dst32); 3837 ILVRL_H2_SH(dst42, dst31, dst21, dst43); 3838 3839 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 3840 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 3841 dst0 >>= 6; 3842 dst1 >>= 6; 3843 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); 3844 tmp = __msa_adds_s_h(tmp, in0); 3845 tmp = __msa_srari_h(tmp, 7); 3846 CLIP_SH_0_255(tmp); 3847 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); 3848 ST_W2(out, 0, 1, dst, dst_stride); 3849} 3850 3851static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr, 3852 int32_t src_stride, 3853 int16_t *src1_ptr, 3854 int32_t src2_stride, 3855 uint8_t *dst, 3856 int32_t dst_stride, 3857 const int8_t *filter_x, 3858 const int8_t *filter_y) 3859{ 3860 uint64_t tp0, tp1; 3861 v16u8 out; 3862 v16i8 src0, src1, src2, src3, src4, src5, src6; 3863 v8i16 filt0, filt1; 3864 v8i16 filt_h0, filt_h1; 3865 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3866 v16i8 mask1; 3867 v8i16 filter_vec, const_vec; 3868 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3869 v8i16 tmp0, tmp1; 3870 v8i16 in0 = { 0 }, in1 = { 0 }; 3871 v8i16 dst30, dst41, dst52, dst63; 3872 v8i16 dst10, dst32, dst54, dst21, dst43, dst65; 3873 v4i32 dst0, dst1, dst2, dst3; 3874 3875 src0_ptr -= (src_stride + 1); 3876 3877 filter_vec = LD_SH(filter_x); 3878 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3879 3880 filter_vec = LD_SH(filter_y); 3881 UNPCK_R_SB_SH(filter_vec, filter_vec); 3882 3883 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3884 3885 mask1 = mask0 + 2; 3886 3887 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 3888 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 3889 3890 const_vec = __msa_ldi_h(128); 3891 const_vec <<= 6; 3892 3893 LD2(src1_ptr, src2_stride, tp0, tp1); 3894 src1_ptr += 2 * src2_stride; 3895 INSERT_D2_SH(tp0, tp1, in0); 3896 LD2(src1_ptr, src2_stride, tp0, tp1); 3897 INSERT_D2_SH(tp0, tp1, in1); 3898 3899 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1); 3900 3901 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1); 3902 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3); 3903 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5); 3904 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7); 3905 3906 dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3907 dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3908 dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3909 dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3910 3911 ILVRL_H2_SH(dst41, dst30, dst10, dst43); 3912 ILVRL_H2_SH(dst52, dst41, dst21, dst54); 3913 ILVRL_H2_SH(dst63, dst52, dst32, dst65); 3914 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 3915 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 3916 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1); 3917 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1); 3918 SRA_4V(dst0, dst1, dst2, dst3, 6); 3919 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 3920 ADDS_SH2_SH(tmp0, in0, tmp1, in1, tmp0, tmp1); 3921 SRARI_H2_SH(tmp0, tmp1, 7); 3922 CLIP_SH2_0_255(tmp0, tmp1); 3923 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 3924 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 3925} 3926 3927static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr, 3928 int32_t src_stride, 3929 int16_t *src1_ptr, 3930 int32_t src2_stride, 3931 uint8_t *dst, 3932 int32_t dst_stride, 3933 const int8_t *filter_x, 3934 const int8_t *filter_y, 3935 int32_t height) 3936{ 3937 uint32_t loop_cnt; 3938 uint64_t tp0, tp1; 3939 v16u8 out0, out1; 3940 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3941 v8i16 filt0, filt1; 3942 v8i16 filt_h0, filt_h1; 3943 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3944 v16i8 mask1; 3945 v8i16 filter_vec, const_vec; 3946 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3947 v8i16 tmp0, tmp1, tmp2, tmp3; 3948 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 3949 v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 3950 v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 3951 v8i16 dst98_r, dst109_r; 3952 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 3953 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; 3954 3955 src0_ptr -= (src_stride + 1); 3956 3957 filter_vec = LD_SH(filter_x); 3958 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3959 3960 filter_vec = LD_SH(filter_y); 3961 UNPCK_R_SB_SH(filter_vec, filter_vec); 3962 3963 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3964 3965 mask1 = mask0 + 2; 3966 3967 const_vec = __msa_ldi_h(128); 3968 const_vec <<= 6; 3969 3970 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3971 src0_ptr += (3 * src_stride); 3972 XORI_B3_128_SB(src0, src1, src2); 3973 3974 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 3975 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3); 3976 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3977 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3978 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 3979 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 3980 3981 3982 for (loop_cnt = height >> 3; loop_cnt--;) { 3983 LD_SB8(src0_ptr, src_stride, 3984 src3, src4, src5, src6, src7, src8, src9, src10); 3985 src0_ptr += (8 * src_stride); 3986 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3987 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1); 3988 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3); 3989 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5); 3990 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7); 3991 3992 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3993 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3994 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3995 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3996 3997 dst32_r = __msa_ilvr_h(dst73, dst22); 3998 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 3999 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 4000 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 4001 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 4002 dst76_r = __msa_ilvr_h(dst22, dst106); 4003 4004 LD2(src1_ptr, src2_stride, tp0, tp1); 4005 src1_ptr += 2 * src2_stride; 4006 INSERT_D2_SH(tp0, tp1, in0); 4007 LD2(src1_ptr, src2_stride, tp0, tp1); 4008 src1_ptr += 2 * src2_stride; 4009 INSERT_D2_SH(tp0, tp1, in1); 4010 4011 LD2(src1_ptr, src2_stride, tp0, tp1); 4012 src1_ptr += 2 * src2_stride; 4013 INSERT_D2_SH(tp0, tp1, in2); 4014 LD2(src1_ptr, src2_stride, tp0, tp1); 4015 src1_ptr += 2 * src2_stride; 4016 INSERT_D2_SH(tp0, tp1, in3); 4017 4018 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, 4019 const_vec, in0, in1, in2, in3); 4020 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4021 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4022 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4023 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4024 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4025 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4026 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4027 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4028 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 4029 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 4030 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r, 4031 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3); 4032 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, 4033 tmp2, tmp3); 4034 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4035 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4036 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4037 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 4038 dst += (8 * dst_stride); 4039 4040 dst10_r = dst98_r; 4041 dst21_r = dst109_r; 4042 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 4043 } 4044} 4045 4046static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr, 4047 int32_t src_stride, 4048 int16_t *src1_ptr, 4049 int32_t src2_stride, 4050 uint8_t *dst, 4051 int32_t dst_stride, 4052 const int8_t *filter_x, 4053 const int8_t *filter_y, 4054 int32_t height) 4055{ 4056 if (2 == height) { 4057 hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4058 dst, dst_stride, filter_x, filter_y); 4059 } else if (4 == height) { 4060 hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4061 dst, dst_stride, filter_x, filter_y); 4062 } else if (0 == (height % 8)) { 4063 hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride, 4064 src1_ptr, src2_stride, 4065 dst, dst_stride, 4066 filter_x, filter_y, height); 4067 } 4068} 4069 4070static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr, 4071 int32_t src_stride, 4072 int16_t *src1_ptr, 4073 int32_t src2_stride, 4074 uint8_t *dst, 4075 int32_t dst_stride, 4076 const int8_t *filter_x, 4077 const int8_t *filter_y, 4078 int32_t height) 4079{ 4080 uint32_t tpw0, tpw1, tpw2, tpw3; 4081 uint64_t tp0, tp1; 4082 v16u8 out0, out1, out2; 4083 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4084 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4085 v8i16 filt0, filt1; 4086 v8i16 filt_h0, filt_h1; 4087 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4088 v16i8 mask1; 4089 v8i16 filter_vec, const_vec; 4090 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9; 4091 v8i16 dsth10, tmp4, tmp5; 4092 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4093 v4i32 dst4_r, dst5_r, dst6_r, dst7_r; 4094 v8i16 tmp0, tmp1, tmp2, tmp3; 4095 v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 4096 v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 4097 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r; 4098 v8i16 dst54_l, dst76_l, dst98_l, dst65_l, dst87_l, dst109_l; 4099 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l; 4100 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 4101 v8i16 in4 = { 0 }, in5 = { 0 }; 4102 4103 src0_ptr -= (src_stride + 1); 4104 4105 filter_vec = LD_SH(filter_x); 4106 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4107 4108 filter_vec = LD_SH(filter_y); 4109 UNPCK_R_SB_SH(filter_vec, filter_vec); 4110 4111 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4112 4113 mask1 = mask0 + 2; 4114 4115 const_vec = __msa_ldi_h(128); 4116 const_vec <<= 6; 4117 4118 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4119 src0_ptr += (3 * src_stride); 4120 XORI_B3_128_SB(src0, src1, src2); 4121 4122 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4123 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4124 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4125 4126 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4127 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4128 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4129 4130 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 4131 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 4132 4133 LD_SB8(src0_ptr, src_stride, 4134 src3, src4, src5, src6, src7, src8, src9, src10); 4135 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4136 4137 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4138 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4139 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4140 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4141 4142 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4143 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4144 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4145 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4146 4147 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 4148 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3); 4149 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5); 4150 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7); 4151 4152 dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4153 dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4154 dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4155 dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4156 4157 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 4158 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 4159 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 4160 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 4161 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 4162 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 4163 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l); 4164 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l); 4165 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l); 4166 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l); 4167 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l); 4168 4169 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4170 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4171 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4172 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4173 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4174 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4175 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4176 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4177 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1); 4178 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1); 4179 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1); 4180 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1); 4181 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 4182 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 4183 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); 4184 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1); 4185 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3); 4186 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5); 4187 4188 LD2(src1_ptr, src2_stride, tp0, tp1); 4189 INSERT_D2_SH(tp0, tp1, in0); 4190 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1); 4191 INSERT_D2_SH(tp0, tp1, in1); 4192 4193 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1); 4194 INSERT_D2_SH(tp0, tp1, in2); 4195 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1); 4196 INSERT_D2_SH(tp0, tp1, in3); 4197 4198 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec, 4199 in0, in1, in2, in3); 4200 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2, 4201 tmp3); 4202 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4203 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4204 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4205 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 4206 4207 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3); 4208 src1_ptr += (4 * src2_stride); 4209 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4); 4210 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3); 4211 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5); 4212 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5); 4213 ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5); 4214 SRARI_H2_SH(tmp4, tmp5, 7); 4215 CLIP_SH2_0_255(tmp4, tmp5); 4216 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); 4217 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); 4218} 4219 4220static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr, 4221 int32_t src_stride, 4222 int16_t *src1_ptr, 4223 int32_t src2_stride, 4224 uint8_t *dst, 4225 int32_t dst_stride, 4226 const int8_t *filter_x, 4227 const int8_t *filter_y) 4228{ 4229 v16u8 out; 4230 v16i8 src0, src1, src2, src3, src4; 4231 v8i16 filt0, filt1; 4232 v8i16 filt_h0, filt_h1; 4233 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4234 v16i8 mask1; 4235 v8i16 filter_vec, const_vec; 4236 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 4237 v8i16 dst0, dst1, dst2, dst3, dst4; 4238 v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 4239 v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 4240 v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 4241 v8i16 tmp0, tmp1; 4242 v8i16 in0, in1; 4243 4244 src0_ptr -= (src_stride + 1); 4245 4246 filter_vec = LD_SH(filter_x); 4247 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4248 4249 filter_vec = LD_SH(filter_y); 4250 UNPCK_R_SB_SH(filter_vec, filter_vec); 4251 4252 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4253 4254 mask1 = mask0 + 2; 4255 4256 const_vec = __msa_ldi_h(128); 4257 const_vec <<= 6; 4258 4259 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4); 4260 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4261 4262 LD_SH2(src1_ptr, src2_stride, in0, in1); 4263 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1); 4264 4265 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4266 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4267 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4268 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 4269 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 4270 4271 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4272 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4273 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4274 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4275 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 4276 4277 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4278 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4279 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4280 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4281 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4282 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4283 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4284 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4285 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4286 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); 4287 ADDS_SH2_SH(in0, tmp0, in1, tmp1, tmp0, tmp1); 4288 SRARI_H2_SH(tmp0, tmp1, 7); 4289 CLIP_SH2_0_255(tmp0, tmp1); 4290 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 4291 ST_D2(out, 0, 1, dst, dst_stride); 4292} 4293 4294static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr, 4295 int32_t src_stride, 4296 int16_t *src1_ptr, 4297 int32_t src2_stride, 4298 uint8_t *dst, 4299 int32_t dst_stride, 4300 const int8_t *filter_x, 4301 const int8_t *filter_y, 4302 int32_t width8mult) 4303{ 4304 uint32_t cnt; 4305 v16u8 out0, out1; 4306 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 4307 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4308 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec; 4309 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3; 4310 v8i16 in0, in1, in2, in3; 4311 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4312 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 4313 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 4314 4315 src0_ptr -= (src_stride + 1); 4316 4317 filter_vec = LD_SH(filter_x); 4318 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4319 4320 filter_vec = LD_SH(filter_y); 4321 UNPCK_R_SB_SH(filter_vec, filter_vec); 4322 4323 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4324 4325 mask0 = LD_SB(ff_hevc_mask_arr); 4326 mask1 = mask0 + 2; 4327 4328 const_vec = __msa_ldi_h(128); 4329 const_vec <<= 6; 4330 4331 for (cnt = width8mult; cnt--;) { 4332 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 4333 src0_ptr += 8; 4334 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 4335 4336 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 4337 src1_ptr += 8; 4338 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, 4339 const_vec, in0, in1, in2, in3); 4340 4341 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4342 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4343 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4344 4345 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4346 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4347 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4348 4349 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4350 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4351 4352 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4353 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4354 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4355 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4356 4357 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4358 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4359 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4360 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4361 4362 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4363 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4364 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4365 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4366 4367 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4368 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4369 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4370 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4371 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4372 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4373 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4374 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4375 4376 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4377 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4378 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 4379 dst3_r, tmp0, tmp1, tmp2, tmp3); 4380 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 4381 tmp0, tmp1, tmp2, tmp3); 4382 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4383 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4384 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4385 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 4386 dst += 8; 4387 } 4388} 4389 4390static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr, 4391 int32_t src_stride, 4392 int16_t *src1_ptr, 4393 int32_t src2_stride, 4394 uint8_t *dst, 4395 int32_t dst_stride, 4396 const int8_t *filter_x, 4397 const int8_t *filter_y) 4398{ 4399 v16u8 out0, out1, out2; 4400 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 4401 v8i16 in0, in1, in2, in3, in4, in5; 4402 v8i16 filt0, filt1; 4403 v8i16 filt_h0, filt_h1; 4404 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4405 v16i8 mask1; 4406 v8i16 filter_vec, const_vec; 4407 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 4408 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 4409 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 4410 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 4411 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4412 v4i32 dst4_r, dst4_l, dst5_r, dst5_l; 4413 v8i16 dst10_r, dst32_r, dst10_l, dst32_l; 4414 v8i16 dst21_r, dst43_r, dst21_l, dst43_l; 4415 v8i16 dst54_r, dst54_l, dst65_r, dst65_l; 4416 v8i16 dst76_r, dst76_l, dst87_r, dst87_l; 4417 4418 src0_ptr -= (src_stride + 1); 4419 4420 filter_vec = LD_SH(filter_x); 4421 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4422 4423 filter_vec = LD_SH(filter_y); 4424 UNPCK_R_SB_SH(filter_vec, filter_vec); 4425 4426 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4427 4428 mask1 = mask0 + 2; 4429 4430 const_vec = __msa_ldi_h(128); 4431 const_vec <<= 6; 4432 4433 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4); 4434 src0_ptr += (5 * src_stride); 4435 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8); 4436 4437 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4438 XORI_B4_128_SB(src5, src6, src7, src8); 4439 4440 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); 4441 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec, 4442 in0, in1, in2, in3); 4443 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5); 4444 4445 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4446 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4447 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4448 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 4449 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 4450 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11); 4451 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13); 4452 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15); 4453 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17); 4454 4455 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4456 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4457 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4458 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4459 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 4460 dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1); 4461 dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1); 4462 dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1); 4463 dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1); 4464 4465 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4466 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4467 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4468 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4469 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4470 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4471 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 4472 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 4473 4474 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4475 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4476 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4477 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4478 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4479 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4480 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4481 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4482 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4483 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); 4484 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4485 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); 4486 4487 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4488 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4489 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6); 4490 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r, 4491 tmp0, tmp1, tmp2, tmp3); 4492 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5); 4493 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 4494 tmp0, tmp1, tmp2, tmp3); 4495 ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5); 4496 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4497 SRARI_H2_SH(tmp4, tmp5, 7); 4498 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4499 CLIP_SH2_0_255(tmp4, tmp5); 4500 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4501 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); 4502 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 4503 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 4504} 4505 4506static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr, 4507 int32_t src_stride, 4508 int16_t *src1_ptr, 4509 int32_t src2_stride, 4510 uint8_t *dst, 4511 int32_t dst_stride, 4512 const int8_t *filter_x, 4513 const int8_t *filter_y, 4514 int32_t height, 4515 int32_t width) 4516{ 4517 uint32_t loop_cnt, cnt; 4518 uint8_t *src0_ptr_tmp; 4519 int16_t *src1_ptr_tmp; 4520 uint8_t *dst_tmp; 4521 v16u8 out0, out1; 4522 v16i8 src0, src1, src2, src3, src4, src5, src6; 4523 v8i16 in0, in1, in2, in3; 4524 v8i16 filt0, filt1; 4525 v8i16 filt_h0, filt_h1; 4526 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4527 v16i8 mask1; 4528 v8i16 filter_vec, const_vec; 4529 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4530 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 4531 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4532 v8i16 tmp0, tmp1, tmp2, tmp3; 4533 v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 4534 v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 4535 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6; 4536 4537 src0_ptr -= (src_stride + 1); 4538 4539 filter_vec = LD_SH(filter_x); 4540 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4541 4542 filter_vec = LD_SH(filter_y); 4543 UNPCK_R_SB_SH(filter_vec, filter_vec); 4544 4545 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4546 4547 mask1 = mask0 + 2; 4548 4549 const_vec = __msa_ldi_h(128); 4550 const_vec <<= 6; 4551 4552 for (cnt = width >> 3; cnt--;) { 4553 src0_ptr_tmp = src0_ptr; 4554 dst_tmp = dst; 4555 src1_ptr_tmp = src1_ptr; 4556 4557 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2); 4558 src0_ptr_tmp += (3 * src_stride); 4559 XORI_B3_128_SB(src0, src1, src2); 4560 4561 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4562 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4563 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4564 4565 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4566 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4567 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4568 4569 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4570 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4571 4572 for (loop_cnt = height >> 2; loop_cnt--;) { 4573 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6); 4574 src0_ptr_tmp += (4 * src_stride); 4575 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); 4576 src1_ptr_tmp += (4 * src2_stride); 4577 XORI_B4_128_SB(src3, src4, src5, src6); 4578 4579 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, 4580 const_vec, in0, in1, in2, in3); 4581 4582 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4583 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4584 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4585 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4586 4587 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4588 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4589 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4590 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4591 4592 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4593 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4594 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4595 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4596 4597 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4598 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4599 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4600 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4601 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4602 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4603 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4604 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4605 4606 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4607 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4608 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 4609 dst3_r, tmp0, tmp1, tmp2, tmp3); 4610 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 4611 tmp0, tmp1, tmp2, tmp3); 4612 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4613 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4614 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4615 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 4616 dst_tmp += (4 * dst_stride); 4617 4618 dst10_r = dst54_r; 4619 dst10_l = dst54_l; 4620 dst21_r = dst65_r; 4621 dst21_l = dst65_l; 4622 dst2 = dst6; 4623 } 4624 4625 src0_ptr += 8; 4626 dst += 8; 4627 src1_ptr += 8; 4628 } 4629} 4630 4631static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr, 4632 int32_t src_stride, 4633 int16_t *src1_ptr, 4634 int32_t src2_stride, 4635 uint8_t *dst, 4636 int32_t dst_stride, 4637 const int8_t *filter_x, 4638 const int8_t *filter_y, 4639 int32_t height) 4640{ 4641 if (2 == height) { 4642 hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4643 dst, dst_stride, filter_x, filter_y); 4644 } else if (4 == height) { 4645 hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4646 dst, dst_stride, filter_x, filter_y, 1); 4647 } else if (6 == height) { 4648 hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4649 dst, dst_stride, filter_x, filter_y); 4650 } else { 4651 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, 4652 src1_ptr, src2_stride, 4653 dst, dst_stride, 4654 filter_x, filter_y, height, 8); 4655 } 4656} 4657 4658static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr, 4659 int32_t src_stride, 4660 int16_t *src1_ptr, 4661 int32_t src2_stride, 4662 uint8_t *dst, 4663 int32_t dst_stride, 4664 const int8_t *filter_x, 4665 const int8_t *filter_y, 4666 int32_t height) 4667{ 4668 uint32_t loop_cnt; 4669 uint64_t tp0, tp1; 4670 uint8_t *src0_ptr_tmp, *dst_tmp; 4671 int16_t *src1_ptr_tmp; 4672 v16u8 out0, out1; 4673 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4674 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4675 v16i8 mask0, mask1, mask2, mask3; 4676 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3; 4677 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, const_vec; 4678 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 4679 v8i16 dst76_r, dst98_r, dst87_r, dst109_r; 4680 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 4681 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 4682 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 4683 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4684 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 4685 4686 src0_ptr -= (src_stride + 1); 4687 4688 filter_vec = LD_SH(filter_x); 4689 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4690 4691 filter_vec = LD_SH(filter_y); 4692 UNPCK_R_SB_SH(filter_vec, filter_vec); 4693 4694 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4695 4696 mask0 = LD_SB(ff_hevc_mask_arr); 4697 mask1 = mask0 + 2; 4698 4699 const_vec = __msa_ldi_h(128); 4700 const_vec <<= 6; 4701 4702 src0_ptr_tmp = src0_ptr; 4703 dst_tmp = dst; 4704 src1_ptr_tmp = src1_ptr; 4705 4706 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2); 4707 src0_ptr_tmp += (3 * src_stride); 4708 4709 XORI_B3_128_SB(src0, src1, src2); 4710 4711 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4712 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4713 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4714 4715 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4716 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4717 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4718 4719 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 4720 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 4721 4722 for (loop_cnt = 4; loop_cnt--;) { 4723 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6); 4724 src0_ptr_tmp += (4 * src_stride); 4725 XORI_B4_128_SB(src3, src4, src5, src6); 4726 4727 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); 4728 src1_ptr_tmp += (4 * src2_stride); 4729 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, 4730 const_vec, in0, in1, in2, in3); 4731 4732 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4733 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4734 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4735 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4736 4737 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4738 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4739 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4740 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4741 4742 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 4743 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 4744 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 4745 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 4746 4747 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4748 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4749 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4750 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4751 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4752 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4753 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4754 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4755 4756 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4757 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4758 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 4759 dst3_r, tmp0, tmp1, tmp2, tmp3); 4760 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 4761 tmp0, tmp1, tmp2, tmp3); 4762 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4763 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4764 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4765 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 4766 dst_tmp += (4 * dst_stride); 4767 4768 dst10_r = dst54_r; 4769 dst10_l = dst54_l; 4770 dst21_r = dst65_r; 4771 dst21_l = dst65_l; 4772 dsth2 = dsth6; 4773 } 4774 4775 src0_ptr += 8; 4776 dst += 8; 4777 src1_ptr += 8; 4778 4779 mask2 = LD_SB(ff_hevc_mask_arr + 16); 4780 mask3 = mask2 + 2; 4781 4782 LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4783 src0_ptr += (3 * src_stride); 4784 XORI_B3_128_SB(src0, src1, src2); 4785 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 4786 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3); 4787 4788 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4789 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4790 4791 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 4792 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 4793 4794 for (loop_cnt = 2; loop_cnt--;) { 4795 LD_SB8(src0_ptr, src_stride, 4796 src3, src4, src5, src6, src7, src8, src9, src10); 4797 src0_ptr += (8 * src_stride); 4798 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4799 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1); 4800 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3); 4801 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5); 4802 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7); 4803 4804 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4805 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4806 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4807 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4808 4809 dst32_r = __msa_ilvr_h(dst73, dst22); 4810 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 4811 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 4812 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 4813 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 4814 dst76_r = __msa_ilvr_h(dst22, dst106); 4815 4816 LD2(src1_ptr, src2_stride, tp0, tp1); 4817 src1_ptr += 2 * src2_stride; 4818 INSERT_D2_SH(tp0, tp1, in0); 4819 LD2(src1_ptr, src2_stride, tp0, tp1); 4820 src1_ptr += 2 * src2_stride; 4821 INSERT_D2_SH(tp0, tp1, in1); 4822 4823 LD2(src1_ptr, src2_stride, tp0, tp1); 4824 src1_ptr += 2 * src2_stride; 4825 INSERT_D2_SH(tp0, tp1, in2); 4826 LD2(src1_ptr, src2_stride, tp0, tp1); 4827 src1_ptr += 2 * src2_stride; 4828 INSERT_D2_SH(tp0, tp1, in3); 4829 4830 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, 4831 const_vec, in0, in1, in2, in3); 4832 4833 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4834 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4835 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4836 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4837 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4838 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4839 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4840 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4841 4842 SRA_4V(dst0, dst1, dst2, dst3, 6); 4843 SRA_4V(dst4, dst5, dst6, dst7, 6); 4844 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 4845 tmp0, tmp1, tmp2, tmp3); 4846 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 4847 tmp0, tmp1, tmp2, tmp3); 4848 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4849 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4850 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4851 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 4852 dst += (8 * dst_stride); 4853 4854 dst10_r = dst98_r; 4855 dst21_r = dst109_r; 4856 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 4857 } 4858} 4859 4860static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr, 4861 int32_t src_stride, 4862 int16_t *src1_ptr, 4863 int32_t src2_stride, 4864 uint8_t *dst, 4865 int32_t dst_stride, 4866 const int8_t *filter_x, 4867 const int8_t *filter_y, 4868 int32_t height) 4869{ 4870 if (4 == height) { 4871 hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4872 dst, dst_stride, filter_x, filter_y, 2); 4873 } else { 4874 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, 4875 src2_stride, dst, dst_stride, filter_x, 4876 filter_y, height, 16); 4877 } 4878} 4879 4880static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr, 4881 int32_t src_stride, 4882 int16_t *src1_ptr, 4883 int32_t src2_stride, 4884 uint8_t *dst, 4885 int32_t dst_stride, 4886 const int8_t *filter_x, 4887 const int8_t *filter_y, 4888 int32_t height) 4889{ 4890 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4891 dst, dst_stride, filter_x, filter_y, 4892 height, 24); 4893} 4894 4895static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr, 4896 int32_t src_stride, 4897 int16_t *src1_ptr, 4898 int32_t src2_stride, 4899 uint8_t *dst, 4900 int32_t dst_stride, 4901 const int8_t *filter_x, 4902 const int8_t *filter_y, 4903 int32_t height) 4904{ 4905 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4906 dst, dst_stride, filter_x, filter_y, 4907 height, 32); 4908} 4909 4910#define BI_MC_COPY(WIDTH) \ 4911void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ 4912 ptrdiff_t dst_stride, \ 4913 uint8_t *src, \ 4914 ptrdiff_t src_stride, \ 4915 int16_t *src_16bit, \ 4916 int height, \ 4917 intptr_t mx, \ 4918 intptr_t my, \ 4919 int width) \ 4920{ \ 4921 hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \ 4922 dst, dst_stride, height); \ 4923} 4924 4925BI_MC_COPY(4); 4926BI_MC_COPY(6); 4927BI_MC_COPY(8); 4928BI_MC_COPY(12); 4929BI_MC_COPY(16); 4930BI_MC_COPY(24); 4931BI_MC_COPY(32); 4932BI_MC_COPY(48); 4933BI_MC_COPY(64); 4934 4935#undef BI_MC_COPY 4936 4937#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 4938void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ 4939 ptrdiff_t dst_stride, \ 4940 uint8_t *src, \ 4941 ptrdiff_t src_stride, \ 4942 int16_t *src_16bit, \ 4943 int height, \ 4944 intptr_t mx, \ 4945 intptr_t my, \ 4946 int width) \ 4947{ \ 4948 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 4949 \ 4950 hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ 4951 MAX_PB_SIZE, dst, dst_stride, \ 4952 filter, height); \ 4953} 4954 4955BI_MC(qpel, h, 4, 8, hz, mx); 4956BI_MC(qpel, h, 8, 8, hz, mx); 4957BI_MC(qpel, h, 12, 8, hz, mx); 4958BI_MC(qpel, h, 16, 8, hz, mx); 4959BI_MC(qpel, h, 24, 8, hz, mx); 4960BI_MC(qpel, h, 32, 8, hz, mx); 4961BI_MC(qpel, h, 48, 8, hz, mx); 4962BI_MC(qpel, h, 64, 8, hz, mx); 4963 4964BI_MC(qpel, v, 4, 8, vt, my); 4965BI_MC(qpel, v, 8, 8, vt, my); 4966BI_MC(qpel, v, 12, 8, vt, my); 4967BI_MC(qpel, v, 16, 8, vt, my); 4968BI_MC(qpel, v, 24, 8, vt, my); 4969BI_MC(qpel, v, 32, 8, vt, my); 4970BI_MC(qpel, v, 48, 8, vt, my); 4971BI_MC(qpel, v, 64, 8, vt, my); 4972 4973BI_MC(epel, h, 4, 4, hz, mx); 4974BI_MC(epel, h, 8, 4, hz, mx); 4975BI_MC(epel, h, 6, 4, hz, mx); 4976BI_MC(epel, h, 12, 4, hz, mx); 4977BI_MC(epel, h, 16, 4, hz, mx); 4978BI_MC(epel, h, 24, 4, hz, mx); 4979BI_MC(epel, h, 32, 4, hz, mx); 4980 4981BI_MC(epel, v, 4, 4, vt, my); 4982BI_MC(epel, v, 8, 4, vt, my); 4983BI_MC(epel, v, 6, 4, vt, my); 4984BI_MC(epel, v, 12, 4, vt, my); 4985BI_MC(epel, v, 16, 4, vt, my); 4986BI_MC(epel, v, 24, 4, vt, my); 4987BI_MC(epel, v, 32, 4, vt, my); 4988 4989#undef BI_MC 4990 4991#define BI_MC_HV(PEL, WIDTH, TAP) \ 4992void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \ 4993 ptrdiff_t dst_stride, \ 4994 uint8_t *src, \ 4995 ptrdiff_t src_stride, \ 4996 int16_t *src_16bit, \ 4997 int height, \ 4998 intptr_t mx, \ 4999 intptr_t my, \ 5000 int width) \ 5001{ \ 5002 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 5003 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 5004 \ 5005 hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ 5006 MAX_PB_SIZE, dst, dst_stride, \ 5007 filter_x, filter_y, height); \ 5008} 5009 5010BI_MC_HV(qpel, 4, 8); 5011BI_MC_HV(qpel, 8, 8); 5012BI_MC_HV(qpel, 12, 8); 5013BI_MC_HV(qpel, 16, 8); 5014BI_MC_HV(qpel, 24, 8); 5015BI_MC_HV(qpel, 32, 8); 5016BI_MC_HV(qpel, 48, 8); 5017BI_MC_HV(qpel, 64, 8); 5018 5019BI_MC_HV(epel, 4, 4); 5020BI_MC_HV(epel, 8, 4); 5021BI_MC_HV(epel, 6, 4); 5022BI_MC_HV(epel, 12, 4); 5023BI_MC_HV(epel, 16, 4); 5024BI_MC_HV(epel, 24, 4); 5025BI_MC_HV(epel, 32, 4); 5026 5027#undef BI_MC_HV 5028