1/* 2 * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavcodec/vp9dsp.h" 22#include "libavutil/mips/generic_macros_msa.h" 23#include "vp9dsp_mips.h" 24 25static const uint8_t mc_filt_mask_arr[16 * 3] = { 26 /* 8 width cases */ 27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28 /* 4 width cases */ 29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 30 /* 4 width cases */ 31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 32}; 33 34static const int8_t vp9_bilinear_filters_msa[15][2] = { 35 {120, 8}, 36 {112, 16}, 37 {104, 24}, 38 {96, 32}, 39 {88, 40}, 40 {80, 48}, 41 {72, 56}, 42 {64, 64}, 43 {56, 72}, 44 {48, 80}, 45 {40, 88}, 46 {32, 96}, 47 {24, 104}, 48 {16, 112}, 49 {8, 120} 50}; 51 52#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \ 53 filt0, filt1, filt2, filt3) \ 54( { \ 55 v8i16 tmp0, tmp1; \ 56 \ 57 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \ 58 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \ 59 tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \ 60 tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \ 61 tmp0 = __msa_adds_s_h(tmp0, tmp1); \ 62 \ 63 tmp0; \ 64} ) 65 66#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \ 67 filt_h0, filt_h1, filt_h2, filt_h3) \ 68( { \ 69 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 70 v8i16 hz_out_m; \ 71 \ 72 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \ 73 vec0_m, vec1_m, vec2_m, vec3_m); \ 74 hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \ 75 filt_h0, filt_h1, filt_h2, filt_h3); \ 76 \ 77 hz_out_m = __msa_srari_h(hz_out_m, 7); \ 78 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ 79 \ 80 hz_out_m; \ 81} ) 82 83#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 84 mask0, mask1, mask2, mask3, \ 85 filt0, filt1, filt2, filt3, \ 86 out0, out1) \ 87{ \ 88 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 89 v8i16 res0_m, res1_m, res2_m, res3_m; \ 90 \ 91 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 92 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ 93 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 94 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ 95 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ 96 DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ 97 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ 98 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ 99 ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ 100} 101 102#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 103 mask0, mask1, mask2, mask3, \ 104 filt0, filt1, filt2, filt3, \ 105 out0, out1, out2, out3) \ 106{ \ 107 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 108 v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ 109 \ 110 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 111 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 112 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 113 res0_m, res1_m, res2_m, res3_m); \ 114 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ 115 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ 116 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ 117 res4_m, res5_m, res6_m, res7_m); \ 118 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ 119 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ 120 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ 121 res0_m, res1_m, res2_m, res3_m); \ 122 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ 123 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ 124 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ 125 res4_m, res5_m, res6_m, res7_m); \ 126 ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ 127 res7_m, out0, out1, out2, out3); \ 128} 129 130#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \ 131{ \ 132 v16u8 tmp_m; \ 133 \ 134 tmp_m = PCKEV_XORI128_UB(in1, in0); \ 135 tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \ 136 ST_UB(tmp_m, (pdst)); \ 137} 138 139#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \ 140{ \ 141 v16u8 tmp_m; \ 142 \ 143 tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \ 144 tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \ 145 ST_UB(tmp_m, (pdst)); \ 146} 147 148#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, \ 149 pdst, stride) \ 150{ \ 151 v16u8 tmp0_m, tmp1_m; \ 152 uint8_t *pdst_m = (uint8_t *) (pdst); \ 153 \ 154 PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 155 AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ 156 ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \ 157} 158 159static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, 160 uint8_t *dst, int32_t dst_stride, 161 const int8_t *filter) 162{ 163 v16u8 mask0, mask1, mask2, mask3, out; 164 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 165 v8i16 filt, out0, out1; 166 167 mask0 = LD_UB(&mc_filt_mask_arr[16]); 168 src -= 3; 169 170 /* rearranging filter */ 171 filt = LD_SH(filter); 172 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 173 174 mask1 = mask0 + 2; 175 mask2 = mask0 + 4; 176 mask3 = mask0 + 6; 177 178 LD_SB4(src, src_stride, src0, src1, src2, src3); 179 XORI_B4_128_SB(src0, src1, src2, src3); 180 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 181 mask3, filt0, filt1, filt2, filt3, out0, out1); 182 SRARI_H2_SH(out0, out1, 7); 183 SAT_SH2_SH(out0, out1, 7); 184 out = PCKEV_XORI128_UB(out0, out1); 185 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 186} 187 188static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, 189 uint8_t *dst, int32_t dst_stride, 190 const int8_t *filter) 191{ 192 v16i8 filt0, filt1, filt2, filt3; 193 v16i8 src0, src1, src2, src3; 194 v16u8 mask0, mask1, mask2, mask3, out; 195 v8i16 filt, out0, out1, out2, out3; 196 197 mask0 = LD_UB(&mc_filt_mask_arr[16]); 198 src -= 3; 199 200 /* rearranging filter */ 201 filt = LD_SH(filter); 202 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 203 204 mask1 = mask0 + 2; 205 mask2 = mask0 + 4; 206 mask3 = mask0 + 6; 207 208 LD_SB4(src, src_stride, src0, src1, src2, src3); 209 XORI_B4_128_SB(src0, src1, src2, src3); 210 src += (4 * src_stride); 211 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 212 mask3, filt0, filt1, filt2, filt3, out0, out1); 213 LD_SB4(src, src_stride, src0, src1, src2, src3); 214 XORI_B4_128_SB(src0, src1, src2, src3); 215 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 216 mask3, filt0, filt1, filt2, filt3, out2, out3); 217 SRARI_H4_SH(out0, out1, out2, out3, 7); 218 SAT_SH4_SH(out0, out1, out2, out3, 7); 219 out = PCKEV_XORI128_UB(out0, out1); 220 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 221 out = PCKEV_XORI128_UB(out2, out3); 222 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 223} 224 225static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, 226 uint8_t *dst, int32_t dst_stride, 227 const int8_t *filter, int32_t height) 228{ 229 if (4 == height) { 230 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); 231 } else if (8 == height) { 232 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); 233 } 234} 235 236static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, 237 uint8_t *dst, int32_t dst_stride, 238 const int8_t *filter) 239{ 240 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 241 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; 242 v8i16 filt, out0, out1, out2, out3; 243 244 mask0 = LD_UB(&mc_filt_mask_arr[0]); 245 src -= 3; 246 247 /* rearranging filter */ 248 filt = LD_SH(filter); 249 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 250 251 mask1 = mask0 + 2; 252 mask2 = mask0 + 4; 253 mask3 = mask0 + 6; 254 255 LD_SB4(src, src_stride, src0, src1, src2, src3); 256 XORI_B4_128_SB(src0, src1, src2, src3); 257 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 258 mask3, filt0, filt1, filt2, filt3, out0, out1, 259 out2, out3); 260 SRARI_H4_SH(out0, out1, out2, out3, 7); 261 SAT_SH4_SH(out0, out1, out2, out3, 7); 262 tmp0 = PCKEV_XORI128_UB(out0, out1); 263 tmp1 = PCKEV_XORI128_UB(out2, out3); 264 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 265} 266 267static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, 268 uint8_t *dst, int32_t dst_stride, 269 const int8_t *filter, int32_t height) 270{ 271 uint32_t loop_cnt; 272 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 273 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; 274 v8i16 filt, out0, out1, out2, out3; 275 276 mask0 = LD_UB(&mc_filt_mask_arr[0]); 277 src -= 3; 278 279 /* rearranging filter */ 280 filt = LD_SH(filter); 281 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 282 283 mask1 = mask0 + 2; 284 mask2 = mask0 + 4; 285 mask3 = mask0 + 6; 286 287 for (loop_cnt = (height >> 2); loop_cnt--;) { 288 LD_SB4(src, src_stride, src0, src1, src2, src3); 289 XORI_B4_128_SB(src0, src1, src2, src3); 290 src += (4 * src_stride); 291 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 292 mask3, filt0, filt1, filt2, filt3, out0, 293 out1, out2, out3); 294 SRARI_H4_SH(out0, out1, out2, out3, 7); 295 SAT_SH4_SH(out0, out1, out2, out3, 7); 296 tmp0 = PCKEV_XORI128_UB(out0, out1); 297 tmp1 = PCKEV_XORI128_UB(out2, out3); 298 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 299 dst += (4 * dst_stride); 300 } 301} 302 303static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, 304 uint8_t *dst, int32_t dst_stride, 305 const int8_t *filter, int32_t height) 306{ 307 if (4 == height) { 308 common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter); 309 } else { 310 common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, 311 height); 312 } 313} 314 315static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, 316 uint8_t *dst, int32_t dst_stride, 317 const int8_t *filter, int32_t height) 318{ 319 uint32_t loop_cnt; 320 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 321 v16u8 mask0, mask1, mask2, mask3, out; 322 v8i16 filt, out0, out1, out2, out3; 323 324 mask0 = LD_UB(&mc_filt_mask_arr[0]); 325 src -= 3; 326 327 /* rearranging filter */ 328 filt = LD_SH(filter); 329 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 330 331 mask1 = mask0 + 2; 332 mask2 = mask0 + 4; 333 mask3 = mask0 + 6; 334 335 for (loop_cnt = (height >> 1); loop_cnt--;) { 336 LD_SB2(src, src_stride, src0, src2); 337 LD_SB2(src + 8, src_stride, src1, src3); 338 XORI_B4_128_SB(src0, src1, src2, src3); 339 src += (2 * src_stride); 340 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 341 mask3, filt0, filt1, filt2, filt3, out0, 342 out1, out2, out3); 343 SRARI_H4_SH(out0, out1, out2, out3, 7); 344 SAT_SH4_SH(out0, out1, out2, out3, 7); 345 out = PCKEV_XORI128_UB(out0, out1); 346 ST_UB(out, dst); 347 dst += dst_stride; 348 out = PCKEV_XORI128_UB(out2, out3); 349 ST_UB(out, dst); 350 dst += dst_stride; 351 } 352} 353 354static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, 355 uint8_t *dst, int32_t dst_stride, 356 const int8_t *filter, int32_t height) 357{ 358 uint32_t loop_cnt; 359 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 360 v16u8 mask0, mask1, mask2, mask3, out; 361 v8i16 filt, out0, out1, out2, out3; 362 363 mask0 = LD_UB(&mc_filt_mask_arr[0]); 364 src -= 3; 365 366 /* rearranging filter */ 367 filt = LD_SH(filter); 368 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 369 370 mask1 = mask0 + 2; 371 mask2 = mask0 + 4; 372 mask3 = mask0 + 6; 373 374 for (loop_cnt = (height >> 1); loop_cnt--;) { 375 src0 = LD_SB(src); 376 src2 = LD_SB(src + 16); 377 src3 = LD_SB(src + 24); 378 src1 = __msa_sldi_b(src2, src0, 8); 379 src += src_stride; 380 XORI_B4_128_SB(src0, src1, src2, src3); 381 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 382 mask3, filt0, filt1, filt2, filt3, out0, 383 out1, out2, out3); 384 SRARI_H4_SH(out0, out1, out2, out3, 7); 385 SAT_SH4_SH(out0, out1, out2, out3, 7); 386 387 src0 = LD_SB(src); 388 src2 = LD_SB(src + 16); 389 src3 = LD_SB(src + 24); 390 src1 = __msa_sldi_b(src2, src0, 8); 391 src += src_stride; 392 393 out = PCKEV_XORI128_UB(out0, out1); 394 ST_UB(out, dst); 395 out = PCKEV_XORI128_UB(out2, out3); 396 ST_UB(out, dst + 16); 397 dst += dst_stride; 398 399 XORI_B4_128_SB(src0, src1, src2, src3); 400 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 401 mask3, filt0, filt1, filt2, filt3, out0, 402 out1, out2, out3); 403 SRARI_H4_SH(out0, out1, out2, out3, 7); 404 SAT_SH4_SH(out0, out1, out2, out3, 7); 405 out = PCKEV_XORI128_UB(out0, out1); 406 ST_UB(out, dst); 407 out = PCKEV_XORI128_UB(out2, out3); 408 ST_UB(out, dst + 16); 409 dst += dst_stride; 410 } 411} 412 413static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, 414 uint8_t *dst, int32_t dst_stride, 415 const int8_t *filter, int32_t height) 416{ 417 int32_t loop_cnt; 418 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 419 v16u8 mask0, mask1, mask2, mask3, out; 420 v8i16 filt, out0, out1, out2, out3; 421 422 mask0 = LD_UB(&mc_filt_mask_arr[0]); 423 src -= 3; 424 425 /* rearranging filter */ 426 filt = LD_SH(filter); 427 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 428 429 mask1 = mask0 + 2; 430 mask2 = mask0 + 4; 431 mask3 = mask0 + 6; 432 433 for (loop_cnt = height; loop_cnt--;) { 434 src0 = LD_SB(src); 435 src2 = LD_SB(src + 16); 436 src3 = LD_SB(src + 24); 437 src1 = __msa_sldi_b(src2, src0, 8); 438 439 XORI_B4_128_SB(src0, src1, src2, src3); 440 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 441 mask2, mask3, filt0, filt1, filt2, filt3, 442 out0, out1, out2, out3); 443 SRARI_H4_SH(out0, out1, out2, out3, 7); 444 SAT_SH4_SH(out0, out1, out2, out3, 7); 445 out = PCKEV_XORI128_UB(out0, out1); 446 ST_UB(out, dst); 447 out = PCKEV_XORI128_UB(out2, out3); 448 ST_UB(out, dst + 16); 449 450 src0 = LD_SB(src + 32); 451 src2 = LD_SB(src + 48); 452 src3 = LD_SB(src + 56); 453 src1 = __msa_sldi_b(src2, src0, 8); 454 src += src_stride; 455 456 XORI_B4_128_SB(src0, src1, src2, src3); 457 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 458 mask2, mask3, filt0, filt1, filt2, filt3, 459 out0, out1, out2, out3); 460 SRARI_H4_SH(out0, out1, out2, out3, 7); 461 SAT_SH4_SH(out0, out1, out2, out3, 7); 462 out = PCKEV_XORI128_UB(out0, out1); 463 ST_UB(out, dst + 32); 464 out = PCKEV_XORI128_UB(out2, out3); 465 ST_UB(out, dst + 48); 466 dst += dst_stride; 467 } 468} 469 470static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, 471 uint8_t *dst, int32_t dst_stride, 472 const int8_t *filter, int32_t height) 473{ 474 uint32_t loop_cnt; 475 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 476 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 477 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; 478 v16i8 src10998, filt0, filt1, filt2, filt3; 479 v16u8 out; 480 v8i16 filt, out10, out32; 481 482 src -= (3 * src_stride); 483 484 filt = LD_SH(filter); 485 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 486 487 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 488 src += (7 * src_stride); 489 490 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 491 src54_r, src21_r); 492 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 493 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, 494 src4332, src6554); 495 XORI_B3_128_SB(src2110, src4332, src6554); 496 497 for (loop_cnt = (height >> 2); loop_cnt--;) { 498 LD_SB4(src, src_stride, src7, src8, src9, src10); 499 src += (4 * src_stride); 500 501 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 502 src87_r, src98_r, src109_r); 503 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); 504 XORI_B2_128_SB(src8776, src10998); 505 out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, 506 filt1, filt2, filt3); 507 out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, 508 filt1, filt2, filt3); 509 SRARI_H2_SH(out10, out32, 7); 510 SAT_SH2_SH(out10, out32, 7); 511 out = PCKEV_XORI128_UB(out10, out32); 512 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 513 dst += (4 * dst_stride); 514 515 src2110 = src6554; 516 src4332 = src8776; 517 src6554 = src10998; 518 src6 = src10; 519 } 520} 521 522static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, 523 uint8_t *dst, int32_t dst_stride, 524 const int8_t *filter, int32_t height) 525{ 526 uint32_t loop_cnt; 527 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 528 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 529 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; 530 v16u8 tmp0, tmp1; 531 v8i16 filt, out0_r, out1_r, out2_r, out3_r; 532 533 src -= (3 * src_stride); 534 535 filt = LD_SH(filter); 536 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 537 538 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 539 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 540 src += (7 * src_stride); 541 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 542 src54_r, src21_r); 543 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 544 545 for (loop_cnt = (height >> 2); loop_cnt--;) { 546 LD_SB4(src, src_stride, src7, src8, src9, src10); 547 XORI_B4_128_SB(src7, src8, src9, src10); 548 src += (4 * src_stride); 549 550 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 551 src87_r, src98_r, src109_r); 552 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, 553 filt1, filt2, filt3); 554 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, 555 filt1, filt2, filt3); 556 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, 557 filt1, filt2, filt3); 558 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, 559 filt1, filt2, filt3); 560 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); 561 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 562 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); 563 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); 564 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 565 dst += (4 * dst_stride); 566 567 src10_r = src54_r; 568 src32_r = src76_r; 569 src54_r = src98_r; 570 src21_r = src65_r; 571 src43_r = src87_r; 572 src65_r = src109_r; 573 src6 = src10; 574 } 575} 576 577static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, 578 uint8_t *dst, int32_t dst_stride, 579 const int8_t *filter, int32_t height) 580{ 581 uint32_t loop_cnt; 582 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 583 v16i8 filt0, filt1, filt2, filt3; 584 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 585 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; 586 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; 587 v16u8 tmp0, tmp1, tmp2, tmp3; 588 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 589 590 src -= (3 * src_stride); 591 592 filt = LD_SH(filter); 593 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 594 595 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 596 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 597 src += (7 * src_stride); 598 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 599 src54_r, src21_r); 600 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 601 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, 602 src54_l, src21_l); 603 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 604 605 for (loop_cnt = (height >> 2); loop_cnt--;) { 606 LD_SB4(src, src_stride, src7, src8, src9, src10); 607 XORI_B4_128_SB(src7, src8, src9, src10); 608 src += (4 * src_stride); 609 610 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 611 src87_r, src98_r, src109_r); 612 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, 613 src87_l, src98_l, src109_l); 614 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, 615 filt1, filt2, filt3); 616 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, 617 filt1, filt2, filt3); 618 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, 619 filt1, filt2, filt3); 620 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, 621 filt1, filt2, filt3); 622 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, 623 filt1, filt2, filt3); 624 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, 625 filt1, filt2, filt3); 626 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, 627 filt1, filt2, filt3); 628 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, 629 filt1, filt2, filt3); 630 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); 631 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7); 632 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 633 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 634 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 635 out3_r, tmp0, tmp1, tmp2, tmp3); 636 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 637 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 638 dst += (4 * dst_stride); 639 640 src10_r = src54_r; 641 src32_r = src76_r; 642 src54_r = src98_r; 643 src21_r = src65_r; 644 src43_r = src87_r; 645 src65_r = src109_r; 646 src10_l = src54_l; 647 src32_l = src76_l; 648 src54_l = src98_l; 649 src21_l = src65_l; 650 src43_l = src87_l; 651 src65_l = src109_l; 652 src6 = src10; 653 } 654} 655 656static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, 657 uint8_t *dst, int32_t dst_stride, 658 const int8_t *filter, int32_t height, 659 int32_t width) 660{ 661 const uint8_t *src_tmp; 662 uint8_t *dst_tmp; 663 uint32_t loop_cnt, cnt; 664 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 665 v16i8 filt0, filt1, filt2, filt3; 666 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 667 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; 668 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; 669 v16u8 tmp0, tmp1, tmp2, tmp3; 670 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 671 672 src -= (3 * src_stride); 673 674 filt = LD_SH(filter); 675 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 676 677 for (cnt = (width >> 4); cnt--;) { 678 src_tmp = src; 679 dst_tmp = dst; 680 681 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 682 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 683 src_tmp += (7 * src_stride); 684 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, 685 src32_r, src54_r, src21_r); 686 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 687 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, 688 src32_l, src54_l, src21_l); 689 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 690 691 for (loop_cnt = (height >> 2); loop_cnt--;) { 692 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); 693 XORI_B4_128_SB(src7, src8, src9, src10); 694 src_tmp += (4 * src_stride); 695 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 696 src87_r, src98_r, src109_r); 697 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, 698 src87_l, src98_l, src109_l); 699 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, 700 filt0, filt1, filt2, filt3); 701 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, 702 filt0, filt1, filt2, filt3); 703 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, 704 filt0, filt1, filt2, filt3); 705 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, 706 filt0, filt1, filt2, filt3); 707 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, 708 filt0, filt1, filt2, filt3); 709 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, 710 filt0, filt1, filt2, filt3); 711 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, 712 filt0, filt1, filt2, filt3); 713 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, 714 filt0, filt1, filt2, filt3); 715 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); 716 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7); 717 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 718 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 719 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 720 out3_r, tmp0, tmp1, tmp2, tmp3); 721 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 722 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); 723 dst_tmp += (4 * dst_stride); 724 725 src10_r = src54_r; 726 src32_r = src76_r; 727 src54_r = src98_r; 728 src21_r = src65_r; 729 src43_r = src87_r; 730 src65_r = src109_r; 731 src10_l = src54_l; 732 src32_l = src76_l; 733 src54_l = src98_l; 734 src21_l = src65_l; 735 src43_l = src87_l; 736 src65_l = src109_l; 737 src6 = src10; 738 } 739 740 src += 16; 741 dst += 16; 742 } 743} 744 745static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, 746 uint8_t *dst, int32_t dst_stride, 747 const int8_t *filter, int32_t height) 748{ 749 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 750 32); 751} 752 753static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, 754 uint8_t *dst, int32_t dst_stride, 755 const int8_t *filter, int32_t height) 756{ 757 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 758 64); 759} 760 761static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, 762 uint8_t *dst, int32_t dst_stride, 763 const int8_t *filter_horiz, 764 const int8_t *filter_vert, 765 int32_t height) 766{ 767 uint32_t loop_cnt; 768 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 769 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 770 v16u8 mask0, mask1, mask2, mask3, out; 771 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 772 v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4; 773 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 774 775 mask0 = LD_UB(&mc_filt_mask_arr[16]); 776 src -= (3 + 3 * src_stride); 777 778 /* rearranging filter */ 779 filt = LD_SH(filter_horiz); 780 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 781 782 mask1 = mask0 + 2; 783 mask2 = mask0 + 4; 784 mask3 = mask0 + 6; 785 786 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 787 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 788 src += (7 * src_stride); 789 790 hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, 791 filt_hz1, filt_hz2, filt_hz3); 792 hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, 793 filt_hz1, filt_hz2, filt_hz3); 794 hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, 795 filt_hz1, filt_hz2, filt_hz3); 796 hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, 797 filt_hz1, filt_hz2, filt_hz3); 798 SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3); 799 800 filt = LD_SH(filter_vert); 801 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 802 803 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 804 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 805 806 for (loop_cnt = (height >> 2); loop_cnt--;) { 807 LD_SB4(src, src_stride, src7, src8, src9, src10); 808 XORI_B4_128_SB(src7, src8, src9, src10); 809 src += (4 * src_stride); 810 811 hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, 812 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 813 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8); 814 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); 815 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, 816 filt_vt2, filt_vt3); 817 818 hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, 819 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 820 hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8); 821 out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8); 822 tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1, 823 filt_vt2, filt_vt3); 824 SRARI_H2_SH(tmp0, tmp1, 7); 825 SAT_SH2_SH(tmp0, tmp1, 7); 826 out = PCKEV_XORI128_UB(tmp0, tmp1); 827 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 828 dst += (4 * dst_stride); 829 830 hz_out5 = hz_out9; 831 out0 = out2; 832 out1 = out3; 833 out2 = out4; 834 } 835} 836 837static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, 838 uint8_t *dst, int32_t dst_stride, 839 const int8_t *filter_horiz, 840 const int8_t *filter_vert, 841 int32_t height) 842{ 843 uint32_t loop_cnt; 844 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 845 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 846 v16u8 mask0, mask1, mask2, mask3, vec0, vec1; 847 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 848 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 849 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; 850 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; 851 852 mask0 = LD_UB(&mc_filt_mask_arr[0]); 853 src -= (3 + 3 * src_stride); 854 855 /* rearranging filter */ 856 filt = LD_SH(filter_horiz); 857 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 858 859 mask1 = mask0 + 2; 860 mask2 = mask0 + 4; 861 mask3 = mask0 + 6; 862 863 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 864 src += (7 * src_stride); 865 866 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 867 hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, 868 filt_hz1, filt_hz2, filt_hz3); 869 hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, 870 filt_hz1, filt_hz2, filt_hz3); 871 hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, 872 filt_hz1, filt_hz2, filt_hz3); 873 hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, 874 filt_hz1, filt_hz2, filt_hz3); 875 hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, 876 filt_hz1, filt_hz2, filt_hz3); 877 hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, 878 filt_hz1, filt_hz2, filt_hz3); 879 hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, 880 filt_hz1, filt_hz2, filt_hz3); 881 882 filt = LD_SH(filter_vert); 883 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 884 885 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 886 ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); 887 ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); 888 889 for (loop_cnt = (height >> 2); loop_cnt--;) { 890 LD_SB4(src, src_stride, src7, src8, src9, src10); 891 src += (4 * src_stride); 892 893 XORI_B4_128_SB(src7, src8, src9, src10); 894 895 hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, 896 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 897 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); 898 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, 899 filt_vt2, filt_vt3); 900 901 hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, 902 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 903 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7); 904 tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, 905 filt_vt2, filt_vt3); 906 907 hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, 908 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 909 out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8); 910 tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, 911 filt_vt1, filt_vt2, filt_vt3); 912 913 hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, 914 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 915 out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9); 916 tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, 917 filt_vt2, filt_vt3); 918 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 919 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 920 vec0 = PCKEV_XORI128_UB(tmp0, tmp1); 921 vec1 = PCKEV_XORI128_UB(tmp2, tmp3); 922 ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride); 923 dst += (4 * dst_stride); 924 925 hz_out6 = hz_out10; 926 out0 = out2; 927 out1 = out3; 928 out2 = out8; 929 out4 = out6; 930 out5 = out7; 931 out6 = out9; 932 } 933} 934 935static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, 936 uint8_t *dst, int32_t dst_stride, 937 const int8_t *filter_horiz, 938 const int8_t *filter_vert, 939 int32_t height) 940{ 941 int32_t multiple8_cnt; 942 943 for (multiple8_cnt = 2; multiple8_cnt--;) { 944 common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, 945 filter_vert, height); 946 947 src += 8; 948 dst += 8; 949 } 950} 951 952static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, 953 uint8_t *dst, int32_t dst_stride, 954 const int8_t *filter_horiz, 955 const int8_t *filter_vert, 956 int32_t height) 957{ 958 int32_t multiple8_cnt; 959 960 for (multiple8_cnt = 4; multiple8_cnt--;) { 961 common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, 962 filter_vert, height); 963 964 src += 8; 965 dst += 8; 966 } 967} 968 969static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, 970 uint8_t *dst, int32_t dst_stride, 971 const int8_t *filter_horiz, 972 const int8_t *filter_vert, 973 int32_t height) 974{ 975 int32_t multiple8_cnt; 976 977 for (multiple8_cnt = 8; multiple8_cnt--;) { 978 common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, 979 filter_vert, height); 980 981 src += 8; 982 dst += 8; 983 } 984} 985 986static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, 987 int32_t src_stride, 988 uint8_t *dst, int32_t dst_stride, 989 const int8_t *filter) 990{ 991 uint32_t tp0, tp1, tp2, tp3; 992 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 993 v16u8 dst0, res; 994 v16u8 mask0, mask1, mask2, mask3; 995 v8i16 filt, res0, res1; 996 997 mask0 = LD_UB(&mc_filt_mask_arr[16]); 998 src -= 3; 999 1000 /* rearranging filter */ 1001 filt = LD_SH(filter); 1002 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1003 1004 mask1 = mask0 + 2; 1005 mask2 = mask0 + 4; 1006 mask3 = mask0 + 6; 1007 1008 LD_SB4(src, src_stride, src0, src1, src2, src3); 1009 XORI_B4_128_SB(src0, src1, src2, src3); 1010 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 1011 mask3, filt0, filt1, filt2, filt3, res0, res1); 1012 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 1013 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1014 SRARI_H2_SH(res0, res1, 7); 1015 SAT_SH2_SH(res0, res1, 7); 1016 res = PCKEV_XORI128_UB(res0, res1); 1017 res = (v16u8) __msa_aver_u_b(res, dst0); 1018 ST_W4(res, 0, 1, 2, 3, dst, dst_stride); 1019} 1020 1021static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, 1022 int32_t src_stride, 1023 uint8_t *dst, int32_t dst_stride, 1024 const int8_t *filter) 1025{ 1026 uint32_t tp0, tp1, tp2, tp3; 1027 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 1028 v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; 1029 v16u8 dst0, dst1; 1030 v8i16 filt, vec0, vec1, vec2, vec3; 1031 1032 mask0 = LD_UB(&mc_filt_mask_arr[16]); 1033 src -= 3; 1034 1035 /* rearranging filter */ 1036 filt = LD_SH(filter); 1037 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1038 1039 mask1 = mask0 + 2; 1040 mask2 = mask0 + 4; 1041 mask3 = mask0 + 6; 1042 1043 LD_SB4(src, src_stride, src0, src1, src2, src3); 1044 XORI_B4_128_SB(src0, src1, src2, src3); 1045 src += (4 * src_stride); 1046 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 1047 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1048 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); 1049 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 1050 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 1051 mask3, filt0, filt1, filt2, filt3, vec0, vec1); 1052 LD_SB4(src, src_stride, src0, src1, src2, src3); 1053 XORI_B4_128_SB(src0, src1, src2, src3); 1054 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 1055 mask3, filt0, filt1, filt2, filt3, vec2, vec3); 1056 SRARI_H4_SH(vec0, vec1, vec2, vec3, 7); 1057 SAT_SH4_SH(vec0, vec1, vec2, vec3, 7); 1058 PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, 1059 res0, res1, res2, res3); 1060 ILVR_D2_UB(res1, res0, res3, res2, res0, res2); 1061 XORI_B2_128_UB(res0, res2); 1062 AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2); 1063 ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 1064} 1065 1066static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, 1067 int32_t src_stride, 1068 uint8_t *dst, int32_t dst_stride, 1069 const int8_t *filter, 1070 int32_t height) 1071{ 1072 if (4 == height) { 1073 common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, 1074 filter); 1075 } else if (8 == height) { 1076 common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, 1077 filter); 1078 } 1079} 1080 1081static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, 1082 int32_t src_stride, 1083 uint8_t *dst, int32_t dst_stride, 1084 const int8_t *filter, 1085 int32_t height) 1086{ 1087 int32_t loop_cnt; 1088 int64_t tp0, tp1, tp2, tp3; 1089 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 1090 v16u8 mask0, mask1, mask2, mask3, dst0, dst1; 1091 v8i16 filt, out0, out1, out2, out3; 1092 1093 mask0 = LD_UB(&mc_filt_mask_arr[0]); 1094 src -= 3; 1095 1096 /* rearranging filter */ 1097 filt = LD_SH(filter); 1098 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1099 1100 mask1 = mask0 + 2; 1101 mask2 = mask0 + 4; 1102 mask3 = mask0 + 6; 1103 1104 for (loop_cnt = (height >> 2); loop_cnt--;) { 1105 LD_SB4(src, src_stride, src0, src1, src2, src3); 1106 XORI_B4_128_SB(src0, src1, src2, src3); 1107 src += (4 * src_stride); 1108 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 1109 mask3, filt0, filt1, filt2, filt3, out0, 1110 out1, out2, out3); 1111 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 1112 INSERT_D2_UB(tp0, tp1, dst0); 1113 INSERT_D2_UB(tp2, tp3, dst1); 1114 SRARI_H4_SH(out0, out1, out2, out3, 7); 1115 SAT_SH4_SH(out0, out1, out2, out3, 7); 1116 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, 1117 dst, dst_stride); 1118 dst += (4 * dst_stride); 1119 } 1120} 1121 1122static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, 1123 int32_t src_stride, 1124 uint8_t *dst, int32_t dst_stride, 1125 const int8_t *filter, 1126 int32_t height) 1127{ 1128 int32_t loop_cnt; 1129 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 1130 v16u8 mask0, mask1, mask2, mask3, dst0, dst1; 1131 v8i16 filt, out0, out1, out2, out3; 1132 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1133 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1134 1135 mask0 = LD_UB(&mc_filt_mask_arr[0]); 1136 src -= 3; 1137 1138 /* rearranging filter */ 1139 filt = LD_SH(filter); 1140 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1141 1142 mask1 = mask0 + 2; 1143 mask2 = mask0 + 4; 1144 mask3 = mask0 + 6; 1145 1146 for (loop_cnt = height >> 1; loop_cnt--;) { 1147 LD_SB2(src, src_stride, src0, src2); 1148 LD_SB2(src + 8, src_stride, src1, src3); 1149 src += (2 * src_stride); 1150 1151 XORI_B4_128_SB(src0, src1, src2, src3); 1152 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, 1153 vec12); 1154 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, 1155 vec13); 1156 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, 1157 vec14); 1158 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, 1159 vec15); 1160 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, 1161 vec1, vec2, vec3); 1162 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, 1163 vec9, vec10, vec11); 1164 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, 1165 vec1, vec2, vec3); 1166 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, 1167 vec8, vec9, vec10, vec11); 1168 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, 1169 out1, out2, out3); 1170 LD_UB2(dst, dst_stride, dst0, dst1); 1171 SRARI_H4_SH(out0, out1, out2, out3, 7); 1172 SAT_SH4_SH(out0, out1, out2, out3, 7); 1173 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst); 1174 dst += dst_stride; 1175 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst); 1176 dst += dst_stride; 1177 } 1178} 1179 1180static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, 1181 int32_t src_stride, 1182 uint8_t *dst, int32_t dst_stride, 1183 const int8_t *filter, 1184 int32_t height) 1185{ 1186 uint32_t loop_cnt; 1187 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 1188 v16u8 dst1, dst2, mask0, mask1, mask2, mask3; 1189 v8i16 filt, out0, out1, out2, out3; 1190 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1191 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1192 1193 mask0 = LD_UB(&mc_filt_mask_arr[0]); 1194 src -= 3; 1195 1196 /* rearranging filter */ 1197 filt = LD_SH(filter); 1198 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1199 1200 mask1 = mask0 + 2; 1201 mask2 = mask0 + 4; 1202 mask3 = mask0 + 6; 1203 1204 for (loop_cnt = height; loop_cnt--;) { 1205 src0 = LD_SB(src); 1206 src2 = LD_SB(src + 16); 1207 src3 = LD_SB(src + 24); 1208 src1 = __msa_sldi_b(src2, src0, 8); 1209 src += src_stride; 1210 1211 XORI_B4_128_SB(src0, src1, src2, src3); 1212 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, 1213 vec12); 1214 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, 1215 vec13); 1216 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, 1217 vec14); 1218 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, 1219 vec15); 1220 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, 1221 vec1, vec2, vec3); 1222 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, 1223 vec9, vec10, vec11); 1224 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, 1225 vec1, vec2, vec3); 1226 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, 1227 vec8, vec9, vec10, vec11); 1228 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, 1229 out1, out2, out3); 1230 SRARI_H4_SH(out0, out1, out2, out3, 7); 1231 SAT_SH4_SH(out0, out1, out2, out3, 7); 1232 LD_UB2(dst, 16, dst1, dst2); 1233 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst); 1234 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16); 1235 dst += dst_stride; 1236 } 1237} 1238 1239static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, 1240 int32_t src_stride, 1241 uint8_t *dst, int32_t dst_stride, 1242 const int8_t *filter, 1243 int32_t height) 1244{ 1245 uint32_t loop_cnt, cnt; 1246 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 1247 v16u8 dst1, dst2, mask0, mask1, mask2, mask3; 1248 v8i16 filt, out0, out1, out2, out3; 1249 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1250 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1251 1252 mask0 = LD_UB(&mc_filt_mask_arr[0]); 1253 src -= 3; 1254 1255 /* rearranging filter */ 1256 filt = LD_SH(filter); 1257 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1258 1259 mask1 = mask0 + 2; 1260 mask2 = mask0 + 4; 1261 mask3 = mask0 + 6; 1262 1263 for (loop_cnt = height; loop_cnt--;) { 1264 for (cnt = 0; cnt < 2; ++cnt) { 1265 src0 = LD_SB(&src[cnt << 5]); 1266 src2 = LD_SB(&src[16 + (cnt << 5)]); 1267 src3 = LD_SB(&src[24 + (cnt << 5)]); 1268 src1 = __msa_sldi_b(src2, src0, 8); 1269 1270 XORI_B4_128_SB(src0, src1, src2, src3); 1271 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, 1272 vec12); 1273 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, 1274 vec13); 1275 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, 1276 vec10, vec14); 1277 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, 1278 vec11, vec15); 1279 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1280 vec0, vec1, vec2, vec3); 1281 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, 1282 vec8, vec9, vec10, vec11); 1283 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, 1284 vec0, vec1, vec2, vec3); 1285 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, 1286 vec8, vec9, vec10, vec11); 1287 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, 1288 out1, out2, out3); 1289 SRARI_H4_SH(out0, out1, out2, out3, 7); 1290 SAT_SH4_SH(out0, out1, out2, out3, 7); 1291 LD_UB2(&dst[cnt << 5], 16, dst1, dst2); 1292 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]); 1293 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]); 1294 } 1295 1296 src += src_stride; 1297 dst += dst_stride; 1298 } 1299} 1300 1301static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, 1302 int32_t src_stride, 1303 uint8_t *dst, int32_t dst_stride, 1304 const int8_t *filter, 1305 int32_t height) 1306{ 1307 uint32_t loop_cnt; 1308 uint32_t tp0, tp1, tp2, tp3; 1309 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1310 v16u8 dst0, out; 1311 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 1312 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; 1313 v16i8 src10998, filt0, filt1, filt2, filt3; 1314 v8i16 filt, out10, out32; 1315 1316 src -= (3 * src_stride); 1317 1318 filt = LD_SH(filter); 1319 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1320 1321 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1322 src += (7 * src_stride); 1323 1324 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 1325 src54_r, src21_r); 1326 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1327 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, 1328 src4332, src6554); 1329 XORI_B3_128_SB(src2110, src4332, src6554); 1330 1331 for (loop_cnt = (height >> 2); loop_cnt--;) { 1332 LD_SB4(src, src_stride, src7, src8, src9, src10); 1333 src += (4 * src_stride); 1334 1335 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 1336 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1337 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 1338 src87_r, src98_r, src109_r); 1339 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); 1340 XORI_B2_128_SB(src8776, src10998); 1341 out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, 1342 filt1, filt2, filt3); 1343 out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, 1344 filt1, filt2, filt3); 1345 SRARI_H2_SH(out10, out32, 7); 1346 SAT_SH2_SH(out10, out32, 7); 1347 out = PCKEV_XORI128_UB(out10, out32); 1348 out = __msa_aver_u_b(out, dst0); 1349 1350 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 1351 dst += (4 * dst_stride); 1352 1353 src2110 = src6554; 1354 src4332 = src8776; 1355 src6554 = src10998; 1356 src6 = src10; 1357 } 1358} 1359 1360static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, 1361 int32_t src_stride, 1362 uint8_t *dst, int32_t dst_stride, 1363 const int8_t *filter, 1364 int32_t height) 1365{ 1366 uint32_t loop_cnt; 1367 uint64_t tp0, tp1, tp2, tp3; 1368 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1369 v16u8 dst0, dst1; 1370 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 1371 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; 1372 v8i16 filt, out0, out1, out2, out3; 1373 1374 src -= (3 * src_stride); 1375 1376 filt = LD_SH(filter); 1377 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1378 1379 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1380 src += (7 * src_stride); 1381 1382 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1383 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 1384 src54_r, src21_r); 1385 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1386 1387 for (loop_cnt = (height >> 2); loop_cnt--;) { 1388 LD_SB4(src, src_stride, src7, src8, src9, src10); 1389 src += (4 * src_stride); 1390 1391 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 1392 INSERT_D2_UB(tp0, tp1, dst0); 1393 INSERT_D2_UB(tp2, tp3, dst1); 1394 XORI_B4_128_SB(src7, src8, src9, src10); 1395 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 1396 src87_r, src98_r, src109_r); 1397 out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, 1398 filt1, filt2, filt3); 1399 out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, 1400 filt1, filt2, filt3); 1401 out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, 1402 filt1, filt2, filt3); 1403 out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, 1404 filt1, filt2, filt3); 1405 SRARI_H4_SH(out0, out1, out2, out3, 7); 1406 SAT_SH4_SH(out0, out1, out2, out3, 7); 1407 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, 1408 dst, dst_stride); 1409 dst += (4 * dst_stride); 1410 1411 src10_r = src54_r; 1412 src32_r = src76_r; 1413 src54_r = src98_r; 1414 src21_r = src65_r; 1415 src43_r = src87_r; 1416 src65_r = src109_r; 1417 src6 = src10; 1418 } 1419} 1420 1421static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src, 1422 int32_t src_stride, 1423 uint8_t *dst, 1424 int32_t dst_stride, 1425 const int8_t *filter, 1426 int32_t height, 1427 int32_t width) 1428{ 1429 const uint8_t *src_tmp; 1430 uint8_t *dst_tmp; 1431 uint32_t loop_cnt, cnt; 1432 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1433 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 1434 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; 1435 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; 1436 v16i8 filt0, filt1, filt2, filt3; 1437 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; 1438 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; 1439 1440 src -= (3 * src_stride); 1441 1442 filt = LD_SH(filter); 1443 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1444 1445 for (cnt = (width >> 4); cnt--;) { 1446 src_tmp = src; 1447 dst_tmp = dst; 1448 1449 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1450 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1451 src_tmp += (7 * src_stride); 1452 1453 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, 1454 src32_r, src54_r, src21_r); 1455 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1456 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, 1457 src32_l, src54_l, src21_l); 1458 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1459 1460 for (loop_cnt = (height >> 2); loop_cnt--;) { 1461 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); 1462 src_tmp += (4 * src_stride); 1463 1464 LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3); 1465 XORI_B4_128_SB(src7, src8, src9, src10); 1466 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 1467 src87_r, src98_r, src109_r); 1468 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, 1469 src87_l, src98_l, src109_l); 1470 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, 1471 filt0, filt1, filt2, filt3); 1472 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, 1473 filt0, filt1, filt2, filt3); 1474 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, 1475 filt0, filt1, filt2, filt3); 1476 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, 1477 filt0, filt1, filt2, filt3); 1478 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, 1479 filt0, filt1, filt2, filt3); 1480 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, 1481 filt0, filt1, filt2, filt3); 1482 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, 1483 filt0, filt1, filt2, filt3); 1484 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, 1485 filt0, filt1, filt2, filt3); 1486 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1487 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1488 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1489 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1490 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 1491 out3_r, tmp0, tmp1, tmp2, tmp3); 1492 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 1493 AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, 1494 dst0, dst1, dst2, dst3); 1495 ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride); 1496 dst_tmp += (4 * dst_stride); 1497 1498 src10_r = src54_r; 1499 src32_r = src76_r; 1500 src54_r = src98_r; 1501 src21_r = src65_r; 1502 src43_r = src87_r; 1503 src65_r = src109_r; 1504 src10_l = src54_l; 1505 src32_l = src76_l; 1506 src54_l = src98_l; 1507 src21_l = src65_l; 1508 src43_l = src87_l; 1509 src65_l = src109_l; 1510 src6 = src10; 1511 } 1512 1513 src += 16; 1514 dst += 16; 1515 } 1516} 1517 1518static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, 1519 int32_t src_stride, 1520 uint8_t *dst, int32_t dst_stride, 1521 const int8_t *filter, 1522 int32_t height) 1523{ 1524 common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, 1525 filter, height, 16); 1526} 1527 1528static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, 1529 int32_t src_stride, 1530 uint8_t *dst, int32_t dst_stride, 1531 const int8_t *filter, 1532 int32_t height) 1533{ 1534 common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, 1535 filter, height, 32); 1536} 1537 1538static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, 1539 int32_t src_stride, 1540 uint8_t *dst, int32_t dst_stride, 1541 const int8_t *filter, 1542 int32_t height) 1543{ 1544 common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, 1545 filter, height, 64); 1546} 1547 1548static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, 1549 int32_t src_stride, 1550 uint8_t *dst, 1551 int32_t dst_stride, 1552 const int8_t *filter_horiz, 1553 const int8_t *filter_vert, 1554 int32_t height) 1555{ 1556 uint32_t loop_cnt; 1557 uint32_t tp0, tp1, tp2, tp3; 1558 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1559 v16u8 dst0, res, mask0, mask1, mask2, mask3; 1560 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 1561 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1562 v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; 1563 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 1564 1565 mask0 = LD_UB(&mc_filt_mask_arr[16]); 1566 src -= (3 + 3 * src_stride); 1567 1568 /* rearranging filter */ 1569 filt = LD_SH(filter_horiz); 1570 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 1571 1572 mask1 = mask0 + 2; 1573 mask2 = mask0 + 4; 1574 mask3 = mask0 + 6; 1575 1576 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1577 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1578 src += (7 * src_stride); 1579 1580 hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, 1581 filt_hz1, filt_hz2, filt_hz3); 1582 hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, 1583 filt_hz1, filt_hz2, filt_hz3); 1584 hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, 1585 filt_hz1, filt_hz2, filt_hz3); 1586 hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, 1587 filt_hz1, filt_hz2, filt_hz3); 1588 SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3); 1589 1590 filt = LD_SH(filter_vert); 1591 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 1592 1593 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1594 vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 1595 1596 for (loop_cnt = (height >> 2); loop_cnt--;) { 1597 LD_SB4(src, src_stride, src7, src8, src9, src10); 1598 XORI_B4_128_SB(src7, src8, src9, src10); 1599 src += (4 * src_stride); 1600 1601 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 1602 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1603 hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, 1604 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 1605 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8); 1606 vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); 1607 res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, 1608 filt_vt2, filt_vt3); 1609 1610 hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, 1611 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 1612 hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8); 1613 vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8); 1614 res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, 1615 filt_vt2, filt_vt3); 1616 1617 SRARI_H2_SH(res0, res1, 7); 1618 SAT_SH2_SH(res0, res1, 7); 1619 res = PCKEV_XORI128_UB(res0, res1); 1620 res = (v16u8) __msa_aver_u_b(res, dst0); 1621 ST_W4(res, 0, 1, 2, 3, dst, dst_stride); 1622 dst += (4 * dst_stride); 1623 1624 hz_out5 = hz_out9; 1625 vec0 = vec2; 1626 vec1 = vec3; 1627 vec2 = vec4; 1628 } 1629} 1630 1631static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, 1632 int32_t src_stride, 1633 uint8_t *dst, 1634 int32_t dst_stride, 1635 const int8_t *filter_horiz, 1636 const int8_t *filter_vert, 1637 int32_t height) 1638{ 1639 uint32_t loop_cnt; 1640 uint64_t tp0, tp1, tp2, tp3; 1641 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1642 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 1643 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 1644 v16u8 dst0, dst1, mask0, mask1, mask2, mask3; 1645 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1646 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; 1647 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; 1648 1649 mask0 = LD_UB(&mc_filt_mask_arr[0]); 1650 src -= (3 + 3 * src_stride); 1651 1652 /* rearranging filter */ 1653 filt = LD_SH(filter_horiz); 1654 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 1655 1656 mask1 = mask0 + 2; 1657 mask2 = mask0 + 4; 1658 mask3 = mask0 + 6; 1659 1660 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1661 src += (7 * src_stride); 1662 1663 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1664 hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, 1665 filt_hz1, filt_hz2, filt_hz3); 1666 hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, 1667 filt_hz1, filt_hz2, filt_hz3); 1668 hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, 1669 filt_hz1, filt_hz2, filt_hz3); 1670 hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, 1671 filt_hz1, filt_hz2, filt_hz3); 1672 hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, 1673 filt_hz1, filt_hz2, filt_hz3); 1674 hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, 1675 filt_hz1, filt_hz2, filt_hz3); 1676 hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, 1677 filt_hz1, filt_hz2, filt_hz3); 1678 1679 filt = LD_SH(filter_vert); 1680 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 1681 1682 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 1683 ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); 1684 ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); 1685 1686 for (loop_cnt = (height >> 2); loop_cnt--;) { 1687 LD_SB4(src, src_stride, src7, src8, src9, src10); 1688 XORI_B4_128_SB(src7, src8, src9, src10); 1689 src += (4 * src_stride); 1690 1691 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 1692 INSERT_D2_UB(tp0, tp1, dst0); 1693 INSERT_D2_UB(tp2, tp3, dst1); 1694 1695 hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, 1696 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 1697 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); 1698 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, 1699 filt_vt2, filt_vt3); 1700 1701 hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, 1702 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 1703 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7); 1704 tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, 1705 filt_vt2, filt_vt3); 1706 1707 hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, 1708 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 1709 out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8); 1710 tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, 1711 filt_vt2, filt_vt3); 1712 1713 hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, 1714 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 1715 out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9); 1716 tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, 1717 filt_vt2, filt_vt3); 1718 1719 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1720 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1721 CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, 1722 dst, dst_stride); 1723 dst += (4 * dst_stride); 1724 1725 hz_out6 = hz_out10; 1726 out0 = out2; 1727 out1 = out3; 1728 out2 = out8; 1729 out4 = out6; 1730 out5 = out7; 1731 out6 = out9; 1732 } 1733} 1734 1735static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, 1736 int32_t src_stride, 1737 uint8_t *dst, 1738 int32_t dst_stride, 1739 const int8_t *filter_horiz, 1740 const int8_t *filter_vert, 1741 int32_t height) 1742{ 1743 int32_t multiple8_cnt; 1744 1745 for (multiple8_cnt = 2; multiple8_cnt--;) { 1746 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 1747 filter_horiz, filter_vert, 1748 height); 1749 1750 src += 8; 1751 dst += 8; 1752 } 1753} 1754 1755static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, 1756 int32_t src_stride, 1757 uint8_t *dst, 1758 int32_t dst_stride, 1759 const int8_t *filter_horiz, 1760 const int8_t *filter_vert, 1761 int32_t height) 1762{ 1763 int32_t multiple8_cnt; 1764 1765 for (multiple8_cnt = 4; multiple8_cnt--;) { 1766 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 1767 filter_horiz, filter_vert, 1768 height); 1769 1770 src += 8; 1771 dst += 8; 1772 } 1773} 1774 1775static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, 1776 int32_t src_stride, 1777 uint8_t *dst, 1778 int32_t dst_stride, 1779 const int8_t *filter_horiz, 1780 const int8_t *filter_vert, 1781 int32_t height) 1782{ 1783 int32_t multiple8_cnt; 1784 1785 for (multiple8_cnt = 8; multiple8_cnt--;) { 1786 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 1787 filter_horiz, filter_vert, 1788 height); 1789 1790 src += 8; 1791 dst += 8; 1792 } 1793} 1794 1795static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, 1796 uint8_t *dst, int32_t dst_stride, 1797 const int8_t *filter) 1798{ 1799 v16i8 src0, src1, src2, src3, mask; 1800 v16u8 filt0, vec0, vec1, res0, res1; 1801 v8u16 vec2, vec3, filt; 1802 1803 mask = LD_SB(&mc_filt_mask_arr[16]); 1804 1805 /* rearranging filter */ 1806 filt = LD_UH(filter); 1807 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1808 1809 LD_SB4(src, src_stride, src0, src1, src2, src3); 1810 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 1811 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); 1812 SRARI_H2_UH(vec2, vec3, 7); 1813 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); 1814 ST_W2(res0, 0, 1, dst, dst_stride); 1815 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 1816} 1817 1818static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, 1819 uint8_t *dst, int32_t dst_stride, 1820 const int8_t *filter) 1821{ 1822 v16u8 vec0, vec1, vec2, vec3, filt0; 1823 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 1824 v16i8 res0, res1, res2, res3; 1825 v8u16 vec4, vec5, vec6, vec7, filt; 1826 1827 mask = LD_SB(&mc_filt_mask_arr[16]); 1828 1829 /* rearranging filter */ 1830 filt = LD_UH(filter); 1831 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1832 1833 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 1834 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 1835 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); 1836 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1837 vec4, vec5, vec6, vec7); 1838 SRARI_H4_UH(vec4, vec5, vec6, vec7, 7); 1839 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, 1840 res0, res1, res2, res3); 1841 ST_W2(res0, 0, 1, dst, dst_stride); 1842 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 1843 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride); 1844 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride); 1845} 1846 1847void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, 1848 const uint8_t *src, ptrdiff_t src_stride, 1849 int height, int mx, int my) 1850{ 1851 const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; 1852 1853 if (4 == height) { 1854 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 1855 } else if (8 == height) { 1856 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 1857 } 1858} 1859 1860static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, 1861 uint8_t *dst, int32_t dst_stride, 1862 const int8_t *filter) 1863{ 1864 v16u8 filt0; 1865 v16i8 src0, src1, src2, src3, mask; 1866 v8u16 vec0, vec1, vec2, vec3, filt; 1867 1868 mask = LD_SB(&mc_filt_mask_arr[0]); 1869 1870 /* rearranging filter */ 1871 filt = LD_UH(filter); 1872 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1873 1874 LD_SB4(src, src_stride, src0, src1, src2, src3); 1875 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1876 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1877 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1878 vec0, vec1, vec2, vec3); 1879 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1880 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); 1881 ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride); 1882} 1883 1884static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, 1885 uint8_t *dst, int32_t dst_stride, 1886 const int8_t *filter, int32_t height) 1887{ 1888 v16u8 filt0; 1889 v16i8 src0, src1, src2, src3, mask, out0, out1; 1890 v8u16 vec0, vec1, vec2, vec3, filt; 1891 1892 mask = LD_SB(&mc_filt_mask_arr[0]); 1893 1894 /* rearranging filter */ 1895 filt = LD_UH(filter); 1896 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1897 1898 LD_SB4(src, src_stride, src0, src1, src2, src3); 1899 src += (4 * src_stride); 1900 1901 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1902 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1903 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1904 vec0, vec1, vec2, vec3); 1905 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1906 LD_SB4(src, src_stride, src0, src1, src2, src3); 1907 src += (4 * src_stride); 1908 1909 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 1910 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1911 1912 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1913 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1914 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1915 vec0, vec1, vec2, vec3); 1916 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1917 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 1918 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1919 dst += (8 * dst_stride); 1920 1921 if (16 == height) { 1922 LD_SB4(src, src_stride, src0, src1, src2, src3); 1923 src += (4 * src_stride); 1924 1925 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1926 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1927 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1928 vec0, vec1, vec2, vec3); 1929 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1930 LD_SB4(src, src_stride, src0, src1, src2, src3); 1931 src += (4 * src_stride); 1932 1933 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 1934 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1935 1936 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1937 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1938 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1939 vec0, vec1, vec2, vec3); 1940 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1941 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 1942 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1943 } 1944} 1945 1946void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, 1947 const uint8_t *src, ptrdiff_t src_stride, 1948 int height, int mx, int my) 1949{ 1950 const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; 1951 1952 if (4 == height) { 1953 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 1954 } else { 1955 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, 1956 height); 1957 } 1958} 1959 1960void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, 1961 const uint8_t *src, ptrdiff_t src_stride, 1962 int height, int mx, int my) 1963{ 1964 uint32_t loop_cnt; 1965 const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; 1966 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 1967 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1968 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 1969 1970 mask = LD_SB(&mc_filt_mask_arr[0]); 1971 1972 loop_cnt = (height >> 2) - 1; 1973 1974 /* rearranging filter */ 1975 filt = LD_UH(filter); 1976 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1977 1978 LD_SB4(src, src_stride, src0, src2, src4, src6); 1979 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 1980 src += (4 * src_stride); 1981 1982 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 1983 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 1984 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 1985 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 1986 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1987 out0, out1, out2, out3); 1988 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 1989 out4, out5, out6, out7); 1990 SRARI_H4_UH(out0, out1, out2, out3, 7); 1991 SRARI_H4_UH(out4, out5, out6, out7, 7); 1992 PCKEV_ST_SB(out0, out1, dst); 1993 dst += dst_stride; 1994 PCKEV_ST_SB(out2, out3, dst); 1995 dst += dst_stride; 1996 PCKEV_ST_SB(out4, out5, dst); 1997 dst += dst_stride; 1998 PCKEV_ST_SB(out6, out7, dst); 1999 dst += dst_stride; 2000 2001 for (; loop_cnt--;) { 2002 LD_SB4(src, src_stride, src0, src2, src4, src6); 2003 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 2004 src += (4 * src_stride); 2005 2006 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 2007 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 2008 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 2009 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 2010 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 2011 out0, out1, out2, out3); 2012 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 2013 out4, out5, out6, out7); 2014 SRARI_H4_UH(out0, out1, out2, out3, 7); 2015 SRARI_H4_UH(out4, out5, out6, out7, 7); 2016 PCKEV_ST_SB(out0, out1, dst); 2017 dst += dst_stride; 2018 PCKEV_ST_SB(out2, out3, dst); 2019 dst += dst_stride; 2020 PCKEV_ST_SB(out4, out5, dst); 2021 dst += dst_stride; 2022 PCKEV_ST_SB(out6, out7, dst); 2023 dst += dst_stride; 2024 } 2025} 2026 2027void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, 2028 const uint8_t *src, ptrdiff_t src_stride, 2029 int height, int mx, int my) 2030{ 2031 uint32_t loop_cnt; 2032 const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; 2033 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 2034 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2035 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 2036 2037 mask = LD_SB(&mc_filt_mask_arr[0]); 2038 2039 /* rearranging filter */ 2040 filt = LD_UH(filter); 2041 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 2042 2043 for (loop_cnt = height >> 1; loop_cnt--;) { 2044 src0 = LD_SB(src); 2045 src2 = LD_SB(src + 16); 2046 src3 = LD_SB(src + 24); 2047 src1 = __msa_sldi_b(src2, src0, 8); 2048 src += src_stride; 2049 src4 = LD_SB(src); 2050 src6 = LD_SB(src + 16); 2051 src7 = LD_SB(src + 24); 2052 src5 = __msa_sldi_b(src6, src4, 8); 2053 src += src_stride; 2054 2055 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 2056 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 2057 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 2058 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 2059 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 2060 out0, out1, out2, out3); 2061 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 2062 out4, out5, out6, out7); 2063 SRARI_H4_UH(out0, out1, out2, out3, 7); 2064 SRARI_H4_UH(out4, out5, out6, out7, 7); 2065 PCKEV_ST_SB(out0, out1, dst); 2066 PCKEV_ST_SB(out2, out3, dst + 16); 2067 dst += dst_stride; 2068 PCKEV_ST_SB(out4, out5, dst); 2069 PCKEV_ST_SB(out6, out7, dst + 16); 2070 dst += dst_stride; 2071 } 2072} 2073 2074void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, 2075 const uint8_t *src, ptrdiff_t src_stride, 2076 int height, int mx, int my) 2077{ 2078 uint32_t loop_cnt; 2079 const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; 2080 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 2081 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2082 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 2083 2084 mask = LD_SB(&mc_filt_mask_arr[0]); 2085 2086 /* rearranging filter */ 2087 filt = LD_UH(filter); 2088 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 2089 2090 for (loop_cnt = height; loop_cnt--;) { 2091 src0 = LD_SB(src); 2092 src2 = LD_SB(src + 16); 2093 src4 = LD_SB(src + 32); 2094 src6 = LD_SB(src + 48); 2095 src7 = LD_SB(src + 56); 2096 SLDI_B3_SB(src2, src0, src4, src2, src6, src4, 8, src1, src3, src5); 2097 src += src_stride; 2098 2099 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 2100 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 2101 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 2102 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 2103 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 2104 out0, out1, out2, out3); 2105 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 2106 out4, out5, out6, out7); 2107 SRARI_H4_UH(out0, out1, out2, out3, 7); 2108 SRARI_H4_UH(out4, out5, out6, out7, 7); 2109 PCKEV_ST_SB(out0, out1, dst); 2110 PCKEV_ST_SB(out2, out3, dst + 16); 2111 PCKEV_ST_SB(out4, out5, dst + 32); 2112 PCKEV_ST_SB(out6, out7, dst + 48); 2113 dst += dst_stride; 2114 } 2115} 2116 2117static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, 2118 uint8_t *dst, int32_t dst_stride, 2119 const int8_t *filter) 2120{ 2121 v16i8 src0, src1, src2, src3, src4; 2122 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; 2123 v16u8 filt0; 2124 v8i16 filt; 2125 v8u16 tmp0, tmp1; 2126 2127 filt = LD_SH(filter); 2128 filt0 = (v16u8) __msa_splati_h(filt, 0); 2129 2130 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 2131 src += (5 * src_stride); 2132 2133 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, 2134 src10_r, src21_r, src32_r, src43_r); 2135 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 2136 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 2137 SRARI_H2_UH(tmp0, tmp1, 7); 2138 SAT_UH2_UH(tmp0, tmp1, 7); 2139 src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 2140 ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride); 2141} 2142 2143static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, 2144 uint8_t *dst, int32_t dst_stride, 2145 const int8_t *filter) 2146{ 2147 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2148 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; 2149 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; 2150 v8u16 tmp0, tmp1, tmp2, tmp3; 2151 v16u8 filt0; 2152 v8i16 filt; 2153 2154 filt = LD_SH(filter); 2155 filt0 = (v16u8) __msa_splati_h(filt, 0); 2156 2157 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 2158 src += (8 * src_stride); 2159 2160 src8 = LD_SB(src); 2161 src += src_stride; 2162 2163 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 2164 src32_r, src43_r); 2165 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 2166 src76_r, src87_r); 2167 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 2168 src87_r, src76_r, src2110, src4332, src6554, src8776); 2169 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, 2170 tmp0, tmp1, tmp2, tmp3); 2171 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2172 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2173 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); 2174 ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 2175} 2176 2177void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, 2178 const uint8_t *src, ptrdiff_t src_stride, 2179 int height, int mx, int my) 2180{ 2181 const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; 2182 2183 if (4 == height) { 2184 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 2185 } else if (8 == height) { 2186 common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 2187 } 2188} 2189 2190static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, 2191 uint8_t *dst, int32_t dst_stride, 2192 const int8_t *filter) 2193{ 2194 v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; 2195 v16i8 out0, out1; 2196 v8u16 tmp0, tmp1, tmp2, tmp3; 2197 v8i16 filt; 2198 2199 /* rearranging filter_y */ 2200 filt = LD_SH(filter); 2201 filt0 = (v16u8) __msa_splati_h(filt, 0); 2202 2203 LD_UB5(src, src_stride, src0, src1, src2, src3, src4); 2204 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); 2205 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); 2206 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 2207 tmp0, tmp1, tmp2, tmp3); 2208 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2209 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2210 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 2211 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2212} 2213 2214static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, 2215 uint8_t *dst, int32_t dst_stride, 2216 const int8_t *filter, int32_t height) 2217{ 2218 uint32_t loop_cnt; 2219 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2220 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 2221 v16i8 out0, out1; 2222 v8u16 tmp0, tmp1, tmp2, tmp3; 2223 v8i16 filt; 2224 2225 /* rearranging filter_y */ 2226 filt = LD_SH(filter); 2227 filt0 = (v16u8) __msa_splati_h(filt, 0); 2228 2229 src0 = LD_UB(src); 2230 src += src_stride; 2231 2232 for (loop_cnt = (height >> 3); loop_cnt--;) { 2233 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); 2234 src += (8 * src_stride); 2235 2236 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 2237 vec0, vec1, vec2, vec3); 2238 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, 2239 vec4, vec5, vec6, vec7); 2240 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 2241 tmp0, tmp1, tmp2, tmp3); 2242 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2243 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2244 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 2245 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2246 2247 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 2248 tmp0, tmp1, tmp2, tmp3); 2249 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2250 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2251 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 2252 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 2253 dst += (8 * dst_stride); 2254 2255 src0 = src8; 2256 } 2257} 2258 2259void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, 2260 const uint8_t *src, ptrdiff_t src_stride, 2261 int height, int mx, int my) 2262{ 2263 const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; 2264 2265 if (4 == height) { 2266 common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 2267 } else { 2268 common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, 2269 height); 2270 } 2271} 2272 2273void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, 2274 const uint8_t *src, ptrdiff_t src_stride, 2275 int height, int mx, int my) 2276{ 2277 uint32_t loop_cnt; 2278 const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; 2279 v16u8 src0, src1, src2, src3, src4; 2280 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 2281 v8u16 tmp0, tmp1, tmp2, tmp3; 2282 v8i16 filt; 2283 2284 /* rearranging filter_y */ 2285 filt = LD_SH(filter); 2286 filt0 = (v16u8) __msa_splati_h(filt, 0); 2287 2288 src0 = LD_UB(src); 2289 src += src_stride; 2290 2291 for (loop_cnt = (height >> 2); loop_cnt--;) { 2292 LD_UB4(src, src_stride, src1, src2, src3, src4); 2293 src += (4 * src_stride); 2294 2295 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 2296 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 2297 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 2298 SRARI_H2_UH(tmp0, tmp1, 7); 2299 SAT_UH2_UH(tmp0, tmp1, 7); 2300 PCKEV_ST_SB(tmp0, tmp1, dst); 2301 dst += dst_stride; 2302 2303 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 2304 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 2305 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 2306 SRARI_H2_UH(tmp2, tmp3, 7); 2307 SAT_UH2_UH(tmp2, tmp3, 7); 2308 PCKEV_ST_SB(tmp2, tmp3, dst); 2309 dst += dst_stride; 2310 2311 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 2312 SRARI_H2_UH(tmp0, tmp1, 7); 2313 SAT_UH2_UH(tmp0, tmp1, 7); 2314 PCKEV_ST_SB(tmp0, tmp1, dst); 2315 dst += dst_stride; 2316 2317 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 2318 SRARI_H2_UH(tmp2, tmp3, 7); 2319 SAT_UH2_UH(tmp2, tmp3, 7); 2320 PCKEV_ST_SB(tmp2, tmp3, dst); 2321 dst += dst_stride; 2322 2323 src0 = src4; 2324 } 2325} 2326 2327void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, 2328 const uint8_t *src, ptrdiff_t src_stride, 2329 int height, int mx, int my) 2330{ 2331 uint32_t loop_cnt; 2332 const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; 2333 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 2334 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 2335 v8u16 tmp0, tmp1, tmp2, tmp3; 2336 v8i16 filt; 2337 2338 /* rearranging filter_y */ 2339 filt = LD_SH(filter); 2340 filt0 = (v16u8) __msa_splati_h(filt, 0); 2341 2342 src0 = LD_UB(src); 2343 src5 = LD_UB(src + 16); 2344 src += src_stride; 2345 2346 for (loop_cnt = (height >> 2); loop_cnt--;) { 2347 LD_UB4(src, src_stride, src1, src2, src3, src4); 2348 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 2349 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 2350 2351 LD_UB4(src + 16, src_stride, src6, src7, src8, src9); 2352 src += (4 * src_stride); 2353 2354 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 2355 SRARI_H2_UH(tmp0, tmp1, 7); 2356 SAT_UH2_UH(tmp0, tmp1, 7); 2357 PCKEV_ST_SB(tmp0, tmp1, dst); 2358 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 2359 SRARI_H2_UH(tmp2, tmp3, 7); 2360 SAT_UH2_UH(tmp2, tmp3, 7); 2361 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); 2362 2363 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 2364 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 2365 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 2366 SRARI_H2_UH(tmp0, tmp1, 7); 2367 SAT_UH2_UH(tmp0, tmp1, 7); 2368 PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); 2369 2370 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 2371 SRARI_H2_UH(tmp2, tmp3, 7); 2372 SAT_UH2_UH(tmp2, tmp3, 7); 2373 PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); 2374 2375 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); 2376 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); 2377 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 2378 SRARI_H2_UH(tmp0, tmp1, 7); 2379 SAT_UH2_UH(tmp0, tmp1, 7); 2380 PCKEV_ST_SB(tmp0, tmp1, dst + 16); 2381 2382 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 2383 SRARI_H2_UH(tmp2, tmp3, 7); 2384 SAT_UH2_UH(tmp2, tmp3, 7); 2385 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); 2386 2387 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); 2388 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); 2389 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 2390 SRARI_H2_UH(tmp0, tmp1, 7); 2391 SAT_UH2_UH(tmp0, tmp1, 7); 2392 PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); 2393 2394 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 2395 SRARI_H2_UH(tmp2, tmp3, 7); 2396 SAT_UH2_UH(tmp2, tmp3, 7); 2397 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); 2398 dst += (4 * dst_stride); 2399 2400 src0 = src4; 2401 src5 = src9; 2402 } 2403} 2404 2405void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, 2406 const uint8_t *src, ptrdiff_t src_stride, 2407 int height, int mx, int my) 2408{ 2409 uint32_t loop_cnt; 2410 const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; 2411 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2412 v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 2413 v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2414 v8i16 filt; 2415 2416 /* rearranging filter_y */ 2417 filt = LD_SH(filter); 2418 filt0 = (v16u8) __msa_splati_h(filt, 0); 2419 2420 LD_UB4(src, 16, src0, src3, src6, src9); 2421 src += src_stride; 2422 2423 for (loop_cnt = (height >> 1); loop_cnt--;) { 2424 LD_UB2(src, src_stride, src1, src2); 2425 LD_UB2(src + 16, src_stride, src4, src5); 2426 LD_UB2(src + 32, src_stride, src7, src8); 2427 LD_UB2(src + 48, src_stride, src10, src11); 2428 src += (2 * src_stride); 2429 2430 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 2431 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 2432 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 2433 SRARI_H2_UH(tmp0, tmp1, 7); 2434 SAT_UH2_UH(tmp0, tmp1, 7); 2435 PCKEV_ST_SB(tmp0, tmp1, dst); 2436 2437 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 2438 SRARI_H2_UH(tmp2, tmp3, 7); 2439 SAT_UH2_UH(tmp2, tmp3, 7); 2440 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); 2441 2442 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); 2443 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); 2444 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); 2445 SRARI_H2_UH(tmp4, tmp5, 7); 2446 SAT_UH2_UH(tmp4, tmp5, 7); 2447 PCKEV_ST_SB(tmp4, tmp5, dst + 16); 2448 2449 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); 2450 SRARI_H2_UH(tmp6, tmp7, 7); 2451 SAT_UH2_UH(tmp6, tmp7, 7); 2452 PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); 2453 2454 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); 2455 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); 2456 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 2457 SRARI_H2_UH(tmp0, tmp1, 7); 2458 SAT_UH2_UH(tmp0, tmp1, 7); 2459 PCKEV_ST_SB(tmp0, tmp1, dst + 32); 2460 2461 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 2462 SRARI_H2_UH(tmp2, tmp3, 7); 2463 SAT_UH2_UH(tmp2, tmp3, 7); 2464 PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); 2465 2466 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); 2467 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); 2468 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); 2469 SRARI_H2_UH(tmp4, tmp5, 7); 2470 SAT_UH2_UH(tmp4, tmp5, 7); 2471 PCKEV_ST_SB(tmp4, tmp5, dst + 48); 2472 2473 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); 2474 SRARI_H2_UH(tmp6, tmp7, 7); 2475 SAT_UH2_UH(tmp6, tmp7, 7); 2476 PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); 2477 dst += (2 * dst_stride); 2478 2479 src0 = src2; 2480 src3 = src5; 2481 src6 = src8; 2482 src9 = src11; 2483 } 2484} 2485 2486static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, 2487 uint8_t *dst, int32_t dst_stride, 2488 const int8_t *filter_horiz, const int8_t *filter_vert) 2489{ 2490 v16i8 src0, src1, src2, src3, src4, mask; 2491 v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; 2492 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; 2493 2494 mask = LD_SB(&mc_filt_mask_arr[16]); 2495 2496 /* rearranging filter */ 2497 filt = LD_UH(filter_horiz); 2498 filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0); 2499 2500 filt = LD_UH(filter_vert); 2501 filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0); 2502 2503 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 2504 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); 2505 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); 2506 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 2507 hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8); 2508 hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2); 2509 2510 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 2511 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 2512 SRARI_H2_UH(tmp0, tmp1, 7); 2513 SAT_UH2_UH(tmp0, tmp1, 7); 2514 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); 2515 ST_W2(res0, 0, 1, dst, dst_stride); 2516 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 2517} 2518 2519static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, 2520 uint8_t *dst, int32_t dst_stride, 2521 const int8_t *filter_horiz, const int8_t *filter_vert) 2522{ 2523 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; 2524 v16i8 res0, res1, res2, res3; 2525 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; 2526 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 2527 v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; 2528 2529 mask = LD_SB(&mc_filt_mask_arr[16]); 2530 2531 /* rearranging filter */ 2532 filt = LD_UH(filter_horiz); 2533 filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0); 2534 2535 filt = LD_UH(filter_vert); 2536 filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0); 2537 2538 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 2539 src += (8 * src_stride); 2540 src8 = LD_SB(src); 2541 2542 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); 2543 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); 2544 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7); 2545 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7); 2546 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7); 2547 SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1, 2548 hz_out3, hz_out5); 2549 hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6); 2550 2551 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 2552 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); 2553 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, 2554 vec4, vec5, vec6, vec7); 2555 SRARI_H4_UH(vec4, vec5, vec6, vec7, 7); 2556 SAT_UH4_UH(vec4, vec5, vec6, vec7, 7); 2557 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, 2558 res0, res1, res2, res3); 2559 ST_W2(res0, 0, 1, dst, dst_stride); 2560 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 2561 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride); 2562 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride); 2563} 2564 2565void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 2566 const uint8_t *src, ptrdiff_t src_stride, 2567 int height, int mx, int my) 2568{ 2569 const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1]; 2570 const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1]; 2571 2572 if (4 == height) { 2573 common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, 2574 filter_horiz, filter_vert); 2575 } else if (8 == height) { 2576 common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, 2577 filter_horiz, filter_vert); 2578 } 2579} 2580 2581static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, 2582 uint8_t *dst, int32_t dst_stride, 2583 const int8_t *filter_horiz, const int8_t *filter_vert) 2584{ 2585 v16i8 src0, src1, src2, src3, src4, mask, out0, out1; 2586 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; 2587 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 2588 v8i16 filt; 2589 2590 mask = LD_SB(&mc_filt_mask_arr[0]); 2591 2592 /* rearranging filter */ 2593 filt = LD_SH(filter_horiz); 2594 filt_hz = (v16u8) __msa_splati_h(filt, 0); 2595 2596 filt = LD_SH(filter_vert); 2597 filt_vt = (v16u8) __msa_splati_h(filt, 0); 2598 2599 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 2600 2601 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 2602 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2603 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2604 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 2605 2606 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 2607 vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2608 tmp1 = __msa_dotp_u_h(vec1, filt_vt); 2609 2610 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 2611 vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2612 tmp2 = __msa_dotp_u_h(vec2, filt_vt); 2613 2614 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 2615 vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2616 tmp3 = __msa_dotp_u_h(vec3, filt_vt); 2617 2618 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2619 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2620 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 2621 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2622} 2623 2624static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride, 2625 uint8_t *dst, int32_t dst_stride, 2626 const int8_t *filter_horiz, const int8_t *filter_vert, 2627 int32_t height) 2628{ 2629 uint32_t loop_cnt; 2630 v16i8 src0, src1, src2, src3, src4, mask, out0, out1; 2631 v16u8 filt_hz, filt_vt, vec0; 2632 v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; 2633 v8i16 filt; 2634 2635 mask = LD_SB(&mc_filt_mask_arr[0]); 2636 2637 /* rearranging filter */ 2638 filt = LD_SH(filter_horiz); 2639 filt_hz = (v16u8) __msa_splati_h(filt, 0); 2640 2641 filt = LD_SH(filter_vert); 2642 filt_vt = (v16u8) __msa_splati_h(filt, 0); 2643 2644 src0 = LD_SB(src); 2645 src += src_stride; 2646 2647 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 2648 2649 for (loop_cnt = (height >> 3); loop_cnt--;) { 2650 LD_SB4(src, src_stride, src1, src2, src3, src4); 2651 src += (4 * src_stride); 2652 2653 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2654 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2655 tmp1 = __msa_dotp_u_h(vec0, filt_vt); 2656 2657 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 2658 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2659 tmp2 = __msa_dotp_u_h(vec0, filt_vt); 2660 2661 SRARI_H2_UH(tmp1, tmp2, 7); 2662 SAT_UH2_UH(tmp1, tmp2, 7); 2663 2664 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 2665 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2666 tmp3 = __msa_dotp_u_h(vec0, filt_vt); 2667 2668 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 2669 LD_SB4(src, src_stride, src1, src2, src3, src4); 2670 src += (4 * src_stride); 2671 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2672 tmp4 = __msa_dotp_u_h(vec0, filt_vt); 2673 2674 SRARI_H2_UH(tmp3, tmp4, 7); 2675 SAT_UH2_UH(tmp3, tmp4, 7); 2676 PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); 2677 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2678 2679 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2680 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2681 tmp5 = __msa_dotp_u_h(vec0, filt_vt); 2682 2683 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 2684 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2685 tmp6 = __msa_dotp_u_h(vec0, filt_vt); 2686 2687 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 2688 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2689 tmp7 = __msa_dotp_u_h(vec0, filt_vt); 2690 2691 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 2692 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2693 tmp8 = __msa_dotp_u_h(vec0, filt_vt); 2694 2695 SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7); 2696 SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7); 2697 PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); 2698 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 2699 dst += (8 * dst_stride); 2700 } 2701} 2702 2703void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 2704 const uint8_t *src, ptrdiff_t src_stride, 2705 int height, int mx, int my) 2706{ 2707 const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1]; 2708 const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1]; 2709 2710 if (4 == height) { 2711 common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, 2712 filter_horiz, filter_vert); 2713 } else { 2714 common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, 2715 filter_horiz, filter_vert, height); 2716 } 2717} 2718 2719void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 2720 const uint8_t *src, ptrdiff_t src_stride, 2721 int height, int mx, int my) 2722{ 2723 uint32_t loop_cnt; 2724 const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1]; 2725 const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1]; 2726 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 2727 v16u8 filt_hz, filt_vt, vec0, vec1; 2728 v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; 2729 v8i16 filt; 2730 2731 mask = LD_SB(&mc_filt_mask_arr[0]); 2732 2733 /* rearranging filter */ 2734 filt = LD_SH(filter_horiz); 2735 filt_hz = (v16u8) __msa_splati_h(filt, 0); 2736 2737 filt = LD_SH(filter_vert); 2738 filt_vt = (v16u8) __msa_splati_h(filt, 0); 2739 2740 LD_SB2(src, 8, src0, src1); 2741 src += src_stride; 2742 2743 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 2744 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2745 2746 2747 for (loop_cnt = (height >> 2); loop_cnt--;) { 2748 LD_SB4(src, src_stride, src0, src2, src4, src6); 2749 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 2750 src += (4 * src_stride); 2751 2752 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 2753 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2754 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 2755 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 2756 SRARI_H2_UH(tmp1, tmp2, 7); 2757 SAT_UH2_UH(tmp1, tmp2, 7); 2758 PCKEV_ST_SB(tmp1, tmp2, dst); 2759 dst += dst_stride; 2760 2761 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 2762 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 2763 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 2764 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 2765 SRARI_H2_UH(tmp1, tmp2, 7); 2766 SAT_UH2_UH(tmp1, tmp2, 7); 2767 PCKEV_ST_SB(tmp1, tmp2, dst); 2768 dst += dst_stride; 2769 2770 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 2771 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7); 2772 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 2773 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 2774 SRARI_H2_UH(tmp1, tmp2, 7); 2775 SAT_UH2_UH(tmp1, tmp2, 7); 2776 PCKEV_ST_SB(tmp1, tmp2, dst); 2777 dst += dst_stride; 2778 2779 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7); 2780 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7); 2781 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 2782 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 2783 SRARI_H2_UH(tmp1, tmp2, 7); 2784 SAT_UH2_UH(tmp1, tmp2, 7); 2785 PCKEV_ST_SB(tmp1, tmp2, dst); 2786 dst += dst_stride; 2787 } 2788} 2789 2790void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 2791 const uint8_t *src, ptrdiff_t src_stride, 2792 int height, int mx, int my) 2793{ 2794 int32_t multiple8_cnt; 2795 2796 for (multiple8_cnt = 2; multiple8_cnt--;) { 2797 ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my); 2798 2799 src += 16; 2800 dst += 16; 2801 } 2802} 2803 2804void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 2805 const uint8_t *src, ptrdiff_t src_stride, 2806 int height, int mx, int my) 2807{ 2808 int32_t multiple8_cnt; 2809 2810 for (multiple8_cnt = 4; multiple8_cnt--;) { 2811 ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my); 2812 2813 src += 16; 2814 dst += 16; 2815 } 2816} 2817 2818static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, 2819 int32_t src_stride, 2820 uint8_t *dst, int32_t dst_stride, 2821 const int8_t *filter) 2822{ 2823 uint32_t tp0, tp1, tp2, tp3; 2824 v16i8 src0, src1, src2, src3, mask; 2825 v16u8 filt0, dst0, vec0, vec1, res; 2826 v8u16 vec2, vec3, filt; 2827 2828 mask = LD_SB(&mc_filt_mask_arr[16]); 2829 2830 /* rearranging filter */ 2831 filt = LD_UH(filter); 2832 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 2833 2834 LD_SB4(src, src_stride, src0, src1, src2, src3); 2835 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 2836 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 2837 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 2838 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); 2839 SRARI_H2_UH(vec2, vec3, 7); 2840 2841 res = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2); 2842 res = (v16u8) __msa_aver_u_b(res, dst0); 2843 2844 ST_W4(res, 0, 1, 2, 3, dst, dst_stride); 2845} 2846 2847static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, 2848 int32_t src_stride, 2849 uint8_t *dst, int32_t dst_stride, 2850 const int8_t *filter) 2851{ 2852 uint32_t tp0, tp1, tp2, tp3; 2853 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 2854 v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; 2855 v16u8 dst0, dst1; 2856 v8u16 vec4, vec5, vec6, vec7, filt; 2857 2858 mask = LD_SB(&mc_filt_mask_arr[16]); 2859 2860 /* rearranging filter */ 2861 filt = LD_UH(filter); 2862 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 2863 2864 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 2865 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 2866 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 2867 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); 2868 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 2869 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 2870 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); 2871 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, 2872 vec6, vec7); 2873 SRARI_H4_UH(vec4, vec5, vec6, vec7, 7); 2874 PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, 2875 res2, res3); 2876 ILVR_D2_UB(res1, res0, res3, res2, res0, res2); 2877 AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2); 2878 ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 2879} 2880 2881void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, 2882 const uint8_t *src, ptrdiff_t src_stride, 2883 int height, int mx, int my) 2884{ 2885 const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; 2886 2887 if (4 == height) { 2888 common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, 2889 filter); 2890 } else if (8 == height) { 2891 common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, 2892 filter); 2893 } 2894} 2895 2896static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, 2897 int32_t src_stride, 2898 uint8_t *dst, int32_t dst_stride, 2899 const int8_t *filter) 2900{ 2901 int64_t tp0, tp1, tp2, tp3; 2902 v16i8 src0, src1, src2, src3, mask; 2903 v16u8 filt0, dst0, dst1; 2904 v8u16 vec0, vec1, vec2, vec3, filt; 2905 2906 mask = LD_SB(&mc_filt_mask_arr[0]); 2907 2908 /* rearranging filter */ 2909 filt = LD_UH(filter); 2910 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 2911 2912 LD_SB4(src, src_stride, src0, src1, src2, src3); 2913 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 2914 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 2915 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 2916 vec0, vec1, vec2, vec3); 2917 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 2918 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 2919 INSERT_D2_UB(tp0, tp1, dst0); 2920 INSERT_D2_UB(tp2, tp3, dst1); 2921 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); 2922} 2923 2924static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, 2925 int32_t src_stride, 2926 uint8_t *dst, 2927 int32_t dst_stride, 2928 const int8_t *filter, 2929 int32_t height) 2930{ 2931 int64_t tp0, tp1, tp2, tp3; 2932 v16i8 src0, src1, src2, src3, mask; 2933 v16u8 filt0, dst0, dst1; 2934 v8u16 vec0, vec1, vec2, vec3, filt; 2935 2936 mask = LD_SB(&mc_filt_mask_arr[0]); 2937 2938 /* rearranging filter */ 2939 filt = LD_UH(filter); 2940 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 2941 2942 LD_SB4(src, src_stride, src0, src1, src2, src3); 2943 src += (4 * src_stride); 2944 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 2945 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 2946 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 2947 vec2, vec3); 2948 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 2949 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 2950 INSERT_D2_UB(tp0, tp1, dst0); 2951 INSERT_D2_UB(tp2, tp3, dst1); 2952 LD_SB4(src, src_stride, src0, src1, src2, src3); 2953 src += (4 * src_stride); 2954 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); 2955 dst += (4 * dst_stride); 2956 2957 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 2958 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 2959 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 2960 vec2, vec3); 2961 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 2962 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 2963 INSERT_D2_UB(tp0, tp1, dst0); 2964 INSERT_D2_UB(tp2, tp3, dst1); 2965 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); 2966 dst += (4 * dst_stride); 2967 2968 if (16 == height) { 2969 LD_SB4(src, src_stride, src0, src1, src2, src3); 2970 src += (4 * src_stride); 2971 2972 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 2973 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 2974 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, 2975 vec1, vec2, vec3); 2976 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 2977 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 2978 INSERT_D2_UB(tp0, tp1, dst0); 2979 INSERT_D2_UB(tp2, tp3, dst1); 2980 LD_SB4(src, src_stride, src0, src1, src2, src3); 2981 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); 2982 dst += (4 * dst_stride); 2983 2984 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 2985 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 2986 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, 2987 vec1, vec2, vec3); 2988 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 2989 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 2990 INSERT_D2_UB(tp0, tp1, dst0); 2991 INSERT_D2_UB(tp2, tp3, dst1); 2992 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); 2993 } 2994} 2995 2996void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, 2997 const uint8_t *src, ptrdiff_t src_stride, 2998 int height, int mx, int my) 2999{ 3000 const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; 3001 3002 if (4 == height) { 3003 common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, 3004 filter); 3005 } else { 3006 common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, 3007 filter, height); 3008 } 3009} 3010 3011void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, 3012 const uint8_t *src, ptrdiff_t src_stride, 3013 int height, int mx, int my) 3014{ 3015 uint32_t loop_cnt; 3016 const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; 3017 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 3018 v16u8 filt0, dst0, dst1, dst2, dst3; 3019 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3020 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; 3021 3022 mask = LD_SB(&mc_filt_mask_arr[0]); 3023 3024 /* rearranging filter */ 3025 filt = LD_UH(filter); 3026 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 3027 3028 LD_SB4(src, src_stride, src0, src2, src4, src6); 3029 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 3030 src += (4 * src_stride); 3031 3032 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 3033 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 3034 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 3035 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 3036 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, 3037 res2, res3); 3038 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, 3039 res6, res7); 3040 SRARI_H4_UH(res0, res1, res2, res3, 7); 3041 SRARI_H4_UH(res4, res5, res6, res7, 7); 3042 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 3043 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); 3044 dst += dst_stride; 3045 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); 3046 dst += dst_stride; 3047 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); 3048 dst += dst_stride; 3049 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); 3050 dst += dst_stride; 3051 3052 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { 3053 LD_SB4(src, src_stride, src0, src2, src4, src6); 3054 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 3055 src += (4 * src_stride); 3056 3057 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 3058 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 3059 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 3060 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 3061 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, 3062 res1, res2, res3); 3063 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, 3064 res5, res6, res7); 3065 SRARI_H4_UH(res0, res1, res2, res3, 7); 3066 SRARI_H4_UH(res4, res5, res6, res7, 7); 3067 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 3068 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); 3069 dst += dst_stride; 3070 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); 3071 dst += dst_stride; 3072 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); 3073 dst += dst_stride; 3074 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); 3075 dst += dst_stride; 3076 } 3077} 3078 3079void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, 3080 const uint8_t *src, ptrdiff_t src_stride, 3081 int height, int mx, int my) 3082{ 3083 uint32_t loop_cnt; 3084 const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; 3085 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 3086 v16u8 filt0, dst0, dst1, dst2, dst3; 3087 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3088 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; 3089 3090 mask = LD_SB(&mc_filt_mask_arr[0]); 3091 3092 /* rearranging filter */ 3093 filt = LD_UH(filter); 3094 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 3095 3096 for (loop_cnt = (height >> 1); loop_cnt--;) { 3097 src0 = LD_SB(src); 3098 src2 = LD_SB(src + 16); 3099 src3 = LD_SB(src + 24); 3100 src1 = __msa_sldi_b(src2, src0, 8); 3101 src += src_stride; 3102 src4 = LD_SB(src); 3103 src6 = LD_SB(src + 16); 3104 src7 = LD_SB(src + 24); 3105 src5 = __msa_sldi_b(src6, src4, 8); 3106 src += src_stride; 3107 3108 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 3109 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 3110 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 3111 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 3112 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 3113 res0, res1, res2, res3); 3114 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 3115 res4, res5, res6, res7); 3116 SRARI_H4_UH(res0, res1, res2, res3, 7); 3117 SRARI_H4_UH(res4, res5, res6, res7, 7); 3118 LD_UB2(dst, 16, dst0, dst1); 3119 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); 3120 PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); 3121 dst += dst_stride; 3122 LD_UB2(dst, 16, dst2, dst3); 3123 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); 3124 PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); 3125 dst += dst_stride; 3126 } 3127} 3128 3129void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, 3130 const uint8_t *src, ptrdiff_t src_stride, 3131 int height, int mx, int my) 3132{ 3133 uint32_t loop_cnt; 3134 const int8_t *filter = vp9_bilinear_filters_msa[mx - 1]; 3135 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 3136 v16u8 filt0, dst0, dst1, dst2, dst3; 3137 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3138 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 3139 3140 mask = LD_SB(&mc_filt_mask_arr[0]); 3141 3142 /* rearranging filter */ 3143 filt = LD_UH(filter); 3144 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 3145 3146 for (loop_cnt = height; loop_cnt--;) { 3147 LD_SB4(src, 16, src0, src2, src4, src6); 3148 src7 = LD_SB(src + 56); 3149 SLDI_B3_SB(src2, src0, src4, src2, src6, src4, 8, src1, src3, src5); 3150 src += src_stride; 3151 3152 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 3153 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 3154 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 3155 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 3156 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 3157 out0, out1, out2, out3); 3158 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 3159 out4, out5, out6, out7); 3160 SRARI_H4_UH(out0, out1, out2, out3, 7); 3161 SRARI_H4_UH(out4, out5, out6, out7, 7); 3162 LD_UB4(dst, 16, dst0, dst1, dst2, dst3); 3163 PCKEV_AVG_ST_UB(out1, out0, dst0, dst); 3164 PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); 3165 PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); 3166 PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); 3167 dst += dst_stride; 3168 } 3169} 3170 3171static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, 3172 int32_t src_stride, 3173 uint8_t *dst, int32_t dst_stride, 3174 const int8_t *filter) 3175{ 3176 uint32_t tp0, tp1, tp2, tp3; 3177 v16i8 src0, src1, src2, src3, src4; 3178 v16u8 dst0, out, filt0, src2110, src4332; 3179 v16i8 src10_r, src32_r, src21_r, src43_r; 3180 v8i16 filt; 3181 v8u16 tmp0, tmp1; 3182 3183 filt = LD_SH(filter); 3184 filt0 = (v16u8) __msa_splati_h(filt, 0); 3185 3186 LD_SB4(src, src_stride, src0, src1, src2, src3); 3187 src += (4 * src_stride); 3188 3189 src4 = LD_SB(src); 3190 src += src_stride; 3191 3192 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 3193 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 3194 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, 3195 src10_r, src21_r, src32_r, src43_r); 3196 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 3197 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 3198 SRARI_H2_UH(tmp0, tmp1, 7); 3199 SAT_UH2_UH(tmp0, tmp1, 7); 3200 3201 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 3202 out = __msa_aver_u_b(out, dst0); 3203 3204 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 3205} 3206 3207static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, 3208 int32_t src_stride, 3209 uint8_t *dst, int32_t dst_stride, 3210 const int8_t *filter) 3211{ 3212 uint32_t tp0, tp1, tp2, tp3; 3213 v16u8 dst0, dst1; 3214 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; 3215 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 3216 v16u8 src2110, src4332, src6554, src8776, filt0; 3217 v8u16 tmp0, tmp1, tmp2, tmp3; 3218 v8i16 filt; 3219 3220 filt = LD_SH(filter); 3221 filt0 = (v16u8) __msa_splati_h(filt, 0); 3222 3223 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 3224 src += (8 * src_stride); 3225 src8 = LD_SB(src); 3226 3227 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 3228 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 3229 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); 3230 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 3231 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3232 src32_r, src43_r); 3233 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 3234 src76_r, src87_r); 3235 ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 3236 src87_r, src76_r, src2110, src4332, src6554, src8776); 3237 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, 3238 tmp0, tmp1, tmp2, tmp3); 3239 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 3240 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 3241 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); 3242 AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332); 3243 ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3244} 3245 3246void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, 3247 const uint8_t *src, ptrdiff_t src_stride, 3248 int height, int mx, int my) 3249{ 3250 const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; 3251 3252 if (4 == height) { 3253 common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, 3254 filter); 3255 } else if (8 == height) { 3256 common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, 3257 filter); 3258 } 3259} 3260 3261static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, 3262 int32_t src_stride, 3263 uint8_t *dst, 3264 int32_t dst_stride, 3265 const int8_t *filter) 3266{ 3267 int64_t tp0, tp1, tp2, tp3; 3268 v16u8 src0, src1, src2, src3, src4; 3269 v16u8 dst0, dst1, vec0, vec1, vec2, vec3, filt0; 3270 v8u16 tmp0, tmp1, tmp2, tmp3; 3271 v8i16 filt; 3272 3273 /* rearranging filter_y */ 3274 filt = LD_SH(filter); 3275 filt0 = (v16u8) __msa_splati_h(filt, 0); 3276 3277 LD_UB5(src, src_stride, src0, src1, src2, src3, src4); 3278 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 3279 INSERT_D2_UB(tp0, tp1, dst0); 3280 INSERT_D2_UB(tp2, tp3, dst1); 3281 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); 3282 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); 3283 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 3284 tmp0, tmp1, tmp2, tmp3); 3285 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 3286 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 3287 PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); 3288} 3289 3290static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, 3291 int32_t src_stride, 3292 uint8_t *dst, 3293 int32_t dst_stride, 3294 const int8_t *filter, 3295 int32_t height) 3296{ 3297 uint32_t loop_cnt; 3298 int64_t tp0, tp1, tp2, tp3; 3299 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3300 v16u8 dst0, dst1, dst2, dst3; 3301 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 3302 v8u16 tmp0, tmp1, tmp2, tmp3; 3303 v8i16 filt; 3304 3305 /* rearranging filter_y */ 3306 filt = LD_SH(filter); 3307 filt0 = (v16u8) __msa_splati_h(filt, 0); 3308 3309 src0 = LD_UB(src); 3310 src += src_stride; 3311 3312 for (loop_cnt = (height >> 3); loop_cnt--;) { 3313 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); 3314 src += (8 * src_stride); 3315 3316 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 3317 INSERT_D2_UB(tp0, tp1, dst0); 3318 INSERT_D2_UB(tp2, tp3, dst1); 3319 LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); 3320 INSERT_D2_UB(tp0, tp1, dst2); 3321 INSERT_D2_UB(tp2, tp3, dst3); 3322 3323 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 3324 vec0, vec1, vec2, vec3); 3325 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, 3326 vec4, vec5, vec6, vec7); 3327 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 3328 tmp0, tmp1, tmp2, tmp3); 3329 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 3330 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 3331 PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); 3332 dst += (4 * dst_stride); 3333 3334 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 3335 tmp0, tmp1, tmp2, tmp3); 3336 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 3337 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 3338 PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride); 3339 dst += (4 * dst_stride); 3340 3341 src0 = src8; 3342 } 3343} 3344 3345void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, 3346 const uint8_t *src, ptrdiff_t src_stride, 3347 int height, int mx, int my) 3348{ 3349 const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; 3350 3351 if (4 == height) { 3352 common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, 3353 filter); 3354 } else { 3355 common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, 3356 filter, height); 3357 } 3358} 3359 3360void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, 3361 const uint8_t *src, ptrdiff_t src_stride, 3362 int height, int mx, int my) 3363{ 3364 uint32_t loop_cnt; 3365 const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; 3366 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; 3367 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3368 v8u16 tmp0, tmp1, tmp2, tmp3, filt; 3369 3370 /* rearranging filter_y */ 3371 filt = LD_UH(filter); 3372 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 3373 3374 src0 = LD_UB(src); 3375 src += src_stride; 3376 3377 for (loop_cnt = (height >> 2); loop_cnt--;) { 3378 LD_UB4(src, src_stride, src1, src2, src3, src4); 3379 src += (4 * src_stride); 3380 3381 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 3382 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 3383 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 3384 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 3385 SRARI_H2_UH(tmp0, tmp1, 7); 3386 SAT_UH2_UH(tmp0, tmp1, 7); 3387 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); 3388 dst += dst_stride; 3389 3390 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 3391 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 3392 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 3393 SRARI_H2_UH(tmp2, tmp3, 7); 3394 SAT_UH2_UH(tmp2, tmp3, 7); 3395 PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst); 3396 dst += dst_stride; 3397 3398 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 3399 SRARI_H2_UH(tmp0, tmp1, 7); 3400 SAT_UH2_UH(tmp0, tmp1, 7); 3401 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); 3402 dst += dst_stride; 3403 3404 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 3405 SRARI_H2_UH(tmp2, tmp3, 7); 3406 SAT_UH2_UH(tmp2, tmp3, 7); 3407 PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst); 3408 dst += dst_stride; 3409 3410 src0 = src4; 3411 } 3412} 3413 3414void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, 3415 const uint8_t *src, ptrdiff_t src_stride, 3416 int height, int mx, int my) 3417{ 3418 uint32_t loop_cnt; 3419 const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; 3420 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 3421 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3422 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 3423 v8u16 tmp0, tmp1, tmp2, tmp3, filt; 3424 3425 /* rearranging filter_y */ 3426 filt = LD_UH(filter); 3427 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 3428 3429 LD_UB2(src, 16, src0, src5); 3430 src += src_stride; 3431 3432 for (loop_cnt = (height >> 2); loop_cnt--;) { 3433 LD_UB4(src, src_stride, src1, src2, src3, src4); 3434 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 3435 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 3436 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 3437 3438 LD_UB4(src + 16, src_stride, src6, src7, src8, src9); 3439 LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7); 3440 src += (4 * src_stride); 3441 3442 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 3443 SRARI_H2_UH(tmp0, tmp1, 7); 3444 SAT_UH2_UH(tmp0, tmp1, 7); 3445 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); 3446 3447 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 3448 SRARI_H2_UH(tmp2, tmp3, 7); 3449 SAT_UH2_UH(tmp2, tmp3, 7); 3450 PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); 3451 3452 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 3453 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 3454 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 3455 SRARI_H2_UH(tmp0, tmp1, 7); 3456 SAT_UH2_UH(tmp0, tmp1, 7); 3457 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride); 3458 3459 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 3460 SRARI_H2_UH(tmp2, tmp3, 7); 3461 SAT_UH2_UH(tmp2, tmp3, 7); 3462 PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride); 3463 3464 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); 3465 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); 3466 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 3467 SRARI_H2_UH(tmp0, tmp1, 7); 3468 SAT_UH2_UH(tmp0, tmp1, 7); 3469 PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16); 3470 3471 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 3472 SRARI_H2_UH(tmp2, tmp3, 7); 3473 SAT_UH2_UH(tmp2, tmp3, 7); 3474 PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride); 3475 3476 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); 3477 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); 3478 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 3479 SRARI_H2_UH(tmp0, tmp1, 7); 3480 SAT_UH2_UH(tmp0, tmp1, 7); 3481 PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride); 3482 3483 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 3484 SRARI_H2_UH(tmp2, tmp3, 7); 3485 SAT_UH2_UH(tmp2, tmp3, 7); 3486 PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride); 3487 dst += (4 * dst_stride); 3488 3489 src0 = src4; 3490 src5 = src9; 3491 } 3492} 3493 3494void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, 3495 const uint8_t *src, ptrdiff_t src_stride, 3496 int height, int mx, int my) 3497{ 3498 uint32_t loop_cnt; 3499 const int8_t *filter = vp9_bilinear_filters_msa[my - 1]; 3500 v16u8 src0, src1, src2, src3, src4, src5; 3501 v16u8 src6, src7, src8, src9, src10, src11, filt0; 3502 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3503 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3504 v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3505 v8u16 filt; 3506 3507 /* rearranging filter_y */ 3508 filt = LD_UH(filter); 3509 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 3510 3511 LD_UB4(src, 16, src0, src3, src6, src9); 3512 src += src_stride; 3513 3514 for (loop_cnt = (height >> 1); loop_cnt--;) { 3515 LD_UB2(src, src_stride, src1, src2); 3516 LD_UB2(dst, dst_stride, dst0, dst1); 3517 LD_UB2(src + 16, src_stride, src4, src5); 3518 LD_UB2(dst + 16, dst_stride, dst2, dst3); 3519 LD_UB2(src + 32, src_stride, src7, src8); 3520 LD_UB2(dst + 32, dst_stride, dst4, dst5); 3521 LD_UB2(src + 48, src_stride, src10, src11); 3522 LD_UB2(dst + 48, dst_stride, dst6, dst7); 3523 src += (2 * src_stride); 3524 3525 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 3526 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 3527 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 3528 SRARI_H2_UH(tmp0, tmp1, 7); 3529 SAT_UH2_UH(tmp0, tmp1, 7); 3530 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); 3531 3532 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 3533 SRARI_H2_UH(tmp2, tmp3, 7); 3534 SAT_UH2_UH(tmp2, tmp3, 7); 3535 PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); 3536 3537 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); 3538 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); 3539 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); 3540 SRARI_H2_UH(tmp4, tmp5, 7); 3541 SAT_UH2_UH(tmp4, tmp5, 7); 3542 PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16); 3543 3544 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); 3545 SRARI_H2_UH(tmp6, tmp7, 7); 3546 SAT_UH2_UH(tmp6, tmp7, 7); 3547 PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride); 3548 3549 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); 3550 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); 3551 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 3552 SRARI_H2_UH(tmp0, tmp1, 7); 3553 SAT_UH2_UH(tmp0, tmp1, 7); 3554 PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32); 3555 3556 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 3557 SRARI_H2_UH(tmp2, tmp3, 7); 3558 SAT_UH2_UH(tmp2, tmp3, 7); 3559 PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride); 3560 3561 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); 3562 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); 3563 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); 3564 SRARI_H2_UH(tmp4, tmp5, 7); 3565 SAT_UH2_UH(tmp4, tmp5, 7); 3566 PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48)); 3567 3568 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); 3569 SRARI_H2_UH(tmp6, tmp7, 7); 3570 SAT_UH2_UH(tmp6, tmp7, 7); 3571 PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride); 3572 dst += (2 * dst_stride); 3573 3574 src0 = src2; 3575 src3 = src5; 3576 src6 = src8; 3577 src9 = src11; 3578 } 3579} 3580 3581static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, 3582 int32_t src_stride, 3583 uint8_t *dst, 3584 int32_t dst_stride, 3585 const int8_t *filter_horiz, 3586 const int8_t *filter_vert) 3587{ 3588 uint32_t tp0, tp1, tp2, tp3; 3589 v16i8 src0, src1, src2, src3, src4, mask; 3590 v16u8 filt_hz, filt_vt, vec0, vec1; 3591 v16u8 dst0, out; 3592 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; 3593 3594 mask = LD_SB(&mc_filt_mask_arr[16]); 3595 3596 /* rearranging filter */ 3597 filt = LD_UH(filter_horiz); 3598 filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0); 3599 3600 filt = LD_UH(filter_vert); 3601 filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0); 3602 3603 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3604 3605 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); 3606 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); 3607 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 3608 hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8); 3609 hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2); 3610 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 3611 3612 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 3613 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 3614 3615 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 3616 SRARI_H2_UH(tmp0, tmp1, 7); 3617 SAT_UH2_UH(tmp0, tmp1, 7); 3618 3619 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 3620 out = __msa_aver_u_b(out, dst0); 3621 3622 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 3623} 3624 3625static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, 3626 int32_t src_stride, 3627 uint8_t *dst, 3628 int32_t dst_stride, 3629 const int8_t *filter_horiz, 3630 const int8_t *filter_vert) 3631{ 3632 uint32_t tp0, tp1, tp2, tp3; 3633 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; 3634 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1; 3635 v16u8 dst0, dst1; 3636 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 3637 v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; 3638 v8i16 filt; 3639 3640 mask = LD_SB(&mc_filt_mask_arr[16]); 3641 3642 /* rearranging filter */ 3643 filt = LD_SH(filter_horiz); 3644 filt_hz = (v16u8) __msa_splati_h(filt, 0); 3645 3646 filt = LD_SH(filter_vert); 3647 filt_vt = (v16u8) __msa_splati_h(filt, 0); 3648 3649 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 3650 src += (8 * src_stride); 3651 src8 = LD_SB(src); 3652 3653 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); 3654 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); 3655 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7); 3656 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7); 3657 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7); 3658 SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1, 3659 hz_out3, hz_out5); 3660 hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6); 3661 3662 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 3663 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 3664 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); 3665 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 3666 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 3667 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); 3668 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, 3669 tmp0, tmp1, tmp2, tmp3); 3670 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 3671 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 3672 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1); 3673 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 3674 ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3675} 3676 3677void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 3678 const uint8_t *src, ptrdiff_t src_stride, 3679 int height, int mx, int my) 3680{ 3681 const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1]; 3682 const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1]; 3683 3684 if (4 == height) { 3685 common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, 3686 filter_horiz, filter_vert); 3687 } else if (8 == height) { 3688 common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, 3689 filter_horiz, filter_vert); 3690 } 3691} 3692 3693static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, 3694 int32_t src_stride, 3695 uint8_t *dst, 3696 int32_t dst_stride, 3697 const int8_t *filter_horiz, 3698 const int8_t *filter_vert) 3699{ 3700 uint64_t tp0, tp1, tp2, tp3; 3701 v16i8 src0, src1, src2, src3, src4, mask; 3702 v16u8 filt_hz, filt_vt, dst0, dst1, vec0, vec1, vec2, vec3; 3703 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 3704 v8i16 filt; 3705 3706 mask = LD_SB(&mc_filt_mask_arr[0]); 3707 3708 /* rearranging filter */ 3709 filt = LD_SH(filter_horiz); 3710 filt_hz = (v16u8) __msa_splati_h(filt, 0); 3711 3712 filt = LD_SH(filter_vert); 3713 filt_vt = (v16u8) __msa_splati_h(filt, 0); 3714 3715 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3716 src += (5 * src_stride); 3717 3718 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 3719 INSERT_D2_UB(tp0, tp1, dst0); 3720 INSERT_D2_UB(tp2, tp3, dst1); 3721 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 3722 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 3723 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 3724 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 3725 3726 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 3727 vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 3728 tmp1 = __msa_dotp_u_h(vec1, filt_vt); 3729 3730 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 3731 vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 3732 tmp2 = __msa_dotp_u_h(vec2, filt_vt); 3733 3734 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 3735 vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 3736 tmp3 = __msa_dotp_u_h(vec3, filt_vt); 3737 3738 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 3739 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 3740 PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); 3741} 3742 3743static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, 3744 int32_t src_stride, 3745 uint8_t *dst, 3746 int32_t dst_stride, 3747 const int8_t *filter_horiz, 3748 const int8_t *filter_vert, 3749 int32_t height) 3750{ 3751 uint32_t loop_cnt; 3752 uint64_t tp0, tp1, tp2, tp3; 3753 v16i8 src0, src1, src2, src3, src4, mask; 3754 v16u8 filt_hz, filt_vt, vec0, dst0, dst1; 3755 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 3756 v8i16 filt; 3757 3758 mask = LD_SB(&mc_filt_mask_arr[0]); 3759 3760 /* rearranging filter */ 3761 filt = LD_SH(filter_horiz); 3762 filt_hz = (v16u8) __msa_splati_h(filt, 0); 3763 3764 filt = LD_SH(filter_vert); 3765 filt_vt = (v16u8) __msa_splati_h(filt, 0); 3766 3767 src0 = LD_SB(src); 3768 src += src_stride; 3769 3770 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 3771 3772 for (loop_cnt = (height >> 2); loop_cnt--;) { 3773 LD_SB4(src, src_stride, src1, src2, src3, src4); 3774 src += (4 * src_stride); 3775 3776 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 3777 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 3778 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 3779 3780 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 3781 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 3782 tmp1 = __msa_dotp_u_h(vec0, filt_vt); 3783 3784 SRARI_H2_UH(tmp0, tmp1, 7); 3785 SAT_UH2_UH(tmp0, tmp1, 7); 3786 3787 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 3788 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 3789 tmp2 = __msa_dotp_u_h(vec0, filt_vt); 3790 3791 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 3792 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 3793 tmp3 = __msa_dotp_u_h(vec0, filt_vt); 3794 3795 SRARI_H2_UH(tmp2, tmp3, 7); 3796 SAT_UH2_UH(tmp2, tmp3, 7); 3797 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 3798 INSERT_D2_UB(tp0, tp1, dst0); 3799 INSERT_D2_UB(tp2, tp3, dst1); 3800 PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); 3801 dst += (4 * dst_stride); 3802 } 3803} 3804 3805void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 3806 const uint8_t *src, ptrdiff_t src_stride, 3807 int height, int mx, int my) 3808{ 3809 const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1]; 3810 const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1]; 3811 3812 if (4 == height) { 3813 common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, 3814 filter_horiz, filter_vert); 3815 } else { 3816 common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, 3817 dst, dst_stride, 3818 filter_horiz, filter_vert, 3819 height); 3820 } 3821} 3822 3823void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 3824 const uint8_t *src, ptrdiff_t src_stride, 3825 int height, int mx, int my) 3826{ 3827 uint32_t loop_cnt; 3828 const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1]; 3829 const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1]; 3830 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 3831 v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; 3832 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; 3833 v8i16 filt; 3834 3835 mask = LD_SB(&mc_filt_mask_arr[0]); 3836 3837 /* rearranging filter */ 3838 filt = LD_SH(filter_horiz); 3839 filt_hz = (v16u8) __msa_splati_h(filt, 0); 3840 3841 filt = LD_SH(filter_vert); 3842 filt_vt = (v16u8) __msa_splati_h(filt, 0); 3843 3844 LD_SB2(src, 8, src0, src1); 3845 src += src_stride; 3846 3847 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 3848 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 3849 3850 for (loop_cnt = (height >> 2); loop_cnt--;) { 3851 LD_SB4(src, src_stride, src0, src2, src4, src6); 3852 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 3853 src += (4 * src_stride); 3854 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 3855 3856 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 3857 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 3858 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 3859 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 3860 SRARI_H2_UH(tmp0, tmp1, 7); 3861 SAT_UH2_UH(tmp0, tmp1, 7); 3862 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); 3863 dst += dst_stride; 3864 3865 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 3866 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 3867 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 3868 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 3869 SRARI_H2_UH(tmp0, tmp1, 7); 3870 SAT_UH2_UH(tmp0, tmp1, 7); 3871 PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); 3872 dst += dst_stride; 3873 3874 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 3875 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7); 3876 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 3877 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 3878 SRARI_H2_UH(tmp0, tmp1, 7); 3879 SAT_UH2_UH(tmp0, tmp1, 7); 3880 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); 3881 dst += dst_stride; 3882 3883 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7); 3884 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7); 3885 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 3886 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 3887 SRARI_H2_UH(tmp0, tmp1, 7); 3888 SAT_UH2_UH(tmp0, tmp1, 7); 3889 PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); 3890 dst += dst_stride; 3891 } 3892} 3893 3894void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 3895 const uint8_t *src, ptrdiff_t src_stride, 3896 int height, int mx, int my) 3897{ 3898 int32_t multiple8_cnt; 3899 3900 for (multiple8_cnt = 2; multiple8_cnt--;) { 3901 ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my); 3902 3903 src += 16; 3904 dst += 16; 3905 } 3906} 3907 3908void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 3909 const uint8_t *src, ptrdiff_t src_stride, 3910 int height, int mx, int my) 3911{ 3912 int32_t multiple8_cnt; 3913 3914 for (multiple8_cnt = 4; multiple8_cnt--;) { 3915 ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my); 3916 3917 src += 16; 3918 dst += 16; 3919 } 3920} 3921 3922static void copy_width8_msa(const uint8_t *src, int32_t src_stride, 3923 uint8_t *dst, int32_t dst_stride, 3924 int32_t height) 3925{ 3926 int32_t cnt; 3927 uint64_t out0, out1, out2, out3, out4, out5, out6, out7; 3928 3929 if (0 == height % 8) { 3930 for (cnt = height >> 3; cnt--;) { 3931 LD4(src, src_stride, out0, out1, out2, out3); 3932 src += (4 * src_stride); 3933 LD4(src, src_stride, out4, out5, out6, out7); 3934 src += (4 * src_stride); 3935 3936 SD4(out0, out1, out2, out3, dst, dst_stride); 3937 dst += (4 * dst_stride); 3938 SD4(out4, out5, out6, out7, dst, dst_stride); 3939 dst += (4 * dst_stride); 3940 } 3941 } else if (0 == height % 4) { 3942 for (cnt = (height / 4); cnt--;) { 3943 LD4(src, src_stride, out0, out1, out2, out3); 3944 src += (4 * src_stride); 3945 3946 SD4(out0, out1, out2, out3, dst, dst_stride); 3947 dst += (4 * dst_stride); 3948 } 3949 } 3950} 3951 3952static void copy_width16_msa(const uint8_t *src, int32_t src_stride, 3953 uint8_t *dst, int32_t dst_stride, 3954 int32_t height) 3955{ 3956 int32_t cnt; 3957 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 3958 3959 if (8 == height) { 3960 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 3961 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 3962 } else if (16 == height) { 3963 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 3964 src += (8 * src_stride); 3965 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 3966 dst += (8 * dst_stride); 3967 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 3968 src += (8 * src_stride); 3969 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 3970 dst += (8 * dst_stride); 3971 } else if (32 == height) { 3972 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 3973 src += (8 * src_stride); 3974 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 3975 dst += (8 * dst_stride); 3976 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 3977 src += (8 * src_stride); 3978 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 3979 dst += (8 * dst_stride); 3980 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 3981 src += (8 * src_stride); 3982 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 3983 dst += (8 * dst_stride); 3984 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 3985 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 3986 } else if (0 == height % 4) { 3987 for (cnt = (height >> 2); cnt--;) { 3988 LD_UB4(src, src_stride, src0, src1, src2, src3); 3989 src += (4 * src_stride); 3990 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 3991 dst += (4 * dst_stride); 3992 } 3993 } 3994} 3995 3996static void copy_width32_msa(const uint8_t *src, int32_t src_stride, 3997 uint8_t *dst, int32_t dst_stride, 3998 int32_t height) 3999{ 4000 int32_t cnt; 4001 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 4002 4003 if (0 == height % 8) { 4004 for (cnt = (height >> 3); cnt--;) { 4005 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 4006 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 4007 LD_UB8(src + 16, src_stride, src0, src1, src2, src3, src4, src5, src6, 4008 src7); 4009 src += (8 * src_stride); 4010 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 16, 4011 dst_stride); 4012 dst += (8 * dst_stride); 4013 } 4014 } else if (0 == height % 4) { 4015 for (cnt = (height >> 2); cnt--;) { 4016 LD_UB4(src, src_stride, src0, src1, src2, src3); 4017 LD_UB4(src + 16, src_stride, src4, src5, src6, src7); 4018 src += (4 * src_stride); 4019 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 4020 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); 4021 dst += (4 * dst_stride); 4022 } 4023 } 4024} 4025 4026static void copy_width64_msa(const uint8_t *src, int32_t src_stride, 4027 uint8_t *dst, int32_t dst_stride, 4028 int32_t height) 4029{ 4030 int32_t cnt; 4031 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 4032 v16u8 src8, src9, src10, src11, src12, src13, src14, src15; 4033 4034 for (cnt = (height >> 2); cnt--;) { 4035 LD_UB4(src, 16, src0, src1, src2, src3); 4036 src += src_stride; 4037 LD_UB4(src, 16, src4, src5, src6, src7); 4038 src += src_stride; 4039 LD_UB4(src, 16, src8, src9, src10, src11); 4040 src += src_stride; 4041 LD_UB4(src, 16, src12, src13, src14, src15); 4042 src += src_stride; 4043 4044 ST_UB4(src0, src1, src2, src3, dst, 16); 4045 dst += dst_stride; 4046 ST_UB4(src4, src5, src6, src7, dst, 16); 4047 dst += dst_stride; 4048 ST_UB4(src8, src9, src10, src11, dst, 16); 4049 dst += dst_stride; 4050 ST_UB4(src12, src13, src14, src15, dst, 16); 4051 dst += dst_stride; 4052 } 4053} 4054 4055static void avg_width4_msa(const uint8_t *src, int32_t src_stride, 4056 uint8_t *dst, int32_t dst_stride, 4057 int32_t height) 4058{ 4059 uint32_t tp0, tp1, tp2, tp3; 4060 v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 }; 4061 4062 if (8 == height) { 4063 LW4(src, src_stride, tp0, tp1, tp2, tp3); 4064 src += 4 * src_stride; 4065 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 4066 LW4(src, src_stride, tp0, tp1, tp2, tp3); 4067 INSERT_W4_UB(tp0, tp1, tp2, tp3, src1); 4068 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 4069 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 4070 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); 4071 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 4072 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); 4073 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 4074 } else if (4 == height) { 4075 LW4(src, src_stride, tp0, tp1, tp2, tp3); 4076 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 4077 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 4078 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 4079 dst0 = __msa_aver_u_b(src0, dst0); 4080 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride); 4081 } 4082} 4083 4084static void avg_width8_msa(const uint8_t *src, int32_t src_stride, 4085 uint8_t *dst, int32_t dst_stride, 4086 int32_t height) 4087{ 4088 int32_t cnt; 4089 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; 4090 v16u8 src0, src1, src2, src3; 4091 v16u8 dst0, dst1, dst2, dst3; 4092 4093 if (0 == (height % 8)) { 4094 for (cnt = (height >> 3); cnt--;) { 4095 LD4(src, src_stride, tp0, tp1, tp2, tp3); 4096 src += 4 * src_stride; 4097 LD4(src, src_stride, tp4, tp5, tp6, tp7); 4098 src += 4 * src_stride; 4099 INSERT_D2_UB(tp0, tp1, src0); 4100 INSERT_D2_UB(tp2, tp3, src1); 4101 INSERT_D2_UB(tp4, tp5, src2); 4102 INSERT_D2_UB(tp6, tp7, src3); 4103 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 4104 LD4(dst + 4 * dst_stride, dst_stride, tp4, tp5, tp6, tp7); 4105 INSERT_D2_UB(tp0, tp1, dst0); 4106 INSERT_D2_UB(tp2, tp3, dst1); 4107 INSERT_D2_UB(tp4, tp5, dst2); 4108 INSERT_D2_UB(tp6, tp7, dst3); 4109 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, 4110 dst1, dst2, dst3); 4111 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 4112 dst += 8 * dst_stride; 4113 } 4114 } else if (4 == height) { 4115 LD4(src, src_stride, tp0, tp1, tp2, tp3); 4116 INSERT_D2_UB(tp0, tp1, src0); 4117 INSERT_D2_UB(tp2, tp3, src1); 4118 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 4119 INSERT_D2_UB(tp0, tp1, dst0); 4120 INSERT_D2_UB(tp2, tp3, dst1); 4121 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); 4122 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 4123 } 4124} 4125 4126static void avg_width16_msa(const uint8_t *src, int32_t src_stride, 4127 uint8_t *dst, int32_t dst_stride, 4128 int32_t height) 4129{ 4130 int32_t cnt; 4131 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 4132 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 4133 4134 if (0 == (height % 8)) { 4135 for (cnt = (height / 8); cnt--;) { 4136 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 4137 src += (8 * src_stride); 4138 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 4139 4140 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, 4141 dst0, dst1, dst2, dst3); 4142 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, 4143 dst4, dst5, dst6, dst7); 4144 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); 4145 dst += (8 * dst_stride); 4146 } 4147 } else if (0 == (height % 4)) { 4148 for (cnt = (height / 4); cnt--;) { 4149 LD_UB4(src, src_stride, src0, src1, src2, src3); 4150 src += (4 * src_stride); 4151 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 4152 4153 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, 4154 dst0, dst1, dst2, dst3); 4155 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); 4156 dst += (4 * dst_stride); 4157 } 4158 } 4159} 4160 4161static void avg_width32_msa(const uint8_t *src, int32_t src_stride, 4162 uint8_t *dst, int32_t dst_stride, 4163 int32_t height) 4164{ 4165 int32_t cnt; 4166 uint8_t *dst_dup = dst; 4167 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 4168 v16u8 src8, src9, src10, src11, src12, src13, src14, src15; 4169 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 4170 v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; 4171 4172 if (0 == (height % 8)) { 4173 for (cnt = (height / 8); cnt--;) { 4174 LD_UB4(src, src_stride, src0, src2, src4, src6); 4175 LD_UB4(src + 16, src_stride, src1, src3, src5, src7); 4176 src += (4 * src_stride); 4177 LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6); 4178 LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7); 4179 dst_dup += (4 * dst_stride); 4180 LD_UB4(src, src_stride, src8, src10, src12, src14); 4181 LD_UB4(src + 16, src_stride, src9, src11, src13, src15); 4182 src += (4 * src_stride); 4183 LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14); 4184 LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15); 4185 dst_dup += (4 * dst_stride); 4186 4187 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, 4188 dst0, dst1, dst2, dst3); 4189 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, 4190 dst4, dst5, dst6, dst7); 4191 AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, 4192 dst8, dst9, dst10, dst11); 4193 AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, 4194 dst12, dst13, dst14, dst15); 4195 4196 ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride); 4197 ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride); 4198 dst += (4 * dst_stride); 4199 ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride); 4200 ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride); 4201 dst += (4 * dst_stride); 4202 } 4203 } else if (0 == (height % 4)) { 4204 for (cnt = (height / 4); cnt--;) { 4205 LD_UB4(src, src_stride, src0, src2, src4, src6); 4206 LD_UB4(src + 16, src_stride, src1, src3, src5, src7); 4207 src += (4 * src_stride); 4208 LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6); 4209 LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7); 4210 dst_dup += (4 * dst_stride); 4211 4212 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, 4213 dst0, dst1, dst2, dst3); 4214 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, 4215 dst4, dst5, dst6, dst7); 4216 4217 ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride); 4218 ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride); 4219 dst += (4 * dst_stride); 4220 } 4221 } 4222} 4223 4224static void avg_width64_msa(const uint8_t *src, int32_t src_stride, 4225 uint8_t *dst, int32_t dst_stride, 4226 int32_t height) 4227{ 4228 int32_t cnt; 4229 uint8_t *dst_dup = dst; 4230 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 4231 v16u8 src8, src9, src10, src11, src12, src13, src14, src15; 4232 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 4233 v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; 4234 4235 for (cnt = (height / 4); cnt--;) { 4236 LD_UB4(src, 16, src0, src1, src2, src3); 4237 src += src_stride; 4238 LD_UB4(src, 16, src4, src5, src6, src7); 4239 src += src_stride; 4240 LD_UB4(src, 16, src8, src9, src10, src11); 4241 src += src_stride; 4242 LD_UB4(src, 16, src12, src13, src14, src15); 4243 src += src_stride; 4244 4245 LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3); 4246 dst_dup += dst_stride; 4247 LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7); 4248 dst_dup += dst_stride; 4249 LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11); 4250 dst_dup += dst_stride; 4251 LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15); 4252 dst_dup += dst_stride; 4253 4254 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, 4255 dst0, dst1, dst2, dst3); 4256 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, 4257 dst4, dst5, dst6, dst7); 4258 AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, 4259 dst8, dst9, dst10, dst11); 4260 AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, 4261 dst12, dst13, dst14, dst15); 4262 4263 ST_UB4(dst0, dst1, dst2, dst3, dst, 16); 4264 dst += dst_stride; 4265 ST_UB4(dst4, dst5, dst6, dst7, dst, 16); 4266 dst += dst_stride; 4267 ST_UB4(dst8, dst9, dst10, dst11, dst, 16); 4268 dst += dst_stride; 4269 ST_UB4(dst12, dst13, dst14, dst15, dst, 16); 4270 dst += dst_stride; 4271 } 4272} 4273 4274static const int8_t vp9_subpel_filters_msa[3][15][8] = { 4275 [FILTER_8TAP_REGULAR] = { 4276 {0, 1, -5, 126, 8, -3, 1, 0}, 4277 {-1, 3, -10, 122, 18, -6, 2, 0}, 4278 {-1, 4, -13, 118, 27, -9, 3, -1}, 4279 {-1, 4, -16, 112, 37, -11, 4, -1}, 4280 {-1, 5, -18, 105, 48, -14, 4, -1}, 4281 {-1, 5, -19, 97, 58, -16, 5, -1}, 4282 {-1, 6, -19, 88, 68, -18, 5, -1}, 4283 {-1, 6, -19, 78, 78, -19, 6, -1}, 4284 {-1, 5, -18, 68, 88, -19, 6, -1}, 4285 {-1, 5, -16, 58, 97, -19, 5, -1}, 4286 {-1, 4, -14, 48, 105, -18, 5, -1}, 4287 {-1, 4, -11, 37, 112, -16, 4, -1}, 4288 {-1, 3, -9, 27, 118, -13, 4, -1}, 4289 {0, 2, -6, 18, 122, -10, 3, -1}, 4290 {0, 1, -3, 8, 126, -5, 1, 0}, 4291 }, [FILTER_8TAP_SHARP] = { 4292 {-1, 3, -7, 127, 8, -3, 1, 0}, 4293 {-2, 5, -13, 125, 17, -6, 3, -1}, 4294 {-3, 7, -17, 121, 27, -10, 5, -2}, 4295 {-4, 9, -20, 115, 37, -13, 6, -2}, 4296 {-4, 10, -23, 108, 48, -16, 8, -3}, 4297 {-4, 10, -24, 100, 59, -19, 9, -3}, 4298 {-4, 11, -24, 90, 70, -21, 10, -4}, 4299 {-4, 11, -23, 80, 80, -23, 11, -4}, 4300 {-4, 10, -21, 70, 90, -24, 11, -4}, 4301 {-3, 9, -19, 59, 100, -24, 10, -4}, 4302 {-3, 8, -16, 48, 108, -23, 10, -4}, 4303 {-2, 6, -13, 37, 115, -20, 9, -4}, 4304 {-2, 5, -10, 27, 121, -17, 7, -3}, 4305 {-1, 3, -6, 17, 125, -13, 5, -2}, 4306 {0, 1, -3, 8, 127, -7, 3, -1}, 4307 }, [FILTER_8TAP_SMOOTH] = { 4308 {-3, -1, 32, 64, 38, 1, -3, 0}, 4309 {-2, -2, 29, 63, 41, 2, -3, 0}, 4310 {-2, -2, 26, 63, 43, 4, -4, 0}, 4311 {-2, -3, 24, 62, 46, 5, -4, 0}, 4312 {-2, -3, 21, 60, 49, 7, -4, 0}, 4313 {-1, -4, 18, 59, 51, 9, -4, 0}, 4314 {-1, -4, 16, 57, 53, 12, -4, -1}, 4315 {-1, -4, 14, 55, 55, 14, -4, -1}, 4316 {-1, -4, 12, 53, 57, 16, -4, -1}, 4317 {0, -4, 9, 51, 59, 18, -4, -1}, 4318 {0, -4, 7, 49, 60, 21, -3, -2}, 4319 {0, -4, 5, 46, 62, 24, -3, -2}, 4320 {0, -4, 4, 43, 63, 26, -2, -2}, 4321 {0, -3, 2, 41, 63, 29, -2, -2}, 4322 {0, -3, 1, 38, 64, 32, -1, -3}, 4323 } 4324}; 4325 4326#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx) \ 4327void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \ 4328 const uint8_t *src, \ 4329 ptrdiff_t srcstride, \ 4330 int h, int mx, int my) \ 4331{ \ 4332 const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \ 4333 \ 4334 common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \ 4335} \ 4336 \ 4337void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \ 4338 const uint8_t *src, \ 4339 ptrdiff_t srcstride, \ 4340 int h, int mx, int my) \ 4341{ \ 4342 const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \ 4343 \ 4344 common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \ 4345} \ 4346 \ 4347void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \ 4348 const uint8_t *src, \ 4349 ptrdiff_t srcstride, \ 4350 int h, int mx, int my) \ 4351{ \ 4352 const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \ 4353 const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \ 4354 \ 4355 common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter, \ 4356 vfilter, h); \ 4357} \ 4358 \ 4359void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \ 4360 const uint8_t *src, \ 4361 ptrdiff_t srcstride, \ 4362 int h, int mx, int my) \ 4363{ \ 4364 const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \ 4365 \ 4366 common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \ 4367 dststride, filter, h); \ 4368} \ 4369 \ 4370void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \ 4371 const uint8_t *src, \ 4372 ptrdiff_t srcstride, \ 4373 int h, int mx, int my) \ 4374{ \ 4375 const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \ 4376 \ 4377 common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride, \ 4378 filter, h); \ 4379} \ 4380 \ 4381void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \ 4382 const uint8_t *src, \ 4383 ptrdiff_t srcstride, \ 4384 int h, int mx, int my) \ 4385{ \ 4386 const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \ 4387 const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \ 4388 \ 4389 common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \ 4390 dststride, hfilter, \ 4391 vfilter, h); \ 4392} 4393 4394#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE) \ 4395void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \ 4396 const uint8_t *src, ptrdiff_t srcstride, \ 4397 int h, int mx, int my) \ 4398{ \ 4399 \ 4400 copy_width##SIZE##_msa(src, srcstride, dst, dststride, h); \ 4401} \ 4402 \ 4403void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \ 4404 const uint8_t *src, ptrdiff_t srcstride, \ 4405 int h, int mx, int my) \ 4406{ \ 4407 \ 4408 avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \ 4409} 4410 4411#define VP9_AVG_MIPS_MSA_FUNC(SIZE) \ 4412void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \ 4413 const uint8_t *src, ptrdiff_t srcstride, \ 4414 int h, int mx, int my) \ 4415{ \ 4416 \ 4417 avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \ 4418} 4419 4420VP9_8TAP_MIPS_MSA_FUNC(64, regular, FILTER_8TAP_REGULAR); 4421VP9_8TAP_MIPS_MSA_FUNC(32, regular, FILTER_8TAP_REGULAR); 4422VP9_8TAP_MIPS_MSA_FUNC(16, regular, FILTER_8TAP_REGULAR); 4423VP9_8TAP_MIPS_MSA_FUNC(8, regular, FILTER_8TAP_REGULAR); 4424VP9_8TAP_MIPS_MSA_FUNC(4, regular, FILTER_8TAP_REGULAR); 4425 4426VP9_8TAP_MIPS_MSA_FUNC(64, sharp, FILTER_8TAP_SHARP); 4427VP9_8TAP_MIPS_MSA_FUNC(32, sharp, FILTER_8TAP_SHARP); 4428VP9_8TAP_MIPS_MSA_FUNC(16, sharp, FILTER_8TAP_SHARP); 4429VP9_8TAP_MIPS_MSA_FUNC(8, sharp, FILTER_8TAP_SHARP); 4430VP9_8TAP_MIPS_MSA_FUNC(4, sharp, FILTER_8TAP_SHARP); 4431 4432VP9_8TAP_MIPS_MSA_FUNC(64, smooth, FILTER_8TAP_SMOOTH); 4433VP9_8TAP_MIPS_MSA_FUNC(32, smooth, FILTER_8TAP_SMOOTH); 4434VP9_8TAP_MIPS_MSA_FUNC(16, smooth, FILTER_8TAP_SMOOTH); 4435VP9_8TAP_MIPS_MSA_FUNC(8, smooth, FILTER_8TAP_SMOOTH); 4436VP9_8TAP_MIPS_MSA_FUNC(4, smooth, FILTER_8TAP_SMOOTH); 4437 4438VP9_COPY_AVG_MIPS_MSA_FUNC(64); 4439VP9_COPY_AVG_MIPS_MSA_FUNC(32); 4440VP9_COPY_AVG_MIPS_MSA_FUNC(16); 4441VP9_COPY_AVG_MIPS_MSA_FUNC(8); 4442VP9_AVG_MIPS_MSA_FUNC(4); 4443 4444#undef VP9_8TAP_MIPS_MSA_FUNC 4445#undef VP9_COPY_AVG_MIPS_MSA_FUNC 4446#undef VP9_AVG_MIPS_MSA_FUNC 4447