1/* 2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavcodec/vp8dsp.h" 22#include "libavutil/mips/generic_macros_msa.h" 23#include "vp8dsp_mips.h" 24 25static const uint8_t mc_filt_mask_arr[16 * 3] = { 26 /* 8 width cases */ 27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28 /* 4 width cases */ 29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 30 /* 4 width cases */ 31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 32}; 33 34static const int8_t subpel_filters_msa[7][8] = { 35 {-6, 123, 12, -1, 0, 0, 0, 0}, 36 {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */ 37 {-9, 93, 50, -6, 0, 0, 0, 0}, 38 {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */ 39 {-6, 50, 93, -9, 0, 0, 0, 0}, 40 {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */ 41 {-1, 12, 123, -6, 0, 0, 0, 0}, 42}; 43 44static const int8_t bilinear_filters_msa[7][2] = { 45 {112, 16}, 46 {96, 32}, 47 {80, 48}, 48 {64, 64}, 49 {48, 80}, 50 {32, 96}, 51 {16, 112} 52}; 53 54#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \ 55 filt_h0, filt_h1, filt_h2) \ 56( { \ 57 v16i8 vec0_m, vec1_m, vec2_m; \ 58 v8i16 hz_out_m; \ 59 \ 60 VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \ 61 vec0_m, vec1_m, vec2_m); \ 62 hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \ 63 filt_h0, filt_h1, filt_h2); \ 64 \ 65 hz_out_m = __msa_srari_h(hz_out_m, 7); \ 66 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ 67 \ 68 hz_out_m; \ 69} ) 70 71#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 72 mask0, mask1, mask2, \ 73 filt0, filt1, filt2, \ 74 out0, out1) \ 75{ \ 76 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ 77 \ 78 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 79 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 80 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 81 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 82 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ 83 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \ 84} 85 86#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 87 mask0, mask1, mask2, \ 88 filt0, filt1, filt2, \ 89 out0, out1, out2, out3) \ 90{ \ 91 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 92 \ 93 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 94 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 95 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 96 out0, out1, out2, out3); \ 97 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 98 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 99 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \ 100 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \ 101 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 102 out0, out1, out2, out3); \ 103 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \ 104 out0, out1, out2, out3); \ 105} 106 107#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ 108( { \ 109 v8i16 tmp0; \ 110 \ 111 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \ 112 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \ 113 \ 114 tmp0; \ 115} ) 116 117#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ 118( { \ 119 v16i8 vec0_m, vec1_m; \ 120 v8i16 hz_out_m; \ 121 \ 122 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \ 123 hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \ 124 \ 125 hz_out_m = __msa_srari_h(hz_out_m, 7); \ 126 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ 127 \ 128 hz_out_m; \ 129} ) 130 131#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 132 mask0, mask1, filt0, filt1, \ 133 out0, out1) \ 134{ \ 135 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 136 \ 137 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 138 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 139 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 140 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 141} 142 143#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 144 mask0, mask1, filt0, filt1, \ 145 out0, out1, out2, out3) \ 146{ \ 147 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 148 \ 149 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 150 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 151 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 152 out0, out1, out2, out3); \ 153 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 154 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 155 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 156 out0, out1, out2, out3); \ 157} 158 159static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride, 160 uint8_t *dst, int32_t dst_stride, 161 const int8_t *filter) 162{ 163 v16i8 src0, src1, src2, src3, filt0, filt1, filt2; 164 v16u8 mask0, mask1, mask2, out; 165 v8i16 filt, out0, out1; 166 167 mask0 = LD_UB(&mc_filt_mask_arr[16]); 168 src -= 2; 169 170 /* rearranging filter */ 171 filt = LD_SH(filter); 172 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 173 174 mask1 = mask0 + 2; 175 mask2 = mask0 + 4; 176 177 LD_SB4(src, src_stride, src0, src1, src2, src3); 178 XORI_B4_128_SB(src0, src1, src2, src3); 179 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 180 filt0, filt1, filt2, out0, out1); 181 SRARI_H2_SH(out0, out1, 7); 182 SAT_SH2_SH(out0, out1, 7); 183 out = PCKEV_XORI128_UB(out0, out1); 184 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 185} 186 187static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride, 188 uint8_t *dst, int32_t dst_stride, 189 const int8_t *filter) 190{ 191 v16i8 src0, src1, src2, src3, filt0, filt1, filt2; 192 v16u8 mask0, mask1, mask2, out; 193 v8i16 filt, out0, out1, out2, out3; 194 195 mask0 = LD_UB(&mc_filt_mask_arr[16]); 196 src -= 2; 197 198 /* rearranging filter */ 199 filt = LD_SH(filter); 200 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 201 202 mask1 = mask0 + 2; 203 mask2 = mask0 + 4; 204 205 LD_SB4(src, src_stride, src0, src1, src2, src3); 206 XORI_B4_128_SB(src0, src1, src2, src3); 207 src += (4 * src_stride); 208 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 209 filt0, filt1, filt2, out0, out1); 210 LD_SB4(src, src_stride, src0, src1, src2, src3); 211 XORI_B4_128_SB(src0, src1, src2, src3); 212 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 213 filt0, filt1, filt2, out2, out3); 214 SRARI_H4_SH(out0, out1, out2, out3, 7); 215 SAT_SH4_SH(out0, out1, out2, out3, 7); 216 out = PCKEV_XORI128_UB(out0, out1); 217 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 218 out = PCKEV_XORI128_UB(out2, out3); 219 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 220} 221 222void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, 223 uint8_t *src, ptrdiff_t src_stride, 224 int height, int mx, int my) 225{ 226 const int8_t *filter = subpel_filters_msa[mx - 1]; 227 228 if (4 == height) { 229 common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter); 230 } else if (8 == height) { 231 common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter); 232 } 233} 234 235void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, 236 uint8_t *src, ptrdiff_t src_stride, 237 int height, int mx, int my) 238{ 239 uint32_t loop_cnt; 240 const int8_t *filter = subpel_filters_msa[mx - 1]; 241 v16i8 src0, src1, src2, src3, filt0, filt1, filt2; 242 v16u8 mask0, mask1, mask2, tmp0, tmp1; 243 v8i16 filt, out0, out1, out2, out3; 244 245 mask0 = LD_UB(&mc_filt_mask_arr[0]); 246 247 src -= 2; 248 249 /* rearranging filter */ 250 filt = LD_SH(filter); 251 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 252 253 mask1 = mask0 + 2; 254 mask2 = mask0 + 4; 255 256 LD_SB4(src, src_stride, src0, src1, src2, src3); 257 XORI_B4_128_SB(src0, src1, src2, src3); 258 src += (4 * src_stride); 259 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 260 filt0, filt1, filt2, out0, out1, out2, out3); 261 SRARI_H4_SH(out0, out1, out2, out3, 7); 262 SAT_SH4_SH(out0, out1, out2, out3, 7); 263 tmp0 = PCKEV_XORI128_UB(out0, out1); 264 tmp1 = PCKEV_XORI128_UB(out2, out3); 265 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 266 dst += (4 * dst_stride); 267 268 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { 269 LD_SB4(src, src_stride, src0, src1, src2, src3); 270 XORI_B4_128_SB(src0, src1, src2, src3); 271 src += (4 * src_stride); 272 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 273 filt0, filt1, filt2, out0, out1, out2, out3); 274 SRARI_H4_SH(out0, out1, out2, out3, 7); 275 SAT_SH4_SH(out0, out1, out2, out3, 7); 276 tmp0 = PCKEV_XORI128_UB(out0, out1); 277 tmp1 = PCKEV_XORI128_UB(out2, out3); 278 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 279 dst += (4 * dst_stride); 280 } 281} 282 283void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, 284 uint8_t *src, ptrdiff_t src_stride, 285 int height, int mx, int my) 286{ 287 uint32_t loop_cnt; 288 const int8_t *filter = subpel_filters_msa[mx - 1]; 289 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2; 290 v16u8 mask0, mask1, mask2, out; 291 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; 292 293 mask0 = LD_UB(&mc_filt_mask_arr[0]); 294 src -= 2; 295 296 /* rearranging filter */ 297 filt = LD_SH(filter); 298 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 299 300 mask1 = mask0 + 2; 301 mask2 = mask0 + 4; 302 303 for (loop_cnt = (height >> 2); loop_cnt--;) { 304 LD_SB4(src, src_stride, src0, src2, src4, src6); 305 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 306 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 307 src += (4 * src_stride); 308 309 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 310 filt0, filt1, filt2, out0, out1, out2, out3); 311 HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2, 312 filt0, filt1, filt2, out4, out5, out6, out7); 313 SRARI_H4_SH(out0, out1, out2, out3, 7); 314 SRARI_H4_SH(out4, out5, out6, out7, 7); 315 SAT_SH4_SH(out0, out1, out2, out3, 7); 316 SAT_SH4_SH(out4, out5, out6, out7, 7); 317 out = PCKEV_XORI128_UB(out0, out1); 318 ST_UB(out, dst); 319 dst += dst_stride; 320 out = PCKEV_XORI128_UB(out2, out3); 321 ST_UB(out, dst); 322 dst += dst_stride; 323 out = PCKEV_XORI128_UB(out4, out5); 324 ST_UB(out, dst); 325 dst += dst_stride; 326 out = PCKEV_XORI128_UB(out6, out7); 327 ST_UB(out, dst); 328 dst += dst_stride; 329 } 330} 331 332void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 333 uint8_t *src, ptrdiff_t src_stride, 334 int height, int mx, int my) 335{ 336 uint32_t loop_cnt; 337 const int8_t *filter = subpel_filters_msa[my - 1]; 338 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 339 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 340 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 341 v16u8 out; 342 v8i16 filt, out10, out32; 343 344 src -= (2 * src_stride); 345 346 filt = LD_SH(filter); 347 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 348 349 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 350 src += (5 * src_stride); 351 352 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 353 src32_r, src43_r); 354 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 355 XORI_B2_128_SB(src2110, src4332); 356 357 for (loop_cnt = (height >> 2); loop_cnt--;) { 358 LD_SB4(src, src_stride, src5, src6, src7, src8); 359 src += (4 * src_stride); 360 361 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 362 src65_r, src76_r, src87_r); 363 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); 364 XORI_B2_128_SB(src6554, src8776); 365 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 366 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 367 SRARI_H2_SH(out10, out32, 7); 368 SAT_SH2_SH(out10, out32, 7); 369 out = PCKEV_XORI128_UB(out10, out32); 370 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 371 dst += (4 * dst_stride); 372 373 src2110 = src6554; 374 src4332 = src8776; 375 src4 = src8; 376 } 377} 378 379void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 380 uint8_t *src, ptrdiff_t src_stride, 381 int height, int mx, int my) 382{ 383 uint32_t loop_cnt; 384 const int8_t *filter = subpel_filters_msa[my - 1]; 385 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10; 386 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; 387 v16i8 src109_r, filt0, filt1, filt2; 388 v16u8 tmp0, tmp1; 389 v8i16 filt, out0_r, out1_r, out2_r, out3_r; 390 391 src -= (2 * src_stride); 392 393 filt = LD_SH(filter); 394 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 395 396 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 397 src += (5 * src_stride); 398 399 XORI_B5_128_SB(src0, src1, src2, src3, src4); 400 ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3, 401 src10_r, src32_r, src21_r, src43_r); 402 403 for (loop_cnt = (height >> 2); loop_cnt--;) { 404 LD_SB4(src, src_stride, src7, src8, src9, src10); 405 XORI_B4_128_SB(src7, src8, src9, src10); 406 src += (4 * src_stride); 407 408 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r, 409 src87_r, src98_r, src109_r); 410 out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); 411 out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); 412 out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); 413 out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); 414 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); 415 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 416 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); 417 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); 418 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 419 dst += (4 * dst_stride); 420 421 src10_r = src76_r; 422 src32_r = src98_r; 423 src21_r = src87_r; 424 src43_r = src109_r; 425 src4 = src10; 426 } 427} 428 429void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 430 uint8_t *src, ptrdiff_t src_stride, 431 int height, int mx, int my) 432{ 433 uint32_t loop_cnt; 434 const int8_t *filter = subpel_filters_msa[my - 1]; 435 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 436 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 437 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 438 v16i8 src65_l, src87_l, filt0, filt1, filt2; 439 v16u8 tmp0, tmp1, tmp2, tmp3; 440 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; 441 442 src -= (2 * src_stride); 443 444 filt = LD_SH(filter); 445 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 446 447 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 448 src += (5 * src_stride); 449 450 XORI_B5_128_SB(src0, src1, src2, src3, src4); 451 ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r, 452 src32_r, src43_r, src21_r); 453 ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l, 454 src32_l, src43_l, src21_l); 455 456 for (loop_cnt = (height >> 2); loop_cnt--;) { 457 LD_SB4(src, src_stride, src5, src6, src7, src8); 458 src += (4 * src_stride); 459 460 XORI_B4_128_SB(src5, src6, src7, src8); 461 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 462 src65_r, src76_r, src87_r); 463 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 464 src65_l, src76_l, src87_l); 465 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, 466 filt2); 467 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, 468 filt2); 469 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, 470 filt2); 471 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, 472 filt2); 473 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, 474 filt2); 475 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, 476 filt2); 477 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, 478 filt2); 479 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, 480 filt2); 481 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); 482 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7); 483 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 484 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 485 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 486 out3_r, tmp0, tmp1, tmp2, tmp3); 487 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 488 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 489 dst += (4 * dst_stride); 490 491 src10_r = src54_r; 492 src32_r = src76_r; 493 src21_r = src65_r; 494 src43_r = src87_r; 495 src10_l = src54_l; 496 src32_l = src76_l; 497 src21_l = src65_l; 498 src43_l = src87_l; 499 src4 = src8; 500 } 501} 502 503void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 504 uint8_t *src, ptrdiff_t src_stride, 505 int height, int mx, int my) 506{ 507 uint32_t loop_cnt; 508 const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 509 const int8_t *filter_vert = subpel_filters_msa[my - 1]; 510 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 511 v16i8 filt_hz0, filt_hz1, filt_hz2; 512 v16u8 mask0, mask1, mask2, out; 513 v8i16 tmp0, tmp1; 514 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 515 v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3; 516 517 mask0 = LD_UB(&mc_filt_mask_arr[16]); 518 src -= (2 + 2 * src_stride); 519 520 /* rearranging filter */ 521 filt = LD_SH(filter_horiz); 522 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); 523 524 filt = LD_SH(filter_vert); 525 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); 526 527 mask1 = mask0 + 2; 528 mask2 = mask0 + 4; 529 530 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 531 src += (5 * src_stride); 532 533 XORI_B5_128_SB(src0, src1, src2, src3, src4); 534 hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, 535 filt_hz1, filt_hz2); 536 hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, 537 filt_hz1, filt_hz2); 538 hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8); 539 hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, 540 filt_hz1, filt_hz2); 541 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 542 543 for (loop_cnt = (height >> 2); loop_cnt--;) { 544 LD_SB2(src, src_stride, src5, src6); 545 src += (2 * src_stride); 546 547 XORI_B2_128_SB(src5, src6); 548 hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, 549 filt_hz1, filt_hz2); 550 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8); 551 552 LD_SB2(src, src_stride, src7, src8); 553 src += (2 * src_stride); 554 555 XORI_B2_128_SB(src7, src8); 556 hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0, 557 filt_hz1, filt_hz2); 558 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8); 559 560 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 561 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); 562 563 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); 564 tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); 565 566 SRARI_H2_SH(tmp0, tmp1, 7); 567 SAT_SH2_SH(tmp0, tmp1, 7); 568 out = PCKEV_XORI128_UB(tmp0, tmp1); 569 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 570 dst += (4 * dst_stride); 571 572 hz_out3 = hz_out7; 573 out0 = out2; 574 out1 = out3; 575 } 576} 577 578void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 579 uint8_t *src, ptrdiff_t src_stride, 580 int height, int mx, int my) 581{ 582 uint32_t loop_cnt; 583 const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 584 const int8_t *filter_vert = subpel_filters_msa[my - 1]; 585 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 586 v16i8 filt_hz0, filt_hz1, filt_hz2; 587 v16u8 mask0, mask1, mask2, vec0, vec1; 588 v8i16 filt, filt_vt0, filt_vt1, filt_vt2; 589 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 590 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; 591 v8i16 tmp0, tmp1, tmp2, tmp3; 592 593 mask0 = LD_UB(&mc_filt_mask_arr[0]); 594 src -= (2 + 2 * src_stride); 595 596 /* rearranging filter */ 597 filt = LD_SH(filter_horiz); 598 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); 599 600 mask1 = mask0 + 2; 601 mask2 = mask0 + 4; 602 603 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 604 src += (5 * src_stride); 605 606 XORI_B5_128_SB(src0, src1, src2, src3, src4); 607 hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, 608 filt_hz1, filt_hz2); 609 hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, 610 filt_hz1, filt_hz2); 611 hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, 612 filt_hz1, filt_hz2); 613 hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, 614 filt_hz1, filt_hz2); 615 hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, 616 filt_hz1, filt_hz2); 617 618 filt = LD_SH(filter_vert); 619 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); 620 621 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 622 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4); 623 624 for (loop_cnt = (height >> 2); loop_cnt--;) { 625 LD_SB4(src, src_stride, src5, src6, src7, src8); 626 src += (4 * src_stride); 627 628 XORI_B4_128_SB(src5, src6, src7, src8); 629 hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, 630 filt_hz1, filt_hz2); 631 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 632 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); 633 634 hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, 635 filt_hz1, filt_hz2); 636 out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5); 637 tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); 638 639 hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0, 640 filt_hz1, filt_hz2); 641 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); 642 tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2); 643 644 hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0, 645 filt_hz1, filt_hz2); 646 out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7); 647 tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2); 648 649 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 650 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 651 vec0 = PCKEV_XORI128_UB(tmp0, tmp1); 652 vec1 = PCKEV_XORI128_UB(tmp2, tmp3); 653 ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride); 654 dst += (4 * dst_stride); 655 656 hz_out4 = hz_out8; 657 out0 = out2; 658 out1 = out7; 659 out3 = out5; 660 out4 = out6; 661 } 662} 663 664 665void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 666 uint8_t *src, ptrdiff_t src_stride, 667 int height, int mx, int my) 668{ 669 int32_t multiple8_cnt; 670 671 for (multiple8_cnt = 2; multiple8_cnt--;) { 672 ff_put_vp8_epel8_h6v6_msa(dst, dst_stride, src, src_stride, height, 673 mx, my); 674 675 src += 8; 676 dst += 8; 677 } 678} 679 680static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, 681 uint8_t *dst, int32_t dst_stride, 682 const int8_t *filter) 683{ 684 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 685 v8i16 filt, out0, out1; 686 v16u8 out; 687 688 mask0 = LD_SB(&mc_filt_mask_arr[16]); 689 src -= 1; 690 691 /* rearranging filter */ 692 filt = LD_SH(filter); 693 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 694 695 mask1 = mask0 + 2; 696 697 LD_SB4(src, src_stride, src0, src1, src2, src3); 698 XORI_B4_128_SB(src0, src1, src2, src3); 699 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 700 filt0, filt1, out0, out1); 701 SRARI_H2_SH(out0, out1, 7); 702 SAT_SH2_SH(out0, out1, 7); 703 out = PCKEV_XORI128_UB(out0, out1); 704 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 705} 706 707static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, 708 uint8_t *dst, int32_t dst_stride, 709 const int8_t *filter) 710{ 711 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 712 v16u8 out; 713 v8i16 filt, out0, out1, out2, out3; 714 715 mask0 = LD_SB(&mc_filt_mask_arr[16]); 716 src -= 1; 717 718 /* rearranging filter */ 719 filt = LD_SH(filter); 720 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 721 722 mask1 = mask0 + 2; 723 724 LD_SB4(src, src_stride, src0, src1, src2, src3); 725 src += (4 * src_stride); 726 727 XORI_B4_128_SB(src0, src1, src2, src3); 728 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 729 filt0, filt1, out0, out1); 730 LD_SB4(src, src_stride, src0, src1, src2, src3); 731 XORI_B4_128_SB(src0, src1, src2, src3); 732 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 733 filt0, filt1, out2, out3); 734 SRARI_H4_SH(out0, out1, out2, out3, 7); 735 SAT_SH4_SH(out0, out1, out2, out3, 7); 736 out = PCKEV_XORI128_UB(out0, out1); 737 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 738 out = PCKEV_XORI128_UB(out2, out3); 739 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 740} 741 742static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, 743 uint8_t *dst, int32_t dst_stride, 744 const int8_t *filter) 745{ 746 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 747 v16i8 filt0, filt1, mask0, mask1; 748 v16u8 out; 749 v8i16 filt, out0, out1, out2, out3; 750 751 mask0 = LD_SB(&mc_filt_mask_arr[16]); 752 src -= 1; 753 754 /* rearranging filter */ 755 filt = LD_SH(filter); 756 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 757 758 mask1 = mask0 + 2; 759 760 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 761 src += (8 * src_stride); 762 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 763 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 764 filt0, filt1, out0, out1); 765 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, 766 filt0, filt1, out2, out3); 767 SRARI_H4_SH(out0, out1, out2, out3, 7); 768 SAT_SH4_SH(out0, out1, out2, out3, 7); 769 out = PCKEV_XORI128_UB(out0, out1); 770 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 771 dst += (4 * dst_stride); 772 out = PCKEV_XORI128_UB(out2, out3); 773 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 774 dst += (4 * dst_stride); 775 776 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 777 src += (8 * src_stride); 778 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 779 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 780 filt0, filt1, out0, out1); 781 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, 782 filt0, filt1, out2, out3); 783 SRARI_H4_SH(out0, out1, out2, out3, 7); 784 SAT_SH4_SH(out0, out1, out2, out3, 7); 785 out = PCKEV_XORI128_UB(out0, out1); 786 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 787 dst += (4 * dst_stride); 788 out = PCKEV_XORI128_UB(out2, out3); 789 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 790} 791 792void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, 793 uint8_t *src, ptrdiff_t src_stride, 794 int height, int mx, int my) 795{ 796 const int8_t *filter = subpel_filters_msa[mx - 1]; 797 798 if (4 == height) { 799 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter); 800 } else if (8 == height) { 801 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter); 802 } else if (16 == height) { 803 common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter); 804 } 805} 806 807void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, 808 uint8_t *src, ptrdiff_t src_stride, 809 int height, int mx, int my) 810{ 811 uint32_t loop_cnt; 812 const int8_t *filter = subpel_filters_msa[mx - 1]; 813 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 814 v16u8 tmp0, tmp1; 815 v8i16 filt, out0, out1, out2, out3; 816 817 mask0 = LD_SB(&mc_filt_mask_arr[0]); 818 src -= 1; 819 820 /* rearranging filter */ 821 filt = LD_SH(filter); 822 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 823 824 mask1 = mask0 + 2; 825 826 for (loop_cnt = (height >> 2); loop_cnt--;) { 827 LD_SB4(src, src_stride, src0, src1, src2, src3); 828 src += (4 * src_stride); 829 830 XORI_B4_128_SB(src0, src1, src2, src3); 831 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, 832 filt1, out0, out1, out2, out3); 833 SRARI_H4_SH(out0, out1, out2, out3, 7); 834 SAT_SH4_SH(out0, out1, out2, out3, 7); 835 tmp0 = PCKEV_XORI128_UB(out0, out1); 836 tmp1 = PCKEV_XORI128_UB(out2, out3); 837 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 838 dst += (4 * dst_stride); 839 } 840} 841 842void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, 843 uint8_t *src, ptrdiff_t src_stride, 844 int height, int mx, int my) 845{ 846 uint32_t loop_cnt; 847 const int8_t *filter = subpel_filters_msa[mx - 1]; 848 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 849 v16i8 filt0, filt1, mask0, mask1; 850 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; 851 v16u8 out; 852 853 mask0 = LD_SB(&mc_filt_mask_arr[0]); 854 src -= 1; 855 856 /* rearranging filter */ 857 filt = LD_SH(filter); 858 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 859 860 mask1 = mask0 + 2; 861 862 for (loop_cnt = (height >> 2); loop_cnt--;) { 863 LD_SB4(src, src_stride, src0, src2, src4, src6); 864 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 865 src += (4 * src_stride); 866 867 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 868 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, 869 filt1, out0, out1, out2, out3); 870 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0, 871 filt1, out4, out5, out6, out7); 872 SRARI_H4_SH(out0, out1, out2, out3, 7); 873 SRARI_H4_SH(out4, out5, out6, out7, 7); 874 SAT_SH4_SH(out0, out1, out2, out3, 7); 875 SAT_SH4_SH(out4, out5, out6, out7, 7); 876 out = PCKEV_XORI128_UB(out0, out1); 877 ST_UB(out, dst); 878 dst += dst_stride; 879 out = PCKEV_XORI128_UB(out2, out3); 880 ST_UB(out, dst); 881 dst += dst_stride; 882 out = PCKEV_XORI128_UB(out4, out5); 883 ST_UB(out, dst); 884 dst += dst_stride; 885 out = PCKEV_XORI128_UB(out6, out7); 886 ST_UB(out, dst); 887 dst += dst_stride; 888 } 889} 890 891void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 892 uint8_t *src, ptrdiff_t src_stride, 893 int height, int mx, int my) 894{ 895 uint32_t loop_cnt; 896 const int8_t *filter = subpel_filters_msa[my - 1]; 897 v16i8 src0, src1, src2, src3, src4, src5; 898 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 899 v16i8 src2110, src4332, filt0, filt1; 900 v8i16 filt, out10, out32; 901 v16u8 out; 902 903 src -= src_stride; 904 905 filt = LD_SH(filter); 906 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 907 908 LD_SB3(src, src_stride, src0, src1, src2); 909 src += (3 * src_stride); 910 911 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 912 913 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 914 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 915 916 for (loop_cnt = (height >> 2); loop_cnt--;) { 917 LD_SB3(src, src_stride, src3, src4, src5); 918 src += (3 * src_stride); 919 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 920 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); 921 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); 922 out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1); 923 924 src2 = LD_SB(src); 925 src += (src_stride); 926 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r); 927 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r); 928 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 929 out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1); 930 SRARI_H2_SH(out10, out32, 7); 931 SAT_SH2_SH(out10, out32, 7); 932 out = PCKEV_XORI128_UB(out10, out32); 933 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 934 dst += (4 * dst_stride); 935 } 936} 937 938void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 939 uint8_t *src, ptrdiff_t src_stride, 940 int height, int mx, int my) 941{ 942 uint32_t loop_cnt; 943 const int8_t *filter = subpel_filters_msa[my - 1]; 944 v16i8 src0, src1, src2, src7, src8, src9, src10; 945 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1; 946 v16u8 tmp0, tmp1; 947 v8i16 filt, out0_r, out1_r, out2_r, out3_r; 948 949 src -= src_stride; 950 951 filt = LD_SH(filter); 952 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 953 954 LD_SB3(src, src_stride, src0, src1, src2); 955 src += (3 * src_stride); 956 957 XORI_B3_128_SB(src0, src1, src2); 958 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 959 960 for (loop_cnt = (height >> 2); loop_cnt--;) { 961 LD_SB4(src, src_stride, src7, src8, src9, src10); 962 src += (4 * src_stride); 963 964 XORI_B4_128_SB(src7, src8, src9, src10); 965 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, 966 src72_r, src87_r, src98_r, src109_r); 967 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1); 968 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1); 969 out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1); 970 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1); 971 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); 972 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 973 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); 974 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); 975 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 976 dst += (4 * dst_stride); 977 978 src10_r = src98_r; 979 src21_r = src109_r; 980 src2 = src10; 981 } 982} 983 984void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 985 uint8_t *src, ptrdiff_t src_stride, 986 int height, int mx, int my) 987{ 988 uint32_t loop_cnt; 989 const int8_t *filter = subpel_filters_msa[my - 1]; 990 v16i8 src0, src1, src2, src3, src4, src5, src6; 991 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l; 992 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1; 993 v16u8 tmp0, tmp1, tmp2, tmp3; 994 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 995 996 src -= src_stride; 997 998 filt = LD_SH(filter); 999 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 1000 1001 LD_SB3(src, src_stride, src0, src1, src2); 1002 src += (3 * src_stride); 1003 1004 XORI_B3_128_SB(src0, src1, src2); 1005 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 1006 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 1007 1008 for (loop_cnt = (height >> 2); loop_cnt--;) { 1009 LD_SB4(src, src_stride, src3, src4, src5, src6); 1010 src += (4 * src_stride); 1011 1012 XORI_B4_128_SB(src3, src4, src5, src6); 1013 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 1014 src32_r, src43_r, src54_r, src65_r); 1015 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 1016 src32_l, src43_l, src54_l, src65_l); 1017 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1); 1018 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1); 1019 out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1); 1020 out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1); 1021 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1); 1022 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1); 1023 out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1); 1024 out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1); 1025 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1026 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1027 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1028 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1029 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 1030 out3_r, tmp0, tmp1, tmp2, tmp3); 1031 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 1032 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 1033 dst += (4 * dst_stride); 1034 1035 src10_r = src54_r; 1036 src21_r = src65_r; 1037 src10_l = src54_l; 1038 src21_l = src65_l; 1039 src2 = src6; 1040 } 1041} 1042 1043void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 1044 uint8_t *src, ptrdiff_t src_stride, 1045 int height, int mx, int my) 1046{ 1047 uint32_t loop_cnt; 1048 const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 1049 const int8_t *filter_vert = subpel_filters_msa[my - 1]; 1050 v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; 1051 v16u8 mask0, mask1, out; 1052 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; 1053 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; 1054 1055 mask0 = LD_UB(&mc_filt_mask_arr[16]); 1056 src -= (1 + 1 * src_stride); 1057 1058 /* rearranging filter */ 1059 filt = LD_SH(filter_horiz); 1060 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); 1061 1062 mask1 = mask0 + 2; 1063 1064 LD_SB3(src, src_stride, src0, src1, src2); 1065 src += (3 * src_stride); 1066 1067 XORI_B3_128_SB(src0, src1, src2); 1068 hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); 1069 hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1); 1070 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 1071 1072 filt = LD_SH(filter_vert); 1073 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); 1074 1075 for (loop_cnt = (height >> 2); loop_cnt--;) { 1076 LD_SB4(src, src_stride, src3, src4, src5, src6); 1077 src += (4 * src_stride); 1078 1079 XORI_B2_128_SB(src3, src4); 1080 hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); 1081 hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8); 1082 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2); 1083 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1084 1085 XORI_B2_128_SB(src5, src6); 1086 hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); 1087 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8); 1088 vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 1089 tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); 1090 1091 SRARI_H2_SH(tmp0, tmp1, 7); 1092 SAT_SH2_SH(tmp0, tmp1, 7); 1093 out = PCKEV_XORI128_UB(tmp0, tmp1); 1094 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 1095 dst += (4 * dst_stride); 1096 1097 hz_out1 = hz_out5; 1098 vec0 = vec2; 1099 } 1100} 1101 1102void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 1103 uint8_t *src, ptrdiff_t src_stride, 1104 int height, int mx, int my) 1105{ 1106 uint32_t loop_cnt; 1107 const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 1108 const int8_t *filter_vert = subpel_filters_msa[my - 1]; 1109 v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; 1110 v16u8 mask0, mask1, out0, out1; 1111 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3; 1112 v8i16 hz_out0, hz_out1, hz_out2, hz_out3; 1113 v8i16 vec0, vec1, vec2, vec3, vec4; 1114 1115 mask0 = LD_UB(&mc_filt_mask_arr[0]); 1116 src -= (1 + 1 * src_stride); 1117 1118 /* rearranging filter */ 1119 filt = LD_SH(filter_horiz); 1120 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); 1121 1122 mask1 = mask0 + 2; 1123 1124 LD_SB3(src, src_stride, src0, src1, src2); 1125 src += (3 * src_stride); 1126 1127 XORI_B3_128_SB(src0, src1, src2); 1128 hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); 1129 hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); 1130 hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); 1131 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2); 1132 1133 filt = LD_SH(filter_vert); 1134 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); 1135 1136 for (loop_cnt = (height >> 2); loop_cnt--;) { 1137 LD_SB4(src, src_stride, src3, src4, src5, src6); 1138 src += (4 * src_stride); 1139 1140 XORI_B4_128_SB(src3, src4, src5, src6); 1141 hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); 1142 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2); 1143 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1144 1145 hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); 1146 vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3); 1147 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1); 1148 1149 hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); 1150 vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 1151 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1); 1152 1153 hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); 1154 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1); 1155 tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1156 1157 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1158 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1159 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 1160 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 1161 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1162 dst += (4 * dst_stride); 1163 1164 vec0 = vec4; 1165 vec2 = vec1; 1166 } 1167} 1168 1169void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 1170 uint8_t *src, ptrdiff_t src_stride, 1171 int height, int mx, int my) 1172{ 1173 int32_t multiple8_cnt; 1174 1175 for (multiple8_cnt = 2; multiple8_cnt--;) { 1176 ff_put_vp8_epel8_h4v4_msa(dst, dst_stride, src, src_stride, height, 1177 mx, my); 1178 1179 src += 8; 1180 dst += 8; 1181 } 1182} 1183 1184void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 1185 uint8_t *src, ptrdiff_t src_stride, 1186 int height, int mx, int my) 1187{ 1188 uint32_t loop_cnt; 1189 const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 1190 const int8_t *filter_vert = subpel_filters_msa[my - 1]; 1191 v16i8 src0, src1, src2, src3, src4, src5, src6; 1192 v16i8 filt_hz0, filt_hz1, filt_hz2; 1193 v16u8 res0, res1, mask0, mask1, mask2; 1194 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; 1195 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; 1196 1197 mask0 = LD_UB(&mc_filt_mask_arr[16]); 1198 src -= (2 + 1 * src_stride); 1199 1200 /* rearranging filter */ 1201 filt = LD_SH(filter_horiz); 1202 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); 1203 1204 mask1 = mask0 + 2; 1205 mask2 = mask0 + 4; 1206 1207 LD_SB3(src, src_stride, src0, src1, src2); 1208 src += (3 * src_stride); 1209 1210 XORI_B3_128_SB(src0, src1, src2); 1211 hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, 1212 filt_hz1, filt_hz2); 1213 hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, 1214 filt_hz1, filt_hz2); 1215 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 1216 1217 filt = LD_SH(filter_vert); 1218 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); 1219 1220 for (loop_cnt = (height >> 2); loop_cnt--;) { 1221 LD_SB4(src, src_stride, src3, src4, src5, src6); 1222 src += (4 * src_stride); 1223 1224 XORI_B4_128_SB(src3, src4, src5, src6); 1225 hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, 1226 filt_hz1, filt_hz2); 1227 hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8); 1228 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2); 1229 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1230 1231 hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, 1232 filt_hz1, filt_hz2); 1233 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8); 1234 vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 1235 tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); 1236 1237 SRARI_H2_SH(tmp0, tmp1, 7); 1238 SAT_SH2_SH(tmp0, tmp1, 7); 1239 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); 1240 XORI_B2_128_UB(res0, res1); 1241 ST_W2(res0, 0, 1, dst, dst_stride); 1242 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 1243 dst += (4 * dst_stride); 1244 1245 hz_out1 = hz_out5; 1246 vec0 = vec2; 1247 } 1248} 1249 1250void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 1251 uint8_t *src, ptrdiff_t src_stride, 1252 int height, int mx, int my) 1253{ 1254 uint32_t loop_cnt; 1255 const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 1256 const int8_t *filter_vert = subpel_filters_msa[my - 1]; 1257 v16i8 src0, src1, src2, src3, src4, src5, src6; 1258 v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; 1259 v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3; 1260 v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3; 1261 v16u8 out0, out1; 1262 1263 mask0 = LD_SB(&mc_filt_mask_arr[0]); 1264 src -= (2 + src_stride); 1265 1266 /* rearranging filter */ 1267 filt = LD_SH(filter_horiz); 1268 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); 1269 1270 mask1 = mask0 + 2; 1271 mask2 = mask0 + 4; 1272 1273 LD_SB3(src, src_stride, src0, src1, src2); 1274 src += (3 * src_stride); 1275 1276 XORI_B3_128_SB(src0, src1, src2); 1277 hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, 1278 filt_hz1, filt_hz2); 1279 hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, 1280 filt_hz1, filt_hz2); 1281 hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, 1282 filt_hz1, filt_hz2); 1283 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2); 1284 1285 filt = LD_SH(filter_vert); 1286 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); 1287 1288 for (loop_cnt = (height >> 2); loop_cnt--;) { 1289 LD_SB4(src, src_stride, src3, src4, src5, src6); 1290 src += (4 * src_stride); 1291 1292 XORI_B4_128_SB(src3, src4, src5, src6); 1293 1294 hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, 1295 filt_hz1, filt_hz2); 1296 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2); 1297 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1298 1299 hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, 1300 filt_hz1, filt_hz2); 1301 vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3); 1302 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1); 1303 1304 hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, 1305 filt_hz1, filt_hz2); 1306 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 1307 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1); 1308 1309 hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, 1310 filt_hz1, filt_hz2); 1311 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2); 1312 tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); 1313 1314 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1315 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1316 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 1317 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 1318 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1319 dst += (4 * dst_stride); 1320 } 1321} 1322 1323void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 1324 uint8_t *src, ptrdiff_t src_stride, 1325 int height, int mx, int my) 1326{ 1327 int32_t multiple8_cnt; 1328 1329 for (multiple8_cnt = 2; multiple8_cnt--;) { 1330 ff_put_vp8_epel8_h6v4_msa(dst, dst_stride, src, src_stride, height, 1331 mx, my); 1332 1333 src += 8; 1334 dst += 8; 1335 } 1336} 1337 1338void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 1339 uint8_t *src, ptrdiff_t src_stride, 1340 int height, int mx, int my) 1341{ 1342 uint32_t loop_cnt; 1343 const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 1344 const int8_t *filter_vert = subpel_filters_msa[my - 1]; 1345 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1346 v16i8 filt_hz0, filt_hz1, mask0, mask1; 1347 v16u8 out; 1348 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1349 v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3; 1350 v8i16 filt, filt_vt0, filt_vt1, filt_vt2; 1351 1352 mask0 = LD_SB(&mc_filt_mask_arr[16]); 1353 1354 src -= (1 + 2 * src_stride); 1355 1356 /* rearranging filter */ 1357 filt = LD_SH(filter_horiz); 1358 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); 1359 1360 mask1 = mask0 + 2; 1361 1362 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 1363 src += (5 * src_stride); 1364 1365 XORI_B5_128_SB(src0, src1, src2, src3, src4); 1366 hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); 1367 hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1); 1368 hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); 1369 hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8); 1370 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 1371 1372 filt = LD_SH(filter_vert); 1373 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); 1374 1375 for (loop_cnt = (height >> 2); loop_cnt--;) { 1376 LD_SB4(src, src_stride, src5, src6, src7, src8); 1377 XORI_B4_128_SB(src5, src6, src7, src8); 1378 src += (4 * src_stride); 1379 1380 hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); 1381 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8); 1382 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 1383 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); 1384 1385 hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1); 1386 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8); 1387 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); 1388 tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); 1389 1390 SRARI_H2_SH(tmp0, tmp1, 7); 1391 SAT_SH2_SH(tmp0, tmp1, 7); 1392 out = PCKEV_XORI128_UB(tmp0, tmp1); 1393 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 1394 dst += (4 * dst_stride); 1395 1396 hz_out3 = hz_out7; 1397 out0 = out2; 1398 out1 = out3; 1399 } 1400} 1401 1402void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 1403 uint8_t *src, ptrdiff_t src_stride, 1404 int height, int mx, int my) 1405{ 1406 uint32_t loop_cnt; 1407 const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 1408 const int8_t *filter_vert = subpel_filters_msa[my - 1]; 1409 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1410 v16i8 filt_hz0, filt_hz1, mask0, mask1; 1411 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3; 1412 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1413 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; 1414 v16u8 vec0, vec1; 1415 1416 mask0 = LD_SB(&mc_filt_mask_arr[0]); 1417 src -= (1 + 2 * src_stride); 1418 1419 /* rearranging filter */ 1420 filt = LD_SH(filter_horiz); 1421 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); 1422 1423 mask1 = mask0 + 2; 1424 1425 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 1426 src += (5 * src_stride); 1427 1428 XORI_B5_128_SB(src0, src1, src2, src3, src4); 1429 hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); 1430 hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); 1431 hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); 1432 hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); 1433 hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); 1434 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 1435 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4); 1436 1437 filt = LD_SH(filter_vert); 1438 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); 1439 1440 for (loop_cnt = (height >> 2); loop_cnt--;) { 1441 LD_SB4(src, src_stride, src5, src6, src7, src8); 1442 src += (4 * src_stride); 1443 1444 XORI_B4_128_SB(src5, src6, src7, src8); 1445 1446 hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); 1447 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 1448 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); 1449 1450 hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); 1451 out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5); 1452 tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); 1453 1454 hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1); 1455 out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); 1456 tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2); 1457 1458 hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1); 1459 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7); 1460 tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2); 1461 1462 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1463 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1464 vec0 = PCKEV_XORI128_UB(tmp0, tmp1); 1465 vec1 = PCKEV_XORI128_UB(tmp2, tmp3); 1466 ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride); 1467 dst += (4 * dst_stride); 1468 1469 hz_out4 = hz_out8; 1470 out0 = out2; 1471 out1 = out6; 1472 out3 = out5; 1473 out4 = out7; 1474 } 1475} 1476 1477void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 1478 uint8_t *src, ptrdiff_t src_stride, 1479 int height, int mx, int my) 1480{ 1481 int32_t multiple8_cnt; 1482 1483 for (multiple8_cnt = 2; multiple8_cnt--;) { 1484 ff_put_vp8_epel8_h4v6_msa(dst, dst_stride, src, src_stride, height, 1485 mx, my); 1486 1487 src += 8; 1488 dst += 8; 1489 } 1490} 1491 1492static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride, 1493 uint8_t *dst, int32_t dst_stride, 1494 const int8_t *filter) 1495{ 1496 v16i8 src0, src1, src2, src3, mask; 1497 v16u8 filt0, vec0, vec1, res0, res1; 1498 v8u16 vec2, vec3, filt; 1499 1500 mask = LD_SB(&mc_filt_mask_arr[16]); 1501 1502 /* rearranging filter */ 1503 filt = LD_UH(filter); 1504 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1505 1506 LD_SB4(src, src_stride, src0, src1, src2, src3); 1507 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 1508 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); 1509 SRARI_H2_UH(vec2, vec3, 7); 1510 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); 1511 ST_W2(res0, 0, 1, dst, dst_stride); 1512 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 1513} 1514 1515static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride, 1516 uint8_t *dst, int32_t dst_stride, 1517 const int8_t *filter) 1518{ 1519 v16u8 vec0, vec1, vec2, vec3, filt0; 1520 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 1521 v16i8 res0, res1, res2, res3; 1522 v8u16 vec4, vec5, vec6, vec7, filt; 1523 1524 mask = LD_SB(&mc_filt_mask_arr[16]); 1525 1526 /* rearranging filter */ 1527 filt = LD_UH(filter); 1528 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1529 1530 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 1531 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 1532 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); 1533 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1534 vec4, vec5, vec6, vec7); 1535 SRARI_H4_UH(vec4, vec5, vec6, vec7, 7); 1536 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, 1537 res0, res1, res2, res3); 1538 ST_W2(res0, 0, 1, dst, dst_stride); 1539 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 1540 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride); 1541 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride); 1542} 1543 1544void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride, 1545 uint8_t *src, ptrdiff_t src_stride, 1546 int height, int mx, int my) 1547{ 1548 const int8_t *filter = bilinear_filters_msa[mx - 1]; 1549 1550 if (4 == height) { 1551 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 1552 } else if (8 == height) { 1553 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 1554 } 1555} 1556 1557static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride, 1558 uint8_t *dst, int32_t dst_stride, 1559 const int8_t *filter) 1560{ 1561 v16u8 filt0; 1562 v16i8 src0, src1, src2, src3, mask; 1563 v8u16 vec0, vec1, vec2, vec3, filt; 1564 1565 mask = LD_SB(&mc_filt_mask_arr[0]); 1566 1567 /* rearranging filter */ 1568 filt = LD_UH(filter); 1569 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1570 1571 LD_SB4(src, src_stride, src0, src1, src2, src3); 1572 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1573 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1574 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1575 vec0, vec1, vec2, vec3); 1576 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1577 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); 1578 ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride); 1579} 1580 1581static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, 1582 uint8_t *dst, int32_t dst_stride, 1583 const int8_t *filter, int32_t height) 1584{ 1585 v16u8 filt0; 1586 v16i8 src0, src1, src2, src3, mask, out0, out1; 1587 v8u16 vec0, vec1, vec2, vec3, filt; 1588 1589 mask = LD_SB(&mc_filt_mask_arr[0]); 1590 1591 /* rearranging filter */ 1592 filt = LD_UH(filter); 1593 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1594 1595 LD_SB4(src, src_stride, src0, src1, src2, src3); 1596 src += (4 * src_stride); 1597 1598 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1599 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1600 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1601 vec0, vec1, vec2, vec3); 1602 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1603 1604 LD_SB4(src, src_stride, src0, src1, src2, src3); 1605 src += (4 * src_stride); 1606 1607 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 1608 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1609 1610 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1611 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1612 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1613 vec0, vec1, vec2, vec3); 1614 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1615 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 1616 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1617 dst += (8 * dst_stride); 1618 1619 if (16 == height) { 1620 LD_SB4(src, src_stride, src0, src1, src2, src3); 1621 src += (4 * src_stride); 1622 1623 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1624 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1625 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1626 vec0, vec1, vec2, vec3); 1627 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1628 LD_SB4(src, src_stride, src0, src1, src2, src3); 1629 src += (4 * src_stride); 1630 1631 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 1632 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1633 1634 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1635 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1636 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1637 vec0, vec1, vec2, vec3); 1638 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1639 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 1640 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1641 } 1642} 1643 1644void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride, 1645 uint8_t *src, ptrdiff_t src_stride, 1646 int height, int mx, int my) 1647{ 1648 const int8_t *filter = bilinear_filters_msa[mx - 1]; 1649 1650 if (4 == height) { 1651 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 1652 } else { 1653 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, 1654 height); 1655 } 1656} 1657 1658void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride, 1659 uint8_t *src, ptrdiff_t src_stride, 1660 int height, int mx, int my) 1661{ 1662 uint32_t loop_cnt; 1663 const int8_t *filter = bilinear_filters_msa[mx - 1]; 1664 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 1665 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1666 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 1667 1668 mask = LD_SB(&mc_filt_mask_arr[0]); 1669 1670 loop_cnt = (height >> 2) - 1; 1671 1672 /* rearranging filter */ 1673 filt = LD_UH(filter); 1674 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1675 1676 LD_SB4(src, src_stride, src0, src2, src4, src6); 1677 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 1678 src += (4 * src_stride); 1679 1680 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 1681 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 1682 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 1683 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 1684 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1685 out0, out1, out2, out3); 1686 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 1687 out4, out5, out6, out7); 1688 SRARI_H4_UH(out0, out1, out2, out3, 7); 1689 SRARI_H4_UH(out4, out5, out6, out7, 7); 1690 PCKEV_ST_SB(out0, out1, dst); 1691 dst += dst_stride; 1692 PCKEV_ST_SB(out2, out3, dst); 1693 dst += dst_stride; 1694 PCKEV_ST_SB(out4, out5, dst); 1695 dst += dst_stride; 1696 PCKEV_ST_SB(out6, out7, dst); 1697 dst += dst_stride; 1698 1699 for (; loop_cnt--;) { 1700 LD_SB4(src, src_stride, src0, src2, src4, src6); 1701 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 1702 src += (4 * src_stride); 1703 1704 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 1705 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 1706 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 1707 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 1708 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1709 out0, out1, out2, out3); 1710 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 1711 out4, out5, out6, out7); 1712 SRARI_H4_UH(out0, out1, out2, out3, 7); 1713 SRARI_H4_UH(out4, out5, out6, out7, 7); 1714 PCKEV_ST_SB(out0, out1, dst); 1715 dst += dst_stride; 1716 PCKEV_ST_SB(out2, out3, dst); 1717 dst += dst_stride; 1718 PCKEV_ST_SB(out4, out5, dst); 1719 dst += dst_stride; 1720 PCKEV_ST_SB(out6, out7, dst); 1721 dst += dst_stride; 1722 } 1723} 1724 1725static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride, 1726 uint8_t *dst, int32_t dst_stride, 1727 const int8_t *filter) 1728{ 1729 v16i8 src0, src1, src2, src3, src4; 1730 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; 1731 v16u8 filt0; 1732 v8i16 filt; 1733 v8u16 tmp0, tmp1; 1734 1735 filt = LD_SH(filter); 1736 filt0 = (v16u8) __msa_splati_h(filt, 0); 1737 1738 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 1739 src += (5 * src_stride); 1740 1741 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, 1742 src10_r, src21_r, src32_r, src43_r); 1743 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 1744 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 1745 SRARI_H2_UH(tmp0, tmp1, 7); 1746 SAT_UH2_UH(tmp0, tmp1, 7); 1747 src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 1748 ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride); 1749} 1750 1751static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride, 1752 uint8_t *dst, int32_t dst_stride, 1753 const int8_t *filter) 1754{ 1755 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1756 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; 1757 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; 1758 v8u16 tmp0, tmp1, tmp2, tmp3; 1759 v16u8 filt0; 1760 v8i16 filt; 1761 1762 filt = LD_SH(filter); 1763 filt0 = (v16u8) __msa_splati_h(filt, 0); 1764 1765 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 1766 src += (8 * src_stride); 1767 1768 src8 = LD_SB(src); 1769 src += src_stride; 1770 1771 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1772 src32_r, src43_r); 1773 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 1774 src76_r, src87_r); 1775 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 1776 src87_r, src76_r, src2110, src4332, src6554, src8776); 1777 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, 1778 tmp0, tmp1, tmp2, tmp3); 1779 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1780 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1781 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); 1782 ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 1783} 1784 1785void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride, 1786 uint8_t *src, ptrdiff_t src_stride, 1787 int height, int mx, int my) 1788{ 1789 const int8_t *filter = bilinear_filters_msa[my - 1]; 1790 1791 if (4 == height) { 1792 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 1793 } else if (8 == height) { 1794 common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 1795 } 1796} 1797 1798static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride, 1799 uint8_t *dst, int32_t dst_stride, 1800 const int8_t *filter) 1801{ 1802 v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; 1803 v16i8 out0, out1; 1804 v8u16 tmp0, tmp1, tmp2, tmp3; 1805 v8i16 filt; 1806 1807 /* rearranging filter_y */ 1808 filt = LD_SH(filter); 1809 filt0 = (v16u8) __msa_splati_h(filt, 0); 1810 1811 LD_UB5(src, src_stride, src0, src1, src2, src3, src4); 1812 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); 1813 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); 1814 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1815 tmp0, tmp1, tmp2, tmp3); 1816 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1817 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1818 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 1819 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1820} 1821 1822static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, 1823 uint8_t *dst, int32_t dst_stride, 1824 const int8_t *filter, int32_t height) 1825{ 1826 uint32_t loop_cnt; 1827 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1828 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 1829 v16i8 out0, out1; 1830 v8u16 tmp0, tmp1, tmp2, tmp3; 1831 v8i16 filt; 1832 1833 /* rearranging filter_y */ 1834 filt = LD_SH(filter); 1835 filt0 = (v16u8) __msa_splati_h(filt, 0); 1836 1837 src0 = LD_UB(src); 1838 src += src_stride; 1839 1840 for (loop_cnt = (height >> 3); loop_cnt--;) { 1841 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); 1842 src += (8 * src_stride); 1843 1844 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 1845 vec0, vec1, vec2, vec3); 1846 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, 1847 vec4, vec5, vec6, vec7); 1848 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1849 tmp0, tmp1, tmp2, tmp3); 1850 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1851 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1852 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 1853 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1854 1855 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 1856 tmp0, tmp1, tmp2, tmp3); 1857 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1858 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1859 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 1860 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1861 dst += (8 * dst_stride); 1862 1863 src0 = src8; 1864 } 1865} 1866 1867void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride, 1868 uint8_t *src, ptrdiff_t src_stride, 1869 int height, int mx, int my) 1870{ 1871 const int8_t *filter = bilinear_filters_msa[my - 1]; 1872 1873 if (4 == height) { 1874 common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 1875 } else { 1876 common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, 1877 height); 1878 } 1879} 1880 1881void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride, 1882 uint8_t *src, ptrdiff_t src_stride, 1883 int height, int mx, int my) 1884{ 1885 uint32_t loop_cnt; 1886 const int8_t *filter = bilinear_filters_msa[my - 1]; 1887 v16u8 src0, src1, src2, src3, src4; 1888 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 1889 v8u16 tmp0, tmp1, tmp2, tmp3; 1890 v8i16 filt; 1891 1892 /* rearranging filter_y */ 1893 filt = LD_SH(filter); 1894 filt0 = (v16u8) __msa_splati_h(filt, 0); 1895 1896 src0 = LD_UB(src); 1897 src += src_stride; 1898 1899 for (loop_cnt = (height >> 2); loop_cnt--;) { 1900 LD_UB4(src, src_stride, src1, src2, src3, src4); 1901 src += (4 * src_stride); 1902 1903 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 1904 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 1905 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 1906 SRARI_H2_UH(tmp0, tmp1, 7); 1907 SAT_UH2_UH(tmp0, tmp1, 7); 1908 PCKEV_ST_SB(tmp0, tmp1, dst); 1909 dst += dst_stride; 1910 1911 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 1912 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 1913 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 1914 SRARI_H2_UH(tmp2, tmp3, 7); 1915 SAT_UH2_UH(tmp2, tmp3, 7); 1916 PCKEV_ST_SB(tmp2, tmp3, dst); 1917 dst += dst_stride; 1918 1919 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 1920 SRARI_H2_UH(tmp0, tmp1, 7); 1921 SAT_UH2_UH(tmp0, tmp1, 7); 1922 PCKEV_ST_SB(tmp0, tmp1, dst); 1923 dst += dst_stride; 1924 1925 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 1926 SRARI_H2_UH(tmp2, tmp3, 7); 1927 SAT_UH2_UH(tmp2, tmp3, 7); 1928 PCKEV_ST_SB(tmp2, tmp3, dst); 1929 dst += dst_stride; 1930 1931 src0 = src4; 1932 } 1933} 1934 1935static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride, 1936 uint8_t *dst, int32_t dst_stride, 1937 const int8_t *filter_horiz, 1938 const int8_t *filter_vert) 1939{ 1940 v16i8 src0, src1, src2, src3, src4, mask; 1941 v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; 1942 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; 1943 1944 mask = LD_SB(&mc_filt_mask_arr[16]); 1945 1946 /* rearranging filter */ 1947 filt = LD_UH(filter_horiz); 1948 filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0); 1949 1950 filt = LD_UH(filter_vert); 1951 filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0); 1952 1953 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 1954 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); 1955 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); 1956 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 1957 hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8); 1958 hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2); 1959 1960 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1961 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1962 SRARI_H2_UH(tmp0, tmp1, 7); 1963 SAT_UH2_UH(tmp0, tmp1, 7); 1964 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); 1965 ST_W2(res0, 0, 1, dst, dst_stride); 1966 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 1967} 1968 1969static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride, 1970 uint8_t *dst, int32_t dst_stride, 1971 const int8_t *filter_horiz, 1972 const int8_t *filter_vert) 1973{ 1974 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; 1975 v16i8 res0, res1, res2, res3; 1976 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; 1977 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1978 v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; 1979 1980 mask = LD_SB(&mc_filt_mask_arr[16]); 1981 1982 /* rearranging filter */ 1983 filt = LD_UH(filter_horiz); 1984 filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0); 1985 1986 filt = LD_UH(filter_vert); 1987 filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0); 1988 1989 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 1990 src += (8 * src_stride); 1991 src8 = LD_SB(src); 1992 1993 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); 1994 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); 1995 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7); 1996 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7); 1997 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7); 1998 SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1, 1999 hz_out3, hz_out5); 2000 hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6); 2001 2002 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 2003 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); 2004 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, 2005 vec4, vec5, vec6, vec7); 2006 SRARI_H4_UH(vec4, vec5, vec6, vec7, 7); 2007 SAT_UH4_UH(vec4, vec5, vec6, vec7, 7); 2008 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, 2009 res0, res1, res2, res3); 2010 ST_W2(res0, 0, 1, dst, dst_stride); 2011 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 2012 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride); 2013 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride); 2014} 2015 2016void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 2017 uint8_t *src, ptrdiff_t src_stride, 2018 int height, int mx, int my) 2019{ 2020 const int8_t *filter_horiz = bilinear_filters_msa[mx - 1]; 2021 const int8_t *filter_vert = bilinear_filters_msa[my - 1]; 2022 2023 if (4 == height) { 2024 common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, 2025 filter_horiz, filter_vert); 2026 } else if (8 == height) { 2027 common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, 2028 filter_horiz, filter_vert); 2029 } 2030} 2031 2032static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride, 2033 uint8_t *dst, int32_t dst_stride, 2034 const int8_t *filter_horiz, 2035 const int8_t *filter_vert) 2036{ 2037 v16i8 src0, src1, src2, src3, src4, mask, out0, out1; 2038 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; 2039 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 2040 v8i16 filt; 2041 2042 mask = LD_SB(&mc_filt_mask_arr[0]); 2043 2044 /* rearranging filter */ 2045 filt = LD_SH(filter_horiz); 2046 filt_hz = (v16u8) __msa_splati_h(filt, 0); 2047 2048 filt = LD_SH(filter_vert); 2049 filt_vt = (v16u8) __msa_splati_h(filt, 0); 2050 2051 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 2052 2053 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 2054 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2055 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2056 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 2057 2058 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 2059 vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2060 tmp1 = __msa_dotp_u_h(vec1, filt_vt); 2061 2062 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 2063 vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2064 tmp2 = __msa_dotp_u_h(vec2, filt_vt); 2065 2066 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 2067 vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2068 tmp3 = __msa_dotp_u_h(vec3, filt_vt); 2069 2070 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2071 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2072 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 2073 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2074} 2075 2076static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride, 2077 uint8_t *dst, int32_t dst_stride, 2078 const int8_t *filter_horiz, 2079 const int8_t *filter_vert, 2080 int32_t height) 2081{ 2082 uint32_t loop_cnt; 2083 v16i8 src0, src1, src2, src3, src4, mask, out0, out1; 2084 v16u8 filt_hz, filt_vt, vec0; 2085 v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; 2086 v8i16 filt; 2087 2088 mask = LD_SB(&mc_filt_mask_arr[0]); 2089 2090 /* rearranging filter */ 2091 filt = LD_SH(filter_horiz); 2092 filt_hz = (v16u8) __msa_splati_h(filt, 0); 2093 2094 filt = LD_SH(filter_vert); 2095 filt_vt = (v16u8) __msa_splati_h(filt, 0); 2096 2097 src0 = LD_SB(src); 2098 src += src_stride; 2099 2100 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 2101 2102 for (loop_cnt = (height >> 3); loop_cnt--;) { 2103 LD_SB4(src, src_stride, src1, src2, src3, src4); 2104 src += (4 * src_stride); 2105 2106 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2107 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2108 tmp1 = __msa_dotp_u_h(vec0, filt_vt); 2109 2110 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 2111 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2112 tmp2 = __msa_dotp_u_h(vec0, filt_vt); 2113 2114 SRARI_H2_UH(tmp1, tmp2, 7); 2115 SAT_UH2_UH(tmp1, tmp2, 7); 2116 2117 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 2118 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2119 tmp3 = __msa_dotp_u_h(vec0, filt_vt); 2120 2121 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 2122 LD_SB4(src, src_stride, src1, src2, src3, src4); 2123 src += (4 * src_stride); 2124 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2125 tmp4 = __msa_dotp_u_h(vec0, filt_vt); 2126 2127 SRARI_H2_UH(tmp3, tmp4, 7); 2128 SAT_UH2_UH(tmp3, tmp4, 7); 2129 PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); 2130 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2131 2132 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2133 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2134 tmp5 = __msa_dotp_u_h(vec0, filt_vt); 2135 2136 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 2137 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2138 tmp6 = __msa_dotp_u_h(vec0, filt_vt); 2139 2140 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 2141 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2142 tmp7 = __msa_dotp_u_h(vec0, filt_vt); 2143 2144 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 2145 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2146 tmp8 = __msa_dotp_u_h(vec0, filt_vt); 2147 2148 SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7); 2149 SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7); 2150 PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); 2151 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 2152 dst += (8 * dst_stride); 2153 } 2154} 2155 2156void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 2157 uint8_t *src, ptrdiff_t src_stride, 2158 int height, int mx, int my) 2159{ 2160 const int8_t *filter_horiz = bilinear_filters_msa[mx - 1]; 2161 const int8_t *filter_vert = bilinear_filters_msa[my - 1]; 2162 2163 if (4 == height) { 2164 common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, 2165 filter_horiz, filter_vert); 2166 } else { 2167 common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, 2168 filter_horiz, filter_vert, height); 2169 } 2170} 2171 2172void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 2173 uint8_t *src, ptrdiff_t src_stride, 2174 int height, int mx, int my) 2175{ 2176 uint32_t loop_cnt; 2177 const int8_t *filter_horiz = bilinear_filters_msa[mx - 1]; 2178 const int8_t *filter_vert = bilinear_filters_msa[my - 1]; 2179 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 2180 v16u8 filt_hz, filt_vt, vec0, vec1; 2181 v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; 2182 v8i16 filt; 2183 2184 mask = LD_SB(&mc_filt_mask_arr[0]); 2185 2186 /* rearranging filter */ 2187 filt = LD_SH(filter_horiz); 2188 filt_hz = (v16u8) __msa_splati_h(filt, 0); 2189 2190 filt = LD_SH(filter_vert); 2191 filt_vt = (v16u8) __msa_splati_h(filt, 0); 2192 2193 LD_SB2(src, 8, src0, src1); 2194 src += src_stride; 2195 2196 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 2197 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2198 2199 2200 for (loop_cnt = (height >> 2); loop_cnt--;) { 2201 LD_SB4(src, src_stride, src0, src2, src4, src6); 2202 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 2203 src += (4 * src_stride); 2204 2205 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 2206 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2207 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 2208 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 2209 SRARI_H2_UH(tmp1, tmp2, 7); 2210 SAT_UH2_UH(tmp1, tmp2, 7); 2211 PCKEV_ST_SB(tmp1, tmp2, dst); 2212 dst += dst_stride; 2213 2214 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 2215 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 2216 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 2217 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 2218 SRARI_H2_UH(tmp1, tmp2, 7); 2219 SAT_UH2_UH(tmp1, tmp2, 7); 2220 PCKEV_ST_SB(tmp1, tmp2, dst); 2221 dst += dst_stride; 2222 2223 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 2224 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7); 2225 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 2226 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 2227 SRARI_H2_UH(tmp1, tmp2, 7); 2228 SAT_UH2_UH(tmp1, tmp2, 7); 2229 PCKEV_ST_SB(tmp1, tmp2, dst); 2230 dst += dst_stride; 2231 2232 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7); 2233 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7); 2234 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 2235 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 2236 SRARI_H2_UH(tmp1, tmp2, 7); 2237 SAT_UH2_UH(tmp1, tmp2, 7); 2238 PCKEV_ST_SB(tmp1, tmp2, dst); 2239 dst += dst_stride; 2240 } 2241} 2242 2243void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride, 2244 uint8_t *src, ptrdiff_t src_stride, 2245 int height, int mx, int my) 2246{ 2247 int32_t cnt; 2248 uint64_t out0, out1, out2, out3, out4, out5, out6, out7; 2249 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 2250 2251 if (0 == height % 8) { 2252 for (cnt = height >> 3; cnt--;) { 2253 LD_UB8(src, src_stride, 2254 src0, src1, src2, src3, src4, src5, src6, src7); 2255 src += (8 * src_stride); 2256 2257 out0 = __msa_copy_u_d((v2i64) src0, 0); 2258 out1 = __msa_copy_u_d((v2i64) src1, 0); 2259 out2 = __msa_copy_u_d((v2i64) src2, 0); 2260 out3 = __msa_copy_u_d((v2i64) src3, 0); 2261 out4 = __msa_copy_u_d((v2i64) src4, 0); 2262 out5 = __msa_copy_u_d((v2i64) src5, 0); 2263 out6 = __msa_copy_u_d((v2i64) src6, 0); 2264 out7 = __msa_copy_u_d((v2i64) src7, 0); 2265 2266 SD4(out0, out1, out2, out3, dst, dst_stride); 2267 dst += (4 * dst_stride); 2268 SD4(out4, out5, out6, out7, dst, dst_stride); 2269 dst += (4 * dst_stride); 2270 } 2271 } else if (0 == height % 4) { 2272 for (cnt = (height / 4); cnt--;) { 2273 LD_UB4(src, src_stride, src0, src1, src2, src3); 2274 src += (4 * src_stride); 2275 out0 = __msa_copy_u_d((v2i64) src0, 0); 2276 out1 = __msa_copy_u_d((v2i64) src1, 0); 2277 out2 = __msa_copy_u_d((v2i64) src2, 0); 2278 out3 = __msa_copy_u_d((v2i64) src3, 0); 2279 2280 SD4(out0, out1, out2, out3, dst, dst_stride); 2281 dst += (4 * dst_stride); 2282 } 2283 } 2284} 2285 2286static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride, 2287 uint8_t *dst, int32_t dst_stride, 2288 int32_t height, int32_t width) 2289{ 2290 int32_t cnt, loop_cnt; 2291 uint8_t *src_tmp, *dst_tmp; 2292 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 2293 2294 for (cnt = (width >> 4); cnt--;) { 2295 src_tmp = src; 2296 dst_tmp = dst; 2297 2298 for (loop_cnt = (height >> 3); loop_cnt--;) { 2299 LD_UB8(src_tmp, src_stride, 2300 src0, src1, src2, src3, src4, src5, src6, src7); 2301 src_tmp += (8 * src_stride); 2302 2303 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, 2304 dst_tmp, dst_stride); 2305 dst_tmp += (8 * dst_stride); 2306 } 2307 2308 src += 16; 2309 dst += 16; 2310 } 2311} 2312 2313void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride, 2314 uint8_t *src, ptrdiff_t src_stride, 2315 int height, int mx, int my) 2316{ 2317 int32_t cnt; 2318 v16u8 src0, src1, src2, src3; 2319 2320 if (0 == height % 8) { 2321 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); 2322 } else if (0 == height % 4) { 2323 for (cnt = (height >> 2); cnt--;) { 2324 LD_UB4(src, src_stride, src0, src1, src2, src3); 2325 src += (4 * src_stride); 2326 2327 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 2328 dst += (4 * dst_stride); 2329 } 2330 } 2331} 2332