1/* 2 * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "h264dsp_mips.h" 23 24static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = { 25 /* 8 width cases */ 26 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12, 27 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, 28 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 29 30 /* 4 width cases */ 31 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24, 32 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23, 33 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22, 34}; 35 36#define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \ 37 out1, out2) \ 38{ \ 39 v16i8 tmp0_m, tmp1_m; \ 40 v16i8 minus5b_m = __msa_ldi_b(-5); \ 41 v16i8 plus20b_m = __msa_ldi_b(20); \ 42 \ 43 ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \ 44 HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \ 45 ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \ 46 DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \ 47 ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \ 48 DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \ 49} 50 51#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \ 52( { \ 53 v8i16 out0_m; \ 54 v16i8 tmp0_m; \ 55 v16i8 minus5b = __msa_ldi_b(-5); \ 56 v16i8 plus20b = __msa_ldi_b(20); \ 57 \ 58 tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0); \ 59 out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \ 60 \ 61 tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0); \ 62 out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \ 63 \ 64 tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0); \ 65 out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m); \ 66 \ 67 out0_m; \ 68} ) 69 70#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \ 71( { \ 72 v8i16 out0_m; \ 73 \ 74 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \ 75 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \ 76 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \ 77 \ 78 out0_m; \ 79} ) 80 81#define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2) \ 82( { \ 83 v4i32 out0_m; \ 84 \ 85 out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0); \ 86 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1); \ 87 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2); \ 88 out0_m = __msa_srari_w(out0_m, 10); \ 89 out0_m = __msa_sat_s_w(out0_m, 7); \ 90 out0_m; \ 91} ) 92 93static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y, 94 uint8_t *dst, int32_t stride) 95{ 96 const int16_t filt_const0 = 0xfb01; 97 const int16_t filt_const1 = 0x1414; 98 const int16_t filt_const2 = 0x1fb; 99 v16u8 out; 100 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8; 101 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 102 v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r; 103 v16i8 mask0, mask1, mask2, filt0, filt1, filt2; 104 v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1; 105 106 filt0 = (v16i8) __msa_fill_h(filt_const0); 107 filt1 = (v16i8) __msa_fill_h(filt_const1); 108 filt2 = (v16i8) __msa_fill_h(filt_const2); 109 110 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 111 112 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 113 src_y += (5 * stride); 114 115 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1); 116 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2); 117 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3); 118 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4); 119 120 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3); 121 122 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 123 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 124 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2); 125 hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2); 126 127 SRARI_H2_SH(hz_out0, hz_out1, 5); 128 SAT_SH2_SH(hz_out0, hz_out1, 7); 129 130 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8); 131 132 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5); 133 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6); 134 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7); 135 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8); 136 137 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7); 138 ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r); 139 ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r); 140 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1, 141 filt2); 142 vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1, 143 filt2); 144 SRARI_H2_SH(vt_out0, vt_out1, 5); 145 SAT_SH2_SH(vt_out0, vt_out1, 7); 146 147 out0 = __msa_srari_h((hz_out0 + vt_out0), 1); 148 out1 = __msa_srari_h((hz_out1 + vt_out1), 1); 149 150 SAT_SH2_SH(out0, out1, 7); 151 out = PCKEV_XORI128_UB(out0, out1); 152 ST_W4(out, 0, 1, 2, 3, dst, stride); 153} 154 155static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y, 156 uint8_t *dst, int32_t stride) 157{ 158 const int16_t filt_const0 = 0xfb01; 159 const int16_t filt_const1 = 0x1414; 160 const int16_t filt_const2 = 0x1fb; 161 v16u8 out0, out1; 162 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2; 163 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 164 v16i8 src_vt7, src_vt8, src_vt9, src_vt10, src_vt11, src_vt12; 165 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r; 166 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r; 167 v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2; 168 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2; 169 v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3; 170 171 filt0 = (v16i8) __msa_fill_h(filt_const0); 172 filt1 = (v16i8) __msa_fill_h(filt_const1); 173 filt2 = (v16i8) __msa_fill_h(filt_const2); 174 175 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 176 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 177 src_y += (5 * stride); 178 179 XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 180 181 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 182 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 183 src_x += (4 * stride); 184 185 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 186 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 187 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 188 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 189 190 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); 191 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); 192 193 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8); 194 src_y += (4 * stride); 195 XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8); 196 197 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4, 198 src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r); 199 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8, 200 src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r); 201 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1, 202 filt2); 203 vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1, 204 filt2); 205 vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1, 206 filt2); 207 vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1, 208 filt2); 209 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5); 210 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7); 211 212 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1); 213 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1); 214 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1); 215 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1); 216 217 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 218 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 219 220 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 221 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 222 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 223 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 224 dst += (4 * stride); 225 226 LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12); 227 XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12); 228 229 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 230 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 231 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 232 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 233 234 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); 235 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); 236 237 ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10, 238 src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r, 239 src_vt1211_r); 240 vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1, 241 filt2); 242 vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1, 243 filt2); 244 vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1, 245 filt2); 246 vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0, 247 filt1, filt2); 248 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5); 249 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7); 250 251 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1); 252 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1); 253 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1); 254 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1); 255 256 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 257 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 258 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 259 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 260} 261 262static void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x, 263 const uint8_t *src_y, uint8_t *dst, 264 int32_t stride) 265{ 266 const int16_t filt_const0 = 0xfb01; 267 const int16_t filt_const1 = 0x1414; 268 const int16_t filt_const2 = 0x1fb; 269 const uint8_t *src_x_tmp = src_x; 270 const uint8_t *src_y_tmp = src_y; 271 uint8_t *dst_tmp = dst; 272 uint32_t multiple8_cnt, loop_cnt; 273 v16u8 tmp0, tmp1; 274 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2; 275 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 276 v16i8 src_vt7, src_vt8; 277 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r; 278 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2; 279 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2; 280 v8i16 vt_out3, out0, out1, out2, out3; 281 282 filt0 = (v16i8) __msa_fill_h(filt_const0); 283 filt1 = (v16i8) __msa_fill_h(filt_const1); 284 filt2 = (v16i8) __msa_fill_h(filt_const2); 285 286 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 287 288 for (multiple8_cnt = 2; multiple8_cnt--;) { 289 src_x = src_x_tmp; 290 src_y = src_y_tmp; 291 dst = dst_tmp; 292 293 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 294 src_y += (5 * stride); 295 296 XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 297 298 for (loop_cnt = 4; loop_cnt--;) { 299 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 300 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 301 src_x += (4 * stride); 302 303 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 304 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 305 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 306 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 307 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); 308 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); 309 310 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8); 311 src_y += (4 * stride); 312 313 XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8); 314 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, 315 src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, 316 src_vt43_r); 317 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, 318 src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, 319 src_vt87_r); 320 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, 321 filt1, filt2); 322 vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, 323 filt1, filt2); 324 vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, 325 filt1, filt2); 326 vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, 327 filt1, filt2); 328 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5); 329 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7); 330 331 out0 = __msa_srari_h((hz_out0 + vt_out0), 1); 332 out1 = __msa_srari_h((hz_out1 + vt_out1), 1); 333 out2 = __msa_srari_h((hz_out2 + vt_out2), 1); 334 out3 = __msa_srari_h((hz_out3 + vt_out3), 1); 335 336 SAT_SH4_SH(out0, out1, out2, out3, 7); 337 tmp0 = PCKEV_XORI128_UB(out0, out1); 338 tmp1 = PCKEV_XORI128_UB(out2, out3); 339 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, stride); 340 dst += (4 * stride); 341 342 src_vt0 = src_vt4; 343 src_vt1 = src_vt5; 344 src_vt2 = src_vt6; 345 src_vt3 = src_vt7; 346 src_vt4 = src_vt8; 347 } 348 349 src_x_tmp += 8; 350 src_y_tmp += 8; 351 dst_tmp += 8; 352 } 353} 354 355static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x, 356 const uint8_t *src_y, 357 uint8_t *dst, 358 int32_t stride) 359{ 360 uint32_t tp0, tp1, tp2, tp3; 361 const int16_t filt_const0 = 0xfb01; 362 const int16_t filt_const1 = 0x1414; 363 const int16_t filt_const2 = 0x1fb; 364 v16u8 res, dst0 = { 0 }; 365 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8; 366 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 367 v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r; 368 v16i8 mask0, mask1, mask2, filt0, filt1, filt2; 369 v8i16 hz_out0, hz_out1, vt_out0, vt_out1, res0, res1; 370 371 filt0 = (v16i8) __msa_fill_h(filt_const0); 372 filt1 = (v16i8) __msa_fill_h(filt_const1); 373 filt2 = (v16i8) __msa_fill_h(filt_const2); 374 375 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 376 377 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 378 src_y += (5 * stride); 379 380 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1); 381 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2); 382 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3); 383 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4); 384 385 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3); 386 387 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 388 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 389 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2); 390 hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2); 391 392 SRARI_H2_SH(hz_out0, hz_out1, 5); 393 SAT_SH2_SH(hz_out0, hz_out1, 7); 394 395 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8); 396 397 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5); 398 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6); 399 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7); 400 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8); 401 402 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7); 403 ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r); 404 ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r); 405 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1, 406 filt2); 407 vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1, 408 filt2); 409 SRARI_H2_SH(vt_out0, vt_out1, 5); 410 SAT_SH2_SH(vt_out0, vt_out1, 7); 411 LW4(dst, stride, tp0, tp1, tp2, tp3); 412 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 413 414 res1 = __msa_srari_h((hz_out1 + vt_out1), 1); 415 res0 = __msa_srari_h((hz_out0 + vt_out0), 1); 416 417 SAT_SH2_SH(res0, res1, 7); 418 res = PCKEV_XORI128_UB(res0, res1); 419 dst0 = __msa_aver_u_b(res, dst0); 420 421 ST_W4(dst0, 0, 1, 2, 3, dst, stride); 422} 423 424static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x, 425 const uint8_t *src_y, 426 uint8_t *dst, 427 int32_t stride) 428{ 429 const int16_t filt_const0 = 0xfb01; 430 const int16_t filt_const1 = 0x1414; 431 const int16_t filt_const2 = 0x1fb; 432 uint64_t tp0, tp1, tp2, tp3; 433 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 }; 434 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt0, src_vt1, src_vt2; 435 v16i8 src_vt3, src_vt4, src_vt5, src_vt6, src_vt7, src_vt8; 436 v16i8 src_vt9, src_vt10, src_vt11, src_vt12, mask0, mask1, mask2; 437 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r; 438 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r; 439 v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2; 440 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2; 441 v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3; 442 443 filt0 = (v16i8) __msa_fill_h(filt_const0); 444 filt1 = (v16i8) __msa_fill_h(filt_const1); 445 filt2 = (v16i8) __msa_fill_h(filt_const2); 446 447 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 448 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 449 src_y += (5 * stride); 450 451 XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 452 453 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 454 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 455 src_x += (4 * stride); 456 457 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 458 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 459 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 460 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 461 462 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); 463 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); 464 465 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8); 466 src_y += (4 * stride); 467 XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8); 468 469 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4, 470 src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r); 471 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8, 472 src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r); 473 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1, 474 filt2); 475 vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1, 476 filt2); 477 vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1, 478 filt2); 479 vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1, 480 filt2); 481 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5); 482 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7); 483 484 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1); 485 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1); 486 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1); 487 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1); 488 489 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 490 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 491 492 LD4(dst, stride, tp0, tp1, tp2, tp3); 493 INSERT_D2_UB(tp0, tp1, dst0); 494 INSERT_D2_UB(tp2, tp3, dst1); 495 496 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 497 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 498 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 499 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 500 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 501 dst += (4 * stride); 502 503 LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12); 504 XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12); 505 506 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 507 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 508 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 509 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 510 511 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); 512 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); 513 514 ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10, 515 src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r, 516 src_vt1211_r); 517 vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1, 518 filt2); 519 vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1, 520 filt2); 521 vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1, 522 filt2); 523 vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0, 524 filt1, filt2); 525 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5); 526 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7); 527 528 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1); 529 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1); 530 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1); 531 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1); 532 533 LD4(dst, stride, tp0, tp1, tp2, tp3); 534 INSERT_D2_UB(tp0, tp1, dst0); 535 INSERT_D2_UB(tp2, tp3, dst1); 536 537 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 538 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 539 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 540 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 541 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 542} 543 544static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x, 545 const uint8_t *src_y, 546 uint8_t *dst, 547 int32_t stride) 548{ 549 const int16_t filt_const0 = 0xfb01; 550 const int16_t filt_const1 = 0x1414; 551 const int16_t filt_const2 = 0x1fb; 552 const uint8_t *src_x_tmp = src_x; 553 const uint8_t *src_y_tmp = src_y; 554 uint8_t *dst_tmp = dst; 555 uint32_t multiple8_cnt, loop_cnt; 556 uint64_t tp0, tp1, tp2, tp3; 557 v16u8 tmp0, tmp1, dst0 = { 0 }, dst1 = { 0 }; 558 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2; 559 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 560 v16i8 src_vt7, src_vt8; 561 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r; 562 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2; 563 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2; 564 v8i16 vt_out3, out0, out1, out2, out3; 565 566 filt0 = (v16i8) __msa_fill_h(filt_const0); 567 filt1 = (v16i8) __msa_fill_h(filt_const1); 568 filt2 = (v16i8) __msa_fill_h(filt_const2); 569 570 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 571 572 for (multiple8_cnt = 2; multiple8_cnt--;) { 573 src_x = src_x_tmp; 574 src_y = src_y_tmp; 575 dst = dst_tmp; 576 577 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 578 src_y += (5 * stride); 579 580 XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 581 582 for (loop_cnt = 4; loop_cnt--;) { 583 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 584 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 585 src_x += (4 * stride); 586 587 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 588 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 589 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 590 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 591 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); 592 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); 593 594 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8); 595 src_y += (4 * stride); 596 597 XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8); 598 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, 599 src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, 600 src_vt43_r); 601 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, 602 src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, 603 src_vt87_r); 604 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, 605 filt1, filt2); 606 vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, 607 filt1, filt2); 608 vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, 609 filt1, filt2); 610 vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, 611 filt1, filt2); 612 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5); 613 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7); 614 615 out0 = __msa_srari_h((hz_out0 + vt_out0), 1); 616 out1 = __msa_srari_h((hz_out1 + vt_out1), 1); 617 out2 = __msa_srari_h((hz_out2 + vt_out2), 1); 618 out3 = __msa_srari_h((hz_out3 + vt_out3), 1); 619 620 LD4(dst, stride, tp0, tp1, tp2, tp3); 621 INSERT_D2_UB(tp0, tp1, dst0); 622 INSERT_D2_UB(tp2, tp3, dst1); 623 624 SAT_SH4_SH(out0, out1, out2, out3, 7); 625 tmp0 = PCKEV_XORI128_UB(out0, out1); 626 tmp1 = PCKEV_XORI128_UB(out2, out3); 627 AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1); 628 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 629 dst += (4 * stride); 630 631 src_vt0 = src_vt4; 632 src_vt1 = src_vt5; 633 src_vt2 = src_vt6; 634 src_vt3 = src_vt7; 635 src_vt4 = src_vt8; 636 } 637 638 src_x_tmp += 8; 639 src_y_tmp += 8; 640 dst_tmp += 8; 641 } 642} 643 644void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, 645 ptrdiff_t stride) 646{ 647 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 648 v16u8 src8, src9, src10, src11, src12, src13, src14, src15; 649 650 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 651 src += (8 * stride); 652 LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15); 653 654 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride); 655 dst += (8 * stride); 656 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride); 657} 658 659void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, 660 ptrdiff_t stride) 661{ 662 uint64_t src0, src1, src2, src3, src4, src5, src6, src7; 663 664 LD4(src, stride, src0, src1, src2, src3); 665 src += 4 * stride; 666 LD4(src, stride, src4, src5, src6, src7); 667 SD4(src0, src1, src2, src3, dst, stride); 668 dst += 4 * stride; 669 SD4(src4, src5, src6, src7, dst, stride); 670} 671 672void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, 673 ptrdiff_t stride) 674{ 675 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 676 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 677 678 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 679 src += (8 * stride); 680 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 681 682 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, 683 dst2, dst3); 684 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, 685 dst6, dst7); 686 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride); 687 dst += (8 * stride); 688 689 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 690 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 691 692 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, 693 dst2, dst3); 694 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, 695 dst6, dst7); 696 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride); 697} 698 699void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, 700 ptrdiff_t stride) 701{ 702 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; 703 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 704 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 705 706 LD4(src, stride, tp0, tp1, tp2, tp3); 707 src += 4 * stride; 708 LD4(src, stride, tp4, tp5, tp6, tp7); 709 INSERT_D2_UB(tp0, tp1, src0); 710 INSERT_D2_UB(tp2, tp3, src1); 711 INSERT_D2_UB(tp4, tp5, src2); 712 INSERT_D2_UB(tp6, tp7, src3); 713 714 LD4(dst, stride, tp0, tp1, tp2, tp3); 715 LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7); 716 INSERT_D2_UB(tp0, tp1, dst0); 717 INSERT_D2_UB(tp2, tp3, dst1); 718 INSERT_D2_UB(tp4, tp5, dst2); 719 INSERT_D2_UB(tp6, tp7, dst3); 720 721 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, 722 dst2, dst3); 723 724 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 725} 726 727void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src, 728 ptrdiff_t stride) 729{ 730 uint32_t tp0, tp1, tp2, tp3; 731 v16u8 src0 = { 0 }, dst0 = { 0 }; 732 733 LW4(src, stride, tp0, tp1, tp2, tp3); 734 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 735 LW4(dst, stride, tp0, tp1, tp2, tp3); 736 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 737 738 dst0 = __msa_aver_u_b(src0, dst0); 739 740 ST_W4(dst0, 0, 1, 2, 3, dst, stride); 741} 742 743void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, 744 ptrdiff_t stride) 745{ 746 uint32_t loop_cnt; 747 v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6; 748 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11; 749 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 750 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 751 v16i8 minus5b = __msa_ldi_b(-5); 752 v16i8 plus20b = __msa_ldi_b(20); 753 754 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 755 mask3 = mask0 + 8; 756 mask4 = mask1 + 8; 757 mask5 = mask2 + 8; 758 src -= 2; 759 760 for (loop_cnt = 4; loop_cnt--;) { 761 LD_SB2(src, 16, src0, src1); 762 src += stride; 763 LD_SB2(src, 16, src2, src3); 764 src += stride; 765 LD_SB2(src, 16, src4, src5); 766 src += stride; 767 LD_SB2(src, 16, src6, src7); 768 src += stride; 769 770 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 771 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3); 772 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9); 773 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4); 774 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10); 775 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5); 776 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11); 777 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); 778 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 779 minus5b, res0, res1, res2, res3); 780 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 781 plus20b, res0, res1, res2, res3); 782 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3); 783 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9); 784 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4); 785 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10); 786 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5); 787 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11); 788 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); 789 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 790 minus5b, res4, res5, res6, res7); 791 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 792 plus20b, res4, res5, res6, res7); 793 SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2, 794 src0, src2, src4, src6); 795 SRARI_H4_SH(res0, res1, res2, res3, 5); 796 SRARI_H4_SH(res4, res5, res6, res7, 5); 797 SAT_SH4_SH(res0, res1, res2, res3, 7); 798 SAT_SH4_SH(res4, res5, res6, res7, 7); 799 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1); 800 PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3); 801 dst0 = __msa_aver_s_b(dst0, src0); 802 dst1 = __msa_aver_s_b(dst1, src2); 803 dst2 = __msa_aver_s_b(dst2, src4); 804 dst3 = __msa_aver_s_b(dst3, src6); 805 XORI_B4_128_SB(dst0, dst1, dst2, dst3); 806 ST_SB4(dst0, dst1, dst2, dst3, dst, stride); 807 dst += (4 * stride); 808 } 809} 810 811void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, 812 ptrdiff_t stride) 813{ 814 uint32_t loop_cnt; 815 v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6; 816 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11; 817 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 818 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 819 v16i8 minus5b = __msa_ldi_b(-5); 820 v16i8 plus20b = __msa_ldi_b(20); 821 822 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 823 mask3 = mask0 + 8; 824 mask4 = mask1 + 8; 825 mask5 = mask2 + 8; 826 src -= 2; 827 828 for (loop_cnt = 4; loop_cnt--;) { 829 LD_SB2(src, 16, src0, src1); 830 src += stride; 831 LD_SB2(src, 16, src2, src3); 832 src += stride; 833 LD_SB2(src, 16, src4, src5); 834 src += stride; 835 LD_SB2(src, 16, src6, src7); 836 src += stride; 837 838 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 839 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3); 840 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9); 841 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4); 842 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10); 843 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5); 844 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11); 845 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); 846 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 847 minus5b, res0, res1, res2, res3); 848 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 849 plus20b, res0, res1, res2, res3); 850 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3); 851 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9); 852 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4); 853 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10); 854 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5); 855 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11); 856 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); 857 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 858 minus5b, res4, res5, res6, res7); 859 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 860 plus20b, res4, res5, res6, res7); 861 SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3, 862 src0, src2, src4, src6); 863 SRARI_H4_SH(res0, res1, res2, res3, 5); 864 SRARI_H4_SH(res4, res5, res6, res7, 5); 865 SAT_SH4_SH(res0, res1, res2, res3, 7); 866 SAT_SH4_SH(res4, res5, res6, res7, 7); 867 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1); 868 PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3); 869 dst0 = __msa_aver_s_b(dst0, src0); 870 dst1 = __msa_aver_s_b(dst1, src2); 871 dst2 = __msa_aver_s_b(dst2, src4); 872 dst3 = __msa_aver_s_b(dst3, src6); 873 XORI_B4_128_SB(dst0, dst1, dst2, dst3); 874 ST_SB4(dst0, dst1, dst2, dst3, dst, stride); 875 dst += (4 * stride); 876 } 877} 878 879void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, 880 ptrdiff_t stride) 881{ 882 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 883 v16i8 tmp0, tmp1, tmp2, tmp3, vec11; 884 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 885 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 886 v16i8 minus5b = __msa_ldi_b(-5); 887 v16i8 plus20b = __msa_ldi_b(20); 888 889 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 890 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7); 891 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 892 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 893 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 894 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); 895 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 896 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 897 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 898 res0, res1, res2, res3); 899 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); 900 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); 901 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 902 res0, res1, res2, res3); 903 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 904 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 905 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7); 906 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 907 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 908 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 909 res4, res5, res6, res7); 910 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9); 911 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); 912 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 913 res4, res5, res6, res7); 914 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2, 915 src0, src1, src2, src3); 916 SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2, 917 src4, src5, src6, src7); 918 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1); 919 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5); 920 SRARI_H4_SH(res0, res1, res2, res3, 5); 921 SRARI_H4_SH(res4, res5, res6, res7, 5); 922 SAT_SH4_SH(res0, res1, res2, res3, 7); 923 SAT_SH4_SH(res4, res5, res6, res7, 7); 924 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1); 925 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3); 926 tmp0 = __msa_aver_s_b(tmp0, src0); 927 tmp1 = __msa_aver_s_b(tmp1, src1); 928 tmp2 = __msa_aver_s_b(tmp2, src4); 929 tmp3 = __msa_aver_s_b(tmp3, src5); 930 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3); 931 ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 932} 933 934void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, 935 ptrdiff_t stride) 936{ 937 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 938 v16i8 tmp0, tmp1, tmp2, tmp3, vec11; 939 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 940 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 941 v16i8 minus5b = __msa_ldi_b(-5); 942 v16i8 plus20b = __msa_ldi_b(20); 943 944 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 945 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7); 946 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 947 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 948 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 949 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); 950 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 951 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 952 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 953 res0, res1, res2, res3); 954 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); 955 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); 956 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 957 res0, res1, res2, res3); 958 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 959 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 960 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7); 961 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 962 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 963 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 964 res4, res5, res6, res7); 965 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9); 966 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); 967 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 968 res4, res5, res6, res7); 969 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3, 970 src0, src1, src2, src3); 971 SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3, 972 src4, src5, src6, src7); 973 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1); 974 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5); 975 SRARI_H4_SH(res0, res1, res2, res3, 5); 976 SRARI_H4_SH(res4, res5, res6, res7, 5); 977 SAT_SH4_SH(res0, res1, res2, res3, 7); 978 SAT_SH4_SH(res4, res5, res6, res7, 7); 979 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1); 980 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3); 981 tmp0 = __msa_aver_s_b(tmp0, src0); 982 tmp1 = __msa_aver_s_b(tmp1, src1); 983 tmp2 = __msa_aver_s_b(tmp2, src4); 984 tmp3 = __msa_aver_s_b(tmp3, src5); 985 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3); 986 ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 987} 988 989void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, 990 ptrdiff_t stride) 991{ 992 v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2; 993 v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 994 v8i16 res0, res1; 995 v16i8 minus5b = __msa_ldi_b(-5); 996 v16i8 plus20b = __msa_ldi_b(20); 997 998 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 999 LD_SB4(src - 2, stride, src0, src1, src2, src3); 1000 XORI_B4_128_SB(src0, src1, src2, src3); 1001 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 1002 HADD_SB2_SH(vec0, vec1, res0, res1); 1003 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 1004 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1); 1005 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 1006 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1); 1007 SRARI_H2_SH(res0, res1, 5); 1008 SAT_SH2_SH(res0, res1, 7); 1009 res = __msa_pckev_b((v16i8) res1, (v16i8) res0); 1010 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2, 1011 src0, src1, src2, src3); 1012 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); 1013 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); 1014 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1); 1015 res = __msa_aver_s_b(res, src0); 1016 res = (v16i8) __msa_xori_b((v16u8) res, 128); 1017 ST_W4(res, 0, 1, 2, 3, dst, stride); 1018} 1019 1020void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, 1021 ptrdiff_t stride) 1022{ 1023 v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2; 1024 v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 1025 v8i16 res0, res1; 1026 v16i8 minus5b = __msa_ldi_b(-5); 1027 v16i8 plus20b = __msa_ldi_b(20); 1028 1029 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 1030 LD_SB4(src - 2, stride, src0, src1, src2, src3); 1031 XORI_B4_128_SB(src0, src1, src2, src3); 1032 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 1033 HADD_SB2_SH(vec0, vec1, res0, res1); 1034 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 1035 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1); 1036 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 1037 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1); 1038 SRARI_H2_SH(res0, res1, 5); 1039 SAT_SH2_SH(res0, res1, 7); 1040 res = __msa_pckev_b((v16i8) res1, (v16i8) res0); 1041 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3, 1042 src0, src1, src2, src3); 1043 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); 1044 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); 1045 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1); 1046 res = __msa_aver_s_b(res, src0); 1047 res = (v16i8) __msa_xori_b((v16u8) res, 128); 1048 ST_W4(res, 0, 1, 2, 3, dst, stride); 1049} 1050 1051void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, 1052 ptrdiff_t stride) 1053{ 1054 uint32_t loop_cnt; 1055 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 1056 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 1057 v16i8 vec11; 1058 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 1059 v16i8 minus5b = __msa_ldi_b(-5); 1060 v16i8 plus20b = __msa_ldi_b(20); 1061 1062 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 1063 src -= 2; 1064 1065 for (loop_cnt = 4; loop_cnt--;) { 1066 LD_SB2(src, 8, src0, src1); 1067 src += stride; 1068 LD_SB2(src, 8, src2, src3); 1069 src += stride; 1070 LD_SB2(src, 8, src4, src5); 1071 src += stride; 1072 LD_SB2(src, 8, src6, src7); 1073 src += stride; 1074 1075 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 1076 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3); 1077 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9); 1078 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4); 1079 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10); 1080 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5); 1081 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11); 1082 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); 1083 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 1084 minus5b, res0, res1, res2, res3); 1085 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 1086 plus20b, res0, res1, res2, res3); 1087 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3); 1088 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9); 1089 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4); 1090 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10); 1091 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5); 1092 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11); 1093 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); 1094 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 1095 minus5b, res4, res5, res6, res7); 1096 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 1097 plus20b, res4, res5, res6, res7); 1098 SRARI_H4_SH(res0, res1, res2, res3, 5); 1099 SRARI_H4_SH(res4, res5, res6, res7, 5); 1100 SAT_SH4_SH(res0, res1, res2, res3, 7); 1101 SAT_SH4_SH(res4, res5, res6, res7, 7); 1102 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1, 1103 vec2, vec3); 1104 XORI_B4_128_SB(vec0, vec1, vec2, vec3); 1105 ST_SB4(vec0, vec1, vec2, vec3, dst, stride); 1106 dst += (4 * stride); 1107 } 1108} 1109 1110void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, 1111 ptrdiff_t stride) 1112{ 1113 v16u8 out0, out1, out2, out3; 1114 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 1115 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 1116 v16i8 vec11; 1117 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 1118 v16i8 minus5b = __msa_ldi_b(-5); 1119 v16i8 plus20b = __msa_ldi_b(20); 1120 1121 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 1122 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7); 1123 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 1124 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 1125 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 1126 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); 1127 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 1128 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 1129 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 1130 res0, res1, res2, res3); 1131 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); 1132 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); 1133 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, 1134 plus20b, res0, res1, res2, res3); 1135 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 1136 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 1137 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7); 1138 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 1139 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 1140 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 1141 res4, res5, res6, res7); 1142 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9); 1143 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); 1144 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, 1145 plus20b, res4, res5, res6, res7); 1146 SRARI_H4_SH(res0, res1, res2, res3, 5); 1147 SRARI_H4_SH(res4, res5, res6, res7, 5); 1148 SAT_SH4_SH(res0, res1, res2, res3, 7); 1149 SAT_SH4_SH(res4, res5, res6, res7, 7); 1150 out0 = PCKEV_XORI128_UB(res0, res1); 1151 out1 = PCKEV_XORI128_UB(res2, res3); 1152 out2 = PCKEV_XORI128_UB(res4, res5); 1153 out3 = PCKEV_XORI128_UB(res6, res7); 1154 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1155} 1156 1157void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, 1158 ptrdiff_t stride) 1159{ 1160 v16u8 out; 1161 v16i8 src0, src1, src2, src3, mask0, mask1, mask2; 1162 v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 1163 v8i16 res0, res1; 1164 v16i8 minus5b = __msa_ldi_b(-5); 1165 v16i8 plus20b = __msa_ldi_b(20); 1166 1167 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 1168 LD_SB4(src - 2, stride, src0, src1, src2, src3); 1169 XORI_B4_128_SB(src0, src1, src2, src3); 1170 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 1171 HADD_SB2_SH(vec0, vec1, res0, res1); 1172 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 1173 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1); 1174 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 1175 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1); 1176 SRARI_H2_SH(res0, res1, 5); 1177 SAT_SH2_SH(res0, res1, 7); 1178 out = PCKEV_XORI128_UB(res0, res1); 1179 ST_W4(out, 0, 1, 2, 3, dst, stride); 1180} 1181 1182void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, 1183 ptrdiff_t stride) 1184{ 1185 int32_t loop_cnt; 1186 int16_t filt_const0 = 0xfb01; 1187 int16_t filt_const1 = 0x1414; 1188 int16_t filt_const2 = 0x1fb; 1189 v16u8 res0, res1, res2, res3; 1190 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1191 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 1192 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 1193 v16i8 src65_l, src87_l, filt0, filt1, filt2; 1194 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 1195 1196 filt0 = (v16i8) __msa_fill_h(filt_const0); 1197 filt1 = (v16i8) __msa_fill_h(filt_const1); 1198 filt2 = (v16i8) __msa_fill_h(filt_const2); 1199 1200 src -= (stride * 2); 1201 1202 LD_SB5(src, stride, src0, src1, src2, src3, src4); 1203 src += (5 * stride); 1204 1205 XORI_B5_128_SB(src0, src1, src2, src3, src4); 1206 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1207 src32_r, src43_r); 1208 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 1209 src32_l, src43_l); 1210 1211 for (loop_cnt = 4; loop_cnt--;) { 1212 LD_SB4(src, stride, src5, src6, src7, src8); 1213 src += (4 * stride); 1214 1215 XORI_B4_128_SB(src5, src6, src7, src8); 1216 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 1217 src65_r, src76_r, src87_r); 1218 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 1219 src65_l, src76_l, src87_l); 1220 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 1221 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 1222 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 1223 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 1224 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 1225 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 1226 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 1227 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 1228 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 1229 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1230 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); 1231 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1232 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 1233 out3_r, res0, res1, res2, res3); 1234 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2); 1235 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3); 1236 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4); 1237 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5); 1238 XORI_B4_128_UB(res0, res1, res2, res3); 1239 ST_UB4(res0, res1, res2, res3, dst, stride); 1240 dst += (4 * stride); 1241 1242 src10_r = src54_r; 1243 src32_r = src76_r; 1244 src21_r = src65_r; 1245 src43_r = src87_r; 1246 src10_l = src54_l; 1247 src32_l = src76_l; 1248 src21_l = src65_l; 1249 src43_l = src87_l; 1250 src2 = src6; 1251 src3 = src7; 1252 src4 = src8; 1253 } 1254} 1255 1256void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, 1257 ptrdiff_t stride) 1258{ 1259 int32_t loop_cnt; 1260 int16_t filt_const0 = 0xfb01; 1261 int16_t filt_const1 = 0x1414; 1262 int16_t filt_const2 = 0x1fb; 1263 v16u8 res0, res1, res2, res3; 1264 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1265 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 1266 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 1267 v16i8 src65_l, src87_l, filt0, filt1, filt2; 1268 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 1269 1270 filt0 = (v16i8) __msa_fill_h(filt_const0); 1271 filt1 = (v16i8) __msa_fill_h(filt_const1); 1272 filt2 = (v16i8) __msa_fill_h(filt_const2); 1273 1274 src -= (stride * 2); 1275 1276 LD_SB5(src, stride, src0, src1, src2, src3, src4); 1277 src += (5 * stride); 1278 1279 XORI_B5_128_SB(src0, src1, src2, src3, src4); 1280 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1281 src32_r, src43_r); 1282 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 1283 src32_l, src43_l); 1284 1285 for (loop_cnt = 4; loop_cnt--;) { 1286 LD_SB4(src, stride, src5, src6, src7, src8); 1287 src += (4 * stride); 1288 1289 XORI_B4_128_SB(src5, src6, src7, src8); 1290 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 1291 src65_r, src76_r, src87_r); 1292 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 1293 src65_l, src76_l, src87_l); 1294 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 1295 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 1296 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 1297 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 1298 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 1299 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 1300 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 1301 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 1302 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 1303 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1304 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); 1305 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1306 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 1307 out3_r, res0, res1, res2, res3); 1308 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3); 1309 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4); 1310 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5); 1311 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6); 1312 XORI_B4_128_UB(res0, res1, res2, res3); 1313 ST_UB4(res0, res1, res2, res3, dst, stride); 1314 dst += (4 * stride); 1315 1316 src10_r = src54_r; 1317 src32_r = src76_r; 1318 src21_r = src65_r; 1319 src43_r = src87_r; 1320 src10_l = src54_l; 1321 src32_l = src76_l; 1322 src21_l = src65_l; 1323 src43_l = src87_l; 1324 src3 = src7; 1325 src4 = src8; 1326 } 1327} 1328 1329void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, 1330 ptrdiff_t stride) 1331{ 1332 const int16_t filt_const0 = 0xfb01; 1333 const int16_t filt_const1 = 0x1414; 1334 const int16_t filt_const2 = 0x1fb; 1335 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1336 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r; 1337 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r; 1338 v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3; 1339 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r; 1340 1341 filt0 = (v16i8) __msa_fill_h(filt_const0); 1342 filt1 = (v16i8) __msa_fill_h(filt_const1); 1343 filt2 = (v16i8) __msa_fill_h(filt_const2); 1344 1345 src -= (stride * 2); 1346 1347 LD_SB5(src, stride, src0, src1, src2, src3, src4); 1348 src += (5 * stride); 1349 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12); 1350 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12); 1351 XORI_B5_128_SB(src0, src1, src2, src3, src4); 1352 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1353 src32_r, src43_r); 1354 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 1355 src76_r, src87_r); 1356 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r, 1357 src109_r, src1110_r, src1211_r); 1358 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 1359 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 1360 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 1361 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 1362 out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2); 1363 out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2); 1364 out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2); 1365 out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2); 1366 PCKEV_D2_SB(src3, src2, src5, src4, tmp0, tmp1); 1367 PCKEV_D2_SB(src7, src6, src9, src8, tmp2, tmp3); 1368 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 1369 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5); 1370 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1371 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7); 1372 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1); 1373 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3); 1374 out0 = __msa_aver_s_b(out0, tmp0); 1375 out1 = __msa_aver_s_b(out1, tmp1); 1376 out2 = __msa_aver_s_b(out2, tmp2); 1377 out3 = __msa_aver_s_b(out3, tmp3); 1378 XORI_B4_128_SB(out0, out1, out2, out3); 1379 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1380} 1381 1382void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, 1383 ptrdiff_t stride) 1384{ 1385 const int16_t filt_const0 = 0xfb01; 1386 const int16_t filt_const1 = 0x1414; 1387 const int16_t filt_const2 = 0x1fb; 1388 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1389 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r; 1390 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r; 1391 v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3; 1392 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r; 1393 1394 filt0 = (v16i8) __msa_fill_h(filt_const0); 1395 filt1 = (v16i8) __msa_fill_h(filt_const1); 1396 filt2 = (v16i8) __msa_fill_h(filt_const2); 1397 1398 src -= (stride * 2); 1399 1400 LD_SB5(src, stride, src0, src1, src2, src3, src4); 1401 src += (5 * stride); 1402 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12); 1403 XORI_B5_128_SB(src0, src1, src2, src3, src4); 1404 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12); 1405 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1406 src32_r, src43_r); 1407 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 1408 src76_r, src87_r); 1409 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r, 1410 src109_r, src1110_r, src1211_r); 1411 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 1412 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 1413 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 1414 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 1415 out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2); 1416 out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2); 1417 out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2); 1418 out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2); 1419 PCKEV_D2_SB(src4, src3, src6, src5, tmp0, tmp1); 1420 PCKEV_D2_SB(src8, src7, src10, src9, tmp2, tmp3); 1421 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 1422 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5); 1423 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1424 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7); 1425 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1); 1426 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3); 1427 out0 = __msa_aver_s_b(out0, tmp0); 1428 out1 = __msa_aver_s_b(out1, tmp1); 1429 out2 = __msa_aver_s_b(out2, tmp2); 1430 out3 = __msa_aver_s_b(out3, tmp3); 1431 XORI_B4_128_SB(out0, out1, out2, out3); 1432 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1433} 1434 1435void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, 1436 ptrdiff_t stride) 1437{ 1438 int16_t filt_const0 = 0xfb01; 1439 int16_t filt_const1 = 0x1414; 1440 int16_t filt_const2 = 0x1fb; 1441 v16u8 out; 1442 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1443 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 1444 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 1445 v8i16 out10, out32; 1446 1447 filt0 = (v16i8) __msa_fill_h(filt_const0); 1448 filt1 = (v16i8) __msa_fill_h(filt_const1); 1449 filt2 = (v16i8) __msa_fill_h(filt_const2); 1450 1451 src -= (stride * 2); 1452 1453 LD_SB5(src, stride, src0, src1, src2, src3, src4); 1454 src += (5 * stride); 1455 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1456 src32_r, src43_r); 1457 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 1458 XORI_B2_128_SB(src2110, src4332); 1459 LD_SB4(src, stride, src5, src6, src7, src8); 1460 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 1461 src76_r, src87_r); 1462 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); 1463 XORI_B2_128_SB(src6554, src8776); 1464 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 1465 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 1466 SRARI_H2_SH(out10, out32, 5); 1467 SAT_SH2_SH(out10, out32, 7); 1468 out = PCKEV_XORI128_UB(out10, out32); 1469 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); 1470 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5); 1471 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r); 1472 out = __msa_aver_u_b(out, (v16u8) src32_r); 1473 ST_W4(out, 0, 1, 2, 3, dst, stride); 1474} 1475 1476void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, 1477 ptrdiff_t stride) 1478{ 1479 int16_t filt_const0 = 0xfb01; 1480 int16_t filt_const1 = 0x1414; 1481 int16_t filt_const2 = 0x1fb; 1482 v16u8 out; 1483 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1484 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 1485 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 1486 v8i16 out10, out32; 1487 1488 filt0 = (v16i8) __msa_fill_h(filt_const0); 1489 filt1 = (v16i8) __msa_fill_h(filt_const1); 1490 filt2 = (v16i8) __msa_fill_h(filt_const2); 1491 1492 src -= (stride * 2); 1493 1494 LD_SB5(src, stride, src0, src1, src2, src3, src4); 1495 src += (5 * stride); 1496 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1497 src32_r, src43_r); 1498 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 1499 XORI_B2_128_SB(src2110, src4332); 1500 LD_SB4(src, stride, src5, src6, src7, src8); 1501 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 1502 src76_r, src87_r); 1503 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); 1504 XORI_B2_128_SB(src6554, src8776); 1505 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 1506 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 1507 SRARI_H2_SH(out10, out32, 5); 1508 SAT_SH2_SH(out10, out32, 7); 1509 out = PCKEV_XORI128_UB(out10, out32); 1510 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4); 1511 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6); 1512 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r); 1513 out = __msa_aver_u_b(out, (v16u8) src32_r); 1514 ST_W4(out, 0, 1, 2, 3, dst, stride); 1515} 1516 1517void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, 1518 ptrdiff_t stride) 1519{ 1520 avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2), dst, stride); 1521} 1522 1523void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, 1524 ptrdiff_t stride) 1525{ 1526 avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2) + 1, dst, stride); 1527} 1528 1529void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, 1530 ptrdiff_t stride) 1531{ 1532 avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2), dst, 1533 stride); 1534} 1535 1536void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, 1537 ptrdiff_t stride) 1538{ 1539 avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2) + 1, dst, 1540 stride); 1541} 1542 1543void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, 1544 ptrdiff_t stride) 1545{ 1546 avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2), dst, stride); 1547} 1548 1549void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, 1550 ptrdiff_t stride) 1551{ 1552 avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2) + 1, dst, stride); 1553} 1554 1555void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, 1556 ptrdiff_t stride) 1557{ 1558 avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2), dst, stride); 1559} 1560 1561void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, 1562 ptrdiff_t stride) 1563{ 1564 avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2) + 1, dst, 1565 stride); 1566} 1567 1568 1569void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, 1570 ptrdiff_t stride) 1571{ 1572 avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2), dst, stride); 1573} 1574 1575void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, 1576 ptrdiff_t stride) 1577{ 1578 avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2) + 1, dst, stride); 1579} 1580 1581void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, 1582 ptrdiff_t stride) 1583{ 1584 avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2), dst, stride); 1585} 1586 1587void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, 1588 ptrdiff_t stride) 1589{ 1590 avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2) + 1, dst, 1591 stride); 1592} 1593 1594void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, 1595 ptrdiff_t stride) 1596{ 1597 uint8_t *dst_tmp = dst; 1598 const uint8_t *src_tmp = src - (2 * stride) - 2; 1599 uint32_t multiple8_cnt, loop_cnt; 1600 const int32_t filt_const0 = 0xfffb0001; 1601 const int32_t filt_const1 = 0x140014; 1602 const int32_t filt_const2 = 0x1fffb; 1603 v16u8 out0, out1; 1604 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1; 1605 v16i8 mask2; 1606 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1607 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1608 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 1609 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l; 1610 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l; 1611 v8i16 hz_out87_l, filt0, filt1, filt2; 1612 v4i32 tmp0, tmp1; 1613 1614 filt0 = (v8i16) __msa_fill_w(filt_const0); 1615 filt1 = (v8i16) __msa_fill_w(filt_const1); 1616 filt2 = (v8i16) __msa_fill_w(filt_const2); 1617 1618 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 1619 1620 for (multiple8_cnt = 2; multiple8_cnt--;) { 1621 dst = dst_tmp; 1622 src = src_tmp; 1623 1624 LD_SB5(src, stride, src0, src1, src2, src3, src4); 1625 XORI_B5_128_SB(src0, src1, src2, src3, src4); 1626 src += (5 * stride); 1627 1628 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 1629 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 1630 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 1631 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 1632 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 1633 1634 for (loop_cnt = 4; loop_cnt--;) { 1635 LD_SB4(src, stride, src5, src6, src7, src8); 1636 src += (4 * stride); 1637 1638 XORI_B4_128_SB(src5, src6, src7, src8); 1639 1640 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 1641 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 1642 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 1643 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 1644 1645 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 1646 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, 1647 hz_out43_r); 1648 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 1649 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, 1650 hz_out43_l); 1651 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 1652 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, 1653 hz_out87_r); 1654 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 1655 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, 1656 hz_out87_l); 1657 1658 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, 1659 filt1, filt2); 1660 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, 1661 filt1, filt2); 1662 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1663 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, 1664 filt1, filt2); 1665 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, 1666 filt1, filt2); 1667 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1668 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, 1669 filt1, filt2); 1670 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, 1671 filt1, filt2); 1672 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1673 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, 1674 filt1, filt2); 1675 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, 1676 filt1, filt2); 1677 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1678 1679 dst1 = __msa_srari_h(hz_out2, 5); 1680 dst3 = __msa_srari_h(hz_out3, 5); 1681 dst5 = __msa_srari_h(hz_out4, 5); 1682 dst7 = __msa_srari_h(hz_out5, 5); 1683 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7); 1684 1685 dst0 = __msa_aver_s_h(dst0, dst1); 1686 dst1 = __msa_aver_s_h(dst2, dst3); 1687 dst2 = __msa_aver_s_h(dst4, dst5); 1688 dst3 = __msa_aver_s_h(dst6, dst7); 1689 1690 out0 = PCKEV_XORI128_UB(dst0, dst1); 1691 out1 = PCKEV_XORI128_UB(dst2, dst3); 1692 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 1693 dst += (4 * stride); 1694 1695 hz_out0 = hz_out4; 1696 hz_out1 = hz_out5; 1697 hz_out2 = hz_out6; 1698 hz_out3 = hz_out7; 1699 hz_out4 = hz_out8; 1700 } 1701 1702 src_tmp += 8; 1703 dst_tmp += 8; 1704 } 1705} 1706 1707void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, 1708 ptrdiff_t stride) 1709{ 1710 uint8_t *dst_tmp = dst; 1711 const uint8_t *src_tmp = src - (2 * stride) - 2; 1712 uint32_t multiple8_cnt, loop_cnt; 1713 const int32_t filt_const0 = 0xfffb0001; 1714 const int32_t filt_const1 = 0x140014; 1715 const int32_t filt_const2 = 0x1fffb; 1716 v16u8 out0, out1; 1717 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1; 1718 v16i8 mask2; 1719 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1720 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1721 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 1722 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l; 1723 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l; 1724 v8i16 hz_out87_l, filt0, filt1, filt2; 1725 v4i32 tmp0, tmp1; 1726 1727 filt0 = (v8i16) __msa_fill_w(filt_const0); 1728 filt1 = (v8i16) __msa_fill_w(filt_const1); 1729 filt2 = (v8i16) __msa_fill_w(filt_const2); 1730 1731 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 1732 1733 for (multiple8_cnt = 2; multiple8_cnt--;) { 1734 dst = dst_tmp; 1735 src = src_tmp; 1736 1737 LD_SB5(src, stride, src0, src1, src2, src3, src4); 1738 XORI_B5_128_SB(src0, src1, src2, src3, src4); 1739 src += (5 * stride); 1740 1741 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 1742 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 1743 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 1744 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 1745 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 1746 1747 for (loop_cnt = 4; loop_cnt--;) { 1748 LD_SB4(src, stride, src5, src6, src7, src8); 1749 src += (4 * stride); 1750 1751 XORI_B4_128_SB(src5, src6, src7, src8); 1752 1753 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 1754 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 1755 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 1756 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 1757 1758 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 1759 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, 1760 hz_out43_r); 1761 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 1762 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, 1763 hz_out43_l); 1764 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 1765 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, 1766 hz_out87_r); 1767 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 1768 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, 1769 hz_out87_l); 1770 1771 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, 1772 filt1, filt2); 1773 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, 1774 filt1, filt2); 1775 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1776 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, 1777 filt1, filt2); 1778 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, 1779 filt1, filt2); 1780 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1781 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, 1782 filt1, filt2); 1783 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, 1784 filt1, filt2); 1785 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1786 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, 1787 filt1, filt2); 1788 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, 1789 filt1, filt2); 1790 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1791 1792 dst1 = __msa_srari_h(hz_out3, 5); 1793 dst3 = __msa_srari_h(hz_out4, 5); 1794 dst5 = __msa_srari_h(hz_out5, 5); 1795 dst7 = __msa_srari_h(hz_out6, 5); 1796 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7); 1797 1798 dst0 = __msa_aver_s_h(dst0, dst1); 1799 dst1 = __msa_aver_s_h(dst2, dst3); 1800 dst2 = __msa_aver_s_h(dst4, dst5); 1801 dst3 = __msa_aver_s_h(dst6, dst7); 1802 1803 out0 = PCKEV_XORI128_UB(dst0, dst1); 1804 out1 = PCKEV_XORI128_UB(dst2, dst3); 1805 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 1806 dst += (4 * stride); 1807 1808 hz_out0 = hz_out4; 1809 hz_out1 = hz_out5; 1810 hz_out2 = hz_out6; 1811 hz_out3 = hz_out7; 1812 hz_out4 = hz_out8; 1813 } 1814 1815 src_tmp += 8; 1816 dst_tmp += 8; 1817 } 1818} 1819 1820void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, 1821 ptrdiff_t stride) 1822{ 1823 const int32_t filt_const0 = 0xfffb0001; 1824 const int32_t filt_const1 = 0x140014; 1825 const int32_t filt_const2 = 0x1fffb; 1826 v16u8 out0, out1; 1827 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1828 v16i8 src11, src12, mask0, mask1, mask2; 1829 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1830 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12; 1831 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 1832 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r; 1833 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3; 1834 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l; 1835 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l; 1836 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2; 1837 v4i32 tmp0, tmp1; 1838 1839 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 1840 1841 filt0 = (v8i16) __msa_fill_w(filt_const0); 1842 filt1 = (v8i16) __msa_fill_w(filt_const1); 1843 filt2 = (v8i16) __msa_fill_w(filt_const2); 1844 1845 src -= ((2 * stride) + 2); 1846 1847 LD_SB5(src, stride, src0, src1, src2, src3, src4); 1848 XORI_B5_128_SB(src0, src1, src2, src3, src4); 1849 src += (5 * stride); 1850 1851 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 1852 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 1853 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 1854 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 1855 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 1856 1857 LD_SB4(src, stride, src5, src6, src7, src8); 1858 src += (4 * stride); 1859 XORI_B4_128_SB(src5, src6, src7, src8); 1860 1861 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 1862 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 1863 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 1864 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 1865 1866 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 1867 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 1868 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 1869 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l); 1870 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 1871 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 1872 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 1873 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l); 1874 1875 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 1876 filt2); 1877 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1, 1878 filt2); 1879 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1880 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 1881 filt2); 1882 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1, 1883 filt2); 1884 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1885 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 1886 filt2); 1887 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1, 1888 filt2); 1889 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1890 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 1891 filt2); 1892 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1, 1893 filt2); 1894 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1895 1896 SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5); 1897 SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7); 1898 1899 dst0 = __msa_aver_s_h(dst0, hz_out2); 1900 dst1 = __msa_aver_s_h(dst1, hz_out3); 1901 dst2 = __msa_aver_s_h(dst2, hz_out4); 1902 dst3 = __msa_aver_s_h(dst3, hz_out5); 1903 1904 out0 = PCKEV_XORI128_UB(dst0, dst1); 1905 out1 = PCKEV_XORI128_UB(dst2, dst3); 1906 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 1907 dst += (4 * stride); 1908 1909 LD_SB4(src, stride, src9, src10, src11, src12); 1910 XORI_B4_128_SB(src9, src10, src11, src12); 1911 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2); 1912 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2); 1913 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2); 1914 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2); 1915 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 1916 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r, 1917 hz_out1211_r); 1918 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 1919 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l, 1920 hz_out1211_l); 1921 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1, 1922 filt2); 1923 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1, 1924 filt2); 1925 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1926 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1, 1927 filt2); 1928 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1, 1929 filt2); 1930 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1931 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1, 1932 filt2); 1933 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1, 1934 filt2); 1935 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1936 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1, 1937 filt2); 1938 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1, 1939 filt2); 1940 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1941 1942 SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5); 1943 SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7); 1944 1945 dst0 = __msa_aver_s_h(dst0, hz_out6); 1946 dst1 = __msa_aver_s_h(dst1, hz_out7); 1947 dst2 = __msa_aver_s_h(dst2, hz_out8); 1948 dst3 = __msa_aver_s_h(dst3, hz_out9); 1949 1950 out0 = PCKEV_XORI128_UB(dst0, dst1); 1951 out1 = PCKEV_XORI128_UB(dst2, dst3); 1952 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 1953} 1954 1955void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, 1956 ptrdiff_t stride) 1957{ 1958 const int32_t filt_const0 = 0xfffb0001; 1959 const int32_t filt_const1 = 0x140014; 1960 const int32_t filt_const2 = 0x1fffb; 1961 v16u8 out0, out1; 1962 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1963 v16i8 src11, src12, mask0, mask1, mask2; 1964 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1965 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12; 1966 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 1967 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r; 1968 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3; 1969 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l; 1970 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l; 1971 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2; 1972 v4i32 tmp0, tmp1; 1973 1974 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 1975 1976 filt0 = (v8i16) __msa_fill_w(filt_const0); 1977 filt1 = (v8i16) __msa_fill_w(filt_const1); 1978 filt2 = (v8i16) __msa_fill_w(filt_const2); 1979 1980 src -= ((2 * stride) + 2); 1981 1982 LD_SB5(src, stride, src0, src1, src2, src3, src4); 1983 XORI_B5_128_SB(src0, src1, src2, src3, src4); 1984 src += (5 * stride); 1985 1986 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 1987 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 1988 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 1989 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 1990 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 1991 1992 LD_SB4(src, stride, src5, src6, src7, src8); 1993 src += (4 * stride); 1994 XORI_B4_128_SB(src5, src6, src7, src8); 1995 1996 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 1997 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 1998 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 1999 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 2000 2001 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 2002 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 2003 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 2004 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l); 2005 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 2006 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 2007 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 2008 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l); 2009 2010 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 2011 filt2); 2012 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1, 2013 filt2); 2014 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2015 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 2016 filt2); 2017 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1, 2018 filt2); 2019 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2020 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 2021 filt2); 2022 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1, 2023 filt2); 2024 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2025 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 2026 filt2); 2027 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1, 2028 filt2); 2029 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2030 2031 SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5); 2032 SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7); 2033 2034 dst0 = __msa_aver_s_h(dst0, hz_out3); 2035 dst1 = __msa_aver_s_h(dst1, hz_out4); 2036 dst2 = __msa_aver_s_h(dst2, hz_out5); 2037 dst3 = __msa_aver_s_h(dst3, hz_out6); 2038 2039 out0 = PCKEV_XORI128_UB(dst0, dst1); 2040 out1 = PCKEV_XORI128_UB(dst2, dst3); 2041 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 2042 dst += (4 * stride); 2043 2044 LD_SB4(src, stride, src9, src10, src11, src12); 2045 XORI_B4_128_SB(src9, src10, src11, src12); 2046 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2); 2047 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2); 2048 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2); 2049 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2); 2050 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 2051 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r, 2052 hz_out1211_r); 2053 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 2054 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l, 2055 hz_out1211_l); 2056 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1, 2057 filt2); 2058 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1, 2059 filt2); 2060 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2061 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1, 2062 filt2); 2063 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1, 2064 filt2); 2065 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2066 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1, 2067 filt2); 2068 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1, 2069 filt2); 2070 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2071 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1, 2072 filt2); 2073 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1, 2074 filt2); 2075 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2076 2077 SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5); 2078 SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7); 2079 2080 dst0 = __msa_aver_s_h(dst0, hz_out7); 2081 dst1 = __msa_aver_s_h(dst1, hz_out8); 2082 dst2 = __msa_aver_s_h(dst2, hz_out9); 2083 dst3 = __msa_aver_s_h(dst3, hz_out10); 2084 2085 out0 = PCKEV_XORI128_UB(dst0, dst1); 2086 out1 = PCKEV_XORI128_UB(dst2, dst3); 2087 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 2088} 2089 2090void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, 2091 ptrdiff_t stride) 2092{ 2093 const int32_t filt_const0 = 0xfffb0001; 2094 const int32_t filt_const1 = 0x140014; 2095 const int32_t filt_const2 = 0x1fffb; 2096 v16u8 res; 2097 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2098 v16i8 mask0, mask1, mask2; 2099 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 2100 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2; 2101 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 2102 v8i16 hz_out65_r, hz_out76_r, hz_out87_r; 2103 v4i32 tmp0, tmp1; 2104 2105 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 2106 2107 filt0 = (v8i16) __msa_fill_w(filt_const0); 2108 filt1 = (v8i16) __msa_fill_w(filt_const1); 2109 filt2 = (v8i16) __msa_fill_w(filt_const2); 2110 2111 src -= ((2 * stride) + 2); 2112 2113 LD_SB5(src, stride, src0, src1, src2, src3, src4); 2114 src += (5 * stride); 2115 LD_SB4(src, stride, src5, src6, src7, src8); 2116 2117 XORI_B5_128_SB(src0, src1, src2, src3, src4); 2118 XORI_B4_128_SB(src5, src6, src7, src8); 2119 2120 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2); 2121 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2); 2122 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2); 2123 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2); 2124 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 2125 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); 2126 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7); 2127 2128 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 2129 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 2130 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 2131 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 2132 2133 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 2134 filt2); 2135 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 2136 filt2); 2137 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2138 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 2139 filt2); 2140 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 2141 filt2); 2142 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2143 2144 SRARI_H2_SH(hz_out2, hz_out4, 5); 2145 SAT_SH2_SH(hz_out2, hz_out4, 7); 2146 2147 dst0 = __msa_aver_s_h(dst0, hz_out2); 2148 dst1 = __msa_aver_s_h(dst1, hz_out4); 2149 2150 res = PCKEV_XORI128_UB(dst0, dst1); 2151 ST_W4(res, 0, 1, 2, 3, dst, stride); 2152} 2153 2154void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, 2155 ptrdiff_t stride) 2156{ 2157 const int32_t filt_const0 = 0xfffb0001; 2158 const int32_t filt_const1 = 0x140014; 2159 const int32_t filt_const2 = 0x1fffb; 2160 v16u8 res; 2161 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2162 v16i8 mask0, mask1, mask2; 2163 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 2164 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2; 2165 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 2166 v8i16 hz_out65_r, hz_out76_r, hz_out87_r; 2167 v4i32 tmp0, tmp1; 2168 2169 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 2170 2171 filt0 = (v8i16) __msa_fill_w(filt_const0); 2172 filt1 = (v8i16) __msa_fill_w(filt_const1); 2173 filt2 = (v8i16) __msa_fill_w(filt_const2); 2174 2175 src -= ((2 * stride) + 2); 2176 2177 LD_SB5(src, stride, src0, src1, src2, src3, src4); 2178 src += (5 * stride); 2179 LD_SB4(src, stride, src5, src6, src7, src8); 2180 2181 XORI_B5_128_SB(src0, src1, src2, src3, src4); 2182 XORI_B4_128_SB(src5, src6, src7, src8); 2183 2184 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2); 2185 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2); 2186 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2); 2187 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2); 2188 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 2189 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); 2190 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7); 2191 2192 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 2193 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 2194 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 2195 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 2196 2197 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 2198 filt2); 2199 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 2200 filt2); 2201 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2202 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 2203 filt2); 2204 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 2205 filt2); 2206 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2207 2208 PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1); 2209 SRARI_H2_SH(hz_out0, hz_out1, 5); 2210 SAT_SH2_SH(hz_out0, hz_out1, 7); 2211 2212 dst0 = __msa_aver_s_h(dst0, hz_out0); 2213 dst1 = __msa_aver_s_h(dst1, hz_out1); 2214 2215 res = PCKEV_XORI128_UB(dst0, dst1); 2216 ST_W4(res, 0, 1, 2, 3, dst, stride); 2217} 2218 2219void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, 2220 ptrdiff_t stride) 2221{ 2222 int32_t loop_cnt; 2223 int16_t filt_const0 = 0xfb01; 2224 int16_t filt_const1 = 0x1414; 2225 int16_t filt_const2 = 0x1fb; 2226 v16u8 res0, res1, res2, res3; 2227 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2228 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 2229 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 2230 v16i8 src65_l, src87_l, filt0, filt1, filt2; 2231 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 2232 2233 filt0 = (v16i8) __msa_fill_h(filt_const0); 2234 filt1 = (v16i8) __msa_fill_h(filt_const1); 2235 filt2 = (v16i8) __msa_fill_h(filt_const2); 2236 src -= (stride * 2); 2237 2238 LD_SB5(src, stride, src0, src1, src2, src3, src4); 2239 src += (5 * stride); 2240 2241 XORI_B5_128_SB(src0, src1, src2, src3, src4); 2242 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 2243 src32_r, src43_r); 2244 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 2245 src32_l, src43_l); 2246 2247 for (loop_cnt = 4; loop_cnt--;) { 2248 LD_SB4(src, stride, src5, src6, src7, src8); 2249 src += (4 * stride); 2250 2251 XORI_B4_128_SB(src5, src6, src7, src8); 2252 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 2253 src65_r, src76_r, src87_r); 2254 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 2255 src65_l, src76_l, src87_l); 2256 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 2257 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 2258 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 2259 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 2260 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 2261 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 2262 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 2263 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 2264 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 2265 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 2266 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); 2267 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 2268 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 2269 out3_r, res0, res1, res2, res3); 2270 XORI_B4_128_UB(res0, res1, res2, res3); 2271 ST_UB4(res0, res1, res2, res3, dst, stride); 2272 dst += (4 * stride); 2273 2274 src10_r = src54_r; 2275 src32_r = src76_r; 2276 src21_r = src65_r; 2277 src43_r = src87_r; 2278 src10_l = src54_l; 2279 src32_l = src76_l; 2280 src21_l = src65_l; 2281 src43_l = src87_l; 2282 src4 = src8; 2283 } 2284} 2285 2286void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, 2287 ptrdiff_t stride) 2288{ 2289 const int16_t filt_const0 = 0xfb01; 2290 const int16_t filt_const1 = 0x1414; 2291 const int16_t filt_const2 = 0x1fb; 2292 v16u8 out0, out1, out2, out3; 2293 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2294 v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r; 2295 v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r; 2296 v16i8 filt0, filt1, filt2; 2297 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r; 2298 2299 filt0 = (v16i8) __msa_fill_h(filt_const0); 2300 filt1 = (v16i8) __msa_fill_h(filt_const1); 2301 filt2 = (v16i8) __msa_fill_h(filt_const2); 2302 2303 src -= (stride * 2); 2304 2305 LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 2306 src += (8 * stride); 2307 LD_SB5(src, stride, src8, src9, src10, src11, src12); 2308 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 2309 src32_r, src43_r); 2310 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r, 2311 src98_r, src109_r); 2312 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r, 2313 src910_r, src1110_r, src1211_r); 2314 XORI_B4_128_SB(src10_r, src21_r, src32_r, src43_r); 2315 XORI_B4_128_SB(src76_r, src87_r, src98_r, src109_r); 2316 XORI_B4_128_SB(src89_r, src910_r, src1110_r, src1211_r); 2317 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); 2318 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); 2319 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); 2320 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); 2321 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2); 2322 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2); 2323 out6_r = AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2); 2324 out7_r = AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2); 2325 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 2326 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5); 2327 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 2328 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7); 2329 out0 = PCKEV_XORI128_UB(out0_r, out1_r); 2330 out1 = PCKEV_XORI128_UB(out2_r, out3_r); 2331 out2 = PCKEV_XORI128_UB(out4_r, out5_r); 2332 out3 = PCKEV_XORI128_UB(out6_r, out7_r); 2333 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 2334} 2335 2336void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, 2337 ptrdiff_t stride) 2338{ 2339 const int16_t filt_const0 = 0xfb01; 2340 const int16_t filt_const1 = 0x1414; 2341 const int16_t filt_const2 = 0x1fb; 2342 v16u8 out; 2343 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2344 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 2345 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 2346 v8i16 out10, out32; 2347 2348 filt0 = (v16i8) __msa_fill_h(filt_const0); 2349 filt1 = (v16i8) __msa_fill_h(filt_const1); 2350 filt2 = (v16i8) __msa_fill_h(filt_const2); 2351 2352 src -= (stride * 2); 2353 2354 LD_SB5(src, stride, src0, src1, src2, src3, src4); 2355 src += (5 * stride); 2356 LD_SB4(src, stride, src5, src6, src7, src8); 2357 2358 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 2359 src32_r, src43_r); 2360 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 2361 src76_r, src87_r); 2362 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, 2363 src76_r, src2110, src4332, src6554, src8776); 2364 XORI_B4_128_SB(src2110, src4332, src6554, src8776); 2365 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 2366 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 2367 SRARI_H2_SH(out10, out32, 5); 2368 SAT_SH2_SH(out10, out32, 7); 2369 out = PCKEV_XORI128_UB(out10, out32); 2370 ST_W4(out, 0, 1, 2, 3, dst, stride); 2371} 2372 2373void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, 2374 ptrdiff_t stride) 2375{ 2376 uint32_t row; 2377 v16u8 out; 2378 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2379 v16i8 src11; 2380 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3; 2381 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 2382 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5; 2383 v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 2384 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 2385 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 2386 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 2387 v8i16 minus5h = __msa_ldi_h(-5); 2388 v8i16 plus20h = __msa_ldi_h(20); 2389 2390 mask3 = mask0 + 4; 2391 mask4 = mask1 + 4; 2392 mask5 = mask2 + 4; 2393 2394 src -= ((2 * stride) + 2); 2395 2396 LD_SB5(src, stride, src0, src1, src2, src3, src4); 2397 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11); 2398 src += (5 * stride); 2399 XORI_B5_128_SB(src0, src1, src2, src3, src4); 2400 XORI_B5_128_SB(src7, src8, src9, src10, src11); 2401 2402 for (row = 16; row--;) { 2403 LD_SB2(src, 8, src5, src6); 2404 src += stride; 2405 XORI_B2_128_SB(src5, src6); 2406 2407 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 2408 vt_res0, vt_res1); 2409 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6, 2410 vt_res2, vt_res3); 2411 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2412 mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 2413 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2414 mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 2415 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 2416 mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 2417 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 2418 mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 2419 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2420 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2421 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 2422 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 2423 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 2424 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 2425 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 2426 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 2427 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 2428 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 2429 dst0 = __msa_srari_h(shf_vec2, 5); 2430 dst1 = __msa_srari_h(shf_vec5, 5); 2431 dst2 = __msa_srari_h(shf_vec8, 5); 2432 dst3 = __msa_srari_h(shf_vec11, 5); 2433 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7); 2434 PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1); 2435 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3); 2436 dst0 = __msa_aver_s_h(dst2, dst0); 2437 dst1 = __msa_aver_s_h(dst3, dst1); 2438 out = PCKEV_XORI128_UB(dst0, dst1); 2439 ST_UB(out, dst); 2440 dst += stride; 2441 2442 src0 = src1; 2443 src1 = src2; 2444 src2 = src3; 2445 src3 = src4; 2446 src4 = src5; 2447 src7 = src8; 2448 src8 = src9; 2449 src9 = src10; 2450 src10 = src11; 2451 src11 = src6; 2452 } 2453} 2454 2455void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, 2456 ptrdiff_t stride) 2457{ 2458 uint32_t row; 2459 v16u8 out; 2460 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2461 v16i8 src11; 2462 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3; 2463 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 2464 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5; 2465 v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 2466 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 2467 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 2468 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 2469 v8i16 minus5h = __msa_ldi_h(-5); 2470 v8i16 plus20h = __msa_ldi_h(20); 2471 2472 mask3 = mask0 + 4; 2473 mask4 = mask1 + 4; 2474 mask5 = mask2 + 4; 2475 2476 src -= ((2 * stride) + 2); 2477 2478 LD_SB5(src, stride, src0, src1, src2, src3, src4); 2479 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11); 2480 src += (5 * stride); 2481 XORI_B5_128_SB(src0, src1, src2, src3, src4); 2482 XORI_B5_128_SB(src7, src8, src9, src10, src11); 2483 2484 for (row = 16; row--;) { 2485 LD_SB2(src, 8, src5, src6); 2486 src += stride; 2487 XORI_B2_128_SB(src5, src6); 2488 2489 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 2490 vt_res0, vt_res1); 2491 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6, 2492 vt_res2, vt_res3); 2493 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2494 mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 2495 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2496 mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 2497 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 2498 mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 2499 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 2500 mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 2501 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2502 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2503 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 2504 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 2505 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 2506 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 2507 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 2508 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 2509 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 2510 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 2511 dst0 = __msa_srari_h(shf_vec2, 5); 2512 dst1 = __msa_srari_h(shf_vec5, 5); 2513 dst2 = __msa_srari_h(shf_vec8, 5); 2514 dst3 = __msa_srari_h(shf_vec11, 5); 2515 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7); 2516 dst0 = __msa_pckod_h(dst2, dst0); 2517 dst1 = __msa_pckod_h(dst3, dst1); 2518 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3); 2519 dst0 = __msa_aver_s_h(dst2, dst0); 2520 dst1 = __msa_aver_s_h(dst3, dst1); 2521 out = PCKEV_XORI128_UB(dst0, dst1); 2522 ST_UB(out, dst); 2523 dst += stride; 2524 2525 src0 = src1; 2526 src1 = src2; 2527 src2 = src3; 2528 src3 = src4; 2529 src4 = src5; 2530 src7 = src8; 2531 src8 = src9; 2532 src9 = src10; 2533 src10 = src11; 2534 src11 = src6; 2535 } 2536} 2537 2538void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, 2539 ptrdiff_t stride) 2540{ 2541 uint32_t row; 2542 v16u8 out; 2543 v16i8 src0, src1, src2, src3, src4, src5, src6; 2544 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3; 2545 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 2546 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11; 2547 v8i16 mask3, mask4, mask5; 2548 v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 2549 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 2550 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 2551 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 2552 v8i16 minus5h = __msa_ldi_h(-5); 2553 v8i16 plus20h = __msa_ldi_h(20); 2554 2555 mask3 = mask0 + 4; 2556 mask4 = mask1 + 4; 2557 mask5 = mask2 + 4; 2558 2559 src -= ((2 * stride) + 2); 2560 2561 LD_SB5(src, stride, src0, src1, src2, src3, src4); 2562 src += (5 * stride); 2563 XORI_B5_128_SB(src0, src1, src2, src3, src4); 2564 2565 for (row = 4; row--;) { 2566 LD_SB2(src, stride, src5, src6); 2567 src += (2 * stride); 2568 XORI_B2_128_SB(src5, src6); 2569 2570 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 2571 vt_res0, vt_res1); 2572 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6, 2573 vt_res2, vt_res3); 2574 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2575 mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 2576 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2577 mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 2578 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 2579 mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 2580 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 2581 mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 2582 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2583 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2584 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 2585 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 2586 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 2587 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 2588 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 2589 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 2590 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 2591 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 2592 dst0 = __msa_srari_h(shf_vec2, 5); 2593 dst1 = __msa_srari_h(shf_vec5, 5); 2594 dst2 = __msa_srari_h(shf_vec8, 5); 2595 dst3 = __msa_srari_h(shf_vec11, 5); 2596 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7); 2597 PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1); 2598 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3); 2599 dst0 = __msa_aver_s_h(dst2, dst0); 2600 dst1 = __msa_aver_s_h(dst3, dst1); 2601 out = PCKEV_XORI128_UB(dst0, dst1); 2602 ST_D2(out, 0, 1, dst, stride); 2603 dst += (2 * stride); 2604 2605 src0 = src2; 2606 src1 = src3; 2607 src2 = src4; 2608 src3 = src5; 2609 src4 = src6; 2610 } 2611} 2612 2613void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, 2614 ptrdiff_t stride) 2615{ 2616 uint32_t row; 2617 v16u8 out; 2618 v16i8 src0, src1, src2, src3, src4, src5, src6; 2619 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3; 2620 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 2621 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11; 2622 v8i16 mask3, mask4, mask5; 2623 v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 2624 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 2625 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 2626 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 2627 v8i16 minus5h = __msa_ldi_h(-5); 2628 v8i16 plus20h = __msa_ldi_h(20); 2629 2630 mask3 = mask0 + 4; 2631 mask4 = mask1 + 4; 2632 mask5 = mask2 + 4; 2633 2634 src -= ((2 * stride) + 2); 2635 2636 LD_SB5(src, stride, src0, src1, src2, src3, src4); 2637 src += (5 * stride); 2638 XORI_B5_128_SB(src0, src1, src2, src3, src4); 2639 2640 for (row = 4; row--;) { 2641 LD_SB2(src, stride, src5, src6); 2642 src += (2 * stride); 2643 XORI_B2_128_SB(src5, src6); 2644 2645 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 2646 vt_res0, vt_res1); 2647 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6, 2648 vt_res2, vt_res3); 2649 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2650 mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 2651 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2652 mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 2653 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 2654 mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 2655 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 2656 mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 2657 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2658 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2659 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 2660 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 2661 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 2662 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 2663 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 2664 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 2665 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 2666 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 2667 dst0 = __msa_srari_h(shf_vec2, 5); 2668 dst1 = __msa_srari_h(shf_vec5, 5); 2669 dst2 = __msa_srari_h(shf_vec8, 5); 2670 dst3 = __msa_srari_h(shf_vec11, 5); 2671 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7); 2672 dst0 = __msa_pckod_h(dst2, dst0); 2673 dst1 = __msa_pckod_h(dst3, dst1); 2674 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3); 2675 dst0 = __msa_aver_s_h(dst2, dst0); 2676 dst1 = __msa_aver_s_h(dst3, dst1); 2677 out = PCKEV_XORI128_UB(dst0, dst1); 2678 ST_D2(out, 0, 1, dst, stride); 2679 dst += (2 * stride); 2680 2681 src0 = src2; 2682 src1 = src3; 2683 src2 = src4; 2684 src3 = src5; 2685 src4 = src6; 2686 } 2687} 2688 2689void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, 2690 ptrdiff_t stride) 2691{ 2692 const int16_t filt_const0 = 0xfb01; 2693 const int16_t filt_const1 = 0x1414; 2694 const int16_t filt_const2 = 0x1fb; 2695 v16u8 out; 2696 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2697 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r; 2698 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l; 2699 v16i8 src76_l, src87_l, filt0, filt1, filt2; 2700 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7; 2701 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 2702 v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 2703 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 2704 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 2705 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 2706 v8i16 minus5h = __msa_ldi_h(-5); 2707 v8i16 plus20h = __msa_ldi_h(20); 2708 v8i16 zeros = { 0 }; 2709 2710 filt0 = (v16i8) __msa_fill_h(filt_const0); 2711 filt1 = (v16i8) __msa_fill_h(filt_const1); 2712 filt2 = (v16i8) __msa_fill_h(filt_const2); 2713 2714 src -= ((2 * stride) + 2); 2715 2716 LD_SB5(src, stride, src0, src1, src2, src3, src4); 2717 src += (5 * stride); 2718 XORI_B5_128_SB(src0, src1, src2, src3, src4); 2719 LD_SB4(src, stride, src5, src6, src7, src8); 2720 XORI_B4_128_SB(src5, src6, src7, src8); 2721 2722 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 2723 src32_r, src43_r); 2724 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 2725 src76_r, src87_r); 2726 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 2727 src32_l, src43_l); 2728 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l, 2729 src76_l, src87_l); 2730 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 2731 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 2732 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 2733 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 2734 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2735 mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 2736 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2737 mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 2738 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2739 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 2740 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2741 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 2742 2743 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 2744 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 2745 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 2746 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 2747 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2748 mask1, mask2, shf_vec0, shf_vec1, shf_vec6); 2749 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2750 mask1, mask2, shf_vec3, shf_vec4, shf_vec7); 2751 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2752 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2); 2753 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2754 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3); 2755 2756 SRARI_W2_SW(hz_res0, hz_res1, 10); 2757 SAT_SW2_SW(hz_res0, hz_res1, 7); 2758 SRARI_W2_SW(hz_res2, hz_res3, 10); 2759 SAT_SW2_SW(hz_res2, hz_res3, 7); 2760 2761 dst0 = __msa_srari_h(shf_vec2, 5); 2762 dst1 = __msa_srari_h(shf_vec5, 5); 2763 dst2 = __msa_srari_h(shf_vec6, 5); 2764 dst3 = __msa_srari_h(shf_vec7, 5); 2765 2766 SAT_SH2_SH(dst0, dst1, 7); 2767 SAT_SH2_SH(dst2, dst3, 7); 2768 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1); 2769 ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3); 2770 2771 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0); 2772 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1); 2773 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2); 2774 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3); 2775 2776 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2); 2777 out = PCKEV_XORI128_UB(dst0, dst2); 2778 ST_W4(out, 0, 1, 2, 3, dst, stride); 2779} 2780 2781void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, 2782 ptrdiff_t stride) 2783{ 2784 const int16_t filt_const0 = 0xfb01; 2785 const int16_t filt_const1 = 0x1414; 2786 const int16_t filt_const2 = 0x1fb; 2787 v16u8 out; 2788 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2789 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r; 2790 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l; 2791 v16i8 src76_l, src87_l, filt0, filt1, filt2; 2792 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7; 2793 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 2794 v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 2795 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 2796 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 2797 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 2798 v8i16 minus5h = __msa_ldi_h(-5); 2799 v8i16 plus20h = __msa_ldi_h(20); 2800 v8i16 zeros = { 0 }; 2801 2802 filt0 = (v16i8) __msa_fill_h(filt_const0); 2803 filt1 = (v16i8) __msa_fill_h(filt_const1); 2804 filt2 = (v16i8) __msa_fill_h(filt_const2); 2805 2806 src -= ((2 * stride) + 2); 2807 2808 LD_SB5(src, stride, src0, src1, src2, src3, src4); 2809 src += (5 * stride); 2810 XORI_B5_128_SB(src0, src1, src2, src3, src4); 2811 LD_SB4(src, stride, src5, src6, src7, src8); 2812 XORI_B4_128_SB(src5, src6, src7, src8); 2813 2814 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 2815 src32_r, src43_r); 2816 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 2817 src76_r, src87_r); 2818 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 2819 src32_l, src43_l); 2820 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l, 2821 src76_l, src87_l); 2822 2823 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 2824 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 2825 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 2826 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 2827 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2828 mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 2829 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2830 mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 2831 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2832 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 2833 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2834 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 2835 2836 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 2837 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 2838 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 2839 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 2840 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2841 mask1, mask2, shf_vec0, shf_vec1, shf_vec6); 2842 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2843 mask1, mask2, shf_vec3, shf_vec4, shf_vec7); 2844 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2845 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2); 2846 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2847 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3); 2848 2849 SRARI_W2_SW(hz_res0, hz_res1, 10); 2850 SAT_SW2_SW(hz_res0, hz_res1, 7); 2851 SRARI_W2_SW(hz_res2, hz_res3, 10); 2852 SAT_SW2_SW(hz_res2, hz_res3, 7); 2853 2854 dst0 = __msa_srari_h(shf_vec2, 5); 2855 dst1 = __msa_srari_h(shf_vec5, 5); 2856 dst2 = __msa_srari_h(shf_vec6, 5); 2857 dst3 = __msa_srari_h(shf_vec7, 5); 2858 2859 SAT_SH2_SH(dst0, dst1, 7); 2860 SAT_SH2_SH(dst2, dst3, 7); 2861 2862 dst0 = __msa_ilvod_h(zeros, dst0); 2863 dst1 = __msa_ilvod_h(zeros, dst1); 2864 dst2 = __msa_ilvod_h(zeros, dst2); 2865 dst3 = __msa_ilvod_h(zeros, dst3); 2866 2867 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0); 2868 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1); 2869 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2); 2870 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3); 2871 2872 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2); 2873 out = PCKEV_XORI128_UB(dst0, dst2); 2874 ST_W4(out, 0, 1, 2, 3, dst, stride); 2875} 2876 2877void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, 2878 ptrdiff_t stride) 2879{ 2880 const int32_t filt_const0 = 0xfffb0001; 2881 const int32_t filt_const1 = 0x140014; 2882 const int32_t filt_const2 = 0x1fffb; 2883 const uint8_t *src_tmp = src - (2 * stride) - 2; 2884 uint8_t *dst_tmp = dst; 2885 uint32_t multiple8_cnt, loop_cnt; 2886 v16u8 out0, out1; 2887 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2; 2888 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 2889 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3; 2890 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 2891 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l; 2892 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l; 2893 v8i16 hz_out87_l, filt0, filt1, filt2; 2894 v4i32 tmp0, tmp1; 2895 2896 filt0 = (v8i16) __msa_fill_w(filt_const0); 2897 filt1 = (v8i16) __msa_fill_w(filt_const1); 2898 filt2 = (v8i16) __msa_fill_w(filt_const2); 2899 2900 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 2901 2902 for (multiple8_cnt = 2; multiple8_cnt--;) { 2903 src = src_tmp; 2904 dst = dst_tmp; 2905 2906 LD_SB5(src, stride, src0, src1, src2, src3, src4); 2907 XORI_B5_128_SB(src0, src1, src2, src3, src4); 2908 src += (5 * stride); 2909 2910 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 2911 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 2912 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 2913 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 2914 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 2915 2916 for (loop_cnt = 4; loop_cnt--;) { 2917 LD_SB4(src, stride, src0, src1, src2, src3); 2918 XORI_B4_128_SB(src0, src1, src2, src3); 2919 src += (4 * stride); 2920 2921 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 2922 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 2923 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 2924 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 2925 2926 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 2927 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, 2928 hz_out43_r); 2929 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 2930 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, 2931 hz_out43_l); 2932 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 2933 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, 2934 hz_out87_r); 2935 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 2936 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, 2937 hz_out87_l); 2938 2939 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, 2940 filt1, filt2); 2941 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, 2942 filt1, filt2); 2943 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2944 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, 2945 filt1, filt2); 2946 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, 2947 filt1, filt2); 2948 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2949 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, 2950 filt1, filt2); 2951 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, 2952 filt1, filt2); 2953 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2954 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, 2955 filt1, filt2); 2956 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, 2957 filt1, filt2); 2958 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2959 2960 out0 = PCKEV_XORI128_UB(dst0, dst1); 2961 out1 = PCKEV_XORI128_UB(dst2, dst3); 2962 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 2963 dst += (4 * stride); 2964 2965 hz_out0 = hz_out4; 2966 hz_out1 = hz_out5; 2967 hz_out2 = hz_out6; 2968 hz_out3 = hz_out7; 2969 hz_out4 = hz_out8; 2970 } 2971 2972 src_tmp += 8; 2973 dst_tmp += 8; 2974 } 2975} 2976 2977void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, 2978 ptrdiff_t stride) 2979{ 2980 const int32_t filt_const0 = 0xfffb0001; 2981 const int32_t filt_const1 = 0x140014; 2982 const int32_t filt_const2 = 0x1fffb; 2983 v16u8 out0, out1; 2984 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2; 2985 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 2986 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12; 2987 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 2988 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r; 2989 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3; 2990 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l; 2991 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l; 2992 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2; 2993 v4i32 tmp0, tmp1; 2994 2995 filt0 = (v8i16) __msa_fill_w(filt_const0); 2996 filt1 = (v8i16) __msa_fill_w(filt_const1); 2997 filt2 = (v8i16) __msa_fill_w(filt_const2); 2998 2999 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3000 3001 src -= ((2 * stride) + 2); 3002 LD_SB5(src, stride, src0, src1, src2, src3, src4); 3003 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3004 src += (5 * stride); 3005 3006 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 3007 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 3008 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 3009 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 3010 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 3011 3012 LD_SB4(src, stride, src0, src1, src2, src3); 3013 XORI_B4_128_SB(src0, src1, src2, src3); 3014 src += (4 * stride); 3015 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 3016 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 3017 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 3018 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 3019 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 3020 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 3021 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 3022 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l); 3023 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 3024 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 3025 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 3026 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l); 3027 3028 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 3029 filt2); 3030 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1, 3031 filt2); 3032 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3033 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 3034 filt2); 3035 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1, 3036 filt2); 3037 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3038 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 3039 filt2); 3040 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1, 3041 filt2); 3042 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3043 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 3044 filt2); 3045 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1, 3046 filt2); 3047 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3048 out0 = PCKEV_XORI128_UB(dst0, dst1); 3049 out1 = PCKEV_XORI128_UB(dst2, dst3); 3050 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 3051 dst += (4 * stride); 3052 3053 LD_SB4(src, stride, src0, src1, src2, src3); 3054 XORI_B4_128_SB(src0, src1, src2, src3); 3055 hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 3056 hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 3057 hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 3058 hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 3059 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 3060 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r, 3061 hz_out1211_r); 3062 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 3063 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l, 3064 hz_out1211_l); 3065 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1, 3066 filt2); 3067 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1, 3068 filt2); 3069 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3070 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1, 3071 filt2); 3072 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1, 3073 filt2); 3074 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3075 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1, 3076 filt2); 3077 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1, 3078 filt2); 3079 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3080 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1, 3081 filt2); 3082 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1, 3083 filt2); 3084 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3085 out0 = PCKEV_XORI128_UB(dst0, dst1); 3086 out1 = PCKEV_XORI128_UB(dst2, dst3); 3087 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 3088} 3089 3090void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, 3091 ptrdiff_t stride) 3092{ 3093 const int32_t filt_const0 = 0xfffb0001; 3094 const int32_t filt_const1 = 0x140014; 3095 const int32_t filt_const2 = 0x1fffb; 3096 v16u8 res; 3097 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3098 v16i8 mask0, mask1, mask2; 3099 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 3100 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2; 3101 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 3102 v8i16 hz_out65_r, hz_out76_r, hz_out87_r; 3103 v4i32 tmp0, tmp1; 3104 3105 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 3106 3107 filt0 = (v8i16) __msa_fill_w(filt_const0); 3108 filt1 = (v8i16) __msa_fill_w(filt_const1); 3109 filt2 = (v8i16) __msa_fill_w(filt_const2); 3110 3111 src -= ((2 * stride) + 2); 3112 3113 LD_SB5(src, stride, src0, src1, src2, src3, src4); 3114 src += (5 * stride); 3115 LD_SB4(src, stride, src5, src6, src7, src8); 3116 3117 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3118 XORI_B4_128_SB(src5, src6, src7, src8); 3119 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2); 3120 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2); 3121 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2); 3122 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2); 3123 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 3124 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); 3125 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7); 3126 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 3127 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 3128 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 3129 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 3130 3131 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 3132 filt2); 3133 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 3134 filt2); 3135 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3136 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 3137 filt2); 3138 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 3139 filt2); 3140 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3141 res = PCKEV_XORI128_UB(dst0, dst1); 3142 ST_W4(res, 0, 1, 2, 3, dst, stride); 3143} 3144 3145void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, 3146 ptrdiff_t stride) 3147{ 3148 uint32_t loop_cnt; 3149 v16u8 dst0, dst1, dst2, dst3; 3150 v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6; 3151 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11; 3152 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 3153 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 3154 v16i8 minus5b = __msa_ldi_b(-5); 3155 v16i8 plus20b = __msa_ldi_b(20); 3156 3157 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3158 mask3 = mask0 + 8; 3159 mask4 = mask1 + 8; 3160 mask5 = mask2 + 8; 3161 src -= 2; 3162 3163 for (loop_cnt = 4; loop_cnt--;) { 3164 LD_SB2(src, 16, src0, src1); 3165 src += stride; 3166 LD_SB2(src, 16, src2, src3); 3167 src += stride; 3168 LD_SB2(src, 16, src4, src5); 3169 src += stride; 3170 LD_SB2(src, 16, src6, src7); 3171 src += stride; 3172 3173 LD_UB4(dst, stride, dst0, dst1, dst2, dst3); 3174 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3175 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3); 3176 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9); 3177 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4); 3178 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10); 3179 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5); 3180 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11); 3181 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); 3182 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 3183 minus5b, res0, res1, res2, res3); 3184 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 3185 plus20b, res0, res1, res2, res3); 3186 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3); 3187 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9); 3188 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4); 3189 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10); 3190 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5); 3191 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11); 3192 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); 3193 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 3194 minus5b, res4, res5, res6, res7); 3195 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 3196 plus20b, res4, res5, res6, res7); 3197 SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2, 3198 src0, src2, src4, src6); 3199 SRARI_H4_SH(res0, res1, res2, res3, 5); 3200 SRARI_H4_SH(res4, res5, res6, res7, 5); 3201 SAT_SH4_SH(res0, res1, res2, res3, 7); 3202 SAT_SH4_SH(res4, res5, res6, res7, 7); 3203 PCKEV_B2_SB(res1, res0, res3, res2, out0, out1); 3204 PCKEV_B2_SB(res5, res4, res7, res6, out2, out3); 3205 out0 = __msa_aver_s_b(out0, src0); 3206 out1 = __msa_aver_s_b(out1, src2); 3207 out2 = __msa_aver_s_b(out2, src4); 3208 out3 = __msa_aver_s_b(out3, src6); 3209 XORI_B4_128_SB(out0, out1, out2, out3); 3210 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 3211 AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3); 3212 ST_UB4(dst0, dst1, dst2, dst3, dst, stride); 3213 dst += (4 * stride); 3214 } 3215} 3216 3217void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, 3218 ptrdiff_t stride) 3219{ 3220 uint32_t loop_cnt; 3221 v16u8 dst0, dst1, dst2, dst3; 3222 v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6; 3223 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11; 3224 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 3225 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 3226 v16i8 minus5b = __msa_ldi_b(-5); 3227 v16i8 plus20b = __msa_ldi_b(20); 3228 3229 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3230 mask3 = mask0 + 8; 3231 mask4 = mask1 + 8; 3232 mask5 = mask2 + 8; 3233 src -= 2; 3234 3235 for (loop_cnt = 4; loop_cnt--;) { 3236 LD_SB2(src, 16, src0, src1); 3237 src += stride; 3238 LD_SB2(src, 16, src2, src3); 3239 src += stride; 3240 LD_SB2(src, 16, src4, src5); 3241 src += stride; 3242 LD_SB2(src, 16, src6, src7); 3243 src += stride; 3244 3245 LD_UB4(dst, stride, dst0, dst1, dst2, dst3); 3246 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3247 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3); 3248 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9); 3249 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4); 3250 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10); 3251 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5); 3252 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11); 3253 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); 3254 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 3255 minus5b, res0, res1, res2, res3); 3256 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 3257 plus20b, res0, res1, res2, res3); 3258 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3); 3259 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9); 3260 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4); 3261 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10); 3262 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5); 3263 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11); 3264 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); 3265 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 3266 minus5b, res4, res5, res6, res7); 3267 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 3268 plus20b, res4, res5, res6, res7); 3269 SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3, 3270 src0, src2, src4, src6); 3271 SRARI_H4_SH(res0, res1, res2, res3, 5); 3272 SRARI_H4_SH(res4, res5, res6, res7, 5); 3273 SAT_SH4_SH(res0, res1, res2, res3, 7); 3274 SAT_SH4_SH(res4, res5, res6, res7, 7); 3275 PCKEV_B2_SB(res1, res0, res3, res2, out0, out1); 3276 PCKEV_B2_SB(res5, res4, res7, res6, out2, out3); 3277 out0 = __msa_aver_s_b(out0, src0); 3278 out1 = __msa_aver_s_b(out1, src2); 3279 out2 = __msa_aver_s_b(out2, src4); 3280 out3 = __msa_aver_s_b(out3, src6); 3281 XORI_B4_128_SB(out0, out1, out2, out3); 3282 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 3283 AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3); 3284 ST_UB4(dst0, dst1, dst2, dst3, dst, stride); 3285 dst += (4 * stride); 3286 } 3287} 3288 3289void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, 3290 ptrdiff_t stride) 3291{ 3292 uint64_t tp0, tp1, tp2, tp3; 3293 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 3294 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 3295 v16i8 tmp0, tmp1, tmp2, tmp3, vec11; 3296 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 3297 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 3298 v16i8 minus5b = __msa_ldi_b(-5); 3299 v16i8 plus20b = __msa_ldi_b(20); 3300 3301 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3302 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7); 3303 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3304 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 3305 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 3306 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); 3307 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 3308 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 3309 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 3310 res0, res1, res2, res3); 3311 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); 3312 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); 3313 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 3314 res0, res1, res2, res3); 3315 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 3316 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 3317 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7); 3318 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 3319 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 3320 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 3321 res4, res5, res6, res7); 3322 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9); 3323 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); 3324 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 3325 res4, res5, res6, res7); 3326 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2, 3327 src0, src1, src2, src3); 3328 SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2, 3329 src4, src5, src6, src7); 3330 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1); 3331 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5); 3332 SRARI_H4_SH(res0, res1, res2, res3, 5); 3333 SRARI_H4_SH(res4, res5, res6, res7, 5); 3334 SAT_SH4_SH(res0, res1, res2, res3, 7); 3335 SAT_SH4_SH(res4, res5, res6, res7, 7); 3336 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1); 3337 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3); 3338 tmp0 = __msa_aver_s_b(tmp0, src0); 3339 tmp1 = __msa_aver_s_b(tmp1, src1); 3340 tmp2 = __msa_aver_s_b(tmp2, src4); 3341 tmp3 = __msa_aver_s_b(tmp3, src5); 3342 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3); 3343 LD4(dst, stride, tp0, tp1, tp2, tp3); 3344 INSERT_D2_UB(tp0, tp1, dst0); 3345 INSERT_D2_UB(tp2, tp3, dst1); 3346 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 3347 INSERT_D2_UB(tp0, tp1, dst2); 3348 INSERT_D2_UB(tp2, tp3, dst3); 3349 AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1); 3350 AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3); 3351 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 3352} 3353 3354void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, 3355 ptrdiff_t stride) 3356{ 3357 uint64_t tp0, tp1, tp2, tp3; 3358 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 3359 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 3360 v16i8 tmp0, tmp1, tmp2, tmp3, vec11; 3361 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 3362 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 3363 v16i8 minus5b = __msa_ldi_b(-5); 3364 v16i8 plus20b = __msa_ldi_b(20); 3365 3366 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3367 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7); 3368 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3369 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 3370 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 3371 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); 3372 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 3373 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 3374 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 3375 res0, res1, res2, res3); 3376 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); 3377 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); 3378 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 3379 res0, res1, res2, res3); 3380 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 3381 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 3382 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7); 3383 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 3384 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 3385 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 3386 res4, res5, res6, res7); 3387 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9); 3388 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); 3389 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 3390 res4, res5, res6, res7); 3391 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3, 3392 src0, src1, src2, src3); 3393 SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3, 3394 src4, src5, src6, src7); 3395 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1); 3396 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5); 3397 SRARI_H4_SH(res0, res1, res2, res3, 5); 3398 SRARI_H4_SH(res4, res5, res6, res7, 5); 3399 SAT_SH4_SH(res0, res1, res2, res3, 7); 3400 SAT_SH4_SH(res4, res5, res6, res7, 7); 3401 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1); 3402 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3); 3403 tmp0 = __msa_aver_s_b(tmp0, src0); 3404 tmp1 = __msa_aver_s_b(tmp1, src1); 3405 tmp2 = __msa_aver_s_b(tmp2, src4); 3406 tmp3 = __msa_aver_s_b(tmp3, src5); 3407 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3); 3408 LD4(dst, stride, tp0, tp1, tp2, tp3); 3409 INSERT_D2_UB(tp0, tp1, dst0); 3410 INSERT_D2_UB(tp2, tp3, dst1); 3411 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 3412 INSERT_D2_UB(tp0, tp1, dst2); 3413 INSERT_D2_UB(tp2, tp3, dst3); 3414 AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1); 3415 AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3); 3416 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 3417} 3418 3419void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, 3420 ptrdiff_t stride) 3421{ 3422 uint32_t tp0, tp1, tp2, tp3; 3423 v16u8 dst0 = { 0 }; 3424 v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5; 3425 v16i8 mask0, mask1, mask2; 3426 v8i16 out0, out1; 3427 v16i8 minus5b = __msa_ldi_b(-5); 3428 v16i8 plus20b = __msa_ldi_b(20); 3429 3430 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 3431 LD_SB4(src - 2, stride, src0, src1, src2, src3); 3432 XORI_B4_128_SB(src0, src1, src2, src3); 3433 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 3434 HADD_SB2_SH(vec0, vec1, out0, out1); 3435 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 3436 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1); 3437 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 3438 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1); 3439 SRARI_H2_SH(out0, out1, 5); 3440 SAT_SH2_SH(out0, out1, 7); 3441 res = __msa_pckev_b((v16i8) out1, (v16i8) out0); 3442 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2, 3443 src0, src1, src2, src3); 3444 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); 3445 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); 3446 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1); 3447 res = __msa_aver_s_b(res, src0); 3448 res = (v16i8) __msa_xori_b((v16u8) res, 128); 3449 LW4(dst, stride, tp0, tp1, tp2, tp3); 3450 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 3451 dst0 = __msa_aver_u_b((v16u8) res, dst0); 3452 ST_W4(dst0, 0, 1, 2, 3, dst, stride); 3453} 3454 3455void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, 3456 ptrdiff_t stride) 3457{ 3458 uint32_t tp0, tp1, tp2, tp3; 3459 v16u8 dst0 = { 0 }; 3460 v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5; 3461 v16i8 mask0, mask1, mask2; 3462 v8i16 out0, out1; 3463 v16i8 minus5b = __msa_ldi_b(-5); 3464 v16i8 plus20b = __msa_ldi_b(20); 3465 3466 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 3467 LD_SB4(src - 2, stride, src0, src1, src2, src3); 3468 XORI_B4_128_SB(src0, src1, src2, src3); 3469 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 3470 HADD_SB2_SH(vec0, vec1, out0, out1); 3471 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 3472 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1); 3473 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 3474 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1); 3475 SRARI_H2_SH(out0, out1, 5); 3476 SAT_SH2_SH(out0, out1, 7); 3477 res = __msa_pckev_b((v16i8) out1, (v16i8) out0); 3478 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3, 3479 src0, src1, src2, src3); 3480 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); 3481 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); 3482 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1); 3483 res = __msa_aver_s_b(res, src0); 3484 res = (v16i8) __msa_xori_b((v16u8) res, 128); 3485 LW4(dst, stride, tp0, tp1, tp2, tp3); 3486 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 3487 dst0 = __msa_aver_u_b((v16u8) res, dst0); 3488 ST_W4(dst0, 0, 1, 2, 3, dst, stride); 3489} 3490 3491void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, 3492 ptrdiff_t stride) 3493{ 3494 uint32_t loop_cnt; 3495 v16u8 dst0, dst1, dst2, dst3; 3496 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 3497 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 3498 v16i8 vec11; 3499 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 3500 v16i8 minus5b = __msa_ldi_b(-5); 3501 v16i8 plus20b = __msa_ldi_b(20); 3502 3503 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3504 src -= 2; 3505 3506 for (loop_cnt = 4; loop_cnt--;) { 3507 LD_SB2(src, 8, src0, src1); 3508 src += stride; 3509 LD_SB2(src, 8, src2, src3); 3510 src += stride; 3511 LD_SB2(src, 8, src4, src5); 3512 src += stride; 3513 LD_SB2(src, 8, src6, src7); 3514 src += stride; 3515 3516 LD_UB4(dst, stride, dst0, dst1, dst2, dst3); 3517 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3518 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3); 3519 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9); 3520 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4); 3521 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10); 3522 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5); 3523 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11); 3524 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); 3525 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 3526 minus5b, res0, res1, res2, res3); 3527 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 3528 plus20b, res0, res1, res2, res3); 3529 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3); 3530 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9); 3531 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4); 3532 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10); 3533 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5); 3534 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11); 3535 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); 3536 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 3537 minus5b, res4, res5, res6, res7); 3538 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 3539 plus20b, res4, res5, res6, res7); 3540 SRARI_H4_SH(res0, res1, res2, res3, 5); 3541 SRARI_H4_SH(res4, res5, res6, res7, 5); 3542 SAT_SH4_SH(res0, res1, res2, res3, 7); 3543 SAT_SH4_SH(res4, res5, res6, res7, 7); 3544 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1, 3545 vec2, vec3); 3546 XORI_B4_128_SB(vec0, vec1, vec2, vec3); 3547 AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1); 3548 AVER_UB2_UB(vec2, dst2, vec3, dst3, dst2, dst3); 3549 ST_UB4(dst0, dst1, dst2, dst3, dst, stride); 3550 dst += (4 * stride); 3551 } 3552} 3553 3554void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, 3555 ptrdiff_t stride) 3556{ 3557 uint64_t tp0, tp1, tp2, tp3; 3558 v16u8 out0, out1, out2 = { 0 }, out3 = { 0 }; 3559 v16u8 out4, out5, out6 = { 0 }, out7 = { 0 }; 3560 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 3561 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 3562 v16i8 vec11; 3563 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 3564 v16i8 minus5b = __msa_ldi_b(-5); 3565 v16i8 plus20b = __msa_ldi_b(20); 3566 3567 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3568 3569 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7); 3570 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3571 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 3572 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 3573 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); 3574 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 3575 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 3576 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 3577 res0, res1, res2, res3); 3578 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); 3579 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); 3580 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 3581 res0, res1, res2, res3); 3582 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 3583 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 3584 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7); 3585 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 3586 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 3587 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 3588 res4, res5, res6, res7); 3589 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9); 3590 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); 3591 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 3592 res4, res5, res6, res7); 3593 SRARI_H4_SH(res0, res1, res2, res3, 5); 3594 SRARI_H4_SH(res4, res5, res6, res7, 5); 3595 SAT_SH4_SH(res0, res1, res2, res3, 7); 3596 SAT_SH4_SH(res4, res5, res6, res7, 7); 3597 out0 = PCKEV_XORI128_UB(res0, res1); 3598 out1 = PCKEV_XORI128_UB(res2, res3); 3599 out4 = PCKEV_XORI128_UB(res4, res5); 3600 out5 = PCKEV_XORI128_UB(res6, res7); 3601 LD4(dst, stride, tp0, tp1, tp2, tp3); 3602 INSERT_D2_UB(tp0, tp1, out2); 3603 INSERT_D2_UB(tp2, tp3, out3); 3604 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 3605 INSERT_D2_UB(tp0, tp1, out6); 3606 INSERT_D2_UB(tp2, tp3, out7); 3607 AVER_UB2_UB(out0, out2, out1, out3, out0, out1); 3608 AVER_UB2_UB(out4, out6, out5, out7, out4, out5); 3609 ST_D8(out0, out1, out4, out5, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 3610} 3611 3612void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, 3613 ptrdiff_t stride) 3614{ 3615 uint32_t tp0, tp1, tp2, tp3; 3616 v16u8 res, dst0 = { 0 }; 3617 v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, vec4, vec5; 3618 v16i8 mask0, mask1, mask2; 3619 v8i16 res0, res1; 3620 v16i8 minus5b = __msa_ldi_b(-5); 3621 v16i8 plus20b = __msa_ldi_b(20); 3622 3623 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 3624 LD_SB4(src - 2, stride, src0, src1, src2, src3); 3625 XORI_B4_128_SB(src0, src1, src2, src3); 3626 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 3627 HADD_SB2_SH(vec0, vec1, res0, res1); 3628 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 3629 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1); 3630 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 3631 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1); 3632 SRARI_H2_SH(res0, res1, 5); 3633 SAT_SH2_SH(res0, res1, 7); 3634 res = PCKEV_XORI128_UB(res0, res1); 3635 LW4(dst, stride, tp0, tp1, tp2, tp3); 3636 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 3637 res = __msa_aver_u_b(res, dst0); 3638 ST_W4(res, 0, 1, 2, 3, dst, stride); 3639} 3640 3641void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, 3642 ptrdiff_t stride) 3643{ 3644 int32_t loop_cnt; 3645 int16_t filt_const0 = 0xfb01; 3646 int16_t filt_const1 = 0x1414; 3647 int16_t filt_const2 = 0x1fb; 3648 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; 3649 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3650 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 3651 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 3652 v16i8 src65_l, src87_l, filt0, filt1, filt2; 3653 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 3654 3655 filt0 = (v16i8) __msa_fill_h(filt_const0); 3656 filt1 = (v16i8) __msa_fill_h(filt_const1); 3657 filt2 = (v16i8) __msa_fill_h(filt_const2); 3658 3659 src -= (stride * 2); 3660 3661 LD_SB5(src, stride, src0, src1, src2, src3, src4); 3662 src += (5 * stride); 3663 3664 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3665 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3666 src32_r, src43_r); 3667 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 3668 src32_l, src43_l); 3669 3670 for (loop_cnt = 4; loop_cnt--;) { 3671 LD_SB4(src, stride, src5, src6, src7, src8); 3672 src += (4 * stride); 3673 3674 XORI_B4_128_SB(src5, src6, src7, src8); 3675 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 3676 src65_r, src76_r, src87_r); 3677 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 3678 src65_l, src76_l, src87_l); 3679 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 3680 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 3681 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 3682 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 3683 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 3684 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 3685 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 3686 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 3687 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 3688 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 3689 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); 3690 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 3691 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 3692 out3_r, res0, res1, res2, res3); 3693 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2); 3694 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3); 3695 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4); 3696 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5); 3697 LD_UB4(dst, stride, dst0, dst1, dst2, dst3); 3698 XORI_B4_128_UB(res0, res1, res2, res3); 3699 AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1); 3700 AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3); 3701 ST_UB4(dst0, dst1, dst2, dst3, dst, stride); 3702 dst += (4 * stride); 3703 3704 src10_r = src54_r; 3705 src32_r = src76_r; 3706 src21_r = src65_r; 3707 src43_r = src87_r; 3708 src10_l = src54_l; 3709 src32_l = src76_l; 3710 src21_l = src65_l; 3711 src43_l = src87_l; 3712 src2 = src6; 3713 src3 = src7; 3714 src4 = src8; 3715 } 3716} 3717 3718void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, 3719 ptrdiff_t stride) 3720{ 3721 int32_t loop_cnt; 3722 int16_t filt_const0 = 0xfb01; 3723 int16_t filt_const1 = 0x1414; 3724 int16_t filt_const2 = 0x1fb; 3725 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; 3726 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3727 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 3728 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 3729 v16i8 src65_l, src87_l, filt0, filt1, filt2; 3730 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 3731 3732 filt0 = (v16i8) __msa_fill_h(filt_const0); 3733 filt1 = (v16i8) __msa_fill_h(filt_const1); 3734 filt2 = (v16i8) __msa_fill_h(filt_const2); 3735 3736 src -= (stride * 2); 3737 3738 LD_SB5(src, stride, src0, src1, src2, src3, src4); 3739 src += (5 * stride); 3740 3741 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3742 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3743 src32_r, src43_r); 3744 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 3745 src32_l, src43_l); 3746 3747 for (loop_cnt = 4; loop_cnt--;) { 3748 LD_SB4(src, stride, src5, src6, src7, src8); 3749 src += (4 * stride); 3750 3751 XORI_B4_128_SB(src5, src6, src7, src8); 3752 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 3753 src65_r, src76_r, src87_r); 3754 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 3755 src65_l, src76_l, src87_l); 3756 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 3757 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 3758 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 3759 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 3760 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 3761 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 3762 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 3763 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 3764 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 3765 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 3766 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); 3767 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 3768 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 3769 out3_r, res0, res1, res2, res3); 3770 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3); 3771 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4); 3772 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5); 3773 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6); 3774 LD_UB4(dst, stride, dst0, dst1, dst2, dst3); 3775 XORI_B4_128_UB(res0, res1, res2, res3); 3776 AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1); 3777 AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3); 3778 ST_UB4(dst0, dst1, dst2, dst3, dst, stride); 3779 dst += (4 * stride); 3780 3781 src10_r = src54_r; 3782 src32_r = src76_r; 3783 src21_r = src65_r; 3784 src43_r = src87_r; 3785 src10_l = src54_l; 3786 src32_l = src76_l; 3787 src21_l = src65_l; 3788 src43_l = src87_l; 3789 src3 = src7; 3790 src4 = src8; 3791 } 3792} 3793 3794void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, 3795 ptrdiff_t stride) 3796{ 3797 uint64_t tp0, tp1, tp2, tp3; 3798 const int16_t filt_const0 = 0xfb01; 3799 const int16_t filt_const1 = 0x1414; 3800 const int16_t filt_const2 = 0x1fb; 3801 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 3802 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12; 3803 v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r; 3804 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; 3805 v16i8 filt0, filt1, filt2, out0, out1, out2, out3; 3806 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r; 3807 3808 filt0 = (v16i8) __msa_fill_h(filt_const0); 3809 filt1 = (v16i8) __msa_fill_h(filt_const1); 3810 filt2 = (v16i8) __msa_fill_h(filt_const2); 3811 3812 src -= (stride * 2); 3813 3814 LD_SB5(src, stride, src0, src1, src2, src3, src4); 3815 src += (5 * stride); 3816 3817 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3818 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3819 src32_r, src43_r); 3820 LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14); 3821 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14); 3822 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r, 3823 src87_r, src98_r, src109_r); 3824 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); 3825 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); 3826 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); 3827 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); 3828 PCKEV_D2_SB(src3, src2, src7, src4, tmp0, tmp1); 3829 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r, 3830 src21_r, src32_r, src43_r); 3831 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2); 3832 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2); 3833 out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2); 3834 out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2); 3835 PCKEV_D2_SB(src9, src8, src11, src10, tmp2, tmp3); 3836 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 3837 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5); 3838 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 3839 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7); 3840 3841 LD4(dst, stride, tp0, tp1, tp2, tp3); 3842 INSERT_D2_UB(tp0, tp1, dst0); 3843 INSERT_D2_UB(tp2, tp3, dst1); 3844 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 3845 INSERT_D2_UB(tp0, tp1, dst2); 3846 INSERT_D2_UB(tp2, tp3, dst3); 3847 3848 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1); 3849 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3); 3850 out0 = __msa_aver_s_b(out0, tmp0); 3851 out1 = __msa_aver_s_b(out1, tmp1); 3852 out2 = __msa_aver_s_b(out2, tmp2); 3853 out3 = __msa_aver_s_b(out3, tmp3); 3854 XORI_B4_128_SB(out0, out1, out2, out3); 3855 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1, 3856 dst2, dst3); 3857 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 3858} 3859 3860void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, 3861 ptrdiff_t stride) 3862{ 3863 uint64_t tp0, tp1, tp2, tp3; 3864 const int16_t filt_const0 = 0xfb01; 3865 const int16_t filt_const1 = 0x1414; 3866 const int16_t filt_const2 = 0x1fb; 3867 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 3868 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12; 3869 v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r; 3870 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; 3871 v16i8 filt0, filt1, filt2, out0, out1, out2, out3; 3872 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r; 3873 3874 filt0 = (v16i8) __msa_fill_h(filt_const0); 3875 filt1 = (v16i8) __msa_fill_h(filt_const1); 3876 filt2 = (v16i8) __msa_fill_h(filt_const2); 3877 3878 src -= (stride * 2); 3879 3880 LD_SB5(src, stride, src0, src1, src2, src3, src4); 3881 src += (5 * stride); 3882 3883 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3884 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3885 src32_r, src43_r); 3886 LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14); 3887 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14); 3888 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r, 3889 src87_r, src98_r, src109_r); 3890 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); 3891 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); 3892 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); 3893 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); 3894 PCKEV_D2_SB(src4, src3, src8, src7, tmp0, tmp1); 3895 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r, 3896 src21_r, src32_r, src43_r); 3897 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2); 3898 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2); 3899 out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2); 3900 out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2); 3901 PCKEV_D2_SB(src10, src9, src12, src11, tmp2, tmp3); 3902 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 3903 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5); 3904 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 3905 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7); 3906 3907 LD4(dst, stride, tp0, tp1, tp2, tp3); 3908 INSERT_D2_UB(tp0, tp1, dst0); 3909 INSERT_D2_UB(tp2, tp3, dst1); 3910 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 3911 INSERT_D2_UB(tp0, tp1, dst2); 3912 INSERT_D2_UB(tp2, tp3, dst3); 3913 3914 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1); 3915 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3); 3916 out0 = __msa_aver_s_b(out0, tmp0); 3917 out1 = __msa_aver_s_b(out1, tmp1); 3918 out2 = __msa_aver_s_b(out2, tmp2); 3919 out3 = __msa_aver_s_b(out3, tmp3); 3920 XORI_B4_128_SB(out0, out1, out2, out3); 3921 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1, 3922 dst2, dst3); 3923 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 3924} 3925 3926void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, 3927 ptrdiff_t stride) 3928{ 3929 uint32_t tp0, tp1, tp2, tp3; 3930 int16_t filt_const0 = 0xfb01; 3931 int16_t filt_const1 = 0x1414; 3932 int16_t filt_const2 = 0x1fb; 3933 v16u8 res, dst0 = { 0 }; 3934 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3935 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 3936 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 3937 v8i16 out10, out32; 3938 3939 filt0 = (v16i8) __msa_fill_h(filt_const0); 3940 filt1 = (v16i8) __msa_fill_h(filt_const1); 3941 filt2 = (v16i8) __msa_fill_h(filt_const2); 3942 3943 src -= (stride * 2); 3944 LD_SB5(src, stride, src0, src1, src2, src3, src4); 3945 src += (5 * stride); 3946 3947 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3948 src32_r, src43_r); 3949 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 3950 XORI_B2_128_SB(src2110, src4332); 3951 LD_SB4(src, stride, src5, src6, src7, src8); 3952 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 3953 src76_r, src87_r); 3954 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); 3955 XORI_B2_128_SB(src6554, src8776); 3956 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); 3957 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5); 3958 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r); 3959 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 3960 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 3961 SRARI_H2_SH(out10, out32, 5); 3962 SAT_SH2_SH(out10, out32, 7); 3963 LW4(dst, stride, tp0, tp1, tp2, tp3); 3964 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 3965 res = PCKEV_XORI128_UB(out10, out32); 3966 res = __msa_aver_u_b(res, (v16u8) src32_r); 3967 dst0 = __msa_aver_u_b(res, dst0); 3968 ST_W4(dst0, 0, 1, 2, 3, dst, stride); 3969} 3970 3971void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, 3972 ptrdiff_t stride) 3973{ 3974 uint32_t tp0, tp1, tp2, tp3; 3975 int16_t filt_const0 = 0xfb01; 3976 int16_t filt_const1 = 0x1414; 3977 int16_t filt_const2 = 0x1fb; 3978 v16u8 res, dst0 = { 0 }; 3979 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3980 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 3981 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 3982 v8i16 out10, out32; 3983 3984 filt0 = (v16i8) __msa_fill_h(filt_const0); 3985 filt1 = (v16i8) __msa_fill_h(filt_const1); 3986 filt2 = (v16i8) __msa_fill_h(filt_const2); 3987 3988 src -= (stride * 2); 3989 3990 LD_SB5(src, stride, src0, src1, src2, src3, src4); 3991 src += (5 * stride); 3992 3993 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3994 src32_r, src43_r); 3995 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 3996 XORI_B2_128_SB(src2110, src4332); 3997 LD_SB4(src, stride, src5, src6, src7, src8); 3998 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 3999 src76_r, src87_r); 4000 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); 4001 XORI_B2_128_SB(src6554, src8776); 4002 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 4003 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 4004 SRARI_H2_SH(out10, out32, 5); 4005 SAT_SH2_SH(out10, out32, 7); 4006 LW4(dst, stride, tp0, tp1, tp2, tp3); 4007 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 4008 res = PCKEV_XORI128_UB(out10, out32); 4009 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4); 4010 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6); 4011 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r); 4012 res = __msa_aver_u_b(res, (v16u8) src32_r); 4013 dst0 = __msa_aver_u_b(res, dst0); 4014 ST_W4(dst0, 0, 1, 2, 3, dst, stride); 4015} 4016 4017void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, 4018 ptrdiff_t stride) 4019{ 4020 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2, 4021 src - (stride * 2), 4022 dst, stride); 4023} 4024 4025void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, 4026 ptrdiff_t stride) 4027{ 4028 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2, 4029 src - (stride * 2) + 4030 sizeof(uint8_t), 4031 dst, stride); 4032} 4033 4034void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, 4035 ptrdiff_t stride) 4036{ 4037 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2, 4038 src - (stride * 2), 4039 dst, stride); 4040} 4041 4042void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, 4043 ptrdiff_t stride) 4044{ 4045 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2, 4046 src - (stride * 2) + 4047 sizeof(uint8_t), 4048 dst, stride); 4049} 4050 4051void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, 4052 ptrdiff_t stride) 4053{ 4054 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2, 4055 src - (stride * 2), 4056 dst, stride); 4057} 4058 4059void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, 4060 ptrdiff_t stride) 4061{ 4062 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2, 4063 src - (stride * 2) + 4064 sizeof(uint8_t), dst, stride); 4065} 4066 4067void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, 4068 ptrdiff_t stride) 4069{ 4070 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2, 4071 src - (stride * 2), 4072 dst, stride); 4073} 4074 4075void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, 4076 ptrdiff_t stride) 4077{ 4078 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2, 4079 src - (stride * 2) + 4080 sizeof(uint8_t), dst, stride); 4081} 4082 4083 4084void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, 4085 ptrdiff_t stride) 4086{ 4087 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2, 4088 src - (stride * 2), 4089 dst, stride); 4090} 4091 4092void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, 4093 ptrdiff_t stride) 4094{ 4095 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2, 4096 src - (stride * 2) + 4097 sizeof(uint8_t), dst, stride); 4098} 4099 4100void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, 4101 ptrdiff_t stride) 4102{ 4103 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2, 4104 src - (stride * 2), 4105 dst, stride); 4106} 4107 4108void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, 4109 ptrdiff_t stride) 4110{ 4111 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2, 4112 src - (stride * 2) + 4113 sizeof(uint8_t), dst, stride); 4114} 4115 4116void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, 4117 ptrdiff_t stride) 4118{ 4119 uint64_t tp0, tp1, tp2, tp3; 4120 uint8_t *dst_tmp = dst; 4121 const uint8_t *src_tmp = src - (2 * stride) - 2; 4122 uint32_t multiple8_cnt, loop_cnt; 4123 const int32_t filt_const0 = 0xfffb0001; 4124 const int32_t filt_const1 = 0x140014; 4125 const int32_t filt_const2 = 0x1fffb; 4126 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 }; 4127 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1; 4128 v16i8 mask2; 4129 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 4130 v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 4131 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 4132 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l; 4133 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l; 4134 v8i16 hz_out87_l, filt0, filt1, filt2; 4135 v4i32 tmp0_w, tmp1_w; 4136 4137 filt0 = (v8i16) __msa_fill_w(filt_const0); 4138 filt1 = (v8i16) __msa_fill_w(filt_const1); 4139 filt2 = (v8i16) __msa_fill_w(filt_const2); 4140 4141 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 4142 4143 for (multiple8_cnt = 2; multiple8_cnt--;) { 4144 dst = dst_tmp; 4145 src = src_tmp; 4146 4147 LD_SB5(src, stride, src0, src1, src2, src3, src4); 4148 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4149 src += (5 * stride); 4150 4151 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 4152 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 4153 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 4154 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 4155 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 4156 4157 for (loop_cnt = 4; loop_cnt--;) { 4158 LD_SB2(src, stride, src5, src6); 4159 src += (2 * stride); 4160 4161 XORI_B2_128_SB(src5, src6); 4162 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 4163 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 4164 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 4165 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, 4166 hz_out43_r); 4167 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 4168 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, 4169 hz_out43_l); 4170 ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, 4171 hz_out65_r); 4172 ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, 4173 hz_out65_l); 4174 tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, 4175 filt1, filt2); 4176 tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, 4177 filt1, filt2); 4178 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4179 tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, 4180 filt1, filt2); 4181 tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, 4182 filt1, filt2); 4183 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4184 4185 tmp1 = __msa_srari_h(hz_out2, 5); 4186 tmp3 = __msa_srari_h(hz_out3, 5); 4187 SAT_SH2_SH(tmp1, tmp3, 7); 4188 4189 tmp0 = __msa_aver_s_h(tmp0, tmp1); 4190 tmp1 = __msa_aver_s_h(tmp2, tmp3); 4191 4192 LD2(dst, stride, tp0, tp1); 4193 INSERT_D2_UB(tp0, tp1, dst0); 4194 4195 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 4196 dst0 = __msa_aver_u_b(out0, dst0); 4197 ST_D2(dst0, 0, 1, dst, stride); 4198 dst += (2 * stride); 4199 4200 LD_SB2(src, stride, src7, src8); 4201 src += (2 * stride); 4202 4203 XORI_B2_128_SB(src7, src8); 4204 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 4205 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 4206 ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r, 4207 hz_out87_r); 4208 ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l, 4209 hz_out87_l); 4210 tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, 4211 filt1, filt2); 4212 tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, 4213 filt1, filt2); 4214 tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4215 tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, 4216 filt1, filt2); 4217 tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, 4218 filt1, filt2); 4219 tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4220 4221 tmp5 = __msa_srari_h(hz_out4, 5); 4222 tmp7 = __msa_srari_h(hz_out5, 5); 4223 SAT_SH2_SH(tmp5, tmp7, 7); 4224 4225 tmp2 = __msa_aver_s_h(tmp4, tmp5); 4226 tmp3 = __msa_aver_s_h(tmp6, tmp7); 4227 4228 LD2(dst, stride, tp2, tp3); 4229 INSERT_D2_UB(tp2, tp3, dst1); 4230 4231 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 4232 dst1 = __msa_aver_u_b(out1, dst1); 4233 ST_D2(dst1, 0, 1, dst, stride); 4234 dst += (2 * stride); 4235 4236 hz_out0 = hz_out4; 4237 hz_out1 = hz_out5; 4238 hz_out2 = hz_out6; 4239 hz_out3 = hz_out7; 4240 hz_out4 = hz_out8; 4241 } 4242 4243 src_tmp += 8; 4244 dst_tmp += 8; 4245 } 4246} 4247 4248void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, 4249 ptrdiff_t stride) 4250{ 4251 uint64_t tp0, tp1, tp2, tp3; 4252 uint8_t *dst_tmp = dst; 4253 const uint8_t *src_tmp = src - (2 * stride) - 2; 4254 uint32_t multiple8_cnt, loop_cnt; 4255 const int32_t filt_const0 = 0xfffb0001; 4256 const int32_t filt_const1 = 0x140014; 4257 const int32_t filt_const2 = 0x1fffb; 4258 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 }; 4259 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1; 4260 v16i8 mask2; 4261 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 4262 v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 4263 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 4264 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l; 4265 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l; 4266 v8i16 hz_out87_l, filt0, filt1, filt2; 4267 v4i32 tmp0_w, tmp1_w; 4268 4269 filt0 = (v8i16) __msa_fill_w(filt_const0); 4270 filt1 = (v8i16) __msa_fill_w(filt_const1); 4271 filt2 = (v8i16) __msa_fill_w(filt_const2); 4272 4273 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 4274 4275 for (multiple8_cnt = 2; multiple8_cnt--;) { 4276 dst = dst_tmp; 4277 src = src_tmp; 4278 4279 LD_SB5(src, stride, src0, src1, src2, src3, src4); 4280 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4281 src += (5 * stride); 4282 4283 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 4284 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 4285 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 4286 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 4287 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 4288 4289 for (loop_cnt = 4; loop_cnt--;) { 4290 LD_SB2(src, stride, src5, src6); 4291 src += (2 * stride); 4292 4293 XORI_B2_128_SB(src5, src6); 4294 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 4295 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 4296 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 4297 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, 4298 hz_out43_r); 4299 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 4300 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, 4301 hz_out43_l); 4302 ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, hz_out65_r); 4303 ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, hz_out65_l); 4304 4305 tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, 4306 filt1, filt2); 4307 tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, 4308 filt1, filt2); 4309 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4310 tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, 4311 filt1, filt2); 4312 tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, 4313 filt1, filt2); 4314 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4315 4316 tmp1 = __msa_srari_h(hz_out3, 5); 4317 tmp3 = __msa_srari_h(hz_out4, 5); 4318 SAT_SH2_SH(tmp1, tmp3, 7); 4319 4320 tmp0 = __msa_aver_s_h(tmp0, tmp1); 4321 tmp1 = __msa_aver_s_h(tmp2, tmp3); 4322 4323 LD2(dst, stride, tp0, tp1); 4324 INSERT_D2_UB(tp0, tp1, dst0); 4325 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 4326 dst0 = __msa_aver_u_b(out0, dst0); 4327 ST_D2(dst0, 0, 1, dst, stride); 4328 dst += (2 * stride); 4329 4330 LD_SB2(src, stride, src7, src8); 4331 src += (2 * stride); 4332 4333 XORI_B2_128_SB(src7, src8); 4334 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 4335 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 4336 ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r, 4337 hz_out87_r); 4338 ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l, 4339 hz_out87_l); 4340 tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, 4341 filt1, filt2); 4342 tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, 4343 filt1, filt2); 4344 tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4345 tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, 4346 filt1, filt2); 4347 tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, 4348 filt1, filt2); 4349 tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4350 4351 tmp5 = __msa_srari_h(hz_out5, 5); 4352 tmp7 = __msa_srari_h(hz_out6, 5); 4353 SAT_SH2_SH(tmp5, tmp7, 7); 4354 4355 tmp2 = __msa_aver_s_h(tmp4, tmp5); 4356 tmp3 = __msa_aver_s_h(tmp6, tmp7); 4357 4358 LD2(dst, stride, tp2, tp3); 4359 INSERT_D2_UB(tp2, tp3, dst1); 4360 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 4361 dst1 = __msa_aver_u_b(out1, dst1); 4362 ST_D2(dst1, 0, 1, dst, stride); 4363 dst += (2 * stride); 4364 4365 hz_out0 = hz_out4; 4366 hz_out1 = hz_out5; 4367 hz_out2 = hz_out6; 4368 hz_out3 = hz_out7; 4369 hz_out4 = hz_out8; 4370 } 4371 4372 src_tmp += 8; 4373 dst_tmp += 8; 4374 } 4375} 4376 4377void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, 4378 ptrdiff_t stride) 4379{ 4380 const int32_t filt_const0 = 0xfffb0001; 4381 const int32_t filt_const1 = 0x140014; 4382 const int32_t filt_const2 = 0x1fffb; 4383 uint64_t tp0, tp1, tp2, tp3; 4384 v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1; 4385 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4386 v16i8 src11, src12, mask0, mask1, mask2; 4387 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 4388 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12; 4389 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 4390 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r; 4391 v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3; 4392 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l; 4393 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l; 4394 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2; 4395 v4i32 tmp0_w, tmp1_w; 4396 4397 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 4398 4399 filt0 = (v8i16) __msa_fill_w(filt_const0); 4400 filt1 = (v8i16) __msa_fill_w(filt_const1); 4401 filt2 = (v8i16) __msa_fill_w(filt_const2); 4402 4403 src -= ((2 * stride) + 2); 4404 4405 LD_SB5(src, stride, src0, src1, src2, src3, src4); 4406 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4407 src += (5 * stride); 4408 4409 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 4410 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 4411 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 4412 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 4413 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 4414 4415 LD_SB4(src, stride, src5, src6, src7, src8); 4416 src += (4 * stride); 4417 XORI_B4_128_SB(src5, src6, src7, src8); 4418 4419 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 4420 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 4421 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 4422 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 4423 4424 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 4425 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 4426 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 4427 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l); 4428 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 4429 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 4430 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 4431 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l); 4432 4433 tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 4434 filt2); 4435 tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1, 4436 filt2); 4437 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4438 tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 4439 filt2); 4440 tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1, 4441 filt2); 4442 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4443 tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 4444 filt2); 4445 tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1, 4446 filt2); 4447 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4448 tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 4449 filt2); 4450 tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1, 4451 filt2); 4452 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4453 4454 SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5); 4455 SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7); 4456 4457 LD4(dst, stride, tp0, tp1, tp2, tp3); 4458 INSERT_D2_UB(tp0, tp1, dst0); 4459 INSERT_D2_UB(tp2, tp3, dst1); 4460 4461 tmp0 = __msa_aver_s_h(tmp0, hz_out2); 4462 tmp1 = __msa_aver_s_h(tmp1, hz_out3); 4463 tmp2 = __msa_aver_s_h(tmp2, hz_out4); 4464 tmp3 = __msa_aver_s_h(tmp3, hz_out5); 4465 4466 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 4467 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 4468 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 4469 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 4470 dst += (4 * stride); 4471 4472 LD_SB4(src, stride, src9, src10, src11, src12); 4473 XORI_B4_128_SB(src9, src10, src11, src12); 4474 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2); 4475 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2); 4476 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2); 4477 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2); 4478 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 4479 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r, 4480 hz_out1211_r); 4481 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 4482 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l, 4483 hz_out1211_l); 4484 tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1, 4485 filt2); 4486 tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1, 4487 filt2); 4488 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4489 tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1, 4490 filt2); 4491 tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1, 4492 filt2); 4493 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4494 tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1, 4495 filt2); 4496 tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1, 4497 filt2); 4498 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4499 tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1, 4500 filt2); 4501 tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1, 4502 filt2); 4503 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4504 4505 SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5); 4506 SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7); 4507 4508 LD4(dst, stride, tp0, tp1, tp2, tp3); 4509 INSERT_D2_UB(tp0, tp1, dst0); 4510 INSERT_D2_UB(tp2, tp3, dst1); 4511 4512 tmp0 = __msa_aver_s_h(tmp0, hz_out6); 4513 tmp1 = __msa_aver_s_h(tmp1, hz_out7); 4514 tmp2 = __msa_aver_s_h(tmp2, hz_out8); 4515 tmp3 = __msa_aver_s_h(tmp3, hz_out9); 4516 4517 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 4518 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 4519 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 4520 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 4521} 4522 4523void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, 4524 ptrdiff_t stride) 4525{ 4526 const int32_t filt_const0 = 0xfffb0001; 4527 const int32_t filt_const1 = 0x140014; 4528 const int32_t filt_const2 = 0x1fffb; 4529 uint64_t tp0, tp1, tp2, tp3; 4530 v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1; 4531 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4532 v16i8 src11, src12, mask0, mask1, mask2; 4533 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 4534 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12; 4535 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 4536 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r; 4537 v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3; 4538 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l; 4539 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l; 4540 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2; 4541 v4i32 tmp0_w, tmp1_w; 4542 4543 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 4544 4545 filt0 = (v8i16) __msa_fill_w(filt_const0); 4546 filt1 = (v8i16) __msa_fill_w(filt_const1); 4547 filt2 = (v8i16) __msa_fill_w(filt_const2); 4548 4549 src -= ((2 * stride) + 2); 4550 4551 LD_SB5(src, stride, src0, src1, src2, src3, src4); 4552 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4553 src += (5 * stride); 4554 4555 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 4556 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 4557 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 4558 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 4559 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 4560 4561 LD_SB4(src, stride, src5, src6, src7, src8); 4562 src += (4 * stride); 4563 XORI_B4_128_SB(src5, src6, src7, src8); 4564 4565 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 4566 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 4567 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 4568 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 4569 4570 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 4571 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 4572 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 4573 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l); 4574 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 4575 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 4576 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 4577 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l); 4578 4579 tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 4580 filt2); 4581 tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1, 4582 filt2); 4583 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4584 tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 4585 filt2); 4586 tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1, 4587 filt2); 4588 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4589 tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 4590 filt2); 4591 tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1, 4592 filt2); 4593 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4594 tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 4595 filt2); 4596 tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1, 4597 filt2); 4598 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4599 4600 SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5); 4601 SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7); 4602 4603 LD4(dst, stride, tp0, tp1, tp2, tp3); 4604 INSERT_D2_UB(tp0, tp1, dst0); 4605 INSERT_D2_UB(tp2, tp3, dst1); 4606 4607 tmp0 = __msa_aver_s_h(tmp0, hz_out3); 4608 tmp1 = __msa_aver_s_h(tmp1, hz_out4); 4609 tmp2 = __msa_aver_s_h(tmp2, hz_out5); 4610 tmp3 = __msa_aver_s_h(tmp3, hz_out6); 4611 4612 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 4613 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 4614 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 4615 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 4616 dst += (4 * stride); 4617 4618 LD_SB4(src, stride, src9, src10, src11, src12); 4619 XORI_B4_128_SB(src9, src10, src11, src12); 4620 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2); 4621 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2); 4622 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2); 4623 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2); 4624 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 4625 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r, 4626 hz_out1211_r); 4627 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 4628 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l, 4629 hz_out1211_l); 4630 tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1, 4631 filt2); 4632 tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1, 4633 filt2); 4634 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4635 tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1, 4636 filt2); 4637 tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1, 4638 filt2); 4639 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4640 tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1, 4641 filt2); 4642 tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1, 4643 filt2); 4644 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4645 tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1, 4646 filt2); 4647 tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1, 4648 filt2); 4649 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4650 4651 SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5); 4652 SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7); 4653 4654 LD4(dst, stride, tp0, tp1, tp2, tp3); 4655 INSERT_D2_UB(tp0, tp1, dst0); 4656 INSERT_D2_UB(tp2, tp3, dst1); 4657 4658 tmp0 = __msa_aver_s_h(tmp0, hz_out7); 4659 tmp1 = __msa_aver_s_h(tmp1, hz_out8); 4660 tmp2 = __msa_aver_s_h(tmp2, hz_out9); 4661 tmp3 = __msa_aver_s_h(tmp3, hz_out10); 4662 4663 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 4664 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 4665 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 4666 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 4667} 4668 4669void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, 4670 ptrdiff_t stride) 4671{ 4672 uint32_t tp0, tp1, tp2, tp3; 4673 const int32_t filt_const0 = 0xfffb0001; 4674 const int32_t filt_const1 = 0x140014; 4675 const int32_t filt_const2 = 0x1fffb; 4676 v16u8 res, out = { 0 }; 4677 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 4678 v16i8 mask0, mask1, mask2; 4679 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 4680 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2; 4681 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 4682 v8i16 hz_out65_r, hz_out76_r, hz_out87_r; 4683 v4i32 tmp0, tmp1; 4684 4685 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 4686 4687 filt0 = (v8i16) __msa_fill_w(filt_const0); 4688 filt1 = (v8i16) __msa_fill_w(filt_const1); 4689 filt2 = (v8i16) __msa_fill_w(filt_const2); 4690 4691 src -= ((2 * stride) + 2); 4692 4693 LD_SB5(src, stride, src0, src1, src2, src3, src4); 4694 src += (5 * stride); 4695 LD_SB4(src, stride, src5, src6, src7, src8); 4696 4697 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4698 XORI_B4_128_SB(src5, src6, src7, src8); 4699 4700 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2); 4701 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2); 4702 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2); 4703 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2); 4704 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 4705 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); 4706 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7); 4707 4708 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 4709 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 4710 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 4711 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 4712 4713 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 4714 filt2); 4715 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 4716 filt2); 4717 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 4718 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 4719 filt2); 4720 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 4721 filt2); 4722 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 4723 4724 SRARI_H2_SH(hz_out2, hz_out4, 5); 4725 SAT_SH2_SH(hz_out2, hz_out4, 7); 4726 4727 dst0 = __msa_aver_s_h(dst0, hz_out2); 4728 dst1 = __msa_aver_s_h(dst1, hz_out4); 4729 LW4(dst, stride, tp0, tp1, tp2, tp3); 4730 INSERT_W4_UB(tp0, tp1, tp2, tp3, out); 4731 res = PCKEV_XORI128_UB(dst0, dst1); 4732 res = __msa_aver_u_b(res, out); 4733 ST_W4(res, 0, 1, 2, 3, dst, stride); 4734} 4735 4736void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, 4737 ptrdiff_t stride) 4738{ 4739 const int32_t filt_const0 = 0xfffb0001; 4740 const int32_t filt_const1 = 0x140014; 4741 const int32_t filt_const2 = 0x1fffb; 4742 uint32_t tp0, tp1, tp2, tp3; 4743 v16u8 res, out = { 0 }; 4744 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 4745 v16i8 mask0, mask1, mask2; 4746 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 4747 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2; 4748 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 4749 v8i16 hz_out65_r, hz_out76_r, hz_out87_r; 4750 v4i32 tmp0, tmp1; 4751 4752 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 4753 4754 filt0 = (v8i16) __msa_fill_w(filt_const0); 4755 filt1 = (v8i16) __msa_fill_w(filt_const1); 4756 filt2 = (v8i16) __msa_fill_w(filt_const2); 4757 4758 src -= ((2 * stride) + 2); 4759 4760 LD_SB5(src, stride, src0, src1, src2, src3, src4); 4761 src += (5 * stride); 4762 LD_SB4(src, stride, src5, src6, src7, src8); 4763 4764 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4765 XORI_B4_128_SB(src5, src6, src7, src8); 4766 4767 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2); 4768 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2); 4769 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2); 4770 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2); 4771 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 4772 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); 4773 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7); 4774 4775 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 4776 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 4777 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 4778 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 4779 4780 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 4781 filt2); 4782 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 4783 filt2); 4784 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 4785 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 4786 filt2); 4787 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 4788 filt2); 4789 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 4790 4791 PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1); 4792 SRARI_H2_SH(hz_out0, hz_out1, 5); 4793 SAT_SH2_SH(hz_out0, hz_out1, 7); 4794 4795 dst0 = __msa_aver_s_h(dst0, hz_out0); 4796 dst1 = __msa_aver_s_h(dst1, hz_out1); 4797 LW4(dst, stride, tp0, tp1, tp2, tp3); 4798 INSERT_W4_UB(tp0, tp1, tp2, tp3, out); 4799 res = PCKEV_XORI128_UB(dst0, dst1); 4800 res = __msa_aver_u_b(res, out); 4801 ST_W4(res, 0, 1, 2, 3, dst, stride); 4802} 4803 4804void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, 4805 ptrdiff_t stride) 4806{ 4807 int32_t loop_cnt; 4808 int16_t filt_const0 = 0xfb01; 4809 int16_t filt_const1 = 0x1414; 4810 int16_t filt_const2 = 0x1fb; 4811 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; 4812 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 4813 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 4814 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 4815 v16i8 src65_l, src87_l, filt0, filt1, filt2; 4816 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 4817 4818 filt0 = (v16i8) __msa_fill_h(filt_const0); 4819 filt1 = (v16i8) __msa_fill_h(filt_const1); 4820 filt2 = (v16i8) __msa_fill_h(filt_const2); 4821 src -= (stride * 2); 4822 4823 LD_SB5(src, stride, src0, src1, src2, src3, src4); 4824 src += (5 * stride); 4825 4826 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4827 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 4828 src32_r, src43_r); 4829 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 4830 src32_l, src43_l); 4831 4832 for (loop_cnt = 4; loop_cnt--;) { 4833 LD_SB4(src, stride, src5, src6, src7, src8); 4834 src += (4 * stride); 4835 4836 XORI_B4_128_SB(src5, src6, src7, src8); 4837 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 4838 src65_r, src76_r, src87_r); 4839 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 4840 src65_l, src76_l, src87_l); 4841 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 4842 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 4843 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 4844 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 4845 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 4846 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 4847 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 4848 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 4849 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 4850 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 4851 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); 4852 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 4853 LD_UB4(dst, stride, dst0, dst1, dst2, dst3); 4854 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 4855 out3_r, res0, res1, res2, res3); 4856 XORI_B4_128_UB(res0, res1, res2, res3); 4857 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 4858 AVER_UB2_UB(res2, dst2, res3, dst3, res2, res3); 4859 ST_UB4(res0, res1, res2, res3, dst, stride); 4860 dst += (4 * stride); 4861 4862 src10_r = src54_r; 4863 src32_r = src76_r; 4864 src21_r = src65_r; 4865 src43_r = src87_r; 4866 src10_l = src54_l; 4867 src32_l = src76_l; 4868 src21_l = src65_l; 4869 src43_l = src87_l; 4870 src4 = src8; 4871 } 4872} 4873 4874void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, 4875 ptrdiff_t stride) 4876{ 4877 uint64_t tp0, tp1, tp2, tp3; 4878 const int16_t filt_const0 = 0xfb01; 4879 const int16_t filt_const1 = 0x1414; 4880 const int16_t filt_const2 = 0x1fb; 4881 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 4882 v16u8 out0, out1, out2, out3; 4883 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src109_r; 4884 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; 4885 v16i8 filt0, filt1, filt2; 4886 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r; 4887 4888 filt0 = (v16i8) __msa_fill_h(filt_const0); 4889 filt1 = (v16i8) __msa_fill_h(filt_const1); 4890 filt2 = (v16i8) __msa_fill_h(filt_const2); 4891 4892 src -= (stride * 2); 4893 4894 LD_SB5(src, stride, src0, src1, src2, src3, src4); 4895 src += (5 * stride); 4896 4897 XORI_B5_128_SB(src0, src1, src2, src3, src4); 4898 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 4899 src32_r, src43_r); 4900 4901 LD_SB4(src, stride, src7, src8, src9, src10); 4902 src += (4 * stride); 4903 XORI_B4_128_SB(src7, src8, src9, src10); 4904 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r, 4905 src87_r, src98_r, src109_r); 4906 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); 4907 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); 4908 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); 4909 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); 4910 4911 LD_SB4(src, stride, src0, src1, src2, src3); 4912 XORI_B4_128_SB(src0, src1, src2, src3); 4913 ILVR_B4_SB(src0, src10, src1, src0, src2, src1, src3, src2, src10_r, 4914 src21_r, src32_r, src43_r); 4915 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2); 4916 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2); 4917 out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2); 4918 out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2); 4919 4920 LD4(dst, stride, tp0, tp1, tp2, tp3); 4921 INSERT_D2_UB(tp0, tp1, dst0); 4922 INSERT_D2_UB(tp2, tp3, dst1); 4923 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 4924 INSERT_D2_UB(tp0, tp1, dst2); 4925 INSERT_D2_UB(tp2, tp3, dst3); 4926 4927 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 4928 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5); 4929 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 4930 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7); 4931 out0 = PCKEV_XORI128_UB(out0_r, out1_r); 4932 out1 = PCKEV_XORI128_UB(out2_r, out3_r); 4933 out2 = PCKEV_XORI128_UB(out4_r, out5_r); 4934 out3 = PCKEV_XORI128_UB(out6_r, out7_r); 4935 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1, 4936 dst2, dst3); 4937 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 4938} 4939 4940void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, 4941 ptrdiff_t stride) 4942{ 4943 uint32_t tp0, tp1, tp2, tp3; 4944 int16_t filt_const0 = 0xfb01; 4945 int16_t filt_const1 = 0x1414; 4946 int16_t filt_const2 = 0x1fb; 4947 v16u8 res, dst0 = { 0 }; 4948 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 4949 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 4950 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 4951 v8i16 out10, out32; 4952 4953 filt0 = (v16i8) __msa_fill_h(filt_const0); 4954 filt1 = (v16i8) __msa_fill_h(filt_const1); 4955 filt2 = (v16i8) __msa_fill_h(filt_const2); 4956 4957 src -= (stride * 2); 4958 LD_SB5(src, stride, src0, src1, src2, src3, src4); 4959 src += (5 * stride); 4960 4961 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 4962 src32_r, src43_r); 4963 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 4964 XORI_B2_128_SB(src2110, src4332); 4965 LD_SB4(src, stride, src5, src6, src7, src8); 4966 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 4967 src76_r, src87_r); 4968 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); 4969 XORI_B2_128_SB(src6554, src8776); 4970 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 4971 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 4972 SRARI_H2_SH(out10, out32, 5); 4973 SAT_SH2_SH(out10, out32, 7); 4974 LW4(dst, stride, tp0, tp1, tp2, tp3); 4975 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 4976 res = PCKEV_XORI128_UB(out10, out32); 4977 dst0 = __msa_aver_u_b(res, dst0); 4978 ST_W4(dst0, 0, 1, 2, 3, dst, stride); 4979} 4980 4981void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, 4982 ptrdiff_t stride) 4983{ 4984 uint32_t row; 4985 v16u8 out, dst0; 4986 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4987 v16i8 src11; 4988 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3; 4989 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 4990 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5; 4991 v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 4992 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 4993 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 4994 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 4995 v8i16 minus5h = __msa_ldi_h(-5); 4996 v8i16 plus20h = __msa_ldi_h(20); 4997 4998 mask3 = mask0 + 4; 4999 mask4 = mask1 + 4; 5000 mask5 = mask2 + 4; 5001 5002 src -= ((2 * stride) + 2); 5003 5004 LD_SB5(src, stride, src0, src1, src2, src3, src4); 5005 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11); 5006 src += (5 * stride); 5007 XORI_B5_128_SB(src0, src1, src2, src3, src4); 5008 XORI_B5_128_SB(src7, src8, src9, src10, src11); 5009 5010 for (row = 16; row--;) { 5011 LD_SB2(src, 8, src5, src6); 5012 src += stride; 5013 XORI_B2_128_SB(src5, src6); 5014 dst0 = LD_UB(dst); 5015 5016 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 5017 vt_res0, vt_res1); 5018 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6, 5019 vt_res2, vt_res3); 5020 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5021 mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 5022 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5023 mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 5024 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 5025 mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 5026 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 5027 mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 5028 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5029 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5030 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 5031 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 5032 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 5033 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 5034 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 5035 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 5036 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 5037 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 5038 tmp0 = __msa_srari_h(shf_vec2, 5); 5039 tmp1 = __msa_srari_h(shf_vec5, 5); 5040 tmp2 = __msa_srari_h(shf_vec8, 5); 5041 tmp3 = __msa_srari_h(shf_vec11, 5); 5042 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 5043 PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 5044 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3); 5045 tmp0 = __msa_aver_s_h(tmp2, tmp0); 5046 tmp1 = __msa_aver_s_h(tmp3, tmp1); 5047 out = PCKEV_XORI128_UB(tmp0, tmp1); 5048 out = __msa_aver_u_b(out, dst0); 5049 ST_UB(out, dst); 5050 dst += stride; 5051 5052 src0 = src1; 5053 src1 = src2; 5054 src2 = src3; 5055 src3 = src4; 5056 src4 = src5; 5057 src7 = src8; 5058 src8 = src9; 5059 src9 = src10; 5060 src10 = src11; 5061 src11 = src6; 5062 } 5063} 5064 5065void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, 5066 ptrdiff_t stride) 5067{ 5068 uint32_t row; 5069 v16u8 out, dst0; 5070 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 5071 v16i8 src11; 5072 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3; 5073 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 5074 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5; 5075 v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 5076 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 5077 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 5078 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 5079 v8i16 minus5h = __msa_ldi_h(-5); 5080 v8i16 plus20h = __msa_ldi_h(20); 5081 5082 mask3 = mask0 + 4; 5083 mask4 = mask1 + 4; 5084 mask5 = mask2 + 4; 5085 5086 src -= ((2 * stride) + 2); 5087 5088 LD_SB5(src, stride, src0, src1, src2, src3, src4); 5089 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11); 5090 src += (5 * stride); 5091 XORI_B5_128_SB(src0, src1, src2, src3, src4); 5092 XORI_B5_128_SB(src7, src8, src9, src10, src11); 5093 5094 for (row = 16; row--;) { 5095 LD_SB2(src, 8, src5, src6); 5096 src += stride; 5097 XORI_B2_128_SB(src5, src6); 5098 dst0 = LD_UB(dst); 5099 5100 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 5101 vt_res0, vt_res1); 5102 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6, 5103 vt_res2, vt_res3); 5104 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5105 mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 5106 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5107 mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 5108 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 5109 mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 5110 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 5111 mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 5112 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5113 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5114 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 5115 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 5116 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 5117 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 5118 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 5119 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 5120 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 5121 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 5122 tmp0 = __msa_srari_h(shf_vec2, 5); 5123 tmp1 = __msa_srari_h(shf_vec5, 5); 5124 tmp2 = __msa_srari_h(shf_vec8, 5); 5125 tmp3 = __msa_srari_h(shf_vec11, 5); 5126 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 5127 tmp0 = __msa_pckod_h(tmp2, tmp0); 5128 tmp1 = __msa_pckod_h(tmp3, tmp1); 5129 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3); 5130 tmp0 = __msa_aver_s_h(tmp2, tmp0); 5131 tmp1 = __msa_aver_s_h(tmp3, tmp1); 5132 out = PCKEV_XORI128_UB(tmp0, tmp1); 5133 out = __msa_aver_u_b(out, dst0); 5134 ST_UB(out, dst); 5135 dst += stride; 5136 5137 src0 = src1; 5138 src1 = src2; 5139 src2 = src3; 5140 src3 = src4; 5141 src4 = src5; 5142 src7 = src8; 5143 src8 = src9; 5144 src9 = src10; 5145 src10 = src11; 5146 src11 = src6; 5147 } 5148} 5149 5150void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, 5151 ptrdiff_t stride) 5152{ 5153 uint32_t row; 5154 uint64_t tp0, tp1; 5155 v16u8 out, dst0 = { 0 }; 5156 v16i8 src0, src1, src2, src3, src4, src5, src6; 5157 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3; 5158 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 5159 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11; 5160 v8i16 mask3, mask4, mask5; 5161 v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 5162 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 5163 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 5164 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 5165 v8i16 minus5h = __msa_ldi_h(-5); 5166 v8i16 plus20h = __msa_ldi_h(20); 5167 5168 mask3 = mask0 + 4; 5169 mask4 = mask1 + 4; 5170 mask5 = mask2 + 4; 5171 5172 src -= ((2 * stride) + 2); 5173 5174 LD_SB5(src, stride, src0, src1, src2, src3, src4); 5175 src += (5 * stride); 5176 XORI_B5_128_SB(src0, src1, src2, src3, src4); 5177 5178 for (row = 4; row--;) { 5179 LD_SB2(src, stride, src5, src6); 5180 src += (2 * stride); 5181 XORI_B2_128_SB(src5, src6); 5182 5183 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 5184 vt_res0, vt_res1); 5185 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6, 5186 vt_res2, vt_res3); 5187 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5188 mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 5189 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5190 mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 5191 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 5192 mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 5193 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 5194 mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 5195 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5196 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5197 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 5198 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 5199 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 5200 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 5201 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 5202 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 5203 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 5204 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 5205 tmp0 = __msa_srari_h(shf_vec2, 5); 5206 tmp1 = __msa_srari_h(shf_vec5, 5); 5207 tmp2 = __msa_srari_h(shf_vec8, 5); 5208 tmp3 = __msa_srari_h(shf_vec11, 5); 5209 LD2(dst, stride, tp0, tp1); 5210 INSERT_D2_UB(tp0, tp1, dst0); 5211 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 5212 PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 5213 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3); 5214 tmp0 = __msa_aver_s_h(tmp2, tmp0); 5215 tmp1 = __msa_aver_s_h(tmp3, tmp1); 5216 out = PCKEV_XORI128_UB(tmp0, tmp1); 5217 out = __msa_aver_u_b(out, dst0); 5218 ST_D2(out, 0, 1, dst, stride); 5219 dst += (2 * stride); 5220 5221 src0 = src2; 5222 src1 = src3; 5223 src2 = src4; 5224 src3 = src5; 5225 src4 = src6; 5226 } 5227} 5228 5229void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, 5230 ptrdiff_t stride) 5231{ 5232 uint32_t row; 5233 uint64_t tp0, tp1; 5234 v16u8 out, dst0 = { 0 }; 5235 v16i8 src0, src1, src2, src3, src4, src5, src6; 5236 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3; 5237 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 5238 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11; 5239 v8i16 mask3, mask4, mask5; 5240 v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 5241 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 5242 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 5243 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 5244 v8i16 minus5h = __msa_ldi_h(-5); 5245 v8i16 plus20h = __msa_ldi_h(20); 5246 5247 mask3 = mask0 + 4; 5248 mask4 = mask1 + 4; 5249 mask5 = mask2 + 4; 5250 5251 src -= ((2 * stride) + 2); 5252 5253 LD_SB5(src, stride, src0, src1, src2, src3, src4); 5254 src += (5 * stride); 5255 XORI_B5_128_SB(src0, src1, src2, src3, src4); 5256 5257 for (row = 4; row--;) { 5258 LD_SB2(src, stride, src5, src6); 5259 src += (2 * stride); 5260 XORI_B2_128_SB(src5, src6); 5261 5262 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 5263 vt_res0, vt_res1); 5264 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6, 5265 vt_res2, vt_res3); 5266 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5267 mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 5268 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5269 mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 5270 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 5271 mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 5272 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 5273 mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 5274 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5275 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5276 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 5277 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 5278 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 5279 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 5280 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 5281 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 5282 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 5283 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 5284 tmp0 = __msa_srari_h(shf_vec2, 5); 5285 tmp1 = __msa_srari_h(shf_vec5, 5); 5286 tmp2 = __msa_srari_h(shf_vec8, 5); 5287 tmp3 = __msa_srari_h(shf_vec11, 5); 5288 LD2(dst, stride, tp0, tp1); 5289 INSERT_D2_UB(tp0, tp1, dst0); 5290 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 5291 tmp0 = __msa_pckod_h(tmp2, tmp0); 5292 tmp1 = __msa_pckod_h(tmp3, tmp1); 5293 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3); 5294 tmp0 = __msa_aver_s_h(tmp2, tmp0); 5295 tmp1 = __msa_aver_s_h(tmp3, tmp1); 5296 out = PCKEV_XORI128_UB(tmp0, tmp1); 5297 out = __msa_aver_u_b(out, dst0); 5298 ST_D2(out, 0, 1, dst, stride); 5299 dst += (2 * stride); 5300 5301 src0 = src2; 5302 src1 = src3; 5303 src2 = src4; 5304 src3 = src5; 5305 src4 = src6; 5306 } 5307} 5308 5309void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, 5310 ptrdiff_t stride) 5311{ 5312 uint32_t tp0, tp1, tp2, tp3; 5313 const int16_t filt_const0 = 0xfb01; 5314 const int16_t filt_const1 = 0x1414; 5315 const int16_t filt_const2 = 0x1fb; 5316 v16u8 out, dstv = { 0 }; 5317 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 5318 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r; 5319 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l; 5320 v16i8 src76_l, src87_l, filt0, filt1, filt2; 5321 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7; 5322 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 5323 v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 5324 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 5325 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 5326 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 5327 v8i16 minus5h = __msa_ldi_h(-5); 5328 v8i16 plus20h = __msa_ldi_h(20); 5329 v8i16 zeros = { 0 }; 5330 5331 filt0 = (v16i8) __msa_fill_h(filt_const0); 5332 filt1 = (v16i8) __msa_fill_h(filt_const1); 5333 filt2 = (v16i8) __msa_fill_h(filt_const2); 5334 5335 src -= ((2 * stride) + 2); 5336 5337 LD_SB5(src, stride, src0, src1, src2, src3, src4); 5338 src += (5 * stride); 5339 XORI_B5_128_SB(src0, src1, src2, src3, src4); 5340 LD_SB4(src, stride, src5, src6, src7, src8); 5341 XORI_B4_128_SB(src5, src6, src7, src8); 5342 5343 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 5344 src32_r, src43_r); 5345 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 5346 src76_r, src87_r); 5347 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 5348 src32_l, src43_l); 5349 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l, 5350 src76_l, src87_l); 5351 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 5352 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 5353 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 5354 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 5355 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5356 mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 5357 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5358 mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 5359 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5360 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 5361 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5362 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 5363 5364 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 5365 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 5366 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 5367 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 5368 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5369 mask1, mask2, shf_vec0, shf_vec1, shf_vec6); 5370 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5371 mask1, mask2, shf_vec3, shf_vec4, shf_vec7); 5372 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5373 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2); 5374 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5375 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3); 5376 5377 SRARI_W2_SW(hz_res0, hz_res1, 10); 5378 SAT_SW2_SW(hz_res0, hz_res1, 7); 5379 SRARI_W2_SW(hz_res2, hz_res3, 10); 5380 SAT_SW2_SW(hz_res2, hz_res3, 7); 5381 5382 dst0 = __msa_srari_h(shf_vec2, 5); 5383 dst1 = __msa_srari_h(shf_vec5, 5); 5384 dst2 = __msa_srari_h(shf_vec6, 5); 5385 dst3 = __msa_srari_h(shf_vec7, 5); 5386 5387 SAT_SH2_SH(dst0, dst1, 7); 5388 SAT_SH2_SH(dst2, dst3, 7); 5389 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1); 5390 ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3); 5391 5392 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0); 5393 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1); 5394 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2); 5395 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3); 5396 5397 LW4(dst, stride, tp0, tp1, tp2, tp3); 5398 INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv); 5399 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2); 5400 out = PCKEV_XORI128_UB(dst0, dst2); 5401 out = __msa_aver_u_b(out, dstv); 5402 ST_W4(out, 0, 1, 2, 3, dst, stride); 5403} 5404 5405void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, 5406 ptrdiff_t stride) 5407{ 5408 uint32_t tp0, tp1, tp2, tp3; 5409 const int16_t filt_const0 = 0xfb01; 5410 const int16_t filt_const1 = 0x1414; 5411 const int16_t filt_const2 = 0x1fb; 5412 v16u8 out, dstv = { 0 }; 5413 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 5414 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r; 5415 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l; 5416 v16i8 src76_l, src87_l, filt0, filt1, filt2; 5417 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7; 5418 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 5419 v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 5420 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 5421 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 5422 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 5423 v8i16 minus5h = __msa_ldi_h(-5); 5424 v8i16 plus20h = __msa_ldi_h(20); 5425 v8i16 zeros = { 0 }; 5426 5427 filt0 = (v16i8) __msa_fill_h(filt_const0); 5428 filt1 = (v16i8) __msa_fill_h(filt_const1); 5429 filt2 = (v16i8) __msa_fill_h(filt_const2); 5430 5431 src -= ((2 * stride) + 2); 5432 5433 LD_SB5(src, stride, src0, src1, src2, src3, src4); 5434 src += (5 * stride); 5435 XORI_B5_128_SB(src0, src1, src2, src3, src4); 5436 LD_SB4(src, stride, src5, src6, src7, src8); 5437 XORI_B4_128_SB(src5, src6, src7, src8); 5438 5439 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 5440 src32_r, src43_r); 5441 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 5442 src76_r, src87_r); 5443 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 5444 src32_l, src43_l); 5445 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l, 5446 src76_l, src87_l); 5447 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 5448 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 5449 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 5450 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 5451 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5452 mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 5453 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5454 mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 5455 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5456 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 5457 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5458 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 5459 5460 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 5461 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 5462 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 5463 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 5464 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5465 mask1, mask2, shf_vec0, shf_vec1, shf_vec6); 5466 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5467 mask1, mask2, shf_vec3, shf_vec4, shf_vec7); 5468 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5469 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2); 5470 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5471 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3); 5472 5473 SRARI_W2_SW(hz_res0, hz_res1, 10); 5474 SAT_SW2_SW(hz_res0, hz_res1, 7); 5475 SRARI_W2_SW(hz_res2, hz_res3, 10); 5476 SAT_SW2_SW(hz_res2, hz_res3, 7); 5477 5478 dst0 = __msa_srari_h(shf_vec2, 5); 5479 dst1 = __msa_srari_h(shf_vec5, 5); 5480 dst2 = __msa_srari_h(shf_vec6, 5); 5481 dst3 = __msa_srari_h(shf_vec7, 5); 5482 5483 SAT_SH2_SH(dst0, dst1, 7); 5484 SAT_SH2_SH(dst2, dst3, 7); 5485 5486 dst0 = __msa_ilvod_h(zeros, dst0); 5487 dst1 = __msa_ilvod_h(zeros, dst1); 5488 dst2 = __msa_ilvod_h(zeros, dst2); 5489 dst3 = __msa_ilvod_h(zeros, dst3); 5490 5491 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0); 5492 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1); 5493 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2); 5494 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3); 5495 5496 LW4(dst, stride, tp0, tp1, tp2, tp3); 5497 INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv); 5498 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2); 5499 out = PCKEV_XORI128_UB(dst0, dst2); 5500 out = __msa_aver_u_b(out, dstv); 5501 ST_W4(out, 0, 1, 2, 3, dst, stride); 5502} 5503 5504void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, 5505 ptrdiff_t stride) 5506{ 5507 const int32_t filt_const0 = 0xfffb0001; 5508 const int32_t filt_const1 = 0x140014; 5509 const int32_t filt_const2 = 0x1fffb; 5510 const uint8_t *src_tmp = src - (2 * stride) - 2; 5511 uint8_t *dst_tmp = dst; 5512 uint64_t tp0, tp1, tp2, tp3; 5513 uint32_t multiple8_cnt, loop_cnt; 5514 v16u8 dst0, dst1, out0, out1; 5515 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2; 5516 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 5517 v8i16 hz_out7, hz_out8, res0, res1, res2, res3; 5518 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 5519 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l; 5520 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l; 5521 v8i16 hz_out87_l, filt0, filt1, filt2; 5522 v4i32 tmp0, tmp1; 5523 5524 filt0 = (v8i16) __msa_fill_w(filt_const0); 5525 filt1 = (v8i16) __msa_fill_w(filt_const1); 5526 filt2 = (v8i16) __msa_fill_w(filt_const2); 5527 5528 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 5529 5530 for (multiple8_cnt = 2; multiple8_cnt--;) { 5531 src = src_tmp; 5532 dst = dst_tmp; 5533 5534 LD_SB5(src, stride, src0, src1, src2, src3, src4); 5535 XORI_B5_128_SB(src0, src1, src2, src3, src4); 5536 src += (5 * stride); 5537 5538 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 5539 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 5540 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 5541 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 5542 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 5543 5544 for (loop_cnt = 4; loop_cnt--;) { 5545 LD_SB4(src, stride, src0, src1, src2, src3); 5546 XORI_B4_128_SB(src0, src1, src2, src3); 5547 src += (4 * stride); 5548 5549 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 5550 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 5551 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 5552 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 5553 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 5554 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, 5555 hz_out43_r); 5556 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 5557 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, 5558 hz_out43_l); 5559 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 5560 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, 5561 hz_out87_r); 5562 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 5563 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, 5564 hz_out87_l); 5565 5566 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, 5567 filt1, filt2); 5568 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, 5569 filt1, filt2); 5570 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5571 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, 5572 filt1, filt2); 5573 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, 5574 filt1, filt2); 5575 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5576 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, 5577 filt1, filt2); 5578 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, 5579 filt1, filt2); 5580 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5581 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, 5582 filt1, filt2); 5583 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, 5584 filt1, filt2); 5585 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5586 5587 LD4(dst, stride, tp0, tp1, tp2, tp3); 5588 INSERT_D2_UB(tp0, tp1, dst0); 5589 INSERT_D2_UB(tp2, tp3, dst1); 5590 out0 = PCKEV_XORI128_UB(res0, res1); 5591 out1 = PCKEV_XORI128_UB(res2, res3); 5592 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 5593 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 5594 dst += (4 * stride); 5595 5596 hz_out0 = hz_out4; 5597 hz_out1 = hz_out5; 5598 hz_out2 = hz_out6; 5599 hz_out3 = hz_out7; 5600 hz_out4 = hz_out8; 5601 } 5602 5603 src_tmp += 8; 5604 dst_tmp += 8; 5605 } 5606} 5607 5608void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, 5609 ptrdiff_t stride) 5610{ 5611 const int32_t filt_const0 = 0xfffb0001; 5612 const int32_t filt_const1 = 0x140014; 5613 const int32_t filt_const2 = 0x1fffb; 5614 uint64_t tp0, tp1, tp2, tp3; 5615 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 }; 5616 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2; 5617 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 5618 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12; 5619 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 5620 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r; 5621 v8i16 hz_out1110_r, hz_out1211_r, res0, res1, res2, res3; 5622 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l; 5623 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l; 5624 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2; 5625 v4i32 tmp0, tmp1; 5626 5627 filt0 = (v8i16) __msa_fill_w(filt_const0); 5628 filt1 = (v8i16) __msa_fill_w(filt_const1); 5629 filt2 = (v8i16) __msa_fill_w(filt_const2); 5630 5631 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 5632 5633 src -= ((2 * stride) + 2); 5634 LD_SB5(src, stride, src0, src1, src2, src3, src4); 5635 XORI_B5_128_SB(src0, src1, src2, src3, src4); 5636 src += (5 * stride); 5637 5638 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 5639 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 5640 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 5641 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 5642 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 5643 5644 LD_SB4(src, stride, src0, src1, src2, src3); 5645 XORI_B4_128_SB(src0, src1, src2, src3); 5646 src += (4 * stride); 5647 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 5648 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 5649 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 5650 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 5651 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 5652 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 5653 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 5654 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l); 5655 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 5656 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 5657 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 5658 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l); 5659 5660 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 5661 filt2); 5662 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1, 5663 filt2); 5664 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5665 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 5666 filt2); 5667 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1, 5668 filt2); 5669 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5670 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 5671 filt2); 5672 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1, 5673 filt2); 5674 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5675 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 5676 filt2); 5677 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1, 5678 filt2); 5679 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5680 LD4(dst, stride, tp0, tp1, tp2, tp3); 5681 INSERT_D2_UB(tp0, tp1, dst0); 5682 INSERT_D2_UB(tp2, tp3, dst1); 5683 out0 = PCKEV_XORI128_UB(res0, res1); 5684 out1 = PCKEV_XORI128_UB(res2, res3); 5685 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 5686 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 5687 dst += (4 * stride); 5688 5689 LD_SB4(src, stride, src0, src1, src2, src3); 5690 XORI_B4_128_SB(src0, src1, src2, src3); 5691 hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 5692 hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 5693 hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 5694 hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 5695 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 5696 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r, 5697 hz_out1211_r); 5698 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 5699 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l, 5700 hz_out1211_l); 5701 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1, 5702 filt2); 5703 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1, 5704 filt2); 5705 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5706 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1, 5707 filt2); 5708 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1, 5709 filt2); 5710 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5711 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1, 5712 filt2); 5713 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1, 5714 filt2); 5715 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5716 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1, 5717 filt2); 5718 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1, 5719 filt2); 5720 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5721 LD4(dst, stride, tp0, tp1, tp2, tp3); 5722 INSERT_D2_UB(tp0, tp1, dst0); 5723 INSERT_D2_UB(tp2, tp3, dst1); 5724 out0 = PCKEV_XORI128_UB(res0, res1); 5725 out1 = PCKEV_XORI128_UB(res2, res3); 5726 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 5727 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 5728} 5729 5730void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, 5731 ptrdiff_t stride) 5732{ 5733 const int32_t filt_const0 = 0xfffb0001; 5734 const int32_t filt_const1 = 0x140014; 5735 const int32_t filt_const2 = 0x1fffb; 5736 uint32_t tp0, tp1, tp2, tp3; 5737 v16u8 res, dst0 = { 0 }; 5738 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 5739 v16i8 mask0, mask1, mask2; 5740 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 5741 v8i16 hz_out7, hz_out8, res0, res1, filt0, filt1, filt2; 5742 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 5743 v8i16 hz_out65_r, hz_out76_r, hz_out87_r; 5744 v4i32 tmp0, tmp1; 5745 5746 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 5747 5748 filt0 = (v8i16) __msa_fill_w(filt_const0); 5749 filt1 = (v8i16) __msa_fill_w(filt_const1); 5750 filt2 = (v8i16) __msa_fill_w(filt_const2); 5751 5752 src -= ((2 * stride) + 2); 5753 5754 LD_SB5(src, stride, src0, src1, src2, src3, src4); 5755 src += (5 * stride); 5756 LD_SB4(src, stride, src5, src6, src7, src8); 5757 5758 XORI_B5_128_SB(src0, src1, src2, src3, src4); 5759 XORI_B4_128_SB(src5, src6, src7, src8); 5760 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2); 5761 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2); 5762 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2); 5763 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2); 5764 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 5765 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); 5766 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7); 5767 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 5768 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 5769 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 5770 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 5771 5772 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 5773 filt2); 5774 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 5775 filt2); 5776 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5777 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 5778 filt2); 5779 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 5780 filt2); 5781 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5782 LW4(dst, stride, tp0, tp1, tp2, tp3); 5783 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 5784 res = PCKEV_XORI128_UB(res0, res1); 5785 res = __msa_aver_u_b(res, dst0); 5786 ST_W4(res, 0, 1, 2, 3, dst, stride); 5787} 5788