1/* 2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "qpeldsp_mips.h" 23 24#define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2) \ 25( { \ 26 v16u8 out, tmp0, tmp1; \ 27 v16u8 data0, data1, data2, data3, data4, data5; \ 28 v8i16 res_r, res_l; \ 29 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 30 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \ 31 \ 32 VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \ 33 ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \ 34 data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \ 35 data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \ 36 HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \ 37 ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \ 38 data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \ 39 data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \ 40 sum0_r *= (v8u16) (coef0); \ 41 sum0_l *= (v8u16) (coef0); \ 42 ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \ 43 data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \ 44 data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \ 45 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \ 46 ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \ 47 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \ 48 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \ 49 res_r = (v8i16) (sum0_r - sum3_r); \ 50 res_l = (v8i16) (sum0_l - sum3_l); \ 51 SRARI_H2_SH(res_r, res_l, 5); \ 52 CLIP_SH2_0_255(res_r, res_l); \ 53 out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \ 54 \ 55 out; \ 56} ) 57 58#define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, \ 59 mask0, mask1, mask2, mask3, \ 60 coef0, coef1, coef2) \ 61( { \ 62 v16u8 out; \ 63 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 64 v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \ 65 v8i16 res0_r, res1_r; \ 66 \ 67 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \ 68 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \ 69 HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \ 70 DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \ 71 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \ 72 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \ 73 DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \ 74 DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \ 75 res0_r = (v8i16) (sum0_r - sum3_r); \ 76 res1_r = (v8i16) (sum4_r - sum7_r); \ 77 SRARI_H2_SH(res0_r, res1_r, 5); \ 78 CLIP_SH2_0_255(res0_r, res1_r); \ 79 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \ 80 \ 81 out; \ 82} ) 83 84#define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, \ 85 mask0, mask1, mask2, mask3, \ 86 coef0, coef1, coef2) \ 87( { \ 88 v16u8 out; \ 89 v8i16 res0_r; \ 90 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 91 \ 92 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \ 93 sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \ 94 sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \ 95 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \ 96 DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \ 97 res0_r = (v8i16) (sum0_r - sum3_r); \ 98 res0_r = __msa_srari_h(res0_r, 5); \ 99 CLIP_SH_0_255(res0_r); \ 100 out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \ 101 \ 102 out; \ 103} ) 104 105#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, \ 106 mask2, mask3, coef0, \ 107 coef1, coef2) \ 108( { \ 109 v16u8 out; \ 110 v8i16 res0_r; \ 111 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 112 \ 113 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \ 114 sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \ 115 sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \ 116 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \ 117 DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \ 118 res0_r = (v8i16) (sum0_r - sum3_r); \ 119 res0_r += 15; \ 120 res0_r >>= 5; \ 121 CLIP_SH_0_255(res0_r); \ 122 out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \ 123 \ 124 out; \ 125} ) 126 127#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, \ 128 coef0, coef1, coef2) \ 129( { \ 130 v16u8 out, tmp0, tmp1; \ 131 v16u8 data0, data1, data2, data3, data4, data5; \ 132 v8i16 res_r, res_l; \ 133 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 134 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \ 135 \ 136 VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \ 137 ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \ 138 data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \ 139 data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \ 140 HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \ 141 ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \ 142 data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \ 143 data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \ 144 sum0_r *= (v8u16) (coef0); \ 145 sum0_l *= (v8u16) (coef0); \ 146 ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \ 147 data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \ 148 data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \ 149 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \ 150 ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \ 151 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \ 152 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \ 153 res_r = (v8i16) (sum0_r - sum3_r); \ 154 res_l = (v8i16) (sum0_l - sum3_l); \ 155 res_r += 15; \ 156 res_l += 15; \ 157 res_r >>= 5; \ 158 res_l >>= 5; \ 159 CLIP_SH2_0_255(res_r, res_l); \ 160 out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \ 161 \ 162 out; \ 163} ) 164 165#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, \ 166 mask0, mask1, mask2, mask3, \ 167 coef0, coef1, coef2) \ 168( { \ 169 v16u8 out; \ 170 v8i16 res0_r, res1_r; \ 171 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 172 v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \ 173 \ 174 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \ 175 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \ 176 HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \ 177 DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \ 178 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \ 179 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \ 180 DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \ 181 DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \ 182 res0_r = (v8i16) (sum0_r - sum3_r); \ 183 res1_r = (v8i16) (sum4_r - sum7_r); \ 184 res0_r += 15; \ 185 res1_r += 15; \ 186 res0_r >>= 5; \ 187 res1_r >>= 5; \ 188 CLIP_SH2_0_255(res0_r, res1_r); \ 189 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \ 190 \ 191 out; \ 192} ) 193 194#define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3, \ 195 inp4, inp5, inp6, inp7, \ 196 coef0, coef1, coef2) \ 197( { \ 198 v16u8 res; \ 199 v8i16 res_r, res_l; \ 200 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 201 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \ 202 \ 203 ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \ 204 ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \ 205 DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \ 206 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \ 207 ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \ 208 ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \ 209 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \ 210 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \ 211 res_r = (v8i16) (sum0_r - sum3_r); \ 212 res_l = (v8i16) (sum0_l - sum3_l); \ 213 SRARI_H2_SH(res_r, res_l, 5); \ 214 CLIP_SH2_0_255(res_r, res_l); \ 215 res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \ 216 \ 217 res; \ 218} ) 219 220#define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03, \ 221 inp04, inp05, inp06, inp07, \ 222 inp10, inp11, inp12, inp13, \ 223 inp14, inp15, inp16, inp17, \ 224 coef0, coef1, coef2) \ 225( { \ 226 v16u8 res; \ 227 v8i16 val0, val1; \ 228 v8u16 sum00, sum01, sum02, sum03; \ 229 v8u16 sum10, sum11, sum12, sum13; \ 230 \ 231 ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \ 232 sum00, sum10, sum03, sum13); \ 233 DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \ 234 HADD_UB2_UH(sum03, sum13, sum03, sum13); \ 235 ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \ 236 sum02, sum12, sum01, sum11); \ 237 DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \ 238 DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \ 239 val0 = (v8i16) (sum00 - sum03); \ 240 val1 = (v8i16) (sum10 - sum13); \ 241 SRARI_H2_SH(val0, val1, 5); \ 242 CLIP_SH2_0_255(val0, val1); \ 243 res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \ 244 \ 245 res; \ 246} ) 247 248#define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3, \ 249 inp4, inp5, inp6, inp7, \ 250 coef0, coef1, coef2) \ 251( { \ 252 v16u8 res; \ 253 v8i16 res_r, res_l; \ 254 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 255 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \ 256 \ 257 ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \ 258 ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \ 259 DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \ 260 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \ 261 ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \ 262 ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \ 263 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \ 264 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \ 265 res_r = (v8i16) (sum0_r - sum3_r); \ 266 res_l = (v8i16) (sum0_l - sum3_l); \ 267 res_r += 15; \ 268 res_l += 15; \ 269 res_r >>= 5; \ 270 res_l >>= 5; \ 271 CLIP_SH2_0_255(res_r, res_l); \ 272 res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \ 273 \ 274 res; \ 275} ) 276 277#define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03, \ 278 inp04, inp05, inp06, inp07, \ 279 inp10, inp11, inp12, inp13, \ 280 inp14, inp15, inp16, inp17, \ 281 coef0, coef1, coef2) \ 282( { \ 283 v16u8 res; \ 284 v8i16 val0, val1; \ 285 v8u16 sum00, sum01, sum02, sum03; \ 286 v8u16 sum10, sum11, sum12, sum13; \ 287 \ 288 ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \ 289 sum00, sum10, sum03, sum13); \ 290 DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \ 291 HADD_UB2_UH(sum03, sum13, sum03, sum13); \ 292 ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \ 293 sum02, sum12, sum01, sum11); \ 294 DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \ 295 DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \ 296 val0 = (v8i16) (sum00 - sum03); \ 297 val1 = (v8i16) (sum10 - sum13); \ 298 val0 += 15; \ 299 val1 += 15; \ 300 val0 >>= 5; \ 301 val1 >>= 5; \ 302 CLIP_SH2_0_255(val0, val1); \ 303 res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \ 304 \ 305 res; \ 306} ) 307 308static void horiz_mc_qpel_aver_src0_8width_msa(const uint8_t *src, 309 int32_t src_stride, 310 uint8_t *dst, 311 int32_t dst_stride, 312 int32_t height) 313{ 314 uint8_t loop_count; 315 v16u8 inp0, inp1, inp2, inp3; 316 v16u8 res0, res1; 317 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 318 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 319 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 320 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 321 v16u8 const20 = (v16u8) __msa_ldi_b(20); 322 v16u8 const6 = (v16u8) __msa_ldi_b(6); 323 v16u8 const3 = (v16u8) __msa_ldi_b(3); 324 325 for (loop_count = (height >> 2); loop_count--;) { 326 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 327 src += (4 * src_stride); 328 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 329 mask0, mask1, mask2, mask3, 330 const20, const6, const3); 331 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 332 mask0, mask1, mask2, mask3, 333 const20, const6, const3); 334 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 335 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 336 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); 337 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 338 dst += (4 * dst_stride); 339 } 340} 341 342static void horiz_mc_qpel_aver_src0_16width_msa(const uint8_t *src, 343 int32_t src_stride, 344 uint8_t *dst, 345 int32_t dst_stride, 346 int32_t height) 347{ 348 uint8_t loop_count; 349 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 350 v16u8 res; 351 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 352 v16u8 const6 = (v16u8) __msa_ldi_b(6); 353 v16u8 const3 = (v16u8) __msa_ldi_b(3); 354 v8u16 const20 = (v8u16) __msa_ldi_h(20); 355 356 for (loop_count = (height >> 2); loop_count--;) { 357 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 358 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 359 src += (4 * src_stride); 360 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 361 const20, const6, const3); 362 res = __msa_aver_u_b(inp0, res); 363 ST_UB(res, dst); 364 dst += dst_stride; 365 366 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 367 const20, const6, const3); 368 res = __msa_aver_u_b(inp2, res); 369 ST_UB(res, dst); 370 dst += dst_stride; 371 372 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 373 const20, const6, const3); 374 res = __msa_aver_u_b(inp4, res); 375 ST_UB(res, dst); 376 dst += dst_stride; 377 378 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 379 const20, const6, const3); 380 res = __msa_aver_u_b(inp6, res); 381 ST_UB(res, dst); 382 dst += dst_stride; 383 } 384} 385 386static void horiz_mc_qpel_8width_msa(const uint8_t *src, 387 int32_t src_stride, 388 uint8_t *dst, 389 int32_t dst_stride, 390 int32_t height) 391{ 392 uint8_t loop_count; 393 v16u8 inp0, inp1, inp2, inp3; 394 v16u8 res0, res1; 395 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 396 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 397 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 398 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 399 v16u8 const20 = (v16u8) __msa_ldi_b(20); 400 v16u8 const6 = (v16u8) __msa_ldi_b(6); 401 v16u8 const3 = (v16u8) __msa_ldi_b(3); 402 403 for (loop_count = (height >> 2); loop_count--;) { 404 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 405 src += (4 * src_stride); 406 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 407 mask0, mask1, mask2, mask3, 408 const20, const6, const3); 409 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 410 mask0, mask1, mask2, mask3, 411 const20, const6, const3); 412 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 413 dst += (4 * dst_stride); 414 } 415} 416 417static void horiz_mc_qpel_16width_msa(const uint8_t *src, 418 int32_t src_stride, 419 uint8_t *dst, 420 int32_t dst_stride, 421 int32_t height) 422{ 423 uint8_t loop_count; 424 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 425 v16u8 res; 426 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 427 v8u16 const20 = (v8u16) __msa_ldi_h(20); 428 v16u8 const6 = (v16u8) __msa_ldi_b(6); 429 v16u8 const3 = (v16u8) __msa_ldi_b(3); 430 431 for (loop_count = (height >> 2); loop_count--;) { 432 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 433 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 434 src += (4 * src_stride); 435 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 436 const20, const6, const3); 437 ST_UB(res, dst); 438 dst += dst_stride; 439 440 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 441 const20, const6, const3); 442 ST_UB(res, dst); 443 dst += dst_stride; 444 445 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 446 const20, const6, const3); 447 ST_UB(res, dst); 448 dst += dst_stride; 449 450 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 451 const20, const6, const3); 452 ST_UB(res, dst); 453 dst += dst_stride; 454 } 455} 456 457static void horiz_mc_qpel_aver_src1_8width_msa(const uint8_t *src, 458 int32_t src_stride, 459 uint8_t *dst, 460 int32_t dst_stride, 461 int32_t height) 462{ 463 uint8_t loop_count; 464 v16u8 inp0, inp1, inp2, inp3; 465 v16u8 res0, res1; 466 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 467 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 468 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 469 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 470 v16u8 const20 = (v16u8) __msa_ldi_b(20); 471 v16u8 const6 = (v16u8) __msa_ldi_b(6); 472 v16u8 const3 = (v16u8) __msa_ldi_b(3); 473 474 for (loop_count = (height >> 2); loop_count--;) { 475 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 476 src += (4 * src_stride); 477 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 478 mask0, mask1, mask2, mask3, 479 const20, const6, const3); 480 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 481 mask0, mask1, mask2, mask3, 482 const20, const6, const3); 483 SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1, 484 inp0, inp1, inp2, inp3); 485 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 486 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 487 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); 488 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 489 dst += (4 * dst_stride); 490 } 491} 492 493static void horiz_mc_qpel_aver_src1_16width_msa(const uint8_t *src, 494 int32_t src_stride, 495 uint8_t *dst, 496 int32_t dst_stride, 497 int32_t height) 498{ 499 uint8_t loop_count; 500 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 501 v16u8 res; 502 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 503 v8u16 const20 = (v8u16) __msa_ldi_h(20); 504 v16u8 const6 = (v16u8) __msa_ldi_b(6); 505 v16u8 const3 = (v16u8) __msa_ldi_b(3); 506 507 for (loop_count = (height >> 2); loop_count--;) { 508 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 509 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 510 src += (4 * src_stride); 511 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 512 const20, const6, const3); 513 res = __msa_aver_u_b(res, inp1); 514 ST_UB(res, dst); 515 dst += dst_stride; 516 517 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 518 const20, const6, const3); 519 res = __msa_aver_u_b(res, inp3); 520 ST_UB(res, dst); 521 dst += dst_stride; 522 523 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 524 const20, const6, const3); 525 res = __msa_aver_u_b(res, inp5); 526 ST_UB(res, dst); 527 dst += dst_stride; 528 529 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 530 const20, const6, const3); 531 res = __msa_aver_u_b(res, inp7); 532 ST_UB(res, dst); 533 dst += dst_stride; 534 } 535} 536 537static void horiz_mc_qpel_no_rnd_aver_src0_8width_msa(const uint8_t *src, 538 int32_t src_stride, 539 uint8_t *dst, 540 int32_t dst_stride, 541 int32_t height) 542{ 543 uint8_t loop_count; 544 v16u8 inp0, inp1, inp2, inp3; 545 v16u8 res0, res1; 546 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 547 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 548 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 549 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 550 v16u8 const20 = (v16u8) __msa_ldi_b(20); 551 v16u8 const6 = (v16u8) __msa_ldi_b(6); 552 v16u8 const3 = (v16u8) __msa_ldi_b(3); 553 554 for (loop_count = (height >> 2); loop_count--;) { 555 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 556 src += (4 * src_stride); 557 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 558 mask2, mask3, const20, 559 const6, const3); 560 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 561 mask2, mask3, const20, 562 const6, const3); 563 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 564 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 565 res0 = __msa_ave_u_b(inp0, res0); 566 res1 = __msa_ave_u_b(inp2, res1); 567 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 568 dst += (4 * dst_stride); 569 } 570} 571 572static void horiz_mc_qpel_no_rnd_aver_src0_16width_msa(const uint8_t *src, 573 int32_t src_stride, 574 uint8_t *dst, 575 int32_t dst_stride, 576 int32_t height) 577{ 578 uint8_t loop_count; 579 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 580 v16u8 res; 581 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 582 v8u16 const20 = (v8u16) __msa_ldi_h(20); 583 v16u8 const6 = (v16u8) __msa_ldi_b(6); 584 v16u8 const3 = (v16u8) __msa_ldi_b(3); 585 586 for (loop_count = (height >> 2); loop_count--;) { 587 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 588 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 589 src += (4 * src_stride); 590 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 591 const20, const6, const3); 592 res = __msa_ave_u_b(inp0, res); 593 ST_UB(res, dst); 594 dst += dst_stride; 595 596 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, 597 const20, const6, const3); 598 res = __msa_ave_u_b(inp2, res); 599 ST_UB(res, dst); 600 dst += dst_stride; 601 602 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, 603 const20, const6, const3); 604 res = __msa_ave_u_b(inp4, res); 605 ST_UB(res, dst); 606 dst += dst_stride; 607 608 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, 609 const20, const6, const3); 610 res = __msa_ave_u_b(inp6, res); 611 ST_UB(res, dst); 612 dst += dst_stride; 613 } 614} 615 616static void horiz_mc_qpel_no_rnd_8width_msa(const uint8_t *src, 617 int32_t src_stride, 618 uint8_t *dst, 619 int32_t dst_stride, 620 int32_t height) 621{ 622 uint8_t loop_count; 623 v16u8 inp0, inp1, inp2, inp3; 624 v16u8 res0, res1; 625 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 626 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 627 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 628 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 629 v16u8 const20 = (v16u8) __msa_ldi_b(20); 630 v16u8 const6 = (v16u8) __msa_ldi_b(6); 631 v16u8 const3 = (v16u8) __msa_ldi_b(3); 632 633 for (loop_count = (height >> 2); loop_count--;) { 634 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 635 src += (4 * src_stride); 636 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 637 mask2, mask3, const20, 638 const6, const3); 639 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 640 mask2, mask3, const20, 641 const6, const3); 642 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 643 dst += (4 * dst_stride); 644 } 645} 646 647static void horiz_mc_qpel_no_rnd_16width_msa(const uint8_t *src, 648 int32_t src_stride, 649 uint8_t *dst, 650 int32_t dst_stride, 651 int32_t height) 652{ 653 uint8_t loop_count; 654 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 655 v16u8 res; 656 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 657 v16u8 const6 = (v16u8) __msa_ldi_b(6); 658 v16u8 const3 = (v16u8) __msa_ldi_b(3); 659 v8u16 const20 = (v8u16) __msa_ldi_h(20); 660 661 for (loop_count = (height >> 2); loop_count--;) { 662 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 663 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 664 src += (4 * src_stride); 665 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 666 const20, const6, const3); 667 ST_UB(res, dst); 668 dst += dst_stride; 669 670 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, 671 const20, const6, const3); 672 ST_UB(res, dst); 673 dst += dst_stride; 674 675 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, 676 const20, const6, const3); 677 ST_UB(res, dst); 678 dst += dst_stride; 679 680 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, 681 const20, const6, const3); 682 ST_UB(res, dst); 683 dst += dst_stride; 684 } 685} 686 687static void horiz_mc_qpel_no_rnd_aver_src1_8width_msa(const uint8_t *src, 688 int32_t src_stride, 689 uint8_t *dst, 690 int32_t dst_stride, 691 int32_t height) 692{ 693 uint8_t loop_count; 694 v16u8 inp0, inp1, inp2, inp3; 695 v16u8 res0, res1; 696 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 697 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 698 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 699 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 700 v16u8 const20 = (v16u8) __msa_ldi_b(20); 701 v16u8 const6 = (v16u8) __msa_ldi_b(6); 702 v16u8 const3 = (v16u8) __msa_ldi_b(3); 703 704 for (loop_count = (height >> 2); loop_count--;) { 705 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 706 src += (4 * src_stride); 707 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 708 mask2, mask3, const20, 709 const6, const3); 710 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 711 mask2, mask3, const20, 712 const6, const3); 713 SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1, 714 inp0, inp1, inp2, inp3); 715 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 716 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 717 res0 = __msa_ave_u_b(inp0, res0); 718 res1 = __msa_ave_u_b(inp2, res1); 719 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 720 dst += (4 * dst_stride); 721 } 722} 723 724static void horiz_mc_qpel_no_rnd_aver_src1_16width_msa(const uint8_t *src, 725 int32_t src_stride, 726 uint8_t *dst, 727 int32_t dst_stride, 728 int32_t height) 729{ 730 uint8_t loop_count; 731 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 732 v16u8 res; 733 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 734 v16u8 const6 = (v16u8) __msa_ldi_b(6); 735 v16u8 const3 = (v16u8) __msa_ldi_b(3); 736 v8u16 const20 = (v8u16) __msa_ldi_h(20); 737 738 for (loop_count = (height >> 2); loop_count--;) { 739 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 740 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 741 src += (4 * src_stride); 742 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 743 const20, const6, const3); 744 res = __msa_ave_u_b(res, inp1); 745 ST_UB(res, dst); 746 dst += dst_stride; 747 748 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, 749 const20, const6, const3); 750 res = __msa_ave_u_b(res, inp3); 751 ST_UB(res, dst); 752 dst += dst_stride; 753 754 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, 755 const20, const6, const3); 756 res = __msa_ave_u_b(res, inp5); 757 ST_UB(res, dst); 758 dst += dst_stride; 759 760 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, 761 const20, const6, const3); 762 res = __msa_ave_u_b(res, inp7); 763 ST_UB(res, dst); 764 dst += dst_stride; 765 } 766} 767 768static void horiz_mc_qpel_avg_dst_aver_src0_8width_msa(const uint8_t *src, 769 int32_t src_stride, 770 uint8_t *dst, 771 int32_t dst_stride, 772 int32_t height) 773{ 774 uint8_t loop_count; 775 v16u8 inp0, inp1, inp2, inp3; 776 v16u8 dst0, dst1, dst2, dst3; 777 v16u8 res0, res1; 778 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 779 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 780 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 781 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 782 v16u8 const20 = (v16u8) __msa_ldi_b(20); 783 v16u8 const6 = (v16u8) __msa_ldi_b(6); 784 v16u8 const3 = (v16u8) __msa_ldi_b(3); 785 786 for (loop_count = (height >> 2); loop_count--;) { 787 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 788 src += (4 * src_stride); 789 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 790 mask0, mask1, mask2, mask3, 791 const20, const6, const3); 792 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 793 mask0, mask1, mask2, mask3, 794 const20, const6, const3); 795 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 796 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 797 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 798 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 799 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 800 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); 801 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 802 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 803 dst += (4 * dst_stride); 804 } 805} 806 807static void horiz_mc_qpel_avg_dst_aver_src0_16width_msa(const uint8_t *src, 808 int32_t src_stride, 809 uint8_t *dst, 810 int32_t dst_stride, 811 int32_t height) 812{ 813 uint8_t loop_count; 814 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 815 v16u8 res0, res1; 816 v16u8 dst0, dst1; 817 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 818 v16u8 const6 = (v16u8) __msa_ldi_b(6); 819 v16u8 const3 = (v16u8) __msa_ldi_b(3); 820 v8u16 const20 = (v8u16) __msa_ldi_h(20); 821 822 for (loop_count = (height >> 2); loop_count--;) { 823 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 824 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 825 src += (4 * src_stride); 826 res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 827 const20, const6, const3); 828 res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 829 const20, const6, const3); 830 LD_UB2(dst, dst_stride, dst0, dst1); 831 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); 832 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); 833 ST_UB2(res0, res1, dst, dst_stride); 834 dst += (2 * dst_stride); 835 836 res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 837 const20, const6, const3); 838 res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 839 const20, const6, const3); 840 LD_UB2(dst, dst_stride, dst0, dst1); 841 AVER_UB2_UB(inp4, res0, inp6, res1, res0, res1); 842 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); 843 ST_UB2(res0, res1, dst, dst_stride); 844 dst += (2 * dst_stride); 845 } 846} 847 848static void horiz_mc_qpel_avg_dst_8width_msa(const uint8_t *src, 849 int32_t src_stride, 850 uint8_t *dst, 851 int32_t dst_stride, 852 int32_t height) 853{ 854 uint8_t loop_count; 855 v16u8 inp0, inp1, inp2, inp3; 856 v16u8 dst0, dst1, dst2, dst3; 857 v16u8 res0, res1; 858 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 859 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 860 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 861 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 862 v16u8 const20 = (v16u8) __msa_ldi_b(20); 863 v16u8 const6 = (v16u8) __msa_ldi_b(6); 864 v16u8 const3 = (v16u8) __msa_ldi_b(3); 865 866 for (loop_count = (height >> 2); loop_count--;) { 867 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 868 src += (4 * src_stride); 869 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 870 mask0, mask1, mask2, mask3, 871 const20, const6, const3); 872 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 873 mask0, mask1, mask2, mask3, 874 const20, const6, const3); 875 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 876 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 877 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 878 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 879 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 880 dst += (4 * dst_stride); 881 } 882} 883 884static void horiz_mc_qpel_avg_dst_16width_msa(const uint8_t *src, 885 int32_t src_stride, 886 uint8_t *dst, 887 int32_t dst_stride, 888 int32_t height) 889{ 890 uint8_t loop_count; 891 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 892 v16u8 res0, res1; 893 v16u8 dst0, dst1; 894 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 895 v16u8 const6 = (v16u8) __msa_ldi_b(6); 896 v16u8 const3 = (v16u8) __msa_ldi_b(3); 897 v8u16 const20 = (v8u16) __msa_ldi_h(20); 898 899 for (loop_count = (height >> 2); loop_count--;) { 900 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 901 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 902 src += (4 * src_stride); 903 res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 904 const20, const6, const3); 905 res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 906 const20, const6, const3); 907 LD_UB2(dst, dst_stride, dst0, dst1); 908 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); 909 ST_UB2(res0, res1, dst, dst_stride); 910 dst += (2 * dst_stride); 911 912 res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 913 const20, const6, const3); 914 res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 915 const20, const6, const3); 916 LD_UB2(dst, dst_stride, dst0, dst1); 917 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); 918 ST_UB2(res0, res1, dst, dst_stride); 919 dst += (2 * dst_stride); 920 } 921} 922 923static void horiz_mc_qpel_avg_dst_aver_src1_8width_msa(const uint8_t *src, 924 int32_t src_stride, 925 uint8_t *dst, 926 int32_t dst_stride, 927 int32_t height) 928{ 929 uint8_t loop_count; 930 v16u8 inp0, inp1, inp2, inp3; 931 v16u8 dst0, dst1, dst2, dst3; 932 v16u8 res0, res1; 933 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 934 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 935 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 936 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 937 v16u8 const20 = (v16u8) __msa_ldi_b(20); 938 v16u8 const6 = (v16u8) __msa_ldi_b(6); 939 v16u8 const3 = (v16u8) __msa_ldi_b(3); 940 941 for (loop_count = (height >> 2); loop_count--;) { 942 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 943 src += (4 * src_stride); 944 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 945 mask0, mask1, mask2, mask3, 946 const20, const6, const3); 947 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 948 mask0, mask1, mask2, mask3, 949 const20, const6, const3); 950 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 951 SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1, 952 inp0, inp1, inp2, inp3); 953 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 954 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 955 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 956 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 957 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); 958 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 959 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 960 dst += (4 * dst_stride); 961 } 962} 963 964static void horiz_mc_qpel_avg_dst_aver_src1_16width_msa(const uint8_t *src, 965 int32_t src_stride, 966 uint8_t *dst, 967 int32_t dst_stride, 968 int32_t height) 969{ 970 uint8_t loop_count; 971 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 972 v16u8 res0, res1, dst0, dst1; 973 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 974 v16u8 const6 = (v16u8) __msa_ldi_b(6); 975 v16u8 const3 = (v16u8) __msa_ldi_b(3); 976 v8u16 const20 = (v8u16) __msa_ldi_h(20); 977 978 for (loop_count = (height >> 2); loop_count--;) { 979 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 980 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 981 src += (4 * src_stride); 982 res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 983 const20, const6, const3); 984 res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 985 const20, const6, const3); 986 LD_UB2(dst, dst_stride, dst0, dst1); 987 AVER_UB2_UB(res0, inp1, res1, inp3, res0, res1); 988 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); 989 ST_UB2(res0, res1, dst, dst_stride); 990 dst += (2 * dst_stride); 991 res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 992 const20, const6, const3); 993 res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 994 const20, const6, const3); 995 LD_UB2(dst, dst_stride, dst0, dst1); 996 AVER_UB2_UB(res0, inp5, res1, inp7, res0, res1); 997 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); 998 ST_UB2(res0, res1, dst, dst_stride); 999 dst += (2 * dst_stride); 1000 } 1001} 1002 1003 1004static void vert_mc_qpel_aver_src0_8x8_msa(const uint8_t *src, 1005 int32_t src_stride, 1006 uint8_t *dst, 1007 int32_t dst_stride) 1008{ 1009 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1010 v16u8 tmp0, tmp1, res0, res1; 1011 v16u8 const20 = (v16u8) __msa_ldi_b(20); 1012 v16u8 const6 = (v16u8) __msa_ldi_b(6); 1013 v16u8 const3 = (v16u8) __msa_ldi_b(3); 1014 1015 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1016 src += (4 * src_stride); 1017 LD_UB2(src, src_stride, inp4, inp5); 1018 src += (2 * src_stride); 1019 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, 1020 inp1, inp2, inp3, inp4, 1021 inp1, inp0, inp0, inp1, 1022 inp2, inp3, inp4, inp5, 1023 const20, const6, const3); 1024 LD_UB2(src, src_stride, inp6, inp7); 1025 src += (2 * src_stride); 1026 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, 1027 inp3, inp4, inp5, inp6, 1028 inp3, inp2, inp1, inp0, 1029 inp4, inp5, inp6, inp7, 1030 const20, const6, const3); 1031 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 1032 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 1033 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 1034 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 1035 1036 inp8 = LD_UB(src); 1037 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, 1038 inp5, inp6, inp7, inp8, 1039 inp5, inp4, inp3, inp2, 1040 inp6, inp7, inp8, inp8, 1041 const20, const6, const3); 1042 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, 1043 inp7, inp8, inp8, inp7, 1044 inp7, inp6, inp5, inp4, 1045 inp8, inp8, inp7, inp6, 1046 const20, const6, const3); 1047 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5); 1048 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7); 1049 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 1050 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1051} 1052 1053static void vert_mc_qpel_aver_src0_16x16_msa(const uint8_t *src, 1054 int32_t src_stride, 1055 uint8_t *dst, 1056 int32_t dst_stride) 1057{ 1058 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1059 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 1060 v16u8 res0; 1061 v16u8 const20 = (v16u8) __msa_ldi_b(20); 1062 v16u8 const6 = (v16u8) __msa_ldi_b(6); 1063 v16u8 const3 = (v16u8) __msa_ldi_b(3); 1064 1065 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 1066 src += (5 * src_stride); 1067 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, 1068 inp1, inp2, inp3, inp4, 1069 const20, const6, const3); 1070 res0 = __msa_aver_u_b(res0, inp0); 1071 ST_UB(res0, dst); 1072 dst += dst_stride; 1073 1074 inp5 = LD_UB(src); 1075 src += src_stride; 1076 res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, 1077 inp2, inp3, inp4, inp5, 1078 const20, const6, const3); 1079 res0 = __msa_aver_u_b(res0, inp1); 1080 ST_UB(res0, dst); 1081 dst += dst_stride; 1082 1083 inp6 = LD_UB(src); 1084 src += src_stride; 1085 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, 1086 inp3, inp4, inp5, inp6, 1087 const20, const6, const3); 1088 res0 = __msa_aver_u_b(res0, inp2); 1089 ST_UB(res0, dst); 1090 dst += dst_stride; 1091 1092 inp7 = LD_UB(src); 1093 src += src_stride; 1094 res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, 1095 inp4, inp5, inp6, inp7, 1096 const20, const6, const3); 1097 res0 = __msa_aver_u_b(res0, inp3); 1098 ST_UB(res0, dst); 1099 dst += dst_stride; 1100 1101 LD_UB2(src, src_stride, inp8, inp9); 1102 src += (2 * src_stride); 1103 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, 1104 inp5, inp6, inp7, inp8, 1105 const20, const6, const3); 1106 res0 = __msa_aver_u_b(res0, inp4); 1107 ST_UB(res0, dst); 1108 dst += dst_stride; 1109 1110 res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, 1111 inp6, inp7, inp8, inp9, 1112 const20, const6, const3); 1113 res0 = __msa_aver_u_b(res0, inp5); 1114 ST_UB(res0, dst); 1115 dst += dst_stride; 1116 1117 LD_UB2(src, src_stride, inp10, inp11); 1118 src += (2 * src_stride); 1119 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, 1120 inp7, inp8, inp9, inp10, 1121 const20, const6, const3); 1122 res0 = __msa_aver_u_b(res0, inp6); 1123 ST_UB(res0, dst); 1124 dst += dst_stride; 1125 1126 res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, 1127 inp8, inp9, inp10, inp11, 1128 const20, const6, const3); 1129 res0 = __msa_aver_u_b(res0, inp7); 1130 ST_UB(res0, dst); 1131 dst += dst_stride; 1132 1133 LD_UB2(src, src_stride, inp12, inp13); 1134 src += (2 * src_stride); 1135 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, 1136 inp9, inp10, inp11, inp12, 1137 const20, const6, const3); 1138 res0 = __msa_aver_u_b(res0, inp8); 1139 ST_UB(res0, dst); 1140 dst += dst_stride; 1141 1142 res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, 1143 inp10, inp11, inp12, inp13, 1144 const20, const6, const3); 1145 res0 = __msa_aver_u_b(res0, inp9); 1146 ST_UB(res0, dst); 1147 dst += dst_stride; 1148 1149 LD_UB2(src, src_stride, inp14, inp15); 1150 src += (2 * src_stride); 1151 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, 1152 inp11, inp12, inp13, inp14, 1153 const20, const6, const3); 1154 res0 = __msa_aver_u_b(res0, inp10); 1155 ST_UB(res0, dst); 1156 dst += dst_stride; 1157 1158 res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, 1159 inp12, inp13, inp14, inp15, 1160 const20, const6, const3); 1161 res0 = __msa_aver_u_b(res0, inp11); 1162 ST_UB(res0, dst); 1163 dst += dst_stride; 1164 1165 inp16 = LD_UB(src); 1166 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, 1167 inp13, inp14, inp15, inp16, 1168 const20, const6, const3); 1169 res0 = __msa_aver_u_b(res0, inp12); 1170 ST_UB(res0, dst); 1171 dst += dst_stride; 1172 1173 res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, 1174 inp14, inp15, inp16, inp16, 1175 const20, const6, const3); 1176 res0 = __msa_aver_u_b(res0, inp13); 1177 ST_UB(res0, dst); 1178 dst += dst_stride; 1179 1180 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, 1181 inp15, inp16, inp16, inp15, 1182 const20, const6, const3); 1183 res0 = __msa_aver_u_b(res0, inp14); 1184 ST_UB(res0, dst); 1185 dst += dst_stride; 1186 1187 res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, 1188 inp16, inp16, inp15, inp14, 1189 const20, const6, const3); 1190 res0 = __msa_aver_u_b(res0, inp15); 1191 ST_UB(res0, dst); 1192} 1193 1194static void vert_mc_qpel_8x8_msa(const uint8_t *src, 1195 int32_t src_stride, 1196 uint8_t *dst, 1197 int32_t dst_stride) 1198{ 1199 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1200 v16u8 res0, res1; 1201 v16u8 const20 = (v16u8) __msa_ldi_b(20); 1202 v16u8 const6 = (v16u8) __msa_ldi_b(6); 1203 v16u8 const3 = (v16u8) __msa_ldi_b(3); 1204 1205 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1206 src += (4 * src_stride); 1207 LD_UB2(src, src_stride, inp4, inp5); 1208 src += (2 * src_stride); 1209 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, 1210 inp1, inp2, inp3, inp4, 1211 inp1, inp0, inp0, inp1, 1212 inp2, inp3, inp4, inp5, 1213 const20, const6, const3); 1214 LD_UB2(src, src_stride, inp6, inp7); 1215 src += (2 * src_stride); 1216 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, 1217 inp3, inp4, inp5, inp6, 1218 inp3, inp2, inp1, inp0, 1219 inp4, inp5, inp6, inp7, 1220 const20, const6, const3); 1221 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 1222 1223 inp8 = LD_UB(src); 1224 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, 1225 inp5, inp6, inp7, inp8, 1226 inp5, inp4, inp3, inp2, 1227 inp6, inp7, inp8, inp8, 1228 const20, const6, const3); 1229 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, 1230 inp7, inp8, inp8, inp7, 1231 inp7, inp6, inp5, inp4, 1232 inp8, inp8, inp7, inp6, 1233 const20, const6, const3); 1234 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1235} 1236 1237static void vert_mc_qpel_16x16_msa(const uint8_t *src, 1238 int32_t src_stride, 1239 uint8_t *dst, 1240 int32_t dst_stride) 1241{ 1242 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1243 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 1244 v16u8 res0; 1245 v16u8 const20 = (v16u8) __msa_ldi_b(20); 1246 v16u8 const6 = (v16u8) __msa_ldi_b(6); 1247 v16u8 const3 = (v16u8) __msa_ldi_b(3); 1248 1249 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1250 src += (4 * src_stride); 1251 inp4 = LD_UB(src); 1252 src += src_stride; 1253 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, 1254 inp1, inp2, inp3, inp4, 1255 const20, const6, const3); 1256 ST_UB(res0, dst); 1257 dst += dst_stride; 1258 1259 inp5 = LD_UB(src); 1260 src += src_stride; 1261 res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, 1262 inp2, inp3, inp4, inp5, 1263 const20, const6, const3); 1264 ST_UB(res0, dst); 1265 dst += dst_stride; 1266 1267 inp6 = LD_UB(src); 1268 src += src_stride; 1269 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, 1270 inp3, inp4, inp5, inp6, 1271 const20, const6, const3); 1272 ST_UB(res0, dst); 1273 dst += dst_stride; 1274 1275 inp7 = LD_UB(src); 1276 src += src_stride; 1277 res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, 1278 inp4, inp5, inp6, inp7, 1279 const20, const6, const3); 1280 ST_UB(res0, dst); 1281 dst += dst_stride; 1282 1283 inp8 = LD_UB(src); 1284 src += src_stride; 1285 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, 1286 inp5, inp6, inp7, inp8, 1287 const20, const6, const3); 1288 ST_UB(res0, dst); 1289 dst += dst_stride; 1290 1291 inp9 = LD_UB(src); 1292 src += src_stride; 1293 res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, 1294 inp6, inp7, inp8, inp9, 1295 const20, const6, const3); 1296 ST_UB(res0, dst); 1297 dst += dst_stride; 1298 1299 inp10 = LD_UB(src); 1300 src += src_stride; 1301 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, 1302 inp7, inp8, inp9, inp10, 1303 const20, const6, const3); 1304 ST_UB(res0, dst); 1305 dst += dst_stride; 1306 1307 inp11 = LD_UB(src); 1308 src += src_stride; 1309 res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, 1310 inp8, inp9, inp10, inp11, 1311 const20, const6, const3); 1312 ST_UB(res0, dst); 1313 dst += dst_stride; 1314 1315 inp12 = LD_UB(src); 1316 src += src_stride; 1317 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, 1318 inp9, inp10, inp11, inp12, 1319 const20, const6, const3); 1320 ST_UB(res0, dst); 1321 dst += dst_stride; 1322 1323 inp13 = LD_UB(src); 1324 src += src_stride; 1325 res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, 1326 inp10, inp11, inp12, inp13, 1327 const20, const6, const3); 1328 ST_UB(res0, dst); 1329 dst += dst_stride; 1330 1331 inp14 = LD_UB(src); 1332 src += src_stride; 1333 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, 1334 inp11, inp12, inp13, inp14, 1335 const20, const6, const3); 1336 ST_UB(res0, dst); 1337 dst += dst_stride; 1338 1339 inp15 = LD_UB(src); 1340 src += src_stride; 1341 res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, 1342 inp12, inp13, inp14, inp15, 1343 const20, const6, const3); 1344 ST_UB(res0, dst); 1345 dst += dst_stride; 1346 1347 inp16 = LD_UB(src); 1348 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, 1349 inp13, inp14, inp15, inp16, 1350 const20, const6, const3); 1351 ST_UB(res0, dst); 1352 dst += dst_stride; 1353 1354 res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, 1355 inp14, inp15, inp16, inp16, 1356 const20, const6, const3); 1357 ST_UB(res0, dst); 1358 dst += dst_stride; 1359 1360 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, 1361 inp15, inp16, inp16, inp15, 1362 const20, const6, const3); 1363 ST_UB(res0, dst); 1364 dst += dst_stride; 1365 1366 res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, 1367 inp16, inp16, inp15, inp14, 1368 const20, const6, const3); 1369 ST_UB(res0, dst); 1370 dst += dst_stride; 1371} 1372 1373static void vert_mc_qpel_aver_src1_8x8_msa(const uint8_t *src, 1374 int32_t src_stride, 1375 uint8_t *dst, 1376 int32_t dst_stride) 1377{ 1378 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1379 v16u8 tmp0, tmp1, res0, res1; 1380 v16u8 const20 = (v16u8) __msa_ldi_b(20); 1381 v16u8 const6 = (v16u8) __msa_ldi_b(6); 1382 v16u8 const3 = (v16u8) __msa_ldi_b(3); 1383 1384 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1385 src += (4 * src_stride); 1386 LD_UB2(src, src_stride, inp4, inp5); 1387 src += (2 * src_stride); 1388 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, 1389 inp1, inp2, inp3, inp4, 1390 inp1, inp0, inp0, inp1, 1391 inp2, inp3, inp4, inp5, 1392 const20, const6, const3); 1393 1394 LD_UB2(src, src_stride, inp6, inp7); 1395 src += (2 * src_stride); 1396 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, 1397 inp3, inp4, inp5, inp6, 1398 inp3, inp2, inp1, inp0, 1399 inp4, inp5, inp6, inp7, 1400 const20, const6, const3); 1401 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2); 1402 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4); 1403 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 1404 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 1405 1406 inp8 = LD_UB(src); 1407 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, 1408 inp5, inp6, inp7, inp8, 1409 inp5, inp4, inp3, inp2, 1410 inp6, inp7, inp8, inp8, 1411 const20, const6, const3); 1412 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, 1413 inp7, inp8, inp8, inp7, 1414 inp7, inp6, inp5, inp4, 1415 inp8, inp8, inp7, inp6, 1416 const20, const6, const3); 1417 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6); 1418 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8); 1419 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 1420 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1421} 1422 1423static void vert_mc_qpel_aver_src1_16x16_msa(const uint8_t *src, 1424 int32_t src_stride, 1425 uint8_t *dst, 1426 int32_t dst_stride) 1427{ 1428 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1429 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 1430 v16u8 res0; 1431 v16u8 const20 = (v16u8) __msa_ldi_b(20); 1432 v16u8 const6 = (v16u8) __msa_ldi_b(6); 1433 v16u8 const3 = (v16u8) __msa_ldi_b(3); 1434 1435 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1436 src += (4 * src_stride); 1437 inp4 = LD_UB(src); 1438 src += src_stride; 1439 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, 1440 inp1, inp2, inp3, inp4, 1441 const20, const6, const3); 1442 res0 = __msa_aver_u_b(res0, inp1); 1443 ST_UB(res0, dst); 1444 dst += dst_stride; 1445 1446 inp5 = LD_UB(src); 1447 src += src_stride; 1448 res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, 1449 inp2, inp3, inp4, inp5, 1450 const20, const6, const3); 1451 res0 = __msa_aver_u_b(res0, inp2); 1452 ST_UB(res0, dst); 1453 dst += dst_stride; 1454 1455 inp6 = LD_UB(src); 1456 src += src_stride; 1457 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, 1458 inp3, inp4, inp5, inp6, 1459 const20, const6, const3); 1460 res0 = __msa_aver_u_b(res0, inp3); 1461 ST_UB(res0, dst); 1462 dst += dst_stride; 1463 1464 inp7 = LD_UB(src); 1465 src += src_stride; 1466 res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, 1467 inp4, inp5, inp6, inp7, 1468 const20, const6, const3); 1469 res0 = __msa_aver_u_b(res0, inp4); 1470 ST_UB(res0, dst); 1471 dst += dst_stride; 1472 1473 inp8 = LD_UB(src); 1474 src += src_stride; 1475 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, 1476 inp5, inp6, inp7, inp8, 1477 const20, const6, const3); 1478 res0 = __msa_aver_u_b(res0, inp5); 1479 ST_UB(res0, dst); 1480 dst += dst_stride; 1481 1482 inp9 = LD_UB(src); 1483 src += src_stride; 1484 res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, 1485 inp6, inp7, inp8, inp9, 1486 const20, const6, const3); 1487 res0 = __msa_aver_u_b(res0, inp6); 1488 ST_UB(res0, dst); 1489 dst += dst_stride; 1490 1491 inp10 = LD_UB(src); 1492 src += src_stride; 1493 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, 1494 inp7, inp8, inp9, inp10, 1495 const20, const6, const3); 1496 res0 = __msa_aver_u_b(res0, inp7); 1497 ST_UB(res0, dst); 1498 dst += dst_stride; 1499 1500 inp11 = LD_UB(src); 1501 src += src_stride; 1502 res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, 1503 inp8, inp9, inp10, inp11, 1504 const20, const6, const3); 1505 res0 = __msa_aver_u_b(res0, inp8); 1506 ST_UB(res0, dst); 1507 dst += dst_stride; 1508 1509 inp12 = LD_UB(src); 1510 src += src_stride; 1511 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, 1512 inp9, inp10, inp11, inp12, 1513 const20, const6, const3); 1514 res0 = __msa_aver_u_b(res0, inp9); 1515 ST_UB(res0, dst); 1516 dst += dst_stride; 1517 1518 inp13 = LD_UB(src); 1519 src += src_stride; 1520 res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, 1521 inp10, inp11, inp12, inp13, 1522 const20, const6, const3); 1523 res0 = __msa_aver_u_b(res0, inp10); 1524 ST_UB(res0, dst); 1525 dst += dst_stride; 1526 1527 inp14 = LD_UB(src); 1528 src += src_stride; 1529 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, 1530 inp11, inp12, inp13, inp14, 1531 const20, const6, const3); 1532 res0 = __msa_aver_u_b(res0, inp11); 1533 ST_UB(res0, dst); 1534 dst += dst_stride; 1535 1536 inp15 = LD_UB(src); 1537 src += src_stride; 1538 res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, 1539 inp12, inp13, inp14, inp15, 1540 const20, const6, const3); 1541 res0 = __msa_aver_u_b(res0, inp12); 1542 ST_UB(res0, dst); 1543 dst += dst_stride; 1544 1545 inp16 = LD_UB(src); 1546 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, 1547 inp13, inp14, inp15, inp16, 1548 const20, const6, const3); 1549 res0 = __msa_aver_u_b(res0, inp13); 1550 ST_UB(res0, dst); 1551 dst += dst_stride; 1552 1553 res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, 1554 inp14, inp15, inp16, inp16, 1555 const20, const6, const3); 1556 res0 = __msa_aver_u_b(res0, inp14); 1557 ST_UB(res0, dst); 1558 dst += dst_stride; 1559 1560 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, 1561 inp15, inp16, inp16, inp15, 1562 const20, const6, const3); 1563 res0 = __msa_aver_u_b(res0, inp15); 1564 ST_UB(res0, dst); 1565 dst += dst_stride; 1566 1567 res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, 1568 inp16, inp16, inp15, inp14, 1569 const20, const6, const3); 1570 res0 = __msa_aver_u_b(res0, inp16); 1571 ST_UB(res0, dst); 1572} 1573 1574static void vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t *src, 1575 int32_t src_stride, 1576 uint8_t *dst, 1577 int32_t dst_stride) 1578{ 1579 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1580 v16u8 tmp0, tmp1, res0, res1; 1581 v16u8 const20 = (v16u8) __msa_ldi_b(20); 1582 v16u8 const6 = (v16u8) __msa_ldi_b(6); 1583 v16u8 const3 = (v16u8) __msa_ldi_b(3); 1584 1585 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1586 src += (4 * src_stride); 1587 LD_UB2(src, src_stride, inp4, inp5); 1588 src += (2 * src_stride); 1589 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2, 1590 inp1, inp2, inp3, inp4, 1591 inp1, inp0, inp0, inp1, 1592 inp2, inp3, inp4, inp5, 1593 const20, const6, const3); 1594 LD_UB2(src, src_stride, inp6, inp7); 1595 src += (2 * src_stride); 1596 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0, 1597 inp3, inp4, inp5, inp6, 1598 inp3, inp2, inp1, inp0, 1599 inp4, inp5, inp6, inp7, 1600 const20, const6, const3); 1601 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 1602 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 1603 res0 = __msa_ave_u_b(res0, tmp0); 1604 res1 = __msa_ave_u_b(res1, tmp1); 1605 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 1606 1607 inp8 = LD_UB(src); 1608 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1, 1609 inp5, inp6, inp7, inp8, 1610 inp5, inp4, inp3, inp2, 1611 inp6, inp7, inp8, inp8, 1612 const20, const6, const3); 1613 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3, 1614 inp7, inp8, inp8, inp7, 1615 inp7, inp6, inp5, inp4, 1616 inp8, inp8, inp7, inp6, 1617 const20, const6, const3); 1618 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5); 1619 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7); 1620 res0 = __msa_ave_u_b(res0, tmp0); 1621 res1 = __msa_ave_u_b(res1, tmp1); 1622 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1623} 1624 1625static void vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t *src, 1626 int32_t src_stride, 1627 uint8_t *dst, 1628 int32_t dst_stride) 1629{ 1630 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1631 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 1632 v16u8 res0; 1633 v16u8 const20 = (v16u8) __msa_ldi_b(20); 1634 v16u8 const6 = (v16u8) __msa_ldi_b(6); 1635 v16u8 const3 = (v16u8) __msa_ldi_b(3); 1636 1637 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 1638 src += (5 * src_stride); 1639 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2, 1640 inp1, inp2, inp3, inp4, 1641 const20, const6, const3); 1642 res0 = __msa_ave_u_b(res0, inp0); 1643 ST_UB(res0, dst); 1644 dst += dst_stride; 1645 1646 inp5 = LD_UB(src); 1647 src += src_stride; 1648 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1, 1649 inp2, inp3, inp4, inp5, 1650 const20, const6, const3); 1651 res0 = __msa_ave_u_b(res0, inp1); 1652 ST_UB(res0, dst); 1653 dst += dst_stride; 1654 1655 inp6 = LD_UB(src); 1656 src += src_stride; 1657 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0, 1658 inp3, inp4, inp5, inp6, 1659 const20, const6, const3); 1660 res0 = __msa_ave_u_b(res0, inp2); 1661 ST_UB(res0, dst); 1662 dst += dst_stride; 1663 1664 inp7 = LD_UB(src); 1665 src += src_stride; 1666 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0, 1667 inp4, inp5, inp6, inp7, 1668 const20, const6, const3); 1669 res0 = __msa_ave_u_b(res0, inp3); 1670 ST_UB(res0, dst); 1671 dst += dst_stride; 1672 1673 inp8 = LD_UB(src); 1674 src += src_stride; 1675 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1, 1676 inp5, inp6, inp7, inp8, 1677 const20, const6, const3); 1678 res0 = __msa_ave_u_b(res0, inp4); 1679 ST_UB(res0, dst); 1680 dst += dst_stride; 1681 1682 inp9 = LD_UB(src); 1683 src += src_stride; 1684 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2, 1685 inp6, inp7, inp8, inp9, 1686 const20, const6, const3); 1687 res0 = __msa_ave_u_b(res0, inp5); 1688 ST_UB(res0, dst); 1689 dst += dst_stride; 1690 1691 inp10 = LD_UB(src); 1692 src += src_stride; 1693 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3, 1694 inp7, inp8, inp9, inp10, 1695 const20, const6, const3); 1696 res0 = __msa_ave_u_b(res0, inp6); 1697 ST_UB(res0, dst); 1698 dst += dst_stride; 1699 1700 inp11 = LD_UB(src); 1701 src += src_stride; 1702 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4, 1703 inp8, inp9, inp10, inp11, 1704 const20, const6, const3); 1705 res0 = __msa_ave_u_b(res0, inp7); 1706 ST_UB(res0, dst); 1707 dst += dst_stride; 1708 1709 inp12 = LD_UB(src); 1710 src += src_stride; 1711 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5, 1712 inp9, inp10, inp11, inp12, 1713 const20, const6, const3); 1714 res0 = __msa_ave_u_b(res0, inp8); 1715 ST_UB(res0, dst); 1716 dst += dst_stride; 1717 1718 inp13 = LD_UB(src); 1719 src += src_stride; 1720 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6, 1721 inp10, inp11, inp12, inp13, 1722 const20, const6, const3); 1723 res0 = __msa_ave_u_b(res0, inp9); 1724 ST_UB(res0, dst); 1725 dst += dst_stride; 1726 1727 inp14 = LD_UB(src); 1728 src += src_stride; 1729 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7, 1730 inp11, inp12, inp13, inp14, 1731 const20, const6, const3); 1732 res0 = __msa_ave_u_b(res0, inp10); 1733 ST_UB(res0, dst); 1734 dst += dst_stride; 1735 1736 inp15 = LD_UB(src); 1737 src += src_stride; 1738 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8, 1739 inp12, inp13, inp14, inp15, 1740 const20, const6, const3); 1741 res0 = __msa_ave_u_b(res0, inp11); 1742 ST_UB(res0, dst); 1743 dst += dst_stride; 1744 1745 inp16 = LD_UB(src); 1746 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9, 1747 inp13, inp14, inp15, inp16, 1748 const20, const6, const3); 1749 res0 = __msa_ave_u_b(res0, inp12); 1750 ST_UB(res0, dst); 1751 dst += dst_stride; 1752 1753 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10, 1754 inp14, inp15, inp16, inp16, 1755 const20, const6, const3); 1756 res0 = __msa_ave_u_b(res0, inp13); 1757 ST_UB(res0, dst); 1758 dst += dst_stride; 1759 1760 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11, 1761 inp15, inp16, inp16, inp15, 1762 const20, const6, const3); 1763 res0 = __msa_ave_u_b(res0, inp14); 1764 ST_UB(res0, dst); 1765 dst += dst_stride; 1766 1767 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12, 1768 inp16, inp16, inp15, inp14, 1769 const20, const6, const3); 1770 res0 = __msa_ave_u_b(res0, inp15); 1771 ST_UB(res0, dst); 1772 dst += dst_stride; 1773} 1774 1775static void vert_mc_qpel_no_rnd_8x8_msa(const uint8_t *src, 1776 int32_t src_stride, 1777 uint8_t *dst, 1778 int32_t dst_stride) 1779{ 1780 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1781 v16u8 res0, res1; 1782 v16u8 const20 = (v16u8) __msa_ldi_b(20); 1783 v16u8 const6 = (v16u8) __msa_ldi_b(6); 1784 v16u8 const3 = (v16u8) __msa_ldi_b(3); 1785 1786 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1787 src += (4 * src_stride); 1788 LD_UB2(src, src_stride, inp4, inp5); 1789 src += (2 * src_stride); 1790 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2, 1791 inp1, inp2, inp3, inp4, 1792 inp1, inp0, inp0, inp1, 1793 inp2, inp3, inp4, inp5, 1794 const20, const6, const3); 1795 LD_UB2(src, src_stride, inp6, inp7); 1796 src += (2 * src_stride); 1797 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0, 1798 inp3, inp4, inp5, inp6, 1799 inp3, inp2, inp1, inp0, 1800 inp4, inp5, inp6, inp7, 1801 const20, const6, const3); 1802 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 1803 1804 inp8 = LD_UB(src); 1805 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1, 1806 inp5, inp6, inp7, inp8, 1807 inp5, inp4, inp3, inp2, 1808 inp6, inp7, inp8, inp8, 1809 const20, const6, const3); 1810 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3, 1811 inp7, inp8, inp8, inp7, 1812 inp7, inp6, inp5, inp4, 1813 inp8, inp8, inp7, inp6, 1814 const20, const6, const3); 1815 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1816} 1817 1818static void vert_mc_qpel_no_rnd_16x16_msa(const uint8_t *src, 1819 int32_t src_stride, 1820 uint8_t *dst, 1821 int32_t dst_stride) 1822{ 1823 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1824 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 1825 v16u8 res0; 1826 v16u8 const20 = (v16u8) __msa_ldi_b(20); 1827 v16u8 const6 = (v16u8) __msa_ldi_b(6); 1828 v16u8 const3 = (v16u8) __msa_ldi_b(3); 1829 1830 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 1831 src += (5 * src_stride); 1832 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2, 1833 inp1, inp2, inp3, inp4, 1834 const20, const6, const3); 1835 ST_UB(res0, dst); 1836 dst += dst_stride; 1837 1838 inp5 = LD_UB(src); 1839 src += src_stride; 1840 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1, 1841 inp2, inp3, inp4, inp5, 1842 const20, const6, const3); 1843 ST_UB(res0, dst); 1844 dst += dst_stride; 1845 1846 inp6 = LD_UB(src); 1847 src += src_stride; 1848 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0, 1849 inp3, inp4, inp5, inp6, 1850 const20, const6, const3); 1851 ST_UB(res0, dst); 1852 dst += dst_stride; 1853 1854 inp7 = LD_UB(src); 1855 src += src_stride; 1856 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0, 1857 inp4, inp5, inp6, inp7, 1858 const20, const6, const3); 1859 ST_UB(res0, dst); 1860 dst += dst_stride; 1861 1862 inp8 = LD_UB(src); 1863 src += src_stride; 1864 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1, 1865 inp5, inp6, inp7, inp8, 1866 const20, const6, const3); 1867 ST_UB(res0, dst); 1868 dst += dst_stride; 1869 1870 inp9 = LD_UB(src); 1871 src += src_stride; 1872 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2, 1873 inp6, inp7, inp8, inp9, 1874 const20, const6, const3); 1875 ST_UB(res0, dst); 1876 dst += dst_stride; 1877 1878 inp10 = LD_UB(src); 1879 src += src_stride; 1880 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3, 1881 inp7, inp8, inp9, inp10, 1882 const20, const6, const3); 1883 ST_UB(res0, dst); 1884 dst += dst_stride; 1885 1886 inp11 = LD_UB(src); 1887 src += src_stride; 1888 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4, 1889 inp8, inp9, inp10, inp11, 1890 const20, const6, const3); 1891 ST_UB(res0, dst); 1892 dst += dst_stride; 1893 1894 inp12 = LD_UB(src); 1895 src += src_stride; 1896 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5, 1897 inp9, inp10, inp11, inp12, 1898 const20, const6, const3); 1899 ST_UB(res0, dst); 1900 dst += dst_stride; 1901 1902 inp13 = LD_UB(src); 1903 src += src_stride; 1904 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6, 1905 inp10, inp11, inp12, inp13, 1906 const20, const6, const3); 1907 ST_UB(res0, dst); 1908 dst += dst_stride; 1909 1910 inp14 = LD_UB(src); 1911 src += src_stride; 1912 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7, 1913 inp11, inp12, inp13, inp14, 1914 const20, const6, const3); 1915 ST_UB(res0, dst); 1916 dst += dst_stride; 1917 1918 inp15 = LD_UB(src); 1919 src += src_stride; 1920 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8, 1921 inp12, inp13, inp14, inp15, 1922 const20, const6, const3); 1923 ST_UB(res0, dst); 1924 dst += dst_stride; 1925 1926 inp16 = LD_UB(src); 1927 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9, 1928 inp13, inp14, inp15, inp16, 1929 const20, const6, const3); 1930 ST_UB(res0, dst); 1931 dst += dst_stride; 1932 1933 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10, 1934 inp14, inp15, inp16, inp16, 1935 const20, const6, const3); 1936 ST_UB(res0, dst); 1937 dst += dst_stride; 1938 1939 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11, 1940 inp15, inp16, inp16, inp15, 1941 const20, const6, const3); 1942 ST_UB(res0, dst); 1943 dst += dst_stride; 1944 1945 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12, 1946 inp16, inp16, inp15, inp14, 1947 const20, const6, const3); 1948 ST_UB(res0, dst); 1949} 1950 1951static void vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t *src, 1952 int32_t src_stride, 1953 uint8_t *dst, 1954 int32_t dst_stride) 1955{ 1956 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1957 v16u8 tmp0, tmp1, res0, res1; 1958 v16u8 const20 = (v16u8) __msa_ldi_b(20); 1959 v16u8 const6 = (v16u8) __msa_ldi_b(6); 1960 v16u8 const3 = (v16u8) __msa_ldi_b(3); 1961 1962 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1963 src += (4 * src_stride); 1964 LD_UB2(src, src_stride, inp4, inp5); 1965 src += (2 * src_stride); 1966 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2, 1967 inp1, inp2, inp3, inp4, 1968 inp1, inp0, inp0, inp1, 1969 inp2, inp3, inp4, inp5, 1970 const20, const6, const3); 1971 LD_UB2(src, src_stride, inp6, inp7); 1972 src += (2 * src_stride); 1973 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0, 1974 inp3, inp4, inp5, inp6, 1975 inp3, inp2, inp1, inp0, 1976 inp4, inp5, inp6, inp7, 1977 const20, const6, const3); 1978 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2); 1979 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4); 1980 res0 = __msa_ave_u_b(res0, tmp0); 1981 res1 = __msa_ave_u_b(res1, tmp1); 1982 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 1983 1984 inp8 = LD_UB(src); 1985 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1, 1986 inp5, inp6, inp7, inp8, 1987 inp5, inp4, inp3, inp2, 1988 inp6, inp7, inp8, inp8, 1989 const20, const6, const3); 1990 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3, 1991 inp7, inp8, inp8, inp7, 1992 inp7, inp6, inp5, inp4, 1993 inp8, inp8, inp7, inp6, 1994 const20, const6, const3); 1995 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6); 1996 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8); 1997 res0 = __msa_ave_u_b(res0, tmp0); 1998 res1 = __msa_ave_u_b(res1, tmp1); 1999 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 2000} 2001 2002static void vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t *src, 2003 int32_t src_stride, 2004 uint8_t *dst, 2005 int32_t dst_stride) 2006{ 2007 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2008 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 2009 v16u8 res0; 2010 v16u8 const20 = (v16u8) __msa_ldi_b(20); 2011 v16u8 const6 = (v16u8) __msa_ldi_b(6); 2012 v16u8 const3 = (v16u8) __msa_ldi_b(3); 2013 2014 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 2015 src += (5 * src_stride); 2016 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2, 2017 inp1, inp2, inp3, inp4, 2018 const20, const6, const3); 2019 res0 = __msa_ave_u_b(res0, inp1); 2020 ST_UB(res0, dst); 2021 dst += dst_stride; 2022 2023 inp5 = LD_UB(src); 2024 src += src_stride; 2025 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1, 2026 inp2, inp3, inp4, inp5, 2027 const20, const6, const3); 2028 res0 = __msa_ave_u_b(res0, inp2); 2029 ST_UB(res0, dst); 2030 dst += dst_stride; 2031 2032 inp6 = LD_UB(src); 2033 src += src_stride; 2034 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0, 2035 inp3, inp4, inp5, inp6, 2036 const20, const6, const3); 2037 res0 = __msa_ave_u_b(res0, inp3); 2038 ST_UB(res0, dst); 2039 dst += dst_stride; 2040 2041 inp7 = LD_UB(src); 2042 src += src_stride; 2043 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0, 2044 inp4, inp5, inp6, inp7, 2045 const20, const6, const3); 2046 res0 = __msa_ave_u_b(res0, inp4); 2047 ST_UB(res0, dst); 2048 dst += dst_stride; 2049 2050 inp8 = LD_UB(src); 2051 src += src_stride; 2052 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1, 2053 inp5, inp6, inp7, inp8, 2054 const20, const6, const3); 2055 res0 = __msa_ave_u_b(res0, inp5); 2056 ST_UB(res0, dst); 2057 dst += dst_stride; 2058 2059 inp9 = LD_UB(src); 2060 src += src_stride; 2061 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2, 2062 inp6, inp7, inp8, inp9, 2063 const20, const6, const3); 2064 res0 = __msa_ave_u_b(res0, inp6); 2065 ST_UB(res0, dst); 2066 dst += dst_stride; 2067 2068 inp10 = LD_UB(src); 2069 src += src_stride; 2070 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3, 2071 inp7, inp8, inp9, inp10, 2072 const20, const6, const3); 2073 res0 = __msa_ave_u_b(res0, inp7); 2074 ST_UB(res0, dst); 2075 dst += dst_stride; 2076 2077 inp11 = LD_UB(src); 2078 src += src_stride; 2079 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4, 2080 inp8, inp9, inp10, inp11, 2081 const20, const6, const3); 2082 res0 = __msa_ave_u_b(res0, inp8); 2083 ST_UB(res0, dst); 2084 dst += dst_stride; 2085 2086 inp12 = LD_UB(src); 2087 src += src_stride; 2088 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5, 2089 inp9, inp10, inp11, inp12, 2090 const20, const6, const3); 2091 res0 = __msa_ave_u_b(res0, inp9); 2092 ST_UB(res0, dst); 2093 dst += dst_stride; 2094 2095 inp13 = LD_UB(src); 2096 src += src_stride; 2097 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6, 2098 inp10, inp11, inp12, inp13, 2099 const20, const6, const3); 2100 res0 = __msa_ave_u_b(res0, inp10); 2101 ST_UB(res0, dst); 2102 dst += dst_stride; 2103 2104 inp14 = LD_UB(src); 2105 src += src_stride; 2106 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7, 2107 inp11, inp12, inp13, inp14, 2108 const20, const6, const3); 2109 res0 = __msa_ave_u_b(res0, inp11); 2110 ST_UB(res0, dst); 2111 dst += dst_stride; 2112 2113 inp15 = LD_UB(src); 2114 src += src_stride; 2115 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8, 2116 inp12, inp13, inp14, inp15, 2117 const20, const6, const3); 2118 res0 = __msa_ave_u_b(res0, inp12); 2119 ST_UB(res0, dst); 2120 dst += dst_stride; 2121 2122 inp16 = LD_UB(src); 2123 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9, 2124 inp13, inp14, inp15, inp16, 2125 const20, const6, const3); 2126 res0 = __msa_ave_u_b(res0, inp13); 2127 ST_UB(res0, dst); 2128 dst += dst_stride; 2129 2130 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10, 2131 inp14, inp15, inp16, inp16, 2132 const20, const6, const3); 2133 res0 = __msa_ave_u_b(res0, inp14); 2134 ST_UB(res0, dst); 2135 dst += dst_stride; 2136 2137 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11, 2138 inp15, inp16, inp16, inp15, 2139 const20, const6, const3); 2140 res0 = __msa_ave_u_b(res0, inp15); 2141 ST_UB(res0, dst); 2142 dst += dst_stride; 2143 2144 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12, 2145 inp16, inp16, inp15, inp14, 2146 const20, const6, const3); 2147 res0 = __msa_ave_u_b(res0, inp16); 2148 ST_UB(res0, dst); 2149} 2150 2151static void vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t *src, 2152 int32_t src_stride, 2153 uint8_t *dst, 2154 int32_t dst_stride) 2155{ 2156 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2157 v16u8 dst0, dst1, dst2, dst3; 2158 v16u8 tmp0, tmp1, res0, res1; 2159 v16u8 const20 = (v16u8) __msa_ldi_b(20); 2160 v16u8 const6 = (v16u8) __msa_ldi_b(6); 2161 v16u8 const3 = (v16u8) __msa_ldi_b(3); 2162 2163 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 2164 src += (4 * src_stride); 2165 LD_UB2(src, src_stride, inp4, inp5); 2166 src += (2 * src_stride); 2167 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, 2168 inp1, inp2, inp3, inp4, 2169 inp1, inp0, inp0, inp1, 2170 inp2, inp3, inp4, inp5, 2171 const20, const6, const3); 2172 2173 LD_UB2(src, src_stride, inp6, inp7); 2174 src += (2 * src_stride); 2175 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, 2176 inp3, inp4, inp5, inp6, 2177 inp3, inp2, inp1, inp0, 2178 inp4, inp5, inp6, inp7, 2179 const20, const6, const3); 2180 2181 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 2182 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 2183 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 2184 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 2185 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 2186 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 2187 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 2188 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 2189 dst += (4 * dst_stride); 2190 2191 inp8 = LD_UB(src); 2192 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, 2193 inp5, inp6, inp7, inp8, 2194 inp5, inp4, inp3, inp2, 2195 inp6, inp7, inp8, inp8, 2196 const20, const6, const3); 2197 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, 2198 inp7, inp8, inp8, inp7, 2199 inp7, inp6, inp5, inp4, 2200 inp8, inp8, inp7, inp6, 2201 const20, const6, const3); 2202 2203 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 2204 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5); 2205 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7); 2206 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 2207 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 2208 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 2209 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 2210 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 2211} 2212 2213static void vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t *src, 2214 int32_t src_stride, 2215 uint8_t *dst, 2216 int32_t dst_stride) 2217{ 2218 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2219 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 2220 v16u8 res0, res1, dst0, dst1; 2221 v16u8 const20 = (v16u8) __msa_ldi_b(20); 2222 v16u8 const6 = (v16u8) __msa_ldi_b(6); 2223 v16u8 const3 = (v16u8) __msa_ldi_b(3); 2224 2225 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 2226 src += (5 * src_stride); 2227 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, 2228 inp1, inp2, inp3, inp4, 2229 const20, const6, const3); 2230 2231 inp5 = LD_UB(src); 2232 src += src_stride; 2233 res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, 2234 inp2, inp3, inp4, inp5, 2235 const20, const6, const3); 2236 2237 LD_UB2(dst, dst_stride, dst0, dst1); 2238 AVER_UB2_UB(res0, inp0, res1, inp1, res0, res1); 2239 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2240 ST_UB2(res0, res1, dst, dst_stride); 2241 dst += (2 * dst_stride); 2242 2243 inp6 = LD_UB(src); 2244 src += src_stride; 2245 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, 2246 inp3, inp4, inp5, inp6, 2247 const20, const6, const3); 2248 2249 inp7 = LD_UB(src); 2250 src += src_stride; 2251 res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, 2252 inp4, inp5, inp6, inp7, 2253 const20, const6, const3); 2254 2255 LD_UB2(dst, dst_stride, dst0, dst1); 2256 AVER_UB2_UB(res0, inp2, res1, inp3, res0, res1); 2257 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2258 ST_UB2(res0, res1, dst, dst_stride); 2259 dst += (2 * dst_stride); 2260 2261 LD_UB2(src, src_stride, inp8, inp9); 2262 src += (2 * src_stride); 2263 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, 2264 inp5, inp6, inp7, inp8, 2265 const20, const6, const3); 2266 res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, 2267 inp6, inp7, inp8, inp9, 2268 const20, const6, const3); 2269 2270 LD_UB2(dst, dst_stride, dst0, dst1); 2271 AVER_UB2_UB(res0, inp4, res1, inp5, res0, res1); 2272 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2273 ST_UB2(res0, res1, dst, dst_stride); 2274 dst += (2 * dst_stride); 2275 2276 LD_UB2(src, src_stride, inp10, inp11); 2277 src += (2 * src_stride); 2278 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, 2279 inp7, inp8, inp9, inp10, 2280 const20, const6, const3); 2281 res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, 2282 inp8, inp9, inp10, inp11, 2283 const20, const6, const3); 2284 2285 LD_UB2(dst, dst_stride, dst0, dst1); 2286 AVER_UB2_UB(res0, inp6, res1, inp7, res0, res1); 2287 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2288 ST_UB2(res0, res1, dst, dst_stride); 2289 dst += (2 * dst_stride); 2290 2291 LD_UB2(src, src_stride, inp12, inp13); 2292 src += (2 * src_stride); 2293 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, 2294 inp9, inp10, inp11, inp12, 2295 const20, const6, const3); 2296 res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, 2297 inp10, inp11, inp12, inp13, 2298 const20, const6, const3); 2299 LD_UB2(dst, dst_stride, dst0, dst1); 2300 AVER_UB2_UB(res0, inp8, res1, inp9, res0, res1); 2301 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2302 ST_UB2(res0, res1, dst, dst_stride); 2303 dst += (2 * dst_stride); 2304 2305 LD_UB2(src, src_stride, inp14, inp15); 2306 src += (2 * src_stride); 2307 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, 2308 inp11, inp12, inp13, inp14, 2309 const20, const6, const3); 2310 res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, 2311 inp12, inp13, inp14, inp15, 2312 const20, const6, const3); 2313 2314 LD_UB2(dst, dst_stride, dst0, dst1); 2315 AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1); 2316 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2317 ST_UB2(res0, res1, dst, dst_stride); 2318 dst += (2 * dst_stride); 2319 2320 inp16 = LD_UB(src); 2321 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, 2322 inp13, inp14, inp15, inp16, 2323 const20, const6, const3); 2324 res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, 2325 inp14, inp15, inp16, inp16, 2326 const20, const6, const3); 2327 LD_UB2(dst, dst_stride, dst0, dst1); 2328 AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1); 2329 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2330 ST_UB2(res0, res1, dst, dst_stride); 2331 dst += (2 * dst_stride); 2332 2333 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, 2334 inp15, inp16, inp16, inp15, 2335 const20, const6, const3); 2336 res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, 2337 inp16, inp16, inp15, inp14, 2338 const20, const6, const3); 2339 LD_UB2(dst, dst_stride, dst0, dst1); 2340 AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1); 2341 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2342 ST_UB2(res0, res1, dst, dst_stride); 2343} 2344 2345static void vert_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, 2346 int32_t src_stride, 2347 uint8_t *dst, 2348 int32_t dst_stride) 2349{ 2350 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2351 v16u8 dst0, dst1, dst2, dst3; 2352 v16u8 res0, res1; 2353 v16u8 const20 = (v16u8) __msa_ldi_b(20); 2354 v16u8 const6 = (v16u8) __msa_ldi_b(6); 2355 v16u8 const3 = (v16u8) __msa_ldi_b(3); 2356 2357 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 2358 src += (4 * src_stride); 2359 LD_UB2(src, src_stride, inp4, inp5); 2360 src += (2 * src_stride); 2361 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, 2362 inp1, inp2, inp3, inp4, 2363 inp1, inp0, inp0, inp1, 2364 inp2, inp3, inp4, inp5, 2365 const20, const6, const3); 2366 LD_UB2(src, src_stride, inp6, inp7); 2367 src += (2 * src_stride); 2368 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, 2369 inp3, inp4, inp5, inp6, 2370 inp3, inp2, inp1, inp0, 2371 inp4, inp5, inp6, inp7, 2372 const20, const6, const3); 2373 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 2374 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 2375 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 2376 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 2377 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 2378 dst += (4 * dst_stride); 2379 2380 inp8 = LD_UB(src); 2381 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, 2382 inp5, inp6, inp7, inp8, 2383 inp5, inp4, inp3, inp2, 2384 inp6, inp7, inp8, inp8, 2385 const20, const6, const3); 2386 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, 2387 inp7, inp8, inp8, inp7, 2388 inp7, inp6, inp5, inp4, 2389 inp8, inp8, inp7, inp6, 2390 const20, const6, const3); 2391 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 2392 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 2393 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 2394 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 2395 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 2396} 2397 2398static void vert_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, 2399 int32_t src_stride, 2400 uint8_t *dst, 2401 int32_t dst_stride) 2402{ 2403 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2404 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 2405 v16u8 res0, res1, dst0, dst1; 2406 v16u8 const20 = (v16u8) __msa_ldi_b(20); 2407 v16u8 const6 = (v16u8) __msa_ldi_b(6); 2408 v16u8 const3 = (v16u8) __msa_ldi_b(3); 2409 2410 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 2411 src += (5 * src_stride); 2412 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, 2413 inp1, inp2, inp3, inp4, 2414 const20, const6, const3); 2415 inp5 = LD_UB(src); 2416 src += src_stride; 2417 res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, 2418 inp2, inp3, inp4, inp5, 2419 const20, const6, const3); 2420 LD_UB2(dst, dst_stride, dst0, dst1); 2421 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2422 ST_UB2(res0, res1, dst, dst_stride); 2423 dst += (2 * dst_stride); 2424 2425 inp6 = LD_UB(src); 2426 src += src_stride; 2427 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, 2428 inp3, inp4, inp5, inp6, 2429 const20, const6, const3); 2430 inp7 = LD_UB(src); 2431 src += src_stride; 2432 res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, 2433 inp4, inp5, inp6, inp7, 2434 const20, const6, const3); 2435 LD_UB2(dst, dst_stride, dst0, dst1); 2436 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2437 ST_UB2(res0, res1, dst, dst_stride); 2438 dst += (2 * dst_stride); 2439 2440 inp8 = LD_UB(src); 2441 src += src_stride; 2442 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, 2443 inp5, inp6, inp7, inp8, 2444 const20, const6, const3); 2445 inp9 = LD_UB(src); 2446 src += src_stride; 2447 res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, 2448 inp6, inp7, inp8, inp9, 2449 const20, const6, const3); 2450 LD_UB2(dst, dst_stride, dst0, dst1); 2451 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2452 ST_UB2(res0, res1, dst, dst_stride); 2453 dst += (2 * dst_stride); 2454 2455 inp10 = LD_UB(src); 2456 src += src_stride; 2457 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, 2458 inp7, inp8, inp9, inp10, 2459 const20, const6, const3); 2460 inp11 = LD_UB(src); 2461 src += src_stride; 2462 res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, 2463 inp8, inp9, inp10, inp11, 2464 const20, const6, const3); 2465 LD_UB2(dst, dst_stride, dst0, dst1); 2466 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2467 ST_UB2(res0, res1, dst, dst_stride); 2468 dst += (2 * dst_stride); 2469 2470 inp12 = LD_UB(src); 2471 src += src_stride; 2472 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, 2473 inp9, inp10, inp11, inp12, 2474 const20, const6, const3); 2475 inp13 = LD_UB(src); 2476 src += src_stride; 2477 res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, 2478 inp10, inp11, inp12, inp13, 2479 const20, const6, const3); 2480 LD_UB2(dst, dst_stride, dst0, dst1); 2481 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2482 ST_UB2(res0, res1, dst, dst_stride); 2483 dst += (2 * dst_stride); 2484 2485 inp14 = LD_UB(src); 2486 src += src_stride; 2487 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, 2488 inp11, inp12, inp13, inp14, 2489 const20, const6, const3); 2490 inp15 = LD_UB(src); 2491 src += src_stride; 2492 res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, 2493 inp12, inp13, inp14, inp15, 2494 const20, const6, const3); 2495 LD_UB2(dst, dst_stride, dst0, dst1); 2496 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2497 ST_UB2(res0, res1, dst, dst_stride); 2498 dst += (2 * dst_stride); 2499 2500 inp16 = LD_UB(src); 2501 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, 2502 inp13, inp14, inp15, inp16, 2503 const20, const6, const3); 2504 res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, 2505 inp14, inp15, inp16, inp16, 2506 const20, const6, const3); 2507 LD_UB2(dst, dst_stride, dst0, dst1); 2508 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2509 ST_UB2(res0, res1, dst, dst_stride); 2510 dst += (2 * dst_stride); 2511 2512 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, 2513 inp15, inp16, inp16, inp15, 2514 const20, const6, const3); 2515 res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, 2516 inp16, inp16, inp15, inp14, 2517 const20, const6, const3); 2518 LD_UB2(dst, dst_stride, dst0, dst1); 2519 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2520 ST_UB2(res0, res1, dst, dst_stride); 2521} 2522 2523static void vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t *src, 2524 int32_t src_stride, 2525 uint8_t *dst, 2526 int32_t dst_stride) 2527{ 2528 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2529 v16u8 dst0, dst1, dst2, dst3; 2530 v16u8 tmp0, tmp1, res0, res1; 2531 v16u8 const20 = (v16u8) __msa_ldi_b(20); 2532 v16u8 const6 = (v16u8) __msa_ldi_b(6); 2533 v16u8 const3 = (v16u8) __msa_ldi_b(3); 2534 2535 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 2536 src += (4 * src_stride); 2537 LD_UB2(src, src_stride, inp4, inp5); 2538 src += (2 * src_stride); 2539 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, 2540 inp1, inp2, inp3, inp4, 2541 inp1, inp0, inp0, inp1, 2542 inp2, inp3, inp4, inp5, 2543 const20, const6, const3); 2544 LD_UB2(src, src_stride, inp6, inp7); 2545 src += (2 * src_stride); 2546 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, 2547 inp3, inp4, inp5, inp6, 2548 inp3, inp2, inp1, inp0, 2549 inp4, inp5, inp6, inp7, 2550 const20, const6, const3); 2551 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 2552 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2); 2553 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4); 2554 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 2555 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 2556 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 2557 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 2558 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 2559 dst += (4 * dst_stride); 2560 2561 inp8 = LD_UB(src); 2562 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, 2563 inp5, inp6, inp7, inp8, 2564 inp5, inp4, inp3, inp2, 2565 inp6, inp7, inp8, inp8, 2566 const20, const6, const3); 2567 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, 2568 inp7, inp8, inp8, inp7, 2569 inp7, inp6, inp5, inp4, 2570 inp8, inp8, inp7, inp6, 2571 const20, const6, const3); 2572 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 2573 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6); 2574 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8); 2575 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 2576 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 2577 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 2578 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 2579 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 2580} 2581 2582static void vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t *src, 2583 int32_t src_stride, 2584 uint8_t *dst, 2585 int32_t dst_stride) 2586{ 2587 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2588 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 2589 v16u8 res0, res1, dst0, dst1; 2590 v16u8 const20 = (v16u8) __msa_ldi_b(20); 2591 v16u8 const6 = (v16u8) __msa_ldi_b(6); 2592 v16u8 const3 = (v16u8) __msa_ldi_b(3); 2593 2594 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 2595 src += (5 * src_stride); 2596 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, 2597 inp1, inp2, inp3, inp4, 2598 const20, const6, const3); 2599 inp5 = LD_UB(src); 2600 src += src_stride; 2601 res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, 2602 inp2, inp3, inp4, inp5, 2603 const20, const6, const3); 2604 LD_UB2(dst, dst_stride, dst0, dst1); 2605 AVER_UB2_UB(res0, inp1, res1, inp2, res0, res1); 2606 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2607 ST_UB2(res0, res1, dst, dst_stride); 2608 dst += (2 * dst_stride); 2609 2610 inp6 = LD_UB(src); 2611 src += src_stride; 2612 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, 2613 inp3, inp4, inp5, inp6, 2614 const20, const6, const3); 2615 inp7 = LD_UB(src); 2616 src += src_stride; 2617 res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, 2618 inp4, inp5, inp6, inp7, 2619 const20, const6, const3); 2620 LD_UB2(dst, dst_stride, dst0, dst1); 2621 AVER_UB2_UB(res0, inp3, res1, inp4, res0, res1); 2622 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2623 ST_UB2(res0, res1, dst, dst_stride); 2624 dst += (2 * dst_stride); 2625 2626 inp8 = LD_UB(src); 2627 src += src_stride; 2628 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, 2629 inp5, inp6, inp7, inp8, 2630 const20, const6, const3); 2631 inp9 = LD_UB(src); 2632 src += src_stride; 2633 res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, 2634 inp6, inp7, inp8, inp9, 2635 const20, const6, const3); 2636 LD_UB2(dst, dst_stride, dst0, dst1); 2637 AVER_UB2_UB(res0, inp5, res1, inp6, res0, res1); 2638 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2639 ST_UB2(res0, res1, dst, dst_stride); 2640 dst += (2 * dst_stride); 2641 2642 inp10 = LD_UB(src); 2643 src += src_stride; 2644 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, 2645 inp7, inp8, inp9, inp10, 2646 const20, const6, const3); 2647 inp11 = LD_UB(src); 2648 src += src_stride; 2649 res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, 2650 inp8, inp9, inp10, inp11, 2651 const20, const6, const3); 2652 LD_UB2(dst, dst_stride, dst0, dst1); 2653 AVER_UB2_UB(res0, inp7, res1, inp8, res0, res1); 2654 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2655 ST_UB2(res0, res1, dst, dst_stride); 2656 dst += (2 * dst_stride); 2657 2658 inp12 = LD_UB(src); 2659 src += src_stride; 2660 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, 2661 inp9, inp10, inp11, inp12, 2662 const20, const6, const3); 2663 inp13 = LD_UB(src); 2664 src += src_stride; 2665 res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, 2666 inp10, inp11, inp12, inp13, 2667 const20, const6, const3); 2668 LD_UB2(dst, dst_stride, dst0, dst1); 2669 AVER_UB2_UB(res0, inp9, res1, inp10, res0, res1); 2670 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2671 ST_UB2(res0, res1, dst, dst_stride); 2672 dst += (2 * dst_stride); 2673 2674 inp14 = LD_UB(src); 2675 src += src_stride; 2676 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, 2677 inp11, inp12, inp13, inp14, 2678 const20, const6, const3); 2679 inp15 = LD_UB(src); 2680 src += src_stride; 2681 res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, 2682 inp12, inp13, inp14, inp15, 2683 const20, const6, const3); 2684 LD_UB2(dst, dst_stride, dst0, dst1); 2685 AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1); 2686 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2687 ST_UB2(res0, res1, dst, dst_stride); 2688 dst += (2 * dst_stride); 2689 2690 inp16 = LD_UB(src); 2691 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, 2692 inp13, inp14, inp15, inp16, 2693 const20, const6, const3); 2694 res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, 2695 inp14, inp15, inp16, inp16, 2696 const20, const6, const3); 2697 LD_UB2(dst, dst_stride, dst0, dst1); 2698 AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1); 2699 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2700 ST_UB2(res0, res1, dst, dst_stride); 2701 dst += (2 * dst_stride); 2702 2703 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, 2704 inp15, inp16, inp16, inp15, 2705 const20, const6, const3); 2706 res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, 2707 inp16, inp16, inp15, inp14, 2708 const20, const6, const3); 2709 LD_UB2(dst, dst_stride, dst0, dst1); 2710 AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1); 2711 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2712 ST_UB2(res0, res1, dst, dst_stride); 2713} 2714 2715static void hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(const uint8_t *src, 2716 int32_t src_stride, 2717 uint8_t *dst, 2718 int32_t dst_stride, 2719 int32_t height) 2720{ 2721 uint8_t loop_count; 2722 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 2723 v16u8 res; 2724 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 2725 v16u8 const6 = (v16u8) __msa_ldi_b(6); 2726 v16u8 const3 = (v16u8) __msa_ldi_b(3); 2727 v8u16 const20 = (v8u16) __msa_ldi_h(20); 2728 2729 for (loop_count = (height >> 2); loop_count--;) { 2730 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 2731 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 2732 src += (4 * src_stride); 2733 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 2734 const20, const6, const3); 2735 res = __msa_ave_u_b(inp0, res); 2736 ST_UB(res, dst); 2737 dst += dst_stride; 2738 2739 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, 2740 const20, const6, const3); 2741 res = __msa_ave_u_b(inp2, res); 2742 ST_UB(res, dst); 2743 dst += dst_stride; 2744 2745 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, 2746 const20, const6, const3); 2747 res = __msa_ave_u_b(inp4, res); 2748 ST_UB(res, dst); 2749 dst += dst_stride; 2750 2751 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, 2752 const20, const6, const3); 2753 res = __msa_ave_u_b(inp6, res); 2754 ST_UB(res, dst); 2755 dst += dst_stride; 2756 } 2757 2758 LD_UB2(src, 1, inp0, inp1); 2759 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 2760 const20, const6, const3); 2761 res = __msa_ave_u_b(inp0, res); 2762 ST_UB(res, dst); 2763} 2764 2765static void hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(const uint8_t *src, 2766 int32_t src_stride, 2767 uint8_t *dst, 2768 int32_t dst_stride) 2769{ 2770 uint8_t buff[272]; 2771 2772 hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 2773 vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 2774} 2775 2776static void hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t *src, 2777 int32_t src_stride, 2778 uint8_t *dst, 2779 int32_t dst_stride) 2780{ 2781 v16u8 inp0, inp1, inp2, inp3; 2782 v16u8 res0, res1, avg0, avg1; 2783 v16u8 horiz0, horiz1, horiz2, horiz3; 2784 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 2785 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 2786 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 2787 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 2788 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 2789 v16u8 const20 = (v16u8) __msa_ldi_b(20); 2790 v16u8 const6 = (v16u8) __msa_ldi_b(6); 2791 v16u8 const3 = (v16u8) __msa_ldi_b(3); 2792 2793 LD_UB2(src, src_stride, inp0, inp1); 2794 src += (2 * src_stride); 2795 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 2796 mask2, mask3, const20, 2797 const6, const3); 2798 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 2799 horiz0 = __msa_ave_u_b(inp0, res0); 2800 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 2801 LD_UB2(src, src_stride, inp2, inp3); 2802 src += (2 * src_stride); 2803 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 2804 mask2, mask3, const20, 2805 const6, const3); 2806 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 2807 horiz2 = __msa_ave_u_b(inp2, res1); 2808 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 2809 LD_UB2(src, src_stride, inp0, inp1); 2810 src += (2 * src_stride); 2811 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 2812 mask2, mask3, const20, 2813 const6, const3); 2814 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 2815 horiz4 = __msa_ave_u_b(inp0, res0); 2816 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 2817 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 2818 horiz1, horiz2, horiz3, horiz4, 2819 horiz1, horiz0, horiz0, horiz1, 2820 horiz2, horiz3, horiz4, horiz5, 2821 const20, const6, const3); 2822 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 2823 res0 = __msa_ave_u_b(avg0, res0); 2824 ST_D2(res0, 0, 1, dst, dst_stride); 2825 dst += (2 * dst_stride); 2826 2827 LD_UB2(src, src_stride, inp2, inp3); 2828 src += (2 * src_stride); 2829 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 2830 mask2, mask3, const20, 2831 const6, const3); 2832 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 2833 horiz6 = __msa_ave_u_b(inp2, res1); 2834 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 2835 inp0 = LD_UB(src); 2836 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 2837 mask2, mask3, const20, 2838 const6, const3); 2839 horiz8 = __msa_ave_u_b(inp0, res0); 2840 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 2841 horiz3, horiz4, horiz5, horiz6, 2842 horiz3, horiz2, horiz1, horiz0, 2843 horiz4, horiz5, horiz6, horiz7, 2844 const20, const6, const3); 2845 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 2846 res1 = __msa_ave_u_b(avg1, res1); 2847 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 2848 horiz5, horiz6, horiz7, horiz8, 2849 horiz5, horiz4, horiz3, horiz2, 2850 horiz6, horiz7, horiz8, horiz8, 2851 const20, const6, const3); 2852 ST_D2(res1, 0, 1, dst, dst_stride); 2853 dst += 2 * dst_stride; 2854 2855 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 2856 res0 = __msa_ave_u_b(avg0, res0); 2857 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 2858 horiz7, horiz8, horiz8, horiz7, 2859 horiz7, horiz6, horiz5, horiz4, 2860 horiz8, horiz8, horiz7, horiz6, 2861 const20, const6, const3); 2862 ST_D2(res0, 0, 1, dst, dst_stride); 2863 dst += 2 * dst_stride; 2864 2865 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 2866 res1 = __msa_ave_u_b(avg1, res1); 2867 ST_D2(res1, 0, 1, dst, dst_stride); 2868} 2869 2870static void hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t *src, 2871 int32_t src_stride, 2872 uint8_t *dst, 2873 int32_t dst_stride, 2874 int32_t height) 2875{ 2876 uint8_t loop_count; 2877 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 2878 v16u8 res; 2879 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 2880 v16u8 const6 = (v16u8) __msa_ldi_b(6); 2881 v16u8 const3 = (v16u8) __msa_ldi_b(3); 2882 v8u16 const20 = (v8u16) __msa_ldi_h(20); 2883 2884 for (loop_count = (height >> 2); loop_count--;) { 2885 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 2886 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 2887 src += (4 * src_stride); 2888 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 2889 const20, const6, const3); 2890 ST_UB(res, dst); 2891 dst += dst_stride; 2892 2893 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, 2894 const20, const6, const3); 2895 ST_UB(res, dst); 2896 dst += dst_stride; 2897 2898 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, 2899 const20, const6, const3); 2900 ST_UB(res, dst); 2901 dst += dst_stride; 2902 2903 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, 2904 const20, const6, const3); 2905 ST_UB(res, dst); 2906 dst += dst_stride; 2907 } 2908 2909 LD_UB2(src, 1, inp0, inp1); 2910 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 2911 const20, const6, const3); 2912 ST_UB(res, dst); 2913} 2914 2915static void hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(const uint8_t *src, 2916 int32_t src_stride, 2917 uint8_t *dst, 2918 int32_t dst_stride) 2919{ 2920 uint8_t buff[272]; 2921 2922 hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16); 2923 vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 2924} 2925 2926static void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src, 2927 int32_t src_stride, 2928 uint8_t *dst, 2929 int32_t dst_stride) 2930{ 2931 v16u8 inp0, inp1, inp2, inp3; 2932 v16u8 res0, res1, avg0, avg1; 2933 v16u8 horiz0, horiz1, horiz2, horiz3; 2934 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 2935 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 2936 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 2937 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 2938 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 2939 v16u8 const20 = (v16u8) __msa_ldi_b(20); 2940 v16u8 const6 = (v16u8) __msa_ldi_b(6); 2941 v16u8 const3 = (v16u8) __msa_ldi_b(3); 2942 2943 LD_UB2(src, src_stride, inp0, inp1); 2944 src += (2 * src_stride); 2945 horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 2946 mask2, mask3, const20, 2947 const6, const3); 2948 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 2949 2950 LD_UB2(src, src_stride, inp2, inp3); 2951 src += (2 * src_stride); 2952 horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 2953 mask2, mask3, const20, 2954 const6, const3); 2955 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 2956 LD_UB2(src, src_stride, inp0, inp1); 2957 src += (2 * src_stride); 2958 horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 2959 mask2, mask3, const20, 2960 const6, const3); 2961 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 2962 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 2963 horiz1, horiz2, horiz3, horiz4, 2964 horiz1, horiz0, horiz0, horiz1, 2965 horiz2, horiz3, horiz4, horiz5, 2966 const20, const6, const3); 2967 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 2968 res0 = __msa_ave_u_b(avg0, res0); 2969 ST_D2(res0, 0, 1, dst, dst_stride); 2970 dst += (2 * dst_stride); 2971 2972 LD_UB2(src, src_stride, inp2, inp3); 2973 src += (2 * src_stride); 2974 horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 2975 mask2, mask3, const20, 2976 const6, const3); 2977 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 2978 inp0 = LD_UB(src); 2979 horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 2980 mask2, mask3, const20, 2981 const6, const3); 2982 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 2983 horiz3, horiz4, horiz5, horiz6, 2984 horiz3, horiz2, horiz1, horiz0, 2985 horiz4, horiz5, horiz6, horiz7, 2986 const20, const6, const3); 2987 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 2988 res1 = __msa_ave_u_b(avg1, res1); 2989 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 2990 res0 = __msa_ave_u_b(avg0, res0); 2991 ST_D2(res1, 0, 1, dst, dst_stride); 2992 dst += (2 * dst_stride); 2993 2994 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 2995 horiz5, horiz6, horiz7, horiz8, 2996 horiz5, horiz4, horiz3, horiz2, 2997 horiz6, horiz7, horiz8, horiz8, 2998 const20, const6, const3); 2999 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 3000 res0 = __msa_ave_u_b(avg0, res0); 3001 ST_D2(res0, 0, 1, dst, dst_stride); 3002 dst += (2 * dst_stride); 3003 3004 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3005 horiz7, horiz8, horiz8, horiz7, 3006 horiz7, horiz6, horiz5, horiz4, 3007 horiz8, horiz8, horiz7, horiz6, 3008 const20, const6, const3); 3009 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 3010 res1 = __msa_ave_u_b(avg1, res1); 3011 ST_D2(res1, 0, 1, dst, dst_stride); 3012} 3013 3014static void hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t *src, 3015 int32_t src_stride, 3016 uint8_t *dst, 3017 int32_t dst_stride, 3018 int32_t height) 3019{ 3020 uint8_t loop_count; 3021 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 3022 v16u8 res; 3023 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 3024 v16u8 const6 = (v16u8) __msa_ldi_b(6); 3025 v16u8 const3 = (v16u8) __msa_ldi_b(3); 3026 v8u16 const20 = (v8u16) __msa_ldi_h(20); 3027 3028 for (loop_count = (height >> 2); loop_count--;) { 3029 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 3030 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 3031 src += (4 * src_stride); 3032 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 3033 const20, const6, const3); 3034 res = __msa_ave_u_b(res, inp1); 3035 ST_UB(res, dst); 3036 dst += dst_stride; 3037 3038 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, 3039 const20, const6, const3); 3040 res = __msa_ave_u_b(res, inp3); 3041 ST_UB(res, dst); 3042 dst += dst_stride; 3043 3044 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, 3045 const20, const6, const3); 3046 res = __msa_ave_u_b(res, inp5); 3047 ST_UB(res, dst); 3048 dst += dst_stride; 3049 3050 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, 3051 const20, const6, const3); 3052 res = __msa_ave_u_b(res, inp7); 3053 ST_UB(res, dst); 3054 dst += dst_stride; 3055 } 3056 3057 LD_UB2(src, 1, inp0, inp1); 3058 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 3059 const20, const6, const3); 3060 res = __msa_ave_u_b(inp1, res); 3061 ST_UB(res, dst); 3062} 3063 3064static void hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(const uint8_t *src, 3065 int32_t src_stride, 3066 uint8_t *dst, 3067 int32_t dst_stride) 3068{ 3069 uint8_t buff[272]; 3070 3071 hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 3072 vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 3073} 3074 3075static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src, 3076 int32_t src_stride, 3077 uint8_t *dst, 3078 int32_t dst_stride) 3079{ 3080 v16u8 inp0, inp1, inp2, inp3; 3081 v16u8 res0, res1, avg0, avg1; 3082 v16u8 horiz0, horiz1, horiz2, horiz3; 3083 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3084 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3085 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3086 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3087 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3088 v16u8 const20 = (v16u8) __msa_ldi_b(20); 3089 v16u8 const6 = (v16u8) __msa_ldi_b(6); 3090 v16u8 const3 = (v16u8) __msa_ldi_b(3); 3091 3092 LD_UB2(src, src_stride, inp0, inp1); 3093 src += (2 * src_stride); 3094 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3095 mask2, mask3, const20, 3096 const6, const3); 3097 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 3098 3099 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 3100 horiz0 = __msa_ave_u_b(inp0, res0); 3101 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3102 LD_UB2(src, src_stride, inp2, inp3); 3103 src += (2 * src_stride); 3104 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3105 mask2, mask3, const20, 3106 const6, const3); 3107 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 3108 3109 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 3110 horiz2 = __msa_ave_u_b(inp2, res1); 3111 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3112 LD_UB2(src, src_stride, inp0, inp1); 3113 src += (2 * src_stride); 3114 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3115 mask2, mask3, const20, 3116 const6, const3); 3117 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 3118 3119 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 3120 horiz4 = __msa_ave_u_b(inp0, res0); 3121 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3122 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3123 horiz1, horiz2, horiz3, horiz4, 3124 horiz1, horiz0, horiz0, horiz1, 3125 horiz2, horiz3, horiz4, horiz5, 3126 const20, const6, const3); 3127 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 3128 res0 = __msa_ave_u_b(avg0, res0); 3129 ST_D2(res0, 0, 1, dst, dst_stride); 3130 dst += (2 * dst_stride); 3131 3132 LD_UB2(src, src_stride, inp2, inp3); 3133 src += (2 * src_stride); 3134 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3135 mask2, mask3, const20, 3136 const6, const3); 3137 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 3138 3139 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 3140 horiz6 = __msa_ave_u_b(inp2, res1); 3141 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3142 inp0 = LD_UB(src); 3143 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3144 mask2, mask3, const20, 3145 const6, const3); 3146 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 3147 horiz8 = __msa_ave_u_b(inp0, res0); 3148 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3149 horiz3, horiz4, horiz5, horiz6, 3150 horiz3, horiz2, horiz1, horiz0, 3151 horiz4, horiz5, horiz6, horiz7, 3152 const20, const6, const3); 3153 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 3154 res1 = __msa_ave_u_b(avg1, res1); 3155 ST_D2(res1, 0, 1, dst, dst_stride); 3156 dst += (2 * dst_stride); 3157 3158 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3159 horiz5, horiz6, horiz7, horiz8, 3160 horiz5, horiz4, horiz3, horiz2, 3161 horiz6, horiz7, horiz8, horiz8, 3162 const20, const6, const3); 3163 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 3164 res0 = __msa_ave_u_b(avg0, res0); 3165 ST_D2(res0, 0, 1, dst, dst_stride); 3166 dst += (2 * dst_stride); 3167 3168 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3169 horiz7, horiz8, horiz8, horiz7, 3170 horiz7, horiz6, horiz5, horiz4, 3171 horiz8, horiz8, horiz7, horiz6, 3172 const20, const6, const3); 3173 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 3174 res1 = __msa_ave_u_b(avg1, res1); 3175 ST_D2(res1, 0, 1, dst, dst_stride); 3176} 3177 3178static void hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t *src, 3179 int32_t src_stride, 3180 uint8_t *dst, 3181 int32_t dst_stride) 3182{ 3183 uint8_t buff[272]; 3184 3185 hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 3186 vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride); 3187} 3188 3189static void hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t *src, 3190 int32_t src_stride, 3191 uint8_t *dst, 3192 int32_t dst_stride) 3193{ 3194 v16u8 inp0, inp1, inp2, inp3; 3195 v16u8 res0, res1; 3196 v16u8 horiz0, horiz1, horiz2, horiz3; 3197 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3198 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3199 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3200 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3201 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3202 v16u8 const20 = (v16u8) __msa_ldi_b(20); 3203 v16u8 const6 = (v16u8) __msa_ldi_b(6); 3204 v16u8 const3 = (v16u8) __msa_ldi_b(3); 3205 3206 LD_UB2(src, src_stride, inp0, inp1); 3207 src += (2 * src_stride); 3208 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3209 mask2, mask3, const20, 3210 const6, const3); 3211 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 3212 horiz0 = __msa_ave_u_b(inp0, res0); 3213 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3214 LD_UB2(src, src_stride, inp2, inp3); 3215 src += (2 * src_stride); 3216 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3217 mask2, mask3, const20, 3218 const6, const3); 3219 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 3220 horiz2 = __msa_ave_u_b(inp2, res1); 3221 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3222 LD_UB2(src, src_stride, inp0, inp1); 3223 src += (2 * src_stride); 3224 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3225 mask2, mask3, const20, 3226 const6, const3); 3227 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 3228 horiz4 = __msa_ave_u_b(inp0, res0); 3229 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3230 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3231 horiz1, horiz2, horiz3, horiz4, 3232 horiz1, horiz0, horiz0, horiz1, 3233 horiz2, horiz3, horiz4, horiz5, 3234 const20, const6, const3); 3235 3236 LD_UB2(src, src_stride, inp2, inp3); 3237 src += (2 * src_stride); 3238 ST_D2(res0, 0, 1, dst, dst_stride); 3239 dst += 2 * dst_stride; 3240 3241 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3242 mask2, mask3, const20, 3243 const6, const3); 3244 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 3245 horiz6 = __msa_ave_u_b(inp2, res1); 3246 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3247 inp0 = LD_UB(src); 3248 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3249 mask2, mask3, const20, 3250 const6, const3); 3251 horiz8 = __msa_ave_u_b(inp0, res0); 3252 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3253 horiz3, horiz4, horiz5, horiz6, 3254 horiz3, horiz2, horiz1, horiz0, 3255 horiz4, horiz5, horiz6, horiz7, 3256 const20, const6, const3); 3257 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3258 horiz5, horiz6, horiz7, horiz8, 3259 horiz5, horiz4, horiz3, horiz2, 3260 horiz6, horiz7, horiz8, horiz8, 3261 const20, const6, const3); 3262 ST_D4(res1, res0, 0, 1, 0, 1, dst, dst_stride); 3263 dst += (4 * dst_stride); 3264 3265 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3266 horiz7, horiz8, horiz8, horiz7, 3267 horiz7, horiz6, horiz5, horiz4, 3268 horiz8, horiz8, horiz7, horiz6, 3269 const20, const6, const3); 3270 ST_D2(res1, 0, 1, dst, dst_stride); 3271} 3272 3273static void hv_mc_qpel_no_rnd_16x16_msa(const uint8_t *src, 3274 int32_t src_stride, 3275 uint8_t *dst, 3276 int32_t dst_stride) 3277{ 3278 uint8_t buff[272]; 3279 3280 hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16); 3281 vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride); 3282} 3283 3284static void hv_mc_qpel_no_rnd_8x8_msa(const uint8_t *src, 3285 int32_t src_stride, 3286 uint8_t *dst, 3287 int32_t dst_stride) 3288{ 3289 v16u8 inp0, inp1, inp2, inp3; 3290 v16u8 res0, res1; 3291 v16u8 horiz0, horiz1, horiz2, horiz3; 3292 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3293 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3294 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3295 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3296 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3297 v16u8 const20 = (v16u8) __msa_ldi_b(20); 3298 v16u8 const6 = (v16u8) __msa_ldi_b(6); 3299 v16u8 const3 = (v16u8) __msa_ldi_b(3); 3300 3301 LD_UB2(src, src_stride, inp0, inp1); 3302 src += (2 * src_stride); 3303 horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3304 mask2, mask3, const20, 3305 const6, const3); 3306 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3307 LD_UB2(src, src_stride, inp2, inp3); 3308 src += (2 * src_stride); 3309 horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3310 mask2, mask3, const20, 3311 const6, const3); 3312 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3313 LD_UB2(src, src_stride, inp0, inp1); 3314 src += (2 * src_stride); 3315 horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3316 mask2, mask3, const20, 3317 const6, const3); 3318 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3319 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3320 horiz1, horiz2, horiz3, horiz4, 3321 horiz1, horiz0, horiz0, horiz1, 3322 horiz2, horiz3, horiz4, horiz5, 3323 const20, const6, const3); 3324 LD_UB2(src, src_stride, inp2, inp3); 3325 src += (2 * src_stride); 3326 ST_D2(res0, 0, 1, dst, dst_stride); 3327 dst += 2 * dst_stride; 3328 3329 horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3330 mask2, mask3, const20, 3331 const6, const3); 3332 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3333 inp0 = LD_UB(src); 3334 horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3335 mask2, mask3, const20, 3336 const6, const3); 3337 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3338 horiz3, horiz4, horiz5, horiz6, 3339 horiz3, horiz2, horiz1, horiz0, 3340 horiz4, horiz5, horiz6, horiz7, 3341 const20, const6, const3); 3342 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3343 horiz5, horiz6, horiz7, horiz8, 3344 horiz5, horiz4, horiz3, horiz2, 3345 horiz6, horiz7, horiz8, horiz8, 3346 const20, const6, const3); 3347 ST_D2(res1, 0, 1, dst, dst_stride); 3348 dst += 2 * dst_stride; 3349 3350 3351 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3352 horiz7, horiz8, horiz8, horiz7, 3353 horiz7, horiz6, horiz5, horiz4, 3354 horiz8, horiz8, horiz7, horiz6, 3355 const20, const6, const3); 3356 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 3357} 3358 3359static void hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t *src, 3360 int32_t src_stride, 3361 uint8_t *dst, 3362 int32_t dst_stride) 3363{ 3364 uint8_t buff[272]; 3365 3366 hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 3367 vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride); 3368} 3369 3370static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src, 3371 int32_t src_stride, 3372 uint8_t *dst, 3373 int32_t dst_stride) 3374{ 3375 v16u8 inp0, inp1, inp2, inp3; 3376 v16u8 res0, res1; 3377 v16u8 horiz0, horiz1, horiz2, horiz3; 3378 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3379 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3380 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3381 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3382 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3383 v16u8 const20 = (v16u8) __msa_ldi_b(20); 3384 v16u8 const6 = (v16u8) __msa_ldi_b(6); 3385 v16u8 const3 = (v16u8) __msa_ldi_b(3); 3386 3387 LD_UB2(src, src_stride, inp0, inp1); 3388 src += (2 * src_stride); 3389 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3390 mask2, mask3, const20, 3391 const6, const3); 3392 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 3393 3394 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 3395 horiz0 = __msa_ave_u_b(inp0, res0); 3396 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3397 LD_UB2(src, src_stride, inp2, inp3); 3398 src += (2 * src_stride); 3399 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3400 mask2, mask3, const20, 3401 const6, const3); 3402 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 3403 3404 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 3405 horiz2 = __msa_ave_u_b(inp2, res1); 3406 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3407 LD_UB2(src, src_stride, inp0, inp1); 3408 src += (2 * src_stride); 3409 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3410 mask2, mask3, const20, 3411 const6, const3); 3412 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 3413 3414 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 3415 horiz4 = __msa_ave_u_b(inp0, res0); 3416 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3417 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3418 horiz1, horiz2, horiz3, horiz4, 3419 horiz1, horiz0, horiz0, horiz1, 3420 horiz2, horiz3, horiz4, horiz5, 3421 const20, const6, const3); 3422 LD_UB2(src, src_stride, inp2, inp3); 3423 src += (2 * src_stride); 3424 ST_D2(res0, 0, 1, dst, dst_stride); 3425 dst += 2 * dst_stride; 3426 3427 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3428 mask2, mask3, const20, 3429 const6, const3); 3430 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 3431 3432 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 3433 horiz6 = __msa_ave_u_b(inp2, res1); 3434 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3435 inp0 = LD_UB(src); 3436 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3437 mask2, mask3, const20, 3438 const6, const3); 3439 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 3440 horiz8 = __msa_ave_u_b(inp0, res0); 3441 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3442 horiz3, horiz4, horiz5, horiz6, 3443 horiz3, horiz2, horiz1, horiz0, 3444 horiz4, horiz5, horiz6, horiz7, 3445 const20, const6, const3); 3446 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3447 horiz5, horiz6, horiz7, horiz8, 3448 horiz5, horiz4, horiz3, horiz2, 3449 horiz6, horiz7, horiz8, horiz8, 3450 const20, const6, const3); 3451 ST_D2(res1, 0, 1, dst, dst_stride); 3452 dst += 2 * dst_stride; 3453 3454 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3455 horiz7, horiz8, horiz8, horiz7, 3456 horiz7, horiz6, horiz5, horiz4, 3457 horiz8, horiz8, horiz7, horiz6, 3458 const20, const6, const3); 3459 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 3460} 3461 3462static void hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t *src, 3463 int32_t src_stride, 3464 uint8_t *dst, 3465 int32_t dst_stride) 3466{ 3467 uint8_t buff[272]; 3468 3469 hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 3470 vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 3471} 3472 3473static void hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t *src, 3474 int32_t src_stride, 3475 uint8_t *dst, 3476 int32_t dst_stride) 3477{ 3478 v16u8 inp0, inp1, inp2, inp3; 3479 v16u8 res0, res1, avg0, avg1; 3480 v16u8 horiz0, horiz1, horiz2, horiz3; 3481 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3482 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3483 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3484 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3485 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3486 v16u8 const20 = (v16u8) __msa_ldi_b(20); 3487 v16u8 const6 = (v16u8) __msa_ldi_b(6); 3488 v16u8 const3 = (v16u8) __msa_ldi_b(3); 3489 3490 LD_UB2(src, src_stride, inp0, inp1); 3491 src += (2 * src_stride); 3492 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3493 mask2, mask3, const20, 3494 const6, const3); 3495 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 3496 horiz0 = __msa_ave_u_b(inp0, res0); 3497 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3498 LD_UB2(src, src_stride, inp2, inp3); 3499 src += (2 * src_stride); 3500 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3501 mask2, mask3, const20, 3502 const6, const3); 3503 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 3504 horiz2 = __msa_ave_u_b(inp2, res1); 3505 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3506 LD_UB2(src, src_stride, inp0, inp1); 3507 src += (2 * src_stride); 3508 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3509 mask2, mask3, const20, 3510 const6, const3); 3511 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 3512 horiz4 = __msa_ave_u_b(inp0, res0); 3513 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3514 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3515 horiz1, horiz2, horiz3, horiz4, 3516 horiz1, horiz0, horiz0, horiz1, 3517 horiz2, horiz3, horiz4, horiz5, 3518 const20, const6, const3); 3519 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 3520 res0 = __msa_ave_u_b(avg0, res0); 3521 ST_D2(res0, 0, 1, dst, dst_stride); 3522 dst += (2 * dst_stride); 3523 3524 LD_UB2(src, src_stride, inp2, inp3); 3525 src += (2 * src_stride); 3526 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3527 mask2, mask3, const20, 3528 const6, const3); 3529 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 3530 horiz6 = __msa_ave_u_b(inp2, res1); 3531 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3532 inp0 = LD_UB(src); 3533 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3534 mask2, mask3, const20, 3535 const6, const3); 3536 horiz8 = __msa_ave_u_b(inp0, res0); 3537 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3538 horiz3, horiz4, horiz5, horiz6, 3539 horiz3, horiz2, horiz1, horiz0, 3540 horiz4, horiz5, horiz6, horiz7, 3541 const20, const6, const3); 3542 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 3543 res1 = __msa_ave_u_b(avg1, res1); 3544 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3545 horiz5, horiz6, horiz7, horiz8, 3546 horiz5, horiz4, horiz3, horiz2, 3547 horiz6, horiz7, horiz8, horiz8, 3548 const20, const6, const3); 3549 ST_D2(res1, 0, 1, dst, dst_stride); 3550 dst += 2 * dst_stride; 3551 3552 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 3553 res0 = __msa_ave_u_b(avg0, res0); 3554 3555 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3556 horiz7, horiz8, horiz8, horiz7, 3557 horiz7, horiz6, horiz5, horiz4, 3558 horiz8, horiz8, horiz7, horiz6, 3559 const20, const6, const3); 3560 ST_D2(res0, 0, 1, dst, dst_stride); 3561 dst += 2 * dst_stride; 3562 3563 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 3564 res1 = __msa_ave_u_b(avg1, res1); 3565 ST_D2(res1, 0, 1, dst, dst_stride); 3566} 3567 3568static void hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t *src, 3569 int32_t src_stride, 3570 uint8_t *dst, 3571 int32_t dst_stride) 3572{ 3573 uint8_t buff[272]; 3574 3575 hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16); 3576 vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 3577} 3578 3579static void hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t *src, 3580 int32_t src_stride, 3581 uint8_t *dst, 3582 int32_t dst_stride) 3583{ 3584 v16u8 inp0, inp1, inp2, inp3; 3585 v16u8 res0, res1, avg0, avg1; 3586 v16u8 horiz0, horiz1, horiz2, horiz3; 3587 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3588 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3589 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3590 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3591 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3592 v16u8 const20 = (v16u8) __msa_ldi_b(20); 3593 v16u8 const6 = (v16u8) __msa_ldi_b(6); 3594 v16u8 const3 = (v16u8) __msa_ldi_b(3); 3595 3596 LD_UB2(src, src_stride, inp0, inp1); 3597 src += (2 * src_stride); 3598 horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3599 mask2, mask3, const20, 3600 const6, const3); 3601 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3602 LD_UB2(src, src_stride, inp2, inp3); 3603 src += (2 * src_stride); 3604 horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3605 mask2, mask3, const20, 3606 const6, const3); 3607 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3608 LD_UB2(src, src_stride, inp0, inp1); 3609 src += (2 * src_stride); 3610 horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3611 mask2, mask3, const20, 3612 const6, const3); 3613 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3614 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3615 horiz1, horiz2, horiz3, horiz4, 3616 horiz1, horiz0, horiz0, horiz1, 3617 horiz2, horiz3, horiz4, horiz5, 3618 const20, const6, const3); 3619 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 3620 res0 = __msa_ave_u_b(avg0, res0); 3621 LD_UB2(src, src_stride, inp2, inp3); 3622 src += (2 * src_stride); 3623 ST_D2(res0, 0, 1, dst, dst_stride); 3624 dst += 2 * dst_stride; 3625 3626 horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3627 mask2, mask3, const20, 3628 const6, const3); 3629 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3630 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3631 horiz3, horiz4, horiz5, horiz6, 3632 horiz3, horiz2, horiz1, horiz0, 3633 horiz4, horiz5, horiz6, horiz7, 3634 const20, const6, const3); 3635 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 3636 res1 = __msa_ave_u_b(avg1, res1); 3637 inp0 = LD_UB(src); 3638 horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3639 mask2, mask3, const20, 3640 const6, const3); 3641 ST_D2(res1, 0, 1, dst, dst_stride); 3642 dst += 2 * dst_stride; 3643 3644 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3645 horiz5, horiz6, horiz7, horiz8, 3646 horiz5, horiz4, horiz3, horiz2, 3647 horiz6, horiz7, horiz8, horiz8, 3648 const20, const6, const3); 3649 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 3650 res0 = __msa_ave_u_b(avg0, res0); 3651 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3652 horiz7, horiz8, horiz8, horiz7, 3653 horiz7, horiz6, horiz5, horiz4, 3654 horiz8, horiz8, horiz7, horiz6, 3655 const20, const6, const3); 3656 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 3657 res1 = __msa_ave_u_b(avg1, res1); 3658 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 3659} 3660 3661static void hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t *src, 3662 int32_t src_stride, 3663 uint8_t *dst, 3664 int32_t dst_stride) 3665{ 3666 uint8_t buff[272]; 3667 3668 hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 3669 vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 3670} 3671 3672static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src, 3673 int32_t src_stride, 3674 uint8_t *dst, 3675 int32_t dst_stride) 3676{ 3677 v16u8 inp0, inp1, inp2, inp3; 3678 v16u8 res0, res1, avg0, avg1; 3679 v16u8 horiz0, horiz1, horiz2, horiz3; 3680 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3681 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3682 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3683 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3684 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3685 v16u8 const20 = (v16u8) __msa_ldi_b(20); 3686 v16u8 const6 = (v16u8) __msa_ldi_b(6); 3687 v16u8 const3 = (v16u8) __msa_ldi_b(3); 3688 3689 LD_UB2(src, src_stride, inp0, inp1); 3690 src += (2 * src_stride); 3691 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3692 mask2, mask3, const20, 3693 const6, const3); 3694 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 3695 3696 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 3697 horiz0 = __msa_ave_u_b(inp0, res0); 3698 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3699 LD_UB2(src, src_stride, inp2, inp3); 3700 src += (2 * src_stride); 3701 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3702 mask2, mask3, const20, 3703 const6, const3); 3704 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 3705 3706 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 3707 horiz2 = __msa_ave_u_b(inp2, res1); 3708 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3709 LD_UB2(src, src_stride, inp0, inp1); 3710 src += (2 * src_stride); 3711 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3712 mask2, mask3, const20, 3713 const6, const3); 3714 3715 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 3716 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 3717 horiz4 = __msa_ave_u_b(inp0, res0); 3718 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3719 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3720 horiz1, horiz2, horiz3, horiz4, 3721 horiz1, horiz0, horiz0, horiz1, 3722 horiz2, horiz3, horiz4, horiz5, 3723 const20, const6, const3); 3724 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 3725 res0 = __msa_ave_u_b(avg0, res0); 3726 ST_D2(res0, 0, 1, dst, dst_stride); 3727 dst += (2 * dst_stride); 3728 3729 LD_UB2(src, src_stride, inp2, inp3); 3730 src += (2 * src_stride); 3731 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3732 mask2, mask3, const20, 3733 const6, const3); 3734 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 3735 3736 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 3737 horiz6 = __msa_ave_u_b(inp2, res1); 3738 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3739 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3740 horiz3, horiz4, horiz5, horiz6, 3741 horiz3, horiz2, horiz1, horiz0, 3742 horiz4, horiz5, horiz6, horiz7, 3743 const20, const6, const3); 3744 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 3745 res1 = __msa_ave_u_b(avg1, res1); 3746 ST_D2(res1, 0, 1, dst, dst_stride); 3747 dst += (2 * dst_stride); 3748 3749 inp0 = LD_UB(src); 3750 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3751 mask2, mask3, const20, 3752 const6, const3); 3753 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 3754 horiz8 = __msa_ave_u_b(inp0, res0); 3755 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3756 horiz5, horiz6, horiz7, horiz8, 3757 horiz5, horiz4, horiz3, horiz2, 3758 horiz6, horiz7, horiz8, horiz8, 3759 const20, const6, const3); 3760 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3761 horiz7, horiz8, horiz8, horiz7, 3762 horiz7, horiz6, horiz5, horiz4, 3763 horiz8, horiz8, horiz7, horiz6, 3764 const20, const6, const3); 3765 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 3766 res0 = __msa_ave_u_b(avg0, res0); 3767 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 3768 res1 = __msa_ave_u_b(avg1, res1); 3769 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 3770} 3771 3772static void hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t *src, 3773 int32_t src_stride, 3774 uint8_t *dst, 3775 int32_t dst_stride, 3776 int32_t height) 3777{ 3778 uint8_t loop_count; 3779 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 3780 v16u8 res; 3781 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 3782 v16u8 const6 = (v16u8) __msa_ldi_b(6); 3783 v16u8 const3 = (v16u8) __msa_ldi_b(3); 3784 v8u16 const20 = (v8u16) __msa_ldi_h(20); 3785 3786 for (loop_count = (height >> 2); loop_count--;) { 3787 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 3788 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 3789 src += (4 * src_stride); 3790 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 3791 const20, const6, const3); 3792 res = __msa_aver_u_b(inp0, res); 3793 ST_UB(res, dst); 3794 dst += dst_stride; 3795 3796 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 3797 const20, const6, const3); 3798 res = __msa_aver_u_b(inp2, res); 3799 ST_UB(res, dst); 3800 dst += dst_stride; 3801 3802 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 3803 const20, const6, const3); 3804 res = __msa_aver_u_b(inp4, res); 3805 ST_UB(res, dst); 3806 dst += dst_stride; 3807 3808 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 3809 const20, const6, const3); 3810 res = __msa_aver_u_b(inp6, res); 3811 ST_UB(res, dst); 3812 dst += dst_stride; 3813 } 3814 3815 LD_UB2(src, 1, inp0, inp1); 3816 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3); 3817 res = __msa_aver_u_b(inp0, res); 3818 ST_UB(res, dst); 3819} 3820 3821static void hv_mc_qpel_aver_hv_src00_16x16_msa(const uint8_t *src, 3822 int32_t src_stride, 3823 uint8_t *dst, 3824 int32_t dst_stride) 3825{ 3826 uint8_t buff[272]; 3827 3828 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 3829 vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 3830} 3831 3832static void hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t *src, 3833 int32_t src_stride, 3834 uint8_t *dst, 3835 int32_t dst_stride) 3836{ 3837 v16u8 inp0, inp1, inp2, inp3; 3838 v16u8 res0, res1, avg0, avg1; 3839 v16u8 horiz0, horiz1, horiz2, horiz3; 3840 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3841 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3842 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3843 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3844 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3845 v16u8 const20 = (v16u8) __msa_ldi_b(20); 3846 v16u8 const6 = (v16u8) __msa_ldi_b(6); 3847 v16u8 const3 = (v16u8) __msa_ldi_b(3); 3848 3849 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 3850 src += (4 * src_stride); 3851 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 3852 const20, const6, const3); 3853 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 3854 const20, const6, const3); 3855 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 3856 horiz0 = __msa_aver_u_b(inp0, res0); 3857 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3858 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 3859 horiz2 = __msa_aver_u_b(inp2, res1); 3860 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3861 LD_UB2(src, src_stride, inp0, inp1); 3862 src += (2 * src_stride); 3863 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 3864 const20, const6, const3); 3865 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 3866 horiz4 = __msa_aver_u_b(inp0, res0); 3867 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3868 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3869 horiz1, horiz2, horiz3, horiz4, 3870 horiz1, horiz0, horiz0, horiz1, 3871 horiz2, horiz3, horiz4, horiz5, 3872 const20, const6, const3); 3873 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 3874 res0 = __msa_aver_u_b(avg0, res0); 3875 ST_D2(res0, 0, 1, dst, dst_stride); 3876 dst += (2 * dst_stride); 3877 3878 LD_UB2(src, src_stride, inp2, inp3); 3879 src += (2 * src_stride); 3880 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 3881 const20, const6, const3); 3882 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 3883 horiz6 = __msa_aver_u_b(inp2, res1); 3884 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3885 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3886 horiz3, horiz4, horiz5, horiz6, 3887 horiz3, horiz2, horiz1, horiz0, 3888 horiz4, horiz5, horiz6, horiz7, 3889 const20, const6, const3); 3890 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 3891 res1 = __msa_aver_u_b(avg1, res1); 3892 3893 inp0 = LD_UB(src); 3894 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 3895 const20, const6, const3); 3896 horiz8 = __msa_aver_u_b(inp0, res0); 3897 ST_D2(res1, 0, 1, dst, dst_stride); 3898 dst += 2 * dst_stride; 3899 3900 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3901 horiz5, horiz6, horiz7, horiz8, 3902 horiz5, horiz4, horiz3, horiz2, 3903 horiz6, horiz7, horiz8, horiz8, 3904 const20, const6, const3); 3905 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 3906 res0 = __msa_aver_u_b(avg0, res0); 3907 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3908 horiz7, horiz8, horiz8, horiz7, 3909 horiz7, horiz6, horiz5, horiz4, 3910 horiz8, horiz8, horiz7, horiz6, 3911 const20, const6, const3); 3912 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 3913 res1 = __msa_aver_u_b(avg1, res1); 3914 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 3915} 3916 3917static void hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t *src, 3918 int32_t src_stride, 3919 uint8_t *dst, 3920 int32_t dst_stride, 3921 int32_t height) 3922{ 3923 uint8_t loop_count; 3924 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 3925 v16u8 res; 3926 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 3927 v16u8 const6 = (v16u8) __msa_ldi_b(6); 3928 v16u8 const3 = (v16u8) __msa_ldi_b(3); 3929 v8u16 const20 = (v8u16) __msa_ldi_h(20); 3930 3931 for (loop_count = (height >> 2); loop_count--;) { 3932 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 3933 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 3934 src += (4 * src_stride); 3935 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 3936 const20, const6, const3); 3937 ST_UB(res, dst); 3938 dst += dst_stride; 3939 3940 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 3941 const20, const6, const3); 3942 ST_UB(res, dst); 3943 dst += dst_stride; 3944 3945 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 3946 const20, const6, const3); 3947 ST_UB(res, dst); 3948 dst += dst_stride; 3949 3950 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 3951 const20, const6, const3); 3952 ST_UB(res, dst); 3953 dst += dst_stride; 3954 } 3955 3956 LD_UB2(src, 1, inp0, inp1); 3957 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3); 3958 ST_UB(res, dst); 3959} 3960 3961static void hv_mc_qpel_aver_v_src0_16x16_msa(const uint8_t *src, 3962 int32_t src_stride, 3963 uint8_t *dst, 3964 int32_t dst_stride) 3965{ 3966 uint8_t buff[272]; 3967 3968 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); 3969 vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 3970} 3971 3972static void hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t *src, 3973 int32_t src_stride, 3974 uint8_t *dst, 3975 int32_t dst_stride) 3976{ 3977 v16u8 inp0, inp1, inp2, inp3; 3978 v16u8 res0, res1, avg0, avg1; 3979 v16u8 horiz0, horiz1, horiz2, horiz3; 3980 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3981 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3982 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3983 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3984 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3985 v16u8 const20 = (v16u8) __msa_ldi_b(20); 3986 v16u8 const6 = (v16u8) __msa_ldi_b(6); 3987 v16u8 const3 = (v16u8) __msa_ldi_b(3); 3988 3989 LD_UB2(src, src_stride, inp0, inp1); 3990 src += (2 * src_stride); 3991 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 3992 mask0, mask1, mask2, mask3, 3993 const20, const6, const3); 3994 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3995 LD_UB2(src, src_stride, inp2, inp3); 3996 src += (2 * src_stride); 3997 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 3998 mask0, mask1, mask2, mask3, 3999 const20, const6, const3); 4000 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4001 LD_UB2(src, src_stride, inp0, inp1); 4002 src += (2 * src_stride); 4003 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4004 mask0, mask1, mask2, mask3, 4005 const20, const6, const3); 4006 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4007 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4008 horiz1, horiz2, horiz3, horiz4, 4009 horiz1, horiz0, horiz0, horiz1, 4010 horiz2, horiz3, horiz4, horiz5, 4011 const20, const6, const3); 4012 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 4013 res0 = __msa_aver_u_b(avg0, res0); 4014 ST_D2(res0, 0, 1, dst, dst_stride); 4015 dst += (2 * dst_stride); 4016 4017 LD_UB2(src, src_stride, inp2, inp3); 4018 src += (2 * src_stride); 4019 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4020 mask0, mask1, mask2, mask3, 4021 const20, const6, const3); 4022 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4023 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4024 horiz3, horiz4, horiz5, horiz6, 4025 horiz3, horiz2, horiz1, horiz0, 4026 horiz4, horiz5, horiz6, horiz7, 4027 const20, const6, const3); 4028 inp0 = LD_UB(src); 4029 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, 4030 mask0, mask1, mask2, mask3, 4031 const20, const6, const3); 4032 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 4033 res1 = __msa_aver_u_b(avg1, res1); 4034 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4035 horiz5, horiz6, horiz7, horiz8, 4036 horiz5, horiz4, horiz3, horiz2, 4037 horiz6, horiz7, horiz8, horiz8, 4038 const20, const6, const3); 4039 ST_D2(res1, 0, 1, dst, dst_stride); 4040 dst += 2 * dst_stride; 4041 4042 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 4043 res0 = __msa_aver_u_b(avg0, res0); 4044 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4045 horiz7, horiz8, horiz8, horiz7, 4046 horiz7, horiz6, horiz5, horiz4, 4047 horiz8, horiz8, horiz7, horiz6, 4048 const20, const6, const3); 4049 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 4050 res1 = __msa_aver_u_b(avg1, res1); 4051 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4052} 4053 4054static void hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t *src, 4055 int32_t src_stride, 4056 uint8_t *dst, 4057 int32_t dst_stride, 4058 int32_t height) 4059{ 4060 uint8_t loop_count; 4061 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 4062 v16u8 res; 4063 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 4064 v16u8 const6 = (v16u8) __msa_ldi_b(6); 4065 v16u8 const3 = (v16u8) __msa_ldi_b(3); 4066 v8u16 const20 = (v8u16) __msa_ldi_h(20); 4067 4068 for (loop_count = (height >> 2); loop_count--;) { 4069 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 4070 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 4071 src += (4 * src_stride); 4072 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 4073 const20, const6, const3); 4074 res = __msa_aver_u_b(res, inp1); 4075 ST_UB(res, dst); 4076 dst += dst_stride; 4077 4078 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 4079 const20, const6, const3); 4080 res = __msa_aver_u_b(res, inp3); 4081 ST_UB(res, dst); 4082 dst += dst_stride; 4083 4084 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 4085 const20, const6, const3); 4086 res = __msa_aver_u_b(res, inp5); 4087 ST_UB(res, dst); 4088 dst += dst_stride; 4089 4090 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 4091 const20, const6, const3); 4092 res = __msa_aver_u_b(res, inp7); 4093 ST_UB(res, dst); 4094 dst += dst_stride; 4095 } 4096 4097 LD_UB2(src, 1, inp0, inp1); 4098 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3); 4099 res = __msa_aver_u_b(inp1, res); 4100 ST_UB(res, dst); 4101} 4102 4103static void hv_mc_qpel_aver_hv_src10_16x16_msa(const uint8_t *src, 4104 int32_t src_stride, 4105 uint8_t *dst, 4106 int32_t dst_stride) 4107{ 4108 uint8_t buff[272]; 4109 4110 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 4111 vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 4112} 4113 4114static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src, 4115 int32_t src_stride, 4116 uint8_t *dst, 4117 int32_t dst_stride) 4118{ 4119 v16u8 inp0, inp1, inp2, inp3; 4120 v16u8 res0, res1, avg0, avg1; 4121 v16u8 horiz0, horiz1, horiz2, horiz3; 4122 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4123 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4124 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4125 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4126 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4127 v16u8 const20 = (v16u8) __msa_ldi_b(20); 4128 v16u8 const6 = (v16u8) __msa_ldi_b(6); 4129 v16u8 const3 = (v16u8) __msa_ldi_b(3); 4130 4131 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 4132 src += (4 * src_stride); 4133 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4134 const20, const6, const3); 4135 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4136 const20, const6, const3); 4137 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 4138 4139 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 4140 horiz0 = __msa_aver_u_b(inp0, res0); 4141 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4142 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 4143 4144 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 4145 horiz2 = __msa_aver_u_b(inp2, res1); 4146 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4147 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 4148 src += (4 * src_stride); 4149 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4150 const20, const6, const3); 4151 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4152 const20, const6, const3); 4153 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 4154 4155 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 4156 horiz4 = __msa_aver_u_b(inp0, res0); 4157 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4158 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 4159 4160 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 4161 horiz6 = __msa_aver_u_b(inp2, res1); 4162 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4163 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4164 horiz1, horiz2, horiz3, horiz4, 4165 horiz1, horiz0, horiz0, horiz1, 4166 horiz2, horiz3, horiz4, horiz5, 4167 const20, const6, const3); 4168 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 4169 res0 = __msa_aver_u_b(avg0, res0); 4170 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4171 horiz3, horiz4, horiz5, horiz6, 4172 horiz3, horiz2, horiz1, horiz0, 4173 horiz4, horiz5, horiz6, horiz7, 4174 const20, const6, const3); 4175 ST_D2(res0, 0, 1, dst, dst_stride); 4176 dst += 2 * dst_stride; 4177 4178 inp0 = LD_UB(src); 4179 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 4180 const20, const6, const3); 4181 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 4182 res1 = __msa_aver_u_b(avg1, res1); 4183 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 4184 horiz8 = __msa_aver_u_b(inp0, res0); 4185 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4186 horiz5, horiz6, horiz7, horiz8, 4187 horiz5, horiz4, horiz3, horiz2, 4188 horiz6, horiz7, horiz8, horiz8, 4189 const20, const6, const3); 4190 ST_D2(res1, 0, 1, dst, dst_stride); 4191 dst += 2 * dst_stride; 4192 4193 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 4194 res0 = __msa_aver_u_b(avg0, res0); 4195 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4196 horiz7, horiz8, horiz8, horiz7, 4197 horiz7, horiz6, horiz5, horiz4, 4198 horiz8, horiz8, horiz7, horiz6, 4199 const20, const6, const3); 4200 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 4201 res1 = __msa_aver_u_b(avg1, res1); 4202 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4203} 4204 4205static void hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t *src, 4206 int32_t src_stride, 4207 uint8_t *dst, 4208 int32_t dst_stride) 4209{ 4210 uint8_t buff[272]; 4211 4212 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 4213 vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride); 4214} 4215 4216static void hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t *src, 4217 int32_t src_stride, 4218 uint8_t *dst, 4219 int32_t dst_stride) 4220{ 4221 v16u8 inp0, inp1, inp2, inp3; 4222 v16u8 res0, res1; 4223 v16u8 horiz0, horiz1, horiz2, horiz3; 4224 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4225 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4226 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4227 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4228 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4229 v16u8 const20 = (v16u8) __msa_ldi_b(20); 4230 v16u8 const6 = (v16u8) __msa_ldi_b(6); 4231 v16u8 const3 = (v16u8) __msa_ldi_b(3); 4232 4233 LD_UB2(src, src_stride, inp0, inp1); 4234 src += (2 * src_stride); 4235 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4236 const20, const6, const3); 4237 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 4238 horiz0 = __msa_aver_u_b(inp0, res0); 4239 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4240 4241 LD_UB2(src, src_stride, inp2, inp3); 4242 src += (2 * src_stride); 4243 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4244 const20, const6, const3); 4245 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 4246 horiz2 = __msa_aver_u_b(inp2, res1); 4247 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4248 LD_UB2(src, src_stride, inp0, inp1); 4249 src += (2 * src_stride); 4250 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4251 const20, const6, const3); 4252 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 4253 horiz4 = __msa_aver_u_b(inp0, res0); 4254 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4255 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4256 horiz1, horiz2, horiz3, horiz4, 4257 horiz1, horiz0, horiz0, horiz1, 4258 horiz2, horiz3, horiz4, horiz5, 4259 const20, const6, const3); 4260 ST_D2(res0, 0, 1, dst, dst_stride); 4261 dst += (2 * dst_stride); 4262 4263 LD_UB2(src, src_stride, inp2, inp3); 4264 src += (2 * src_stride); 4265 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4266 const20, const6, const3); 4267 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 4268 horiz6 = __msa_aver_u_b(inp2, res1); 4269 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4270 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4271 horiz3, horiz4, horiz5, horiz6, 4272 horiz3, horiz2, horiz1, horiz0, 4273 horiz4, horiz5, horiz6, horiz7, 4274 const20, const6, const3); 4275 inp0 = LD_UB(src); 4276 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 4277 const20, const6, const3); 4278 horiz8 = __msa_aver_u_b(inp0, res0); 4279 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4280 horiz5, horiz6, horiz7, horiz8, 4281 horiz5, horiz4, horiz3, horiz2, 4282 horiz6, horiz7, horiz8, horiz8, 4283 const20, const6, const3); 4284 ST_D2(res1, 0, 1, dst, dst_stride); 4285 dst += 2 * dst_stride; 4286 4287 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4288 horiz7, horiz8, horiz8, horiz7, 4289 horiz7, horiz6, horiz5, horiz4, 4290 horiz8, horiz8, horiz7, horiz6, 4291 const20, const6, const3); 4292 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4293} 4294 4295static void hv_mc_qpel_16x16_msa(const uint8_t *src, 4296 int32_t src_stride, 4297 uint8_t *dst, 4298 int32_t dst_stride) 4299{ 4300 uint8_t buff[272]; 4301 4302 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); 4303 vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride); 4304} 4305 4306static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride, 4307 uint8_t *dst, int32_t dst_stride) 4308{ 4309 v16u8 inp0, inp1, inp2, inp3; 4310 v16u8 res0, res1; 4311 v16u8 horiz0, horiz1, horiz2, horiz3; 4312 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4313 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4314 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4315 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4316 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4317 v16u8 const20 = (v16u8) __msa_ldi_b(20); 4318 v16u8 const6 = (v16u8) __msa_ldi_b(6); 4319 v16u8 const3 = (v16u8) __msa_ldi_b(3); 4320 4321 LD_UB2(src, src_stride, inp0, inp1); 4322 src += (2 * src_stride); 4323 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4324 mask0, mask1, mask2, mask3, 4325 const20, const6, const3); 4326 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4327 LD_UB2(src, src_stride, inp2, inp3); 4328 src += (2 * src_stride); 4329 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4330 mask0, mask1, mask2, mask3, 4331 const20, const6, const3); 4332 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4333 LD_UB2(src, src_stride, inp0, inp1); 4334 src += (2 * src_stride); 4335 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4336 mask0, mask1, mask2, mask3, 4337 const20, const6, const3); 4338 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4339 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4340 horiz1, horiz2, horiz3, horiz4, 4341 horiz1, horiz0, horiz0, horiz1, 4342 horiz2, horiz3, horiz4, horiz5, 4343 const20, const6, const3); 4344 ST_D2(res0, 0, 1, dst, dst_stride); 4345 dst += (2 * dst_stride); 4346 4347 LD_UB2(src, src_stride, inp2, inp3); 4348 src += (2 * src_stride); 4349 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4350 mask0, mask1, mask2, mask3, 4351 const20, const6, const3); 4352 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4353 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4354 horiz3, horiz4, horiz5, horiz6, 4355 horiz3, horiz2, horiz1, horiz0, 4356 horiz4, horiz5, horiz6, horiz7, 4357 const20, const6, const3); 4358 inp0 = LD_UB(src); 4359 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, 4360 mask0, mask1, mask2, mask3, 4361 const20, const6, const3); 4362 ST_D2(res1, 0, 1, dst, dst_stride); 4363 dst += 2 * dst_stride; 4364 4365 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4366 horiz5, horiz6, horiz7, horiz8, 4367 horiz5, horiz4, horiz3, horiz2, 4368 horiz6, horiz7, horiz8, horiz8, 4369 const20, const6, const3); 4370 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4371 horiz7, horiz8, horiz8, horiz7, 4372 horiz7, horiz6, horiz5, horiz4, 4373 horiz8, horiz8, horiz7, horiz6, 4374 const20, const6, const3); 4375 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4376} 4377 4378static void hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t *src, 4379 int32_t src_stride, 4380 uint8_t *dst, 4381 int32_t dst_stride) 4382{ 4383 uint8_t buff[272]; 4384 4385 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 4386 vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride); 4387} 4388 4389static void hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t *src, 4390 int32_t src_stride, 4391 uint8_t *dst, 4392 int32_t dst_stride) 4393{ 4394 v16u8 inp0, inp1, inp2, inp3; 4395 v16u8 res0, res1; 4396 v16u8 horiz0, horiz1, horiz2, horiz3; 4397 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4398 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4399 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4400 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4401 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4402 v16u8 const20 = (v16u8) __msa_ldi_b(20); 4403 v16u8 const6 = (v16u8) __msa_ldi_b(6); 4404 v16u8 const3 = (v16u8) __msa_ldi_b(3); 4405 4406 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 4407 src += (4 * src_stride); 4408 4409 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4410 const20, const6, const3); 4411 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4412 const20, const6, const3); 4413 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 4414 4415 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 4416 horiz0 = __msa_aver_u_b(inp0, res0); 4417 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4418 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 4419 4420 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 4421 horiz2 = __msa_aver_u_b(inp2, res1); 4422 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4423 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 4424 src += (4 * src_stride); 4425 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4426 const20, const6, const3); 4427 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4428 const20, const6, const3); 4429 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 4430 4431 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 4432 horiz4 = __msa_aver_u_b(inp0, res0); 4433 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4434 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 4435 4436 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 4437 horiz6 = __msa_aver_u_b(inp2, res1); 4438 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4439 inp0 = LD_UB(src); 4440 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 4441 const20, const6, const3); 4442 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 4443 horiz8 = __msa_aver_u_b(inp0, res0); 4444 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4445 horiz1, horiz2, horiz3, horiz4, 4446 horiz1, horiz0, horiz0, horiz1, 4447 horiz2, horiz3, horiz4, horiz5, 4448 const20, const6, const3); 4449 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4450 horiz3, horiz4, horiz5, horiz6, 4451 horiz3, horiz2, horiz1, horiz0, 4452 horiz4, horiz5, horiz6, horiz7, 4453 const20, const6, const3); 4454 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4455 dst += (4 * dst_stride); 4456 4457 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4458 horiz5, horiz6, horiz7, horiz8, 4459 horiz5, horiz4, horiz3, horiz2, 4460 horiz6, horiz7, horiz8, horiz8, 4461 const20, const6, const3); 4462 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4463 horiz7, horiz8, horiz8, horiz7, 4464 horiz7, horiz6, horiz5, horiz4, 4465 horiz8, horiz8, horiz7, horiz6, 4466 const20, const6, const3); 4467 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4468} 4469 4470static void hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t *src, 4471 int32_t src_stride, 4472 uint8_t *dst, 4473 int32_t dst_stride) 4474{ 4475 uint8_t buff[272]; 4476 4477 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 4478 vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 4479} 4480 4481static void hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t *src, 4482 int32_t src_stride, 4483 uint8_t *dst, 4484 int32_t dst_stride) 4485{ 4486 v16u8 inp0, inp1, inp2, inp3; 4487 v16u8 res0, res1, avg0, avg1; 4488 v16u8 horiz0, horiz1, horiz2, horiz3; 4489 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4490 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4491 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4492 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4493 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4494 v16u8 const20 = (v16u8) __msa_ldi_b(20); 4495 v16u8 const6 = (v16u8) __msa_ldi_b(6); 4496 v16u8 const3 = (v16u8) __msa_ldi_b(3); 4497 4498 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 4499 src += (4 * src_stride); 4500 4501 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4502 const20, const6, const3); 4503 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4504 const20, const6, const3); 4505 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 4506 horiz0 = __msa_aver_u_b(inp0, res0); 4507 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4508 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 4509 horiz2 = __msa_aver_u_b(inp2, res1); 4510 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4511 LD_UB2(src, src_stride, inp0, inp1); 4512 src += (2 * src_stride); 4513 4514 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4515 const20, const6, const3); 4516 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 4517 horiz4 = __msa_aver_u_b(inp0, res0); 4518 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4519 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4520 horiz1, horiz2, horiz3, horiz4, 4521 horiz1, horiz0, horiz0, horiz1, 4522 horiz2, horiz3, horiz4, horiz5, 4523 const20, const6, const3); 4524 avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2); 4525 res0 = __msa_aver_u_b(avg0, res0); 4526 ST_D2(res0, 0, 1, dst, dst_stride); 4527 dst += (2 * dst_stride); 4528 4529 LD_UB2(src, src_stride, inp2, inp3); 4530 src += (2 * src_stride); 4531 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4532 const20, const6, const3); 4533 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 4534 horiz6 = __msa_aver_u_b(inp2, res1); 4535 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4536 inp0 = LD_UB(src); 4537 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 4538 const20, const6, const3); 4539 horiz8 = __msa_aver_u_b(inp0, res0); 4540 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4541 horiz3, horiz4, horiz5, horiz6, 4542 horiz3, horiz2, horiz1, horiz0, 4543 horiz4, horiz5, horiz6, horiz7, 4544 const20, const6, const3); 4545 avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4); 4546 res1 = __msa_aver_u_b(avg1, res1); 4547 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4548 horiz5, horiz6, horiz7, horiz8, 4549 horiz5, horiz4, horiz3, horiz2, 4550 horiz6, horiz7, horiz8, horiz8, 4551 const20, const6, const3); 4552 ST_D2(res1, 0, 1, dst, dst_stride); 4553 dst += 2 * dst_stride; 4554 4555 avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6); 4556 res0 = __msa_aver_u_b(avg0, res0); 4557 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4558 horiz7, horiz8, horiz8, horiz7, 4559 horiz7, horiz6, horiz5, horiz4, 4560 horiz8, horiz8, horiz7, horiz6, 4561 const20, const6, const3); 4562 avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8); 4563 res1 = __msa_aver_u_b(avg1, res1); 4564 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4565} 4566 4567static void hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t *src, 4568 int32_t src_stride, 4569 uint8_t *dst, 4570 int32_t dst_stride) 4571{ 4572 uint8_t buff[272]; 4573 4574 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); 4575 vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 4576} 4577 4578static void hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t *src, 4579 int32_t src_stride, 4580 uint8_t *dst, 4581 int32_t dst_stride) 4582{ 4583 v16u8 inp0, inp1, inp2, inp3; 4584 v16u8 res0, res1, avg0, avg1; 4585 v16u8 horiz0, horiz1, horiz2, horiz3; 4586 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4587 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4588 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4589 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4590 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4591 v16u8 const20 = (v16u8) __msa_ldi_b(20); 4592 v16u8 const6 = (v16u8) __msa_ldi_b(6); 4593 v16u8 const3 = (v16u8) __msa_ldi_b(3); 4594 4595 LD_UB2(src, src_stride, inp0, inp1); 4596 src += (2 * src_stride); 4597 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4598 mask0, mask1, mask2, mask3, 4599 const20, const6, const3); 4600 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4601 LD_UB2(src, src_stride, inp2, inp3); 4602 src += (2 * src_stride); 4603 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4604 mask0, mask1, mask2, mask3, 4605 const20, const6, const3); 4606 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4607 LD_UB2(src, src_stride, inp0, inp1); 4608 src += (2 * src_stride); 4609 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4610 mask0, mask1, mask2, mask3, 4611 const20, const6, const3); 4612 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4613 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4614 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4615 horiz1, horiz2, horiz3, horiz4, 4616 horiz1, horiz0, horiz0, horiz1, 4617 horiz2, horiz3, horiz4, horiz5, 4618 const20, const6, const3); 4619 avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2); 4620 res0 = __msa_aver_u_b(avg0, res0); 4621 ST_D2(res0, 0, 1, dst, dst_stride); 4622 dst += (2 * dst_stride); 4623 4624 LD_UB2(src, src_stride, inp2, inp3); 4625 src += (2 * src_stride); 4626 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4627 mask0, mask1, mask2, mask3, 4628 const20, const6, const3); 4629 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4630 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4631 horiz3, horiz4, horiz5, horiz6, 4632 horiz3, horiz2, horiz1, horiz0, 4633 horiz4, horiz5, horiz6, horiz7, 4634 const20, const6, const3); 4635 inp0 = LD_UB(src); 4636 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, 4637 mask0, mask1, mask2, mask3, 4638 const20, const6, const3); 4639 avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4); 4640 res1 = __msa_aver_u_b(avg1, res1); 4641 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4642 horiz5, horiz6, horiz7, horiz8, 4643 horiz5, horiz4, horiz3, horiz2, 4644 horiz6, horiz7, horiz8, horiz8, 4645 const20, const6, const3); 4646 ST_D2(res1, 0, 1, dst, dst_stride); 4647 dst += 2 * dst_stride; 4648 avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6); 4649 res0 = __msa_aver_u_b(avg0, res0); 4650 4651 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4652 horiz7, horiz8, horiz8, horiz7, 4653 horiz7, horiz6, horiz5, horiz4, 4654 horiz8, horiz8, horiz7, horiz6, 4655 const20, const6, const3); 4656 avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8); 4657 res1 = __msa_aver_u_b(avg1, res1); 4658 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4659} 4660 4661static void hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t *src, 4662 int32_t src_stride, 4663 uint8_t *dst, 4664 int32_t dst_stride) 4665{ 4666 uint8_t buff[272]; 4667 4668 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 4669 vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 4670} 4671 4672static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src, 4673 int32_t src_stride, 4674 uint8_t *dst, int32_t dst_stride) 4675{ 4676 v16u8 inp0, inp1, inp2, inp3; 4677 v16u8 res0, res1, avg0, avg1; 4678 v16u8 horiz0, horiz1, horiz2, horiz3; 4679 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4680 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4681 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4682 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4683 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4684 v16u8 const20 = (v16u8) __msa_ldi_b(20); 4685 v16u8 const6 = (v16u8) __msa_ldi_b(6); 4686 v16u8 const3 = (v16u8) __msa_ldi_b(3); 4687 4688 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 4689 src += (4 * src_stride); 4690 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4691 mask0, mask1, mask2, mask3, 4692 const20, const6, const3); 4693 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 4694 4695 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 4696 horiz0 = __msa_aver_u_b(inp0, res0); 4697 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4698 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4699 const20, const6, const3); 4700 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 4701 4702 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 4703 horiz2 = __msa_aver_u_b(inp2, res1); 4704 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4705 LD_UB2(src, src_stride, inp0, inp1); 4706 src += (2 * src_stride); 4707 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4708 const20, const6, const3); 4709 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 4710 4711 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 4712 horiz4 = __msa_aver_u_b(inp0, res0); 4713 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4714 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4715 horiz1, horiz2, horiz3, horiz4, 4716 horiz1, horiz0, horiz0, horiz1, 4717 horiz2, horiz3, horiz4, horiz5, 4718 const20, const6, const3); 4719 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 4720 res0 = __msa_aver_u_b(avg0, res0); 4721 LD_UB2(src, src_stride, inp2, inp3); 4722 src += (2 * src_stride); 4723 ST_D2(res0, 0, 1, dst, dst_stride); 4724 dst += 2 * dst_stride; 4725 4726 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4727 const20, const6, const3); 4728 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 4729 4730 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 4731 horiz6 = __msa_aver_u_b(inp2, res1); 4732 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4733 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4734 horiz3, horiz4, horiz5, horiz6, 4735 horiz3, horiz2, horiz1, horiz0, 4736 horiz4, horiz5, horiz6, horiz7, 4737 const20, const6, const3); 4738 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 4739 res1 = __msa_aver_u_b(avg1, res1); 4740 inp0 = LD_UB(src); 4741 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 4742 const20, const6, const3); 4743 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 4744 horiz8 = __msa_aver_u_b(inp0, res0); 4745 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4746 horiz5, horiz6, horiz7, horiz8, 4747 horiz5, horiz4, horiz3, horiz2, 4748 horiz6, horiz7, horiz8, horiz8, 4749 const20, const6, const3); 4750 ST_D2(res1, 0, 1, dst, dst_stride); 4751 dst += 2 * dst_stride; 4752 4753 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 4754 res0 = __msa_aver_u_b(avg0, res0); 4755 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4756 horiz7, horiz8, horiz8, horiz7, 4757 horiz7, horiz6, horiz5, horiz4, 4758 horiz8, horiz8, horiz7, horiz6, 4759 const20, const6, const3); 4760 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 4761 res1 = __msa_aver_u_b(avg1, res1); 4762 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4763} 4764 4765static void hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t *src, 4766 int32_t src_stride, 4767 uint8_t *dst, 4768 int32_t dst_stride) 4769{ 4770 uint8_t buff[272]; 4771 4772 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 4773 vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 4774} 4775 4776static void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src, 4777 int32_t src_stride, 4778 uint8_t *dst, 4779 int32_t dst_stride) 4780{ 4781 v16u8 inp0, inp1, inp2, inp3; 4782 v16u8 res0, res1, avg0, avg1; 4783 v16u8 horiz0, horiz1, horiz2, horiz3; 4784 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4785 v16u8 dst0, dst1; 4786 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4787 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4788 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4789 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4790 v16u8 const20 = (v16u8) __msa_ldi_b(20); 4791 v16u8 const6 = (v16u8) __msa_ldi_b(6); 4792 v16u8 const3 = (v16u8) __msa_ldi_b(3); 4793 4794 LD_UB2(src, src_stride, inp0, inp1); 4795 src += (2 * src_stride); 4796 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4797 const20, const6, const3); 4798 LD_UB2(src, src_stride, inp2, inp3); 4799 src += (2 * src_stride); 4800 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 4801 horiz0 = __msa_aver_u_b(inp0, res0); 4802 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4803 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4804 const20, const6, const3); 4805 LD_UB2(src, src_stride, inp0, inp1); 4806 src += (2 * src_stride); 4807 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 4808 horiz2 = __msa_aver_u_b(inp2, res1); 4809 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4810 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4811 const20, const6, const3); 4812 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 4813 horiz4 = __msa_aver_u_b(inp0, res0); 4814 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4815 LD_UB2(dst, dst_stride, dst0, dst1); 4816 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 4817 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4818 horiz1, horiz2, horiz3, horiz4, 4819 horiz1, horiz0, horiz0, horiz1, 4820 horiz2, horiz3, horiz4, horiz5, 4821 const20, const6, const3); 4822 res0 = __msa_aver_u_b(avg0, res0); 4823 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4824 res0 = __msa_aver_u_b(avg0, res0); 4825 ST_D2(res0, 0, 1, dst, dst_stride); 4826 dst += (2 * dst_stride); 4827 4828 LD_UB2(src, src_stride, inp2, inp3); 4829 src += (2 * src_stride); 4830 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4831 const20, const6, const3); 4832 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 4833 horiz6 = __msa_aver_u_b(inp2, res1); 4834 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4835 LD_UB2(dst, dst_stride, dst0, dst1); 4836 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 4837 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4838 horiz3, horiz4, horiz5, horiz6, 4839 horiz3, horiz2, horiz1, horiz0, 4840 horiz4, horiz5, horiz6, horiz7, 4841 const20, const6, const3); 4842 res1 = __msa_aver_u_b(avg1, res1); 4843 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4844 res1 = __msa_aver_u_b(avg1, res1); 4845 ST_D2(res1, 0, 1, dst, dst_stride); 4846 dst += (2 * dst_stride); 4847 4848 inp0 = LD_UB(src); 4849 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 4850 const20, const6, const3); 4851 horiz8 = __msa_aver_u_b(inp0, res0); 4852 LD_UB2(dst, dst_stride, dst0, dst1); 4853 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 4854 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4855 horiz5, horiz6, horiz7, horiz8, 4856 horiz5, horiz4, horiz3, horiz2, 4857 horiz6, horiz7, horiz8, horiz8, 4858 const20, const6, const3); 4859 res0 = __msa_aver_u_b(avg0, res0); 4860 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4861 res0 = __msa_aver_u_b(avg0, res0); 4862 ST_D2(res0, 0, 1, dst, dst_stride); 4863 dst += (2 * dst_stride); 4864 4865 LD_UB2(dst, dst_stride, dst0, dst1); 4866 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 4867 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4868 horiz7, horiz8, horiz8, horiz7, 4869 horiz7, horiz6, horiz5, horiz4, 4870 horiz8, horiz8, horiz7, horiz6, 4871 const20, const6, const3); 4872 res1 = __msa_aver_u_b(avg1, res1); 4873 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4874 res1 = __msa_aver_u_b(avg1, res1); 4875 ST_D2(res1, 0, 1, dst, dst_stride); 4876} 4877 4878static void hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t *src, 4879 int32_t src_stride, 4880 uint8_t *dst, 4881 int32_t dst_stride) 4882{ 4883 uint8_t buff[272]; 4884 4885 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); 4886 vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 4887} 4888 4889static void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src, 4890 int32_t src_stride, 4891 uint8_t *dst, 4892 int32_t dst_stride) 4893{ 4894 v16u8 inp0, inp1, inp2, inp3; 4895 v16u8 res0, res1, avg0, avg1; 4896 v16u8 horiz0, horiz1, horiz2, horiz3; 4897 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4898 v16u8 dst0, dst1; 4899 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4900 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4901 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4902 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4903 v16u8 const20 = (v16u8) __msa_ldi_b(20); 4904 v16u8 const6 = (v16u8) __msa_ldi_b(6); 4905 v16u8 const3 = (v16u8) __msa_ldi_b(3); 4906 4907 LD_UB2(src, src_stride, inp0, inp1); 4908 src += (2 * src_stride); 4909 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4910 mask0, mask1, mask2, mask3, 4911 const20, const6, const3); 4912 LD_UB2(src, src_stride, inp2, inp3); 4913 src += (2 * src_stride); 4914 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4915 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4916 mask0, mask1, mask2, mask3, 4917 const20, const6, const3); 4918 LD_UB2(src, src_stride, inp0, inp1); 4919 src += (2 * src_stride); 4920 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4921 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4922 mask0, mask1, mask2, mask3, 4923 const20, const6, const3); 4924 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4925 LD_UB2(dst, dst_stride, dst0, dst1); 4926 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 4927 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4928 horiz1, horiz2, horiz3, horiz4, 4929 horiz1, horiz0, horiz0, horiz1, 4930 horiz2, horiz3, horiz4, horiz5, 4931 const20, const6, const3); 4932 res0 = __msa_aver_u_b(avg0, res0); 4933 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4934 res0 = __msa_aver_u_b(avg0, res0); 4935 ST_D2(res0, 0, 1, dst, dst_stride); 4936 dst += (2 * dst_stride); 4937 4938 LD_UB2(src, src_stride, inp2, inp3); 4939 src += (2 * src_stride); 4940 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4941 mask0, mask1, mask2, mask3, 4942 const20, const6, const3); 4943 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4944 LD_UB2(dst, dst_stride, dst0, dst1); 4945 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 4946 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4947 horiz3, horiz4, horiz5, horiz6, 4948 horiz3, horiz2, horiz1, horiz0, 4949 horiz4, horiz5, horiz6, horiz7, 4950 const20, const6, const3); 4951 res1 = __msa_aver_u_b(avg1, res1); 4952 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4953 res1 = __msa_aver_u_b(avg1, res1); 4954 ST_D2(res1, 0, 1, dst, dst_stride); 4955 dst += (2 * dst_stride); 4956 4957 inp0 = LD_UB(src); 4958 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, 4959 mask0, mask1, mask2, mask3, 4960 const20, const6, const3); 4961 LD_UB2(dst, dst_stride, dst0, dst1); 4962 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 4963 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4964 horiz5, horiz6, horiz7, horiz8, 4965 horiz5, horiz4, horiz3, horiz2, 4966 horiz6, horiz7, horiz8, horiz8, 4967 const20, const6, const3); 4968 res0 = __msa_aver_u_b(avg0, res0); 4969 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4970 res0 = __msa_aver_u_b(avg0, res0); 4971 ST_D2(res0, 0, 1, dst, dst_stride); 4972 dst += (2 * dst_stride); 4973 4974 LD_UB2(dst, dst_stride, dst0, dst1); 4975 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 4976 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4977 horiz7, horiz8, horiz8, horiz7, 4978 horiz7, horiz6, horiz5, horiz4, 4979 horiz8, horiz8, horiz7, horiz6, 4980 const20, const6, const3); 4981 res1 = __msa_aver_u_b(avg1, res1); 4982 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4983 res1 = __msa_aver_u_b(avg1, res1); 4984 ST_D2(res1, 0, 1, dst, dst_stride); 4985} 4986 4987static void hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t *src, 4988 int32_t src_stride, 4989 uint8_t *dst, 4990 int32_t dst_stride) 4991{ 4992 uint8_t buff[272]; 4993 4994 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 4995 vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 4996} 4997 4998static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src, 4999 int32_t src_stride, 5000 uint8_t *dst, 5001 int32_t dst_stride) 5002{ 5003 v16u8 inp0, inp1, inp2, inp3; 5004 v16u8 res0, res1, avg0, avg1; 5005 v16u8 horiz0, horiz1, horiz2, horiz3; 5006 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5007 v16u8 dst0, dst1; 5008 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5009 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5010 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5011 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5012 v16u8 const20 = (v16u8) __msa_ldi_b(20); 5013 v16u8 const6 = (v16u8) __msa_ldi_b(6); 5014 v16u8 const3 = (v16u8) __msa_ldi_b(3); 5015 5016 LD_UB2(src, src_stride, inp0, inp1); 5017 src += (2 * src_stride); 5018 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5019 const20, const6, const3); 5020 5021 LD_UB2(src, src_stride, inp2, inp3); 5022 src += (2 * src_stride); 5023 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 5024 5025 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5026 horiz0 = __msa_aver_u_b(inp0, res0); 5027 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5028 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5029 const20, const6, const3); 5030 LD_UB2(src, src_stride, inp0, inp1); 5031 src += (2 * src_stride); 5032 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 5033 5034 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5035 horiz2 = __msa_aver_u_b(inp2, res1); 5036 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5037 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5038 const20, const6, const3); 5039 5040 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 5041 5042 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5043 horiz4 = __msa_aver_u_b(inp0, res0); 5044 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5045 LD_UB2(dst, dst_stride, dst0, dst1); 5046 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 5047 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 5048 horiz1, horiz2, horiz3, horiz4, 5049 horiz1, horiz0, horiz0, horiz1, 5050 horiz2, horiz3, horiz4, horiz5, 5051 const20, const6, const3); 5052 res0 = __msa_aver_u_b(avg0, res0); 5053 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5054 res0 = __msa_aver_u_b(avg0, res0); 5055 ST_D2(res0, 0, 1, dst, dst_stride); 5056 dst += (2 * dst_stride); 5057 5058 LD_UB2(src, src_stride, inp2, inp3); 5059 src += (2 * src_stride); 5060 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5061 const20, const6, const3); 5062 5063 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 5064 5065 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5066 horiz6 = __msa_aver_u_b(inp2, res1); 5067 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5068 LD_UB2(dst, dst_stride, dst0, dst1); 5069 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 5070 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 5071 horiz3, horiz4, horiz5, horiz6, 5072 horiz3, horiz2, horiz1, horiz0, 5073 horiz4, horiz5, horiz6, horiz7, 5074 const20, const6, const3); 5075 res1 = __msa_aver_u_b(avg1, res1); 5076 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5077 res1 = __msa_aver_u_b(avg1, res1); 5078 ST_D2(res1, 0, 1, dst, dst_stride); 5079 dst += (2 * dst_stride); 5080 5081 inp0 = LD_UB(src); 5082 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 5083 const20, const6, const3); 5084 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 5085 horiz8 = __msa_aver_u_b(inp0, res0); 5086 LD_UB2(dst, dst_stride, dst0, dst1); 5087 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 5088 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 5089 horiz5, horiz6, horiz7, horiz8, 5090 horiz5, horiz4, horiz3, horiz2, 5091 horiz6, horiz7, horiz8, horiz8, 5092 const20, const6, const3); 5093 res0 = __msa_aver_u_b(avg0, res0); 5094 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5095 res0 = __msa_aver_u_b(avg0, res0); 5096 ST_D2(res0, 0, 1, dst, dst_stride); 5097 dst += (2 * dst_stride); 5098 5099 LD_UB2(dst, dst_stride, dst0, dst1); 5100 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 5101 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 5102 horiz7, horiz8, horiz8, horiz7, 5103 horiz7, horiz6, horiz5, horiz4, 5104 horiz8, horiz8, horiz7, horiz6, 5105 const20, const6, const3); 5106 res1 = __msa_aver_u_b(avg1, res1); 5107 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5108 res1 = __msa_aver_u_b(avg1, res1); 5109 ST_D2(res1, 0, 1, dst, dst_stride); 5110} 5111 5112static void hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t *src, 5113 int32_t src_stride, 5114 uint8_t *dst, 5115 int32_t dst_stride) 5116{ 5117 uint8_t buff[272]; 5118 5119 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 5120 vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride); 5121} 5122 5123static void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src, 5124 int32_t src_stride, 5125 uint8_t *dst, 5126 int32_t dst_stride) 5127{ 5128 v16u8 inp0, inp1, inp2, inp3; 5129 v16u8 res0, res1, avg0, avg1; 5130 v16u8 horiz0, horiz1, horiz2, horiz3; 5131 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5132 v16u8 dst0, dst1; 5133 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5134 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5135 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5136 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5137 v16u8 const20 = (v16u8) __msa_ldi_b(20); 5138 v16u8 const6 = (v16u8) __msa_ldi_b(6); 5139 v16u8 const3 = (v16u8) __msa_ldi_b(3); 5140 5141 LD_UB2(src, src_stride, inp0, inp1); 5142 src += (2 * src_stride); 5143 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5144 const20, const6, const3); 5145 LD_UB2(src, src_stride, inp2, inp3); 5146 src += (2 * src_stride); 5147 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5148 horiz0 = __msa_aver_u_b(inp0, res0); 5149 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5150 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5151 const20, const6, const3); 5152 LD_UB2(src, src_stride, inp0, inp1); 5153 src += (2 * src_stride); 5154 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5155 horiz2 = __msa_aver_u_b(inp2, res1); 5156 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5157 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5158 const20, const6, const3); 5159 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5160 horiz4 = __msa_aver_u_b(inp0, res0); 5161 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5162 LD_UB2(dst, dst_stride, dst0, dst1); 5163 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 5164 horiz1, horiz2, horiz3, horiz4, 5165 horiz1, horiz0, horiz0, horiz1, 5166 horiz2, horiz3, horiz4, horiz5, 5167 const20, const6, const3); 5168 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5169 res0 = __msa_aver_u_b(avg0, res0); 5170 ST_D2(res0, 0, 1, dst, dst_stride); 5171 dst += (2 * dst_stride); 5172 5173 LD_UB2(src, src_stride, inp2, inp3); 5174 src += (2 * src_stride); 5175 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5176 const20, const6, const3); 5177 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5178 horiz6 = __msa_aver_u_b(inp2, res1); 5179 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5180 LD_UB2(dst, dst_stride, dst0, dst1); 5181 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 5182 horiz3, horiz4, horiz5, horiz6, 5183 horiz3, horiz2, horiz1, horiz0, 5184 horiz4, horiz5, horiz6, horiz7, 5185 const20, const6, const3); 5186 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5187 res1 = __msa_aver_u_b(avg1, res1); 5188 ST_D2(res1, 0, 1, dst, dst_stride); 5189 dst += (2 * dst_stride); 5190 5191 inp0 = LD_UB(src); 5192 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 5193 const20, const6, const3); 5194 horiz8 = __msa_aver_u_b(inp0, res0); 5195 LD_UB2(dst, dst_stride, dst0, dst1); 5196 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 5197 horiz5, horiz6, horiz7, horiz8, 5198 horiz5, horiz4, horiz3, horiz2, 5199 horiz6, horiz7, horiz8, horiz8, 5200 const20, const6, const3); 5201 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5202 res0 = __msa_aver_u_b(avg0, res0); 5203 ST_D2(res0, 0, 1, dst, dst_stride); 5204 dst += (2 * dst_stride); 5205 5206 LD_UB2(dst, dst_stride, dst0, dst1); 5207 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 5208 horiz7, horiz8, horiz8, horiz7, 5209 horiz7, horiz6, horiz5, horiz4, 5210 horiz8, horiz8, horiz7, horiz6, 5211 const20, const6, const3); 5212 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5213 res1 = __msa_aver_u_b(avg1, res1); 5214 ST_D2(res1, 0, 1, dst, dst_stride); 5215} 5216 5217static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride, 5218 uint8_t *dst, int32_t dst_stride) 5219{ 5220 uint8_t buff[272]; 5221 5222 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); 5223 vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride); 5224 5225} 5226 5227static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride, 5228 uint8_t *dst, int32_t dst_stride) 5229{ 5230 v16u8 inp0, inp1, inp2, inp3; 5231 v16u8 res0, res1, avg0, avg1; 5232 v16u8 horiz0, horiz1, horiz2, horiz3; 5233 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5234 v16u8 dst0, dst1; 5235 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5236 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5237 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5238 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5239 v16u8 const20 = (v16u8) __msa_ldi_b(20); 5240 v16u8 const6 = (v16u8) __msa_ldi_b(6); 5241 v16u8 const3 = (v16u8) __msa_ldi_b(3); 5242 5243 LD_UB2(src, src_stride, inp0, inp1); 5244 src += (2 * src_stride); 5245 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 5246 mask0, mask1, mask2, mask3, 5247 const20, const6, const3); 5248 LD_UB2(src, src_stride, inp2, inp3); 5249 src += (2 * src_stride); 5250 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5251 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 5252 mask0, mask1, mask2, mask3, 5253 const20, const6, const3); 5254 LD_UB2(src, src_stride, inp0, inp1); 5255 src += (2 * src_stride); 5256 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5257 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 5258 mask0, mask1, mask2, mask3, 5259 const20, const6, const3); 5260 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5261 LD_UB2(src, src_stride, inp2, inp3); 5262 src += (2 * src_stride); 5263 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 5264 mask0, mask1, mask2, mask3, 5265 const20, const6, const3); 5266 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5267 inp0 = LD_UB(src); 5268 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, 5269 mask0, mask1, mask2, mask3, 5270 const20, const6, const3); 5271 LD_UB2(dst, dst_stride, dst0, dst1); 5272 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 5273 horiz1, horiz2, horiz3, horiz4, 5274 horiz1, horiz0, horiz0, horiz1, 5275 horiz2, horiz3, horiz4, horiz5, 5276 const20, const6, const3); 5277 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5278 res0 = __msa_aver_u_b(avg0, res0); 5279 ST_D2(res0, 0, 1, dst, dst_stride); 5280 dst += (2 * dst_stride); 5281 5282 LD_UB2(dst, dst_stride, dst0, dst1); 5283 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 5284 horiz3, horiz4, horiz5, horiz6, 5285 horiz3, horiz2, horiz1, horiz0, 5286 horiz4, horiz5, horiz6, horiz7, 5287 const20, const6, const3); 5288 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5289 res1 = __msa_aver_u_b(avg1, res1); 5290 ST_D2(res1, 0, 1, dst, dst_stride); 5291 dst += (2 * dst_stride); 5292 5293 LD_UB2(dst, dst_stride, dst0, dst1); 5294 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 5295 horiz5, horiz6, horiz7, horiz8, 5296 horiz5, horiz4, horiz3, horiz2, 5297 horiz6, horiz7, horiz8, horiz8, 5298 const20, const6, const3); 5299 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5300 res0 = __msa_aver_u_b(avg0, res0); 5301 ST_D2(res0, 0, 1, dst, dst_stride); 5302 dst += (2 * dst_stride); 5303 5304 LD_UB2(dst, dst_stride, dst0, dst1); 5305 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 5306 horiz7, horiz8, horiz8, horiz7, 5307 horiz7, horiz6, horiz5, horiz4, 5308 horiz8, horiz8, horiz7, horiz6, 5309 const20, const6, const3); 5310 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5311 res1 = __msa_aver_u_b(avg1, res1); 5312 ST_D2(res1, 0, 1, dst, dst_stride); 5313} 5314 5315static void hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t *src, 5316 int32_t src_stride, 5317 uint8_t *dst, 5318 int32_t dst_stride) 5319{ 5320 uint8_t buff[272]; 5321 5322 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 5323 vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride); 5324} 5325 5326static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src, 5327 int32_t src_stride, 5328 uint8_t *dst, 5329 int32_t dst_stride) 5330{ 5331 v16u8 inp0, inp1, inp2, inp3; 5332 v16u8 res0, res1, avg0, avg1; 5333 v16u8 horiz0, horiz1, horiz2, horiz3; 5334 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5335 v16u8 dst0, dst1; 5336 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5337 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5338 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5339 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5340 v16u8 const20 = (v16u8) __msa_ldi_b(20); 5341 v16u8 const6 = (v16u8) __msa_ldi_b(6); 5342 v16u8 const3 = (v16u8) __msa_ldi_b(3); 5343 5344 LD_UB2(src, src_stride, inp0, inp1); 5345 src += (2 * src_stride); 5346 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5347 const20, const6, const3); 5348 LD_UB2(src, src_stride, inp2, inp3); 5349 src += (2 * src_stride); 5350 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 5351 5352 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5353 horiz0 = __msa_aver_u_b(inp0, res0); 5354 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5355 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5356 const20, const6, const3); 5357 LD_UB2(src, src_stride, inp0, inp1); 5358 src += (2 * src_stride); 5359 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 5360 5361 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5362 horiz2 = __msa_aver_u_b(inp2, res1); 5363 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5364 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5365 const20, const6, const3); 5366 5367 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 5368 5369 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5370 horiz4 = __msa_aver_u_b(inp0, res0); 5371 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5372 LD_UB2(dst, dst_stride, dst0, dst1); 5373 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 5374 horiz1, horiz2, horiz3, horiz4, 5375 horiz1, horiz0, horiz0, horiz1, 5376 horiz2, horiz3, horiz4, horiz5, 5377 const20, const6, const3); 5378 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5379 res0 = __msa_aver_u_b(avg0, res0); 5380 ST_D2(res0, 0, 1, dst, dst_stride); 5381 dst += (2 * dst_stride); 5382 5383 LD_UB2(src, src_stride, inp2, inp3); 5384 src += (2 * src_stride); 5385 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5386 const20, const6, const3); 5387 5388 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 5389 5390 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5391 horiz6 = __msa_aver_u_b(inp2, res1); 5392 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5393 LD_UB2(dst, dst_stride, dst0, dst1); 5394 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 5395 horiz3, horiz4, horiz5, horiz6, 5396 horiz3, horiz2, horiz1, horiz0, 5397 horiz4, horiz5, horiz6, horiz7, 5398 const20, const6, const3); 5399 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5400 res1 = __msa_aver_u_b(avg1, res1); 5401 ST_D2(res1, 0, 1, dst, dst_stride); 5402 dst += (2 * dst_stride); 5403 5404 inp0 = LD_UB(src); 5405 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 5406 const20, const6, const3); 5407 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 5408 horiz8 = __msa_aver_u_b(inp0, res0); 5409 LD_UB2(dst, dst_stride, dst0, dst1); 5410 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 5411 horiz5, horiz6, horiz7, horiz8, 5412 horiz5, horiz4, horiz3, horiz2, 5413 horiz6, horiz7, horiz8, horiz8, 5414 const20, const6, const3); 5415 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5416 res0 = __msa_aver_u_b(avg0, res0); 5417 ST_D2(res0, 0, 1, dst, dst_stride); 5418 dst += (2 * dst_stride); 5419 5420 LD_UB2(dst, dst_stride, dst0, dst1); 5421 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 5422 horiz7, horiz8, horiz8, horiz7, 5423 horiz7, horiz6, horiz5, horiz4, 5424 horiz8, horiz8, horiz7, horiz6, 5425 const20, const6, const3); 5426 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5427 res1 = __msa_aver_u_b(avg1, res1); 5428 ST_D2(res1, 0, 1, dst, dst_stride); 5429} 5430 5431static void hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t *src, 5432 int32_t src_stride, 5433 uint8_t *dst, 5434 int32_t dst_stride) 5435{ 5436 uint8_t buff[272]; 5437 5438 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 5439 vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 5440} 5441 5442static void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src, 5443 int32_t src_stride, 5444 uint8_t *dst, 5445 int32_t dst_stride) 5446{ 5447 v16u8 inp0, inp1, inp2, inp3; 5448 v16u8 res0, res1, avg0, avg1; 5449 v16u8 horiz0, horiz1, horiz2, horiz3; 5450 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5451 v16u8 dst0, dst1; 5452 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5453 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5454 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5455 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5456 v16u8 const20 = (v16u8) __msa_ldi_b(20); 5457 v16u8 const6 = (v16u8) __msa_ldi_b(6); 5458 v16u8 const3 = (v16u8) __msa_ldi_b(3); 5459 5460 LD_UB2(src, src_stride, inp0, inp1); 5461 src += (2 * src_stride); 5462 5463 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5464 const20, const6, const3); 5465 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5466 horiz0 = __msa_aver_u_b(inp0, res0); 5467 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5468 LD_UB2(src, src_stride, inp2, inp3); 5469 src += (2 * src_stride); 5470 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5471 const20, const6, const3); 5472 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5473 horiz2 = __msa_aver_u_b(inp2, res1); 5474 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5475 LD_UB2(dst, dst_stride, dst0, dst1); 5476 LD_UB2(src, src_stride, inp0, inp1); 5477 src += (2 * src_stride); 5478 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5479 const20, const6, const3); 5480 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5481 horiz4 = __msa_aver_u_b(inp0, res0); 5482 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5483 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 5484 horiz1, horiz2, horiz3, horiz4, 5485 horiz1, horiz0, horiz0, horiz1, 5486 horiz2, horiz3, horiz4, horiz5, 5487 const20, const6, const3); 5488 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 5489 res0 = __msa_aver_u_b(avg0, res0); 5490 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5491 res0 = __msa_aver_u_b(avg0, res0); 5492 ST_D2(res0, 0, 1, dst, dst_stride); 5493 dst += (2 * dst_stride); 5494 5495 LD_UB2(dst, dst_stride, dst0, dst1); 5496 LD_UB2(src, src_stride, inp2, inp3); 5497 src += (2 * src_stride); 5498 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5499 const20, const6, const3); 5500 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5501 horiz6 = __msa_aver_u_b(inp2, res1); 5502 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5503 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 5504 horiz3, horiz4, horiz5, horiz6, 5505 horiz3, horiz2, horiz1, horiz0, 5506 horiz4, horiz5, horiz6, horiz7, 5507 const20, const6, const3); 5508 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 5509 res1 = __msa_aver_u_b(avg1, res1); 5510 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5511 res1 = __msa_aver_u_b(avg1, res1); 5512 ST_D2(res1, 0, 1, dst, dst_stride); 5513 dst += (2 * dst_stride); 5514 5515 inp0 = LD_UB(src); 5516 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 5517 const20, const6, const3); 5518 horiz8 = __msa_aver_u_b(inp0, res0); 5519 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 5520 horiz5, horiz6, horiz7, horiz8, 5521 horiz5, horiz4, horiz3, horiz2, 5522 horiz6, horiz7, horiz8, horiz8, 5523 const20, const6, const3); 5524 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 5525 horiz7, horiz8, horiz8, horiz7, 5526 horiz7, horiz6, horiz5, horiz4, 5527 horiz8, horiz8, horiz7, horiz6, 5528 const20, const6, const3); 5529 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 5530 res0 = __msa_aver_u_b(avg0, res0); 5531 LD_UB2(dst, dst_stride, dst0, dst1); 5532 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5533 res0 = __msa_aver_u_b(avg0, res0); 5534 ST_D2(res0, 0, 1, dst, dst_stride); 5535 dst += (2 * dst_stride); 5536 5537 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 5538 res1 = __msa_aver_u_b(avg1, res1); 5539 LD_UB2(dst, dst_stride, dst0, dst1); 5540 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5541 res1 = __msa_aver_u_b(avg1, res1); 5542 ST_D2(res1, 0, 1, dst, dst_stride); 5543} 5544 5545static void hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t *src, 5546 int32_t src_stride, 5547 uint8_t *dst, 5548 int32_t dst_stride) 5549{ 5550 uint8_t buff[272]; 5551 5552 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); 5553 vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 5554} 5555 5556static void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src, 5557 int32_t src_stride, 5558 uint8_t *dst, 5559 int32_t dst_stride) 5560{ 5561 v16u8 inp0, inp1, inp2, inp3; 5562 v16u8 res0, res1, avg0, avg1; 5563 v16u8 horiz0, horiz1, horiz2, horiz3; 5564 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5565 v16u8 dst0, dst1; 5566 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5567 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5568 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5569 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5570 v16u8 const20 = (v16u8) __msa_ldi_b(20); 5571 v16u8 const6 = (v16u8) __msa_ldi_b(6); 5572 v16u8 const3 = (v16u8) __msa_ldi_b(3); 5573 5574 LD_UB2(src, src_stride, inp0, inp1); 5575 src += (2 * src_stride); 5576 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 5577 mask0, mask1, mask2, mask3, 5578 const20, const6, const3); 5579 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5580 LD_UB2(src, src_stride, inp2, inp3); 5581 src += (2 * src_stride); 5582 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 5583 mask0, mask1, mask2, mask3, 5584 const20, const6, const3); 5585 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5586 LD_UB2(dst, dst_stride, dst0, dst1); 5587 LD_UB2(src, src_stride, inp0, inp1); 5588 src += (2 * src_stride); 5589 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 5590 mask0, mask1, mask2, mask3, 5591 const20, const6, const3); 5592 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5593 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 5594 horiz1, horiz2, horiz3, horiz4, 5595 horiz1, horiz0, horiz0, horiz1, 5596 horiz2, horiz3, horiz4, horiz5, 5597 const20, const6, const3); 5598 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 5599 res0 = __msa_aver_u_b(avg0, res0); 5600 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5601 res0 = __msa_aver_u_b(avg0, res0); 5602 ST_D2(res0, 0, 1, dst, dst_stride); 5603 dst += (2 * dst_stride); 5604 5605 LD_UB2(dst, dst_stride, dst0, dst1); 5606 LD_UB2(src, src_stride, inp2, inp3); 5607 src += (2 * src_stride); 5608 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 5609 mask0, mask1, mask2, mask3, 5610 const20, const6, const3); 5611 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5612 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 5613 horiz3, horiz4, horiz5, horiz6, 5614 horiz3, horiz2, horiz1, horiz0, 5615 horiz4, horiz5, horiz6, horiz7, 5616 const20, const6, const3); 5617 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 5618 res1 = __msa_aver_u_b(avg1, res1); 5619 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5620 res1 = __msa_aver_u_b(avg1, res1); 5621 ST_D2(res1, 0, 1, dst, dst_stride); 5622 dst += (2 * dst_stride); 5623 5624 inp0 = LD_UB(src); 5625 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, 5626 mask0, mask1, mask2, mask3, 5627 const20, const6, const3); 5628 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5, 5629 horiz6, horiz7, horiz8, horiz5, horiz4, 5630 horiz3, horiz2, horiz6, horiz7, horiz8, 5631 horiz8, const20, const6, const3); 5632 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7, 5633 horiz8, horiz8, horiz7, horiz7, horiz6, 5634 horiz5, horiz4, horiz8, horiz8, horiz7, 5635 horiz6, const20, const6, const3); 5636 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 5637 res0 = __msa_aver_u_b(avg0, res0); 5638 LD_UB2(dst, dst_stride, dst0, dst1); 5639 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5640 res0 = __msa_aver_u_b(avg0, res0); 5641 ST_D2(res0, 0, 1, dst, dst_stride); 5642 dst += (2 * dst_stride); 5643 5644 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 5645 res1 = __msa_aver_u_b(avg1, res1); 5646 LD_UB2(dst, dst_stride, dst0, dst1); 5647 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5648 res1 = __msa_aver_u_b(avg1, res1); 5649 ST_D2(res1, 0, 1, dst, dst_stride); 5650} 5651 5652static void hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t *src, 5653 int32_t src_stride, 5654 uint8_t *dst, 5655 int32_t dst_stride) 5656{ 5657 uint8_t buff[272]; 5658 5659 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 5660 vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 5661} 5662 5663static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src, 5664 int32_t src_stride, 5665 uint8_t *dst, 5666 int32_t dst_stride) 5667{ 5668 v16u8 inp0, inp1, inp2, inp3; 5669 v16u8 res0, res1, avg0, avg1; 5670 v16u8 horiz0, horiz1, horiz2, horiz3; 5671 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5672 v16u8 dst0, dst1; 5673 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5674 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5675 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5676 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5677 v16u8 const20 = (v16u8) __msa_ldi_b(20); 5678 v16u8 const6 = (v16u8) __msa_ldi_b(6); 5679 v16u8 const3 = (v16u8) __msa_ldi_b(3); 5680 5681 LD_UB2(src, src_stride, inp0, inp1); 5682 src += (2 * src_stride); 5683 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5684 const20, const6, const3); 5685 LD_UB2(src, src_stride, inp2, inp3); 5686 src += (2 * src_stride); 5687 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 5688 5689 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5690 horiz0 = __msa_aver_u_b(inp0, res0); 5691 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5692 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5693 const20, const6, const3); 5694 LD_UB2(src, src_stride, inp0, inp1); 5695 src += (2 * src_stride); 5696 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 5697 5698 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5699 horiz2 = __msa_aver_u_b(inp2, res1); 5700 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5701 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5702 const20, const6, const3); 5703 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 5704 5705 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5706 horiz4 = __msa_aver_u_b(inp0, res0); 5707 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5708 LD_UB2(dst, dst_stride, dst0, dst1); 5709 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 5710 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, horiz1, 5711 horiz2, horiz3, horiz4, horiz1, horiz0, 5712 horiz0, horiz1, horiz2, horiz3, horiz4, 5713 horiz5, const20, const6, const3); 5714 res0 = __msa_aver_u_b(avg0, res0); 5715 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5716 res0 = __msa_aver_u_b(avg0, res0); 5717 ST_D2(res0, 0, 1, dst, dst_stride); 5718 dst += (2 * dst_stride); 5719 5720 LD_UB2(src, src_stride, inp2, inp3); 5721 src += (2 * src_stride); 5722 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5723 const20, const6, const3); 5724 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 5725 5726 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5727 horiz6 = __msa_aver_u_b(inp2, res1); 5728 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5729 LD_UB2(dst, dst_stride, dst0, dst1); 5730 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 5731 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, horiz3, 5732 horiz4, horiz5, horiz6, horiz3, horiz2, 5733 horiz1, horiz0, horiz4, horiz5, horiz6, 5734 horiz7, const20, const6, const3); 5735 res1 = __msa_aver_u_b(avg1, res1); 5736 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5737 res1 = __msa_aver_u_b(avg1, res1); 5738 ST_D2(res1, 0, 1, dst, dst_stride); 5739 dst += (2 * dst_stride); 5740 5741 inp0 = LD_UB(src); 5742 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 5743 const20, const6, const3); 5744 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 5745 horiz8 = __msa_aver_u_b(inp0, res0); 5746 LD_UB2(dst, dst_stride, dst0, dst1); 5747 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 5748 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5, 5749 horiz6, horiz7, horiz8, horiz5, horiz4, 5750 horiz3, horiz2, horiz6, horiz7, horiz8, 5751 horiz8, const20, const6, const3); 5752 res0 = __msa_aver_u_b(avg0, res0); 5753 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5754 res0 = __msa_aver_u_b(avg0, res0); 5755 ST_D2(res0, 0, 1, dst, dst_stride); 5756 dst += (2 * dst_stride); 5757 5758 LD_UB2(dst, dst_stride, dst0, dst1); 5759 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 5760 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7, 5761 horiz8, horiz8, horiz7, horiz7, horiz6, 5762 horiz5, horiz4, horiz8, horiz8, horiz7, 5763 horiz6, const20, const6, const3); 5764 res1 = __msa_aver_u_b(avg1, res1); 5765 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5766 res1 = __msa_aver_u_b(avg1, res1); 5767 ST_D2(res1, 0, 1, dst, dst_stride); 5768} 5769 5770static void copy_8x8_msa(const uint8_t *src, int32_t src_stride, 5771 uint8_t *dst, int32_t dst_stride) 5772{ 5773 uint64_t src0, src1; 5774 int32_t loop_cnt; 5775 5776 for (loop_cnt = 4; loop_cnt--;) { 5777 src0 = LD(src); 5778 src += src_stride; 5779 src1 = LD(src); 5780 src += src_stride; 5781 5782 SD(src0, dst); 5783 dst += dst_stride; 5784 SD(src1, dst); 5785 dst += dst_stride; 5786 } 5787} 5788 5789static void copy_16x16_msa(const uint8_t *src, int32_t src_stride, 5790 uint8_t *dst, int32_t dst_stride) 5791{ 5792 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 5793 v16u8 src8, src9, src10, src11, src12, src13, src14, src15; 5794 5795 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 5796 src += (8 * src_stride); 5797 LD_UB8(src, src_stride, 5798 src8, src9, src10, src11, src12, src13, src14, src15); 5799 5800 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 5801 dst += (8 * dst_stride); 5802 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, 5803 dst, dst_stride); 5804} 5805 5806static void avg_width8_msa(const uint8_t *src, int32_t src_stride, 5807 uint8_t *dst, int32_t dst_stride, 5808 int32_t height) 5809{ 5810 int32_t cnt; 5811 uint64_t out0, out1, out2, out3; 5812 v16u8 src0, src1, src2, src3; 5813 v16u8 dst0, dst1, dst2, dst3; 5814 5815 for (cnt = (height / 4); cnt--;) { 5816 LD_UB4(src, src_stride, src0, src1, src2, src3); 5817 src += (4 * src_stride); 5818 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 5819 5820 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, 5821 dst0, dst1, dst2, dst3); 5822 5823 out0 = __msa_copy_u_d((v2i64) dst0, 0); 5824 out1 = __msa_copy_u_d((v2i64) dst1, 0); 5825 out2 = __msa_copy_u_d((v2i64) dst2, 0); 5826 out3 = __msa_copy_u_d((v2i64) dst3, 0); 5827 SD4(out0, out1, out2, out3, dst, dst_stride); 5828 dst += (4 * dst_stride); 5829 } 5830} 5831 5832static void avg_width16_msa(const uint8_t *src, int32_t src_stride, 5833 uint8_t *dst, int32_t dst_stride, 5834 int32_t height) 5835{ 5836 int32_t cnt; 5837 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 5838 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 5839 5840 for (cnt = (height / 8); cnt--;) { 5841 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 5842 src += (8 * src_stride); 5843 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 5844 5845 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, 5846 dst0, dst1, dst2, dst3); 5847 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, 5848 dst4, dst5, dst6, dst7); 5849 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); 5850 dst += (8 * dst_stride); 5851 } 5852} 5853 5854void ff_copy_16x16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride) 5855{ 5856 copy_16x16_msa(src, stride, dest, stride); 5857} 5858 5859void ff_copy_8x8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride) 5860{ 5861 copy_8x8_msa(src, stride, dest, stride); 5862} 5863 5864void ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dest, 5865 const uint8_t *src, 5866 ptrdiff_t stride) 5867{ 5868 horiz_mc_qpel_aver_src0_8width_msa(src, stride, dest, stride, 8); 5869} 5870 5871void ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dest, 5872 const uint8_t *src, 5873 ptrdiff_t stride) 5874{ 5875 horiz_mc_qpel_aver_src0_16width_msa(src, stride, dest, stride, 16); 5876} 5877 5878void ff_horiz_mc_qpel_8width_msa(uint8_t *dest, const uint8_t *src, 5879 ptrdiff_t stride) 5880{ 5881 horiz_mc_qpel_8width_msa(src, stride, dest, stride, 8); 5882} 5883 5884void ff_horiz_mc_qpel_16width_msa(uint8_t *dest, 5885 const uint8_t *src, ptrdiff_t stride) 5886{ 5887 horiz_mc_qpel_16width_msa(src, stride, dest, stride, 16); 5888} 5889 5890void ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dest, 5891 const uint8_t *src, 5892 ptrdiff_t stride) 5893{ 5894 horiz_mc_qpel_aver_src1_8width_msa(src, stride, dest, stride, 8); 5895} 5896 5897void ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dest, 5898 const uint8_t *src, 5899 ptrdiff_t stride) 5900{ 5901 horiz_mc_qpel_aver_src1_16width_msa(src, stride, dest, stride, 16); 5902} 5903 5904void ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dest, 5905 const uint8_t *src, 5906 ptrdiff_t stride) 5907{ 5908 horiz_mc_qpel_no_rnd_aver_src0_8width_msa(src, stride, dest, stride, 8); 5909} 5910 5911void ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dest, 5912 const uint8_t *src, 5913 ptrdiff_t stride) 5914{ 5915 horiz_mc_qpel_no_rnd_aver_src0_16width_msa(src, stride, dest, stride, 16); 5916} 5917 5918void ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dest, 5919 const uint8_t *src, ptrdiff_t stride) 5920{ 5921 horiz_mc_qpel_no_rnd_8width_msa(src, stride, dest, stride, 8); 5922} 5923 5924void ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dest, 5925 const uint8_t *src, ptrdiff_t stride) 5926{ 5927 horiz_mc_qpel_no_rnd_16width_msa(src, stride, dest, stride, 16); 5928} 5929 5930void ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dest, 5931 const uint8_t *src, 5932 ptrdiff_t stride) 5933{ 5934 horiz_mc_qpel_no_rnd_aver_src1_8width_msa(src, stride, dest, stride, 8); 5935} 5936 5937void ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dest, 5938 const uint8_t *src, 5939 ptrdiff_t stride) 5940{ 5941 horiz_mc_qpel_no_rnd_aver_src1_16width_msa(src, stride, dest, stride, 16); 5942} 5943 5944void ff_avg_width8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride) 5945{ 5946 avg_width8_msa(src, stride, dest, stride, 8); 5947} 5948 5949void ff_avg_width16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride) 5950{ 5951 avg_width16_msa(src, stride, dest, stride, 16); 5952} 5953 5954void ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dest, 5955 const uint8_t *src, 5956 ptrdiff_t stride) 5957{ 5958 horiz_mc_qpel_avg_dst_aver_src0_8width_msa(src, stride, dest, stride, 8); 5959} 5960 5961void ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dest, 5962 const uint8_t *src, 5963 ptrdiff_t stride) 5964{ 5965 horiz_mc_qpel_avg_dst_aver_src0_16width_msa(src, stride, dest, stride, 16); 5966} 5967 5968void ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dest, 5969 const uint8_t *src, ptrdiff_t stride) 5970{ 5971 horiz_mc_qpel_avg_dst_8width_msa(src, stride, dest, stride, 8); 5972} 5973 5974void ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dest, 5975 const uint8_t *src, ptrdiff_t stride) 5976{ 5977 horiz_mc_qpel_avg_dst_16width_msa(src, stride, dest, stride, 16); 5978} 5979 5980void ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dest, 5981 const uint8_t *src, 5982 ptrdiff_t stride) 5983{ 5984 horiz_mc_qpel_avg_dst_aver_src1_8width_msa(src, stride, dest, stride, 8); 5985} 5986 5987void ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dest, 5988 const uint8_t *src, 5989 ptrdiff_t stride) 5990{ 5991 horiz_mc_qpel_avg_dst_aver_src1_16width_msa(src, stride, dest, stride, 16); 5992} 5993 5994 5995void ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dest, 5996 const uint8_t *src, ptrdiff_t stride) 5997{ 5998 vert_mc_qpel_aver_src0_8x8_msa(src, stride, dest, stride); 5999} 6000 6001void ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dest, 6002 const uint8_t *src, ptrdiff_t stride) 6003{ 6004 vert_mc_qpel_aver_src0_16x16_msa(src, stride, dest, stride); 6005} 6006 6007void ff_vert_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src, 6008 ptrdiff_t stride) 6009{ 6010 vert_mc_qpel_8x8_msa(src, stride, dest, stride); 6011} 6012 6013void ff_vert_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src, 6014 ptrdiff_t stride) 6015{ 6016 vert_mc_qpel_16x16_msa(src, stride, dest, stride); 6017} 6018 6019void ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dest, 6020 const uint8_t *src, ptrdiff_t stride) 6021{ 6022 vert_mc_qpel_aver_src1_8x8_msa(src, stride, dest, stride); 6023} 6024 6025void ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dest, 6026 const uint8_t *src, ptrdiff_t stride) 6027{ 6028 vert_mc_qpel_aver_src1_16x16_msa(src, stride, dest, stride); 6029} 6030 6031void ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dest, 6032 const uint8_t *src, 6033 ptrdiff_t stride) 6034{ 6035 vert_mc_qpel_no_rnd_aver_src0_8x8_msa(src, stride, dest, stride); 6036} 6037 6038void ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dest, 6039 const uint8_t *src, 6040 ptrdiff_t stride) 6041{ 6042 vert_mc_qpel_no_rnd_aver_src0_16x16_msa(src, stride, dest, stride); 6043} 6044 6045void ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dest, 6046 const uint8_t *src, ptrdiff_t stride) 6047{ 6048 vert_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride); 6049} 6050 6051void ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dest, 6052 const uint8_t *src, ptrdiff_t stride) 6053{ 6054 vert_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride); 6055} 6056 6057void ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dest, 6058 const uint8_t *src, 6059 ptrdiff_t stride) 6060{ 6061 vert_mc_qpel_no_rnd_aver_src1_8x8_msa(src, stride, dest, stride); 6062} 6063 6064void ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dest, 6065 const uint8_t *src, 6066 ptrdiff_t stride) 6067{ 6068 vert_mc_qpel_no_rnd_aver_src1_16x16_msa(src, stride, dest, stride); 6069} 6070 6071void ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dest, 6072 const uint8_t *src, 6073 ptrdiff_t stride) 6074{ 6075 vert_mc_qpel_avg_dst_aver_src0_8x8_msa(src, stride, dest, stride); 6076} 6077 6078void ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dest, 6079 const uint8_t *src, 6080 ptrdiff_t stride) 6081{ 6082 vert_mc_qpel_avg_dst_aver_src0_16x16_msa(src, stride, dest, stride); 6083} 6084 6085void ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dest, 6086 const uint8_t *src, ptrdiff_t stride) 6087{ 6088 vert_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride); 6089} 6090 6091void ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dest, 6092 const uint8_t *src, ptrdiff_t stride) 6093{ 6094 vert_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride); 6095} 6096 6097void ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dest, 6098 const uint8_t *src, 6099 ptrdiff_t stride) 6100{ 6101 vert_mc_qpel_avg_dst_aver_src1_8x8_msa(src, stride, dest, stride); 6102} 6103 6104void ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dest, 6105 const uint8_t *src, 6106 ptrdiff_t stride) 6107{ 6108 vert_mc_qpel_avg_dst_aver_src1_16x16_msa(src, stride, dest, stride); 6109} 6110 6111/* HV cases */ 6112void ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dest, 6113 const uint8_t *src, 6114 ptrdiff_t stride) 6115{ 6116 hv_mc_qpel_aver_hv_src00_16x16_msa(src, stride, dest, stride); 6117} 6118 6119void ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dest, 6120 const uint8_t *src, ptrdiff_t stride) 6121{ 6122 hv_mc_qpel_aver_hv_src00_8x8_msa(src, stride, dest, stride); 6123} 6124 6125void ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dest, 6126 const uint8_t *src, ptrdiff_t stride) 6127{ 6128 hv_mc_qpel_aver_v_src0_16x16_msa(src, stride, dest, stride); 6129} 6130 6131void ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dest, 6132 const uint8_t *src, ptrdiff_t stride) 6133{ 6134 hv_mc_qpel_aver_v_src0_8x8_msa(src, stride, dest, stride); 6135} 6136 6137void ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dest, 6138 const uint8_t *src, 6139 ptrdiff_t stride) 6140{ 6141 hv_mc_qpel_aver_hv_src10_16x16_msa(src, stride, dest, stride); 6142} 6143 6144void ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dest, 6145 const uint8_t *src, ptrdiff_t stride) 6146{ 6147 hv_mc_qpel_aver_hv_src10_8x8_msa(src, stride, dest, stride); 6148} 6149 6150void ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dest, 6151 const uint8_t *src, ptrdiff_t stride) 6152{ 6153 hv_mc_qpel_aver_h_src0_16x16_msa(src, stride, dest, stride); 6154} 6155 6156void ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dest, 6157 const uint8_t *src, ptrdiff_t stride) 6158{ 6159 hv_mc_qpel_aver_h_src0_8x8_msa(src, stride, dest, stride); 6160} 6161 6162void ff_hv_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src, 6163 ptrdiff_t stride) 6164{ 6165 hv_mc_qpel_16x16_msa(src, stride, dest, stride); 6166} 6167 6168void ff_hv_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src, 6169 ptrdiff_t stride) 6170{ 6171 hv_mc_qpel_8x8_msa(src, stride, dest, stride); 6172} 6173 6174void ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dest, 6175 const uint8_t *src, ptrdiff_t stride) 6176{ 6177 hv_mc_qpel_aver_h_src1_16x16_msa(src, stride, dest, stride); 6178} 6179 6180void ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dest, 6181 const uint8_t *src, ptrdiff_t stride) 6182{ 6183 hv_mc_qpel_aver_h_src1_8x8_msa(src, stride, dest, stride); 6184} 6185 6186void ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dest, 6187 const uint8_t *src, 6188 ptrdiff_t stride) 6189{ 6190 hv_mc_qpel_aver_hv_src01_16x16_msa(src, stride, dest, stride); 6191} 6192 6193void ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dest, 6194 const uint8_t *src, ptrdiff_t stride) 6195{ 6196 hv_mc_qpel_aver_hv_src01_8x8_msa(src, stride, dest, stride); 6197} 6198 6199void ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dest, 6200 const uint8_t *src, ptrdiff_t stride) 6201{ 6202 hv_mc_qpel_aver_v_src1_16x16_msa(src, stride, dest, stride); 6203} 6204 6205void ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dest, 6206 const uint8_t *src, ptrdiff_t stride) 6207{ 6208 hv_mc_qpel_aver_v_src1_8x8_msa(src, stride, dest, stride); 6209} 6210 6211void ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dest, 6212 const uint8_t *src, 6213 ptrdiff_t stride) 6214{ 6215 hv_mc_qpel_aver_hv_src11_16x16_msa(src, stride, dest, stride); 6216} 6217 6218void ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dest, 6219 const uint8_t *src, ptrdiff_t stride) 6220{ 6221 hv_mc_qpel_aver_hv_src11_8x8_msa(src, stride, dest, stride); 6222} 6223 6224void ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dest, 6225 const uint8_t *src, 6226 ptrdiff_t stride) 6227{ 6228 hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(src, stride, dest, stride); 6229} 6230 6231void ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dest, 6232 const uint8_t *src, 6233 ptrdiff_t stride) 6234{ 6235 hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(src, stride, dest, stride); 6236} 6237 6238void ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dest, 6239 const uint8_t *src, 6240 ptrdiff_t stride) 6241{ 6242 hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(src, stride, dest, stride); 6243} 6244 6245void ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dest, 6246 const uint8_t *src, 6247 ptrdiff_t stride) 6248{ 6249 hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(src, stride, dest, stride); 6250} 6251 6252void ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dest, 6253 const uint8_t *src, 6254 ptrdiff_t stride) 6255{ 6256 hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(src, stride, dest, stride); 6257} 6258 6259void ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dest, 6260 const uint8_t *src, 6261 ptrdiff_t stride) 6262{ 6263 hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(src, stride, dest, stride); 6264} 6265 6266void ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dest, 6267 const uint8_t *src, 6268 ptrdiff_t stride) 6269{ 6270 hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(src, stride, dest, stride); 6271} 6272 6273void ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dest, 6274 const uint8_t *src, 6275 ptrdiff_t stride) 6276{ 6277 hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(src, stride, dest, stride); 6278} 6279 6280void ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dest, 6281 const uint8_t *src, ptrdiff_t stride) 6282{ 6283 hv_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride); 6284} 6285 6286void ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dest, 6287 const uint8_t *src, ptrdiff_t stride) 6288{ 6289 hv_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride); 6290} 6291 6292void ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dest, 6293 const uint8_t *src, 6294 ptrdiff_t stride) 6295{ 6296 hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(src, stride, dest, stride); 6297} 6298 6299void ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dest, 6300 const uint8_t *src, 6301 ptrdiff_t stride) 6302{ 6303 hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(src, stride, dest, stride); 6304} 6305 6306void ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dest, 6307 const uint8_t *src, 6308 ptrdiff_t stride) 6309{ 6310 hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(src, stride, dest, stride); 6311} 6312 6313void ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dest, 6314 const uint8_t *src, 6315 ptrdiff_t stride) 6316{ 6317 hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(src, stride, dest, stride); 6318} 6319 6320void ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dest, 6321 const uint8_t *src, 6322 ptrdiff_t stride) 6323{ 6324 hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(src, stride, dest, stride); 6325} 6326 6327void ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dest, 6328 const uint8_t *src, 6329 ptrdiff_t stride) 6330{ 6331 hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(src, stride, dest, stride); 6332} 6333 6334void ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dest, 6335 const uint8_t *src, 6336 ptrdiff_t stride) 6337{ 6338 hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(src, stride, dest, stride); 6339} 6340 6341void ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dest, 6342 const uint8_t *src, 6343 ptrdiff_t stride) 6344{ 6345 hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(src, stride, dest, stride); 6346} 6347 6348void ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dest, 6349 const uint8_t *src, 6350 ptrdiff_t stride) 6351{ 6352 hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(src, stride, dest, stride); 6353} 6354 6355void ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dest, 6356 const uint8_t *src, 6357 ptrdiff_t stride) 6358{ 6359 hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(src, stride, dest, stride); 6360} 6361 6362void ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dest, 6363 const uint8_t *src, 6364 ptrdiff_t stride) 6365{ 6366 hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(src, stride, dest, stride); 6367} 6368 6369void ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dest, 6370 const uint8_t *src, 6371 ptrdiff_t stride) 6372{ 6373 hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(src, stride, dest, stride); 6374} 6375 6376void ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dest, 6377 const uint8_t *src, 6378 ptrdiff_t stride) 6379{ 6380 hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(src, stride, dest, stride); 6381} 6382 6383void ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dest, 6384 const uint8_t *src, 6385 ptrdiff_t stride) 6386{ 6387 hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(src, stride, dest, stride); 6388} 6389 6390void ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dest, 6391 const uint8_t *src, 6392 ptrdiff_t stride) 6393{ 6394 hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(src, stride, dest, stride); 6395} 6396 6397void ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dest, 6398 const uint8_t *src, 6399 ptrdiff_t stride) 6400{ 6401 hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(src, stride, dest, stride); 6402} 6403 6404void ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dest, 6405 const uint8_t *src, ptrdiff_t stride) 6406{ 6407 hv_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride); 6408} 6409 6410void ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dest, 6411 const uint8_t *src, ptrdiff_t stride) 6412{ 6413 hv_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride); 6414} 6415 6416void ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dest, 6417 const uint8_t *src, 6418 ptrdiff_t stride) 6419{ 6420 hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(src, stride, dest, stride); 6421} 6422 6423void ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dest, 6424 const uint8_t *src, 6425 ptrdiff_t stride) 6426{ 6427 hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(src, stride, dest, stride); 6428} 6429 6430void ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dest, 6431 const uint8_t *src, 6432 ptrdiff_t stride) 6433{ 6434 hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(src, stride, dest, stride); 6435} 6436 6437void ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dest, 6438 const uint8_t *src, 6439 ptrdiff_t stride) 6440{ 6441 hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(src, stride, dest, stride); 6442} 6443 6444void ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dest, 6445 const uint8_t *src, 6446 ptrdiff_t stride) 6447{ 6448 hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(src, stride, dest, stride); 6449} 6450 6451void ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dest, 6452 const uint8_t *src, 6453 ptrdiff_t stride) 6454{ 6455 hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(src, stride, dest, stride); 6456} 6457 6458void ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dest, 6459 const uint8_t *src, 6460 ptrdiff_t stride) 6461{ 6462 hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(src, stride, dest, stride); 6463} 6464 6465void ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dest, 6466 const uint8_t *src, 6467 ptrdiff_t stride) 6468{ 6469 hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(src, stride, dest, stride); 6470} 6471