1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 22cabdff1aSopenharmony_ci#include "qpeldsp_mips.h" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci#define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2) \ 25cabdff1aSopenharmony_ci( { \ 26cabdff1aSopenharmony_ci v16u8 out, tmp0, tmp1; \ 27cabdff1aSopenharmony_ci v16u8 data0, data1, data2, data3, data4, data5; \ 28cabdff1aSopenharmony_ci v8i16 res_r, res_l; \ 29cabdff1aSopenharmony_ci v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 30cabdff1aSopenharmony_ci v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \ 31cabdff1aSopenharmony_ci \ 32cabdff1aSopenharmony_ci VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \ 33cabdff1aSopenharmony_ci ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \ 34cabdff1aSopenharmony_ci data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \ 35cabdff1aSopenharmony_ci data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \ 36cabdff1aSopenharmony_ci HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \ 37cabdff1aSopenharmony_ci ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \ 38cabdff1aSopenharmony_ci data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \ 39cabdff1aSopenharmony_ci data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \ 40cabdff1aSopenharmony_ci sum0_r *= (v8u16) (coef0); \ 41cabdff1aSopenharmony_ci sum0_l *= (v8u16) (coef0); \ 42cabdff1aSopenharmony_ci ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \ 43cabdff1aSopenharmony_ci data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \ 44cabdff1aSopenharmony_ci data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \ 45cabdff1aSopenharmony_ci DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \ 46cabdff1aSopenharmony_ci ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \ 47cabdff1aSopenharmony_ci HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \ 48cabdff1aSopenharmony_ci DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \ 49cabdff1aSopenharmony_ci res_r = (v8i16) (sum0_r - sum3_r); \ 50cabdff1aSopenharmony_ci res_l = (v8i16) (sum0_l - sum3_l); \ 51cabdff1aSopenharmony_ci SRARI_H2_SH(res_r, res_l, 5); \ 52cabdff1aSopenharmony_ci CLIP_SH2_0_255(res_r, res_l); \ 53cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \ 54cabdff1aSopenharmony_ci \ 55cabdff1aSopenharmony_ci out; \ 56cabdff1aSopenharmony_ci} ) 57cabdff1aSopenharmony_ci 58cabdff1aSopenharmony_ci#define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, \ 59cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, \ 60cabdff1aSopenharmony_ci coef0, coef1, coef2) \ 61cabdff1aSopenharmony_ci( { \ 62cabdff1aSopenharmony_ci v16u8 out; \ 63cabdff1aSopenharmony_ci v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 64cabdff1aSopenharmony_ci v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \ 65cabdff1aSopenharmony_ci v8i16 res0_r, res1_r; \ 66cabdff1aSopenharmony_ci \ 67cabdff1aSopenharmony_ci VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \ 68cabdff1aSopenharmony_ci VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \ 69cabdff1aSopenharmony_ci HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \ 70cabdff1aSopenharmony_ci DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \ 71cabdff1aSopenharmony_ci VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \ 72cabdff1aSopenharmony_ci VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \ 73cabdff1aSopenharmony_ci DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \ 74cabdff1aSopenharmony_ci DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \ 75cabdff1aSopenharmony_ci res0_r = (v8i16) (sum0_r - sum3_r); \ 76cabdff1aSopenharmony_ci res1_r = (v8i16) (sum4_r - sum7_r); \ 77cabdff1aSopenharmony_ci SRARI_H2_SH(res0_r, res1_r, 5); \ 78cabdff1aSopenharmony_ci CLIP_SH2_0_255(res0_r, res1_r); \ 79cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \ 80cabdff1aSopenharmony_ci \ 81cabdff1aSopenharmony_ci out; \ 82cabdff1aSopenharmony_ci} ) 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_ci#define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, \ 85cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, \ 86cabdff1aSopenharmony_ci coef0, coef1, coef2) \ 87cabdff1aSopenharmony_ci( { \ 88cabdff1aSopenharmony_ci v16u8 out; \ 89cabdff1aSopenharmony_ci v8i16 res0_r; \ 90cabdff1aSopenharmony_ci v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 91cabdff1aSopenharmony_ci \ 92cabdff1aSopenharmony_ci VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \ 93cabdff1aSopenharmony_ci sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \ 94cabdff1aSopenharmony_ci sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \ 95cabdff1aSopenharmony_ci VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \ 96cabdff1aSopenharmony_ci DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \ 97cabdff1aSopenharmony_ci res0_r = (v8i16) (sum0_r - sum3_r); \ 98cabdff1aSopenharmony_ci res0_r = __msa_srari_h(res0_r, 5); \ 99cabdff1aSopenharmony_ci CLIP_SH_0_255(res0_r); \ 100cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \ 101cabdff1aSopenharmony_ci \ 102cabdff1aSopenharmony_ci out; \ 103cabdff1aSopenharmony_ci} ) 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ci#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, \ 106cabdff1aSopenharmony_ci mask2, mask3, coef0, \ 107cabdff1aSopenharmony_ci coef1, coef2) \ 108cabdff1aSopenharmony_ci( { \ 109cabdff1aSopenharmony_ci v16u8 out; \ 110cabdff1aSopenharmony_ci v8i16 res0_r; \ 111cabdff1aSopenharmony_ci v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 112cabdff1aSopenharmony_ci \ 113cabdff1aSopenharmony_ci VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \ 114cabdff1aSopenharmony_ci sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \ 115cabdff1aSopenharmony_ci sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \ 116cabdff1aSopenharmony_ci VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \ 117cabdff1aSopenharmony_ci DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \ 118cabdff1aSopenharmony_ci res0_r = (v8i16) (sum0_r - sum3_r); \ 119cabdff1aSopenharmony_ci res0_r += 15; \ 120cabdff1aSopenharmony_ci res0_r >>= 5; \ 121cabdff1aSopenharmony_ci CLIP_SH_0_255(res0_r); \ 122cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \ 123cabdff1aSopenharmony_ci \ 124cabdff1aSopenharmony_ci out; \ 125cabdff1aSopenharmony_ci} ) 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, \ 128cabdff1aSopenharmony_ci coef0, coef1, coef2) \ 129cabdff1aSopenharmony_ci( { \ 130cabdff1aSopenharmony_ci v16u8 out, tmp0, tmp1; \ 131cabdff1aSopenharmony_ci v16u8 data0, data1, data2, data3, data4, data5; \ 132cabdff1aSopenharmony_ci v8i16 res_r, res_l; \ 133cabdff1aSopenharmony_ci v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 134cabdff1aSopenharmony_ci v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \ 135cabdff1aSopenharmony_ci \ 136cabdff1aSopenharmony_ci VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \ 137cabdff1aSopenharmony_ci ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \ 138cabdff1aSopenharmony_ci data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \ 139cabdff1aSopenharmony_ci data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \ 140cabdff1aSopenharmony_ci HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \ 141cabdff1aSopenharmony_ci ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \ 142cabdff1aSopenharmony_ci data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \ 143cabdff1aSopenharmony_ci data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \ 144cabdff1aSopenharmony_ci sum0_r *= (v8u16) (coef0); \ 145cabdff1aSopenharmony_ci sum0_l *= (v8u16) (coef0); \ 146cabdff1aSopenharmony_ci ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \ 147cabdff1aSopenharmony_ci data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \ 148cabdff1aSopenharmony_ci data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \ 149cabdff1aSopenharmony_ci DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \ 150cabdff1aSopenharmony_ci ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \ 151cabdff1aSopenharmony_ci HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \ 152cabdff1aSopenharmony_ci DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \ 153cabdff1aSopenharmony_ci res_r = (v8i16) (sum0_r - sum3_r); \ 154cabdff1aSopenharmony_ci res_l = (v8i16) (sum0_l - sum3_l); \ 155cabdff1aSopenharmony_ci res_r += 15; \ 156cabdff1aSopenharmony_ci res_l += 15; \ 157cabdff1aSopenharmony_ci res_r >>= 5; \ 158cabdff1aSopenharmony_ci res_l >>= 5; \ 159cabdff1aSopenharmony_ci CLIP_SH2_0_255(res_r, res_l); \ 160cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \ 161cabdff1aSopenharmony_ci \ 162cabdff1aSopenharmony_ci out; \ 163cabdff1aSopenharmony_ci} ) 164cabdff1aSopenharmony_ci 165cabdff1aSopenharmony_ci#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, \ 166cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, \ 167cabdff1aSopenharmony_ci coef0, coef1, coef2) \ 168cabdff1aSopenharmony_ci( { \ 169cabdff1aSopenharmony_ci v16u8 out; \ 170cabdff1aSopenharmony_ci v8i16 res0_r, res1_r; \ 171cabdff1aSopenharmony_ci v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 172cabdff1aSopenharmony_ci v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \ 173cabdff1aSopenharmony_ci \ 174cabdff1aSopenharmony_ci VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \ 175cabdff1aSopenharmony_ci VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \ 176cabdff1aSopenharmony_ci HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \ 177cabdff1aSopenharmony_ci DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \ 178cabdff1aSopenharmony_ci VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \ 179cabdff1aSopenharmony_ci VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \ 180cabdff1aSopenharmony_ci DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \ 181cabdff1aSopenharmony_ci DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \ 182cabdff1aSopenharmony_ci res0_r = (v8i16) (sum0_r - sum3_r); \ 183cabdff1aSopenharmony_ci res1_r = (v8i16) (sum4_r - sum7_r); \ 184cabdff1aSopenharmony_ci res0_r += 15; \ 185cabdff1aSopenharmony_ci res1_r += 15; \ 186cabdff1aSopenharmony_ci res0_r >>= 5; \ 187cabdff1aSopenharmony_ci res1_r >>= 5; \ 188cabdff1aSopenharmony_ci CLIP_SH2_0_255(res0_r, res1_r); \ 189cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \ 190cabdff1aSopenharmony_ci \ 191cabdff1aSopenharmony_ci out; \ 192cabdff1aSopenharmony_ci} ) 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_ci#define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3, \ 195cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, \ 196cabdff1aSopenharmony_ci coef0, coef1, coef2) \ 197cabdff1aSopenharmony_ci( { \ 198cabdff1aSopenharmony_ci v16u8 res; \ 199cabdff1aSopenharmony_ci v8i16 res_r, res_l; \ 200cabdff1aSopenharmony_ci v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 201cabdff1aSopenharmony_ci v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \ 202cabdff1aSopenharmony_ci \ 203cabdff1aSopenharmony_ci ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \ 204cabdff1aSopenharmony_ci ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \ 205cabdff1aSopenharmony_ci DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \ 206cabdff1aSopenharmony_ci HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \ 207cabdff1aSopenharmony_ci ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \ 208cabdff1aSopenharmony_ci ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \ 209cabdff1aSopenharmony_ci DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \ 210cabdff1aSopenharmony_ci DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \ 211cabdff1aSopenharmony_ci res_r = (v8i16) (sum0_r - sum3_r); \ 212cabdff1aSopenharmony_ci res_l = (v8i16) (sum0_l - sum3_l); \ 213cabdff1aSopenharmony_ci SRARI_H2_SH(res_r, res_l, 5); \ 214cabdff1aSopenharmony_ci CLIP_SH2_0_255(res_r, res_l); \ 215cabdff1aSopenharmony_ci res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \ 216cabdff1aSopenharmony_ci \ 217cabdff1aSopenharmony_ci res; \ 218cabdff1aSopenharmony_ci} ) 219cabdff1aSopenharmony_ci 220cabdff1aSopenharmony_ci#define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03, \ 221cabdff1aSopenharmony_ci inp04, inp05, inp06, inp07, \ 222cabdff1aSopenharmony_ci inp10, inp11, inp12, inp13, \ 223cabdff1aSopenharmony_ci inp14, inp15, inp16, inp17, \ 224cabdff1aSopenharmony_ci coef0, coef1, coef2) \ 225cabdff1aSopenharmony_ci( { \ 226cabdff1aSopenharmony_ci v16u8 res; \ 227cabdff1aSopenharmony_ci v8i16 val0, val1; \ 228cabdff1aSopenharmony_ci v8u16 sum00, sum01, sum02, sum03; \ 229cabdff1aSopenharmony_ci v8u16 sum10, sum11, sum12, sum13; \ 230cabdff1aSopenharmony_ci \ 231cabdff1aSopenharmony_ci ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \ 232cabdff1aSopenharmony_ci sum00, sum10, sum03, sum13); \ 233cabdff1aSopenharmony_ci DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \ 234cabdff1aSopenharmony_ci HADD_UB2_UH(sum03, sum13, sum03, sum13); \ 235cabdff1aSopenharmony_ci ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \ 236cabdff1aSopenharmony_ci sum02, sum12, sum01, sum11); \ 237cabdff1aSopenharmony_ci DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \ 238cabdff1aSopenharmony_ci DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \ 239cabdff1aSopenharmony_ci val0 = (v8i16) (sum00 - sum03); \ 240cabdff1aSopenharmony_ci val1 = (v8i16) (sum10 - sum13); \ 241cabdff1aSopenharmony_ci SRARI_H2_SH(val0, val1, 5); \ 242cabdff1aSopenharmony_ci CLIP_SH2_0_255(val0, val1); \ 243cabdff1aSopenharmony_ci res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \ 244cabdff1aSopenharmony_ci \ 245cabdff1aSopenharmony_ci res; \ 246cabdff1aSopenharmony_ci} ) 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_ci#define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3, \ 249cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, \ 250cabdff1aSopenharmony_ci coef0, coef1, coef2) \ 251cabdff1aSopenharmony_ci( { \ 252cabdff1aSopenharmony_ci v16u8 res; \ 253cabdff1aSopenharmony_ci v8i16 res_r, res_l; \ 254cabdff1aSopenharmony_ci v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \ 255cabdff1aSopenharmony_ci v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \ 256cabdff1aSopenharmony_ci \ 257cabdff1aSopenharmony_ci ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \ 258cabdff1aSopenharmony_ci ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \ 259cabdff1aSopenharmony_ci DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \ 260cabdff1aSopenharmony_ci HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \ 261cabdff1aSopenharmony_ci ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \ 262cabdff1aSopenharmony_ci ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \ 263cabdff1aSopenharmony_ci DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \ 264cabdff1aSopenharmony_ci DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \ 265cabdff1aSopenharmony_ci res_r = (v8i16) (sum0_r - sum3_r); \ 266cabdff1aSopenharmony_ci res_l = (v8i16) (sum0_l - sum3_l); \ 267cabdff1aSopenharmony_ci res_r += 15; \ 268cabdff1aSopenharmony_ci res_l += 15; \ 269cabdff1aSopenharmony_ci res_r >>= 5; \ 270cabdff1aSopenharmony_ci res_l >>= 5; \ 271cabdff1aSopenharmony_ci CLIP_SH2_0_255(res_r, res_l); \ 272cabdff1aSopenharmony_ci res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \ 273cabdff1aSopenharmony_ci \ 274cabdff1aSopenharmony_ci res; \ 275cabdff1aSopenharmony_ci} ) 276cabdff1aSopenharmony_ci 277cabdff1aSopenharmony_ci#define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03, \ 278cabdff1aSopenharmony_ci inp04, inp05, inp06, inp07, \ 279cabdff1aSopenharmony_ci inp10, inp11, inp12, inp13, \ 280cabdff1aSopenharmony_ci inp14, inp15, inp16, inp17, \ 281cabdff1aSopenharmony_ci coef0, coef1, coef2) \ 282cabdff1aSopenharmony_ci( { \ 283cabdff1aSopenharmony_ci v16u8 res; \ 284cabdff1aSopenharmony_ci v8i16 val0, val1; \ 285cabdff1aSopenharmony_ci v8u16 sum00, sum01, sum02, sum03; \ 286cabdff1aSopenharmony_ci v8u16 sum10, sum11, sum12, sum13; \ 287cabdff1aSopenharmony_ci \ 288cabdff1aSopenharmony_ci ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \ 289cabdff1aSopenharmony_ci sum00, sum10, sum03, sum13); \ 290cabdff1aSopenharmony_ci DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \ 291cabdff1aSopenharmony_ci HADD_UB2_UH(sum03, sum13, sum03, sum13); \ 292cabdff1aSopenharmony_ci ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \ 293cabdff1aSopenharmony_ci sum02, sum12, sum01, sum11); \ 294cabdff1aSopenharmony_ci DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \ 295cabdff1aSopenharmony_ci DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \ 296cabdff1aSopenharmony_ci val0 = (v8i16) (sum00 - sum03); \ 297cabdff1aSopenharmony_ci val1 = (v8i16) (sum10 - sum13); \ 298cabdff1aSopenharmony_ci val0 += 15; \ 299cabdff1aSopenharmony_ci val1 += 15; \ 300cabdff1aSopenharmony_ci val0 >>= 5; \ 301cabdff1aSopenharmony_ci val1 >>= 5; \ 302cabdff1aSopenharmony_ci CLIP_SH2_0_255(val0, val1); \ 303cabdff1aSopenharmony_ci res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \ 304cabdff1aSopenharmony_ci \ 305cabdff1aSopenharmony_ci res; \ 306cabdff1aSopenharmony_ci} ) 307cabdff1aSopenharmony_ci 308cabdff1aSopenharmony_cistatic void horiz_mc_qpel_aver_src0_8width_msa(const uint8_t *src, 309cabdff1aSopenharmony_ci int32_t src_stride, 310cabdff1aSopenharmony_ci uint8_t *dst, 311cabdff1aSopenharmony_ci int32_t dst_stride, 312cabdff1aSopenharmony_ci int32_t height) 313cabdff1aSopenharmony_ci{ 314cabdff1aSopenharmony_ci uint8_t loop_count; 315cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 316cabdff1aSopenharmony_ci v16u8 res0, res1; 317cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 318cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 319cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 320cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 321cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 322cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 323cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 324cabdff1aSopenharmony_ci 325cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 326cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 327cabdff1aSopenharmony_ci src += (4 * src_stride); 328cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 329cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 330cabdff1aSopenharmony_ci const20, const6, const3); 331cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 332cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 333cabdff1aSopenharmony_ci const20, const6, const3); 334cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 335cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 336cabdff1aSopenharmony_ci AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); 337cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 338cabdff1aSopenharmony_ci dst += (4 * dst_stride); 339cabdff1aSopenharmony_ci } 340cabdff1aSopenharmony_ci} 341cabdff1aSopenharmony_ci 342cabdff1aSopenharmony_cistatic void horiz_mc_qpel_aver_src0_16width_msa(const uint8_t *src, 343cabdff1aSopenharmony_ci int32_t src_stride, 344cabdff1aSopenharmony_ci uint8_t *dst, 345cabdff1aSopenharmony_ci int32_t dst_stride, 346cabdff1aSopenharmony_ci int32_t height) 347cabdff1aSopenharmony_ci{ 348cabdff1aSopenharmony_ci uint8_t loop_count; 349cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 350cabdff1aSopenharmony_ci v16u8 res; 351cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 352cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 353cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 354cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 355cabdff1aSopenharmony_ci 356cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 357cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 358cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 359cabdff1aSopenharmony_ci src += (4 * src_stride); 360cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 361cabdff1aSopenharmony_ci const20, const6, const3); 362cabdff1aSopenharmony_ci res = __msa_aver_u_b(inp0, res); 363cabdff1aSopenharmony_ci ST_UB(res, dst); 364cabdff1aSopenharmony_ci dst += dst_stride; 365cabdff1aSopenharmony_ci 366cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 367cabdff1aSopenharmony_ci const20, const6, const3); 368cabdff1aSopenharmony_ci res = __msa_aver_u_b(inp2, res); 369cabdff1aSopenharmony_ci ST_UB(res, dst); 370cabdff1aSopenharmony_ci dst += dst_stride; 371cabdff1aSopenharmony_ci 372cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 373cabdff1aSopenharmony_ci const20, const6, const3); 374cabdff1aSopenharmony_ci res = __msa_aver_u_b(inp4, res); 375cabdff1aSopenharmony_ci ST_UB(res, dst); 376cabdff1aSopenharmony_ci dst += dst_stride; 377cabdff1aSopenharmony_ci 378cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 379cabdff1aSopenharmony_ci const20, const6, const3); 380cabdff1aSopenharmony_ci res = __msa_aver_u_b(inp6, res); 381cabdff1aSopenharmony_ci ST_UB(res, dst); 382cabdff1aSopenharmony_ci dst += dst_stride; 383cabdff1aSopenharmony_ci } 384cabdff1aSopenharmony_ci} 385cabdff1aSopenharmony_ci 386cabdff1aSopenharmony_cistatic void horiz_mc_qpel_8width_msa(const uint8_t *src, 387cabdff1aSopenharmony_ci int32_t src_stride, 388cabdff1aSopenharmony_ci uint8_t *dst, 389cabdff1aSopenharmony_ci int32_t dst_stride, 390cabdff1aSopenharmony_ci int32_t height) 391cabdff1aSopenharmony_ci{ 392cabdff1aSopenharmony_ci uint8_t loop_count; 393cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 394cabdff1aSopenharmony_ci v16u8 res0, res1; 395cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 396cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 397cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 398cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 399cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 400cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 401cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 402cabdff1aSopenharmony_ci 403cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 404cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 405cabdff1aSopenharmony_ci src += (4 * src_stride); 406cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 407cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 408cabdff1aSopenharmony_ci const20, const6, const3); 409cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 410cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 411cabdff1aSopenharmony_ci const20, const6, const3); 412cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 413cabdff1aSopenharmony_ci dst += (4 * dst_stride); 414cabdff1aSopenharmony_ci } 415cabdff1aSopenharmony_ci} 416cabdff1aSopenharmony_ci 417cabdff1aSopenharmony_cistatic void horiz_mc_qpel_16width_msa(const uint8_t *src, 418cabdff1aSopenharmony_ci int32_t src_stride, 419cabdff1aSopenharmony_ci uint8_t *dst, 420cabdff1aSopenharmony_ci int32_t dst_stride, 421cabdff1aSopenharmony_ci int32_t height) 422cabdff1aSopenharmony_ci{ 423cabdff1aSopenharmony_ci uint8_t loop_count; 424cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 425cabdff1aSopenharmony_ci v16u8 res; 426cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 427cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 428cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 429cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 430cabdff1aSopenharmony_ci 431cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 432cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 433cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 434cabdff1aSopenharmony_ci src += (4 * src_stride); 435cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 436cabdff1aSopenharmony_ci const20, const6, const3); 437cabdff1aSopenharmony_ci ST_UB(res, dst); 438cabdff1aSopenharmony_ci dst += dst_stride; 439cabdff1aSopenharmony_ci 440cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 441cabdff1aSopenharmony_ci const20, const6, const3); 442cabdff1aSopenharmony_ci ST_UB(res, dst); 443cabdff1aSopenharmony_ci dst += dst_stride; 444cabdff1aSopenharmony_ci 445cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 446cabdff1aSopenharmony_ci const20, const6, const3); 447cabdff1aSopenharmony_ci ST_UB(res, dst); 448cabdff1aSopenharmony_ci dst += dst_stride; 449cabdff1aSopenharmony_ci 450cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 451cabdff1aSopenharmony_ci const20, const6, const3); 452cabdff1aSopenharmony_ci ST_UB(res, dst); 453cabdff1aSopenharmony_ci dst += dst_stride; 454cabdff1aSopenharmony_ci } 455cabdff1aSopenharmony_ci} 456cabdff1aSopenharmony_ci 457cabdff1aSopenharmony_cistatic void horiz_mc_qpel_aver_src1_8width_msa(const uint8_t *src, 458cabdff1aSopenharmony_ci int32_t src_stride, 459cabdff1aSopenharmony_ci uint8_t *dst, 460cabdff1aSopenharmony_ci int32_t dst_stride, 461cabdff1aSopenharmony_ci int32_t height) 462cabdff1aSopenharmony_ci{ 463cabdff1aSopenharmony_ci uint8_t loop_count; 464cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 465cabdff1aSopenharmony_ci v16u8 res0, res1; 466cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 467cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 468cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 469cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 470cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 471cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 472cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 473cabdff1aSopenharmony_ci 474cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 475cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 476cabdff1aSopenharmony_ci src += (4 * src_stride); 477cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 478cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 479cabdff1aSopenharmony_ci const20, const6, const3); 480cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 481cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 482cabdff1aSopenharmony_ci const20, const6, const3); 483cabdff1aSopenharmony_ci SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1, 484cabdff1aSopenharmony_ci inp0, inp1, inp2, inp3); 485cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 486cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 487cabdff1aSopenharmony_ci AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); 488cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 489cabdff1aSopenharmony_ci dst += (4 * dst_stride); 490cabdff1aSopenharmony_ci } 491cabdff1aSopenharmony_ci} 492cabdff1aSopenharmony_ci 493cabdff1aSopenharmony_cistatic void horiz_mc_qpel_aver_src1_16width_msa(const uint8_t *src, 494cabdff1aSopenharmony_ci int32_t src_stride, 495cabdff1aSopenharmony_ci uint8_t *dst, 496cabdff1aSopenharmony_ci int32_t dst_stride, 497cabdff1aSopenharmony_ci int32_t height) 498cabdff1aSopenharmony_ci{ 499cabdff1aSopenharmony_ci uint8_t loop_count; 500cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 501cabdff1aSopenharmony_ci v16u8 res; 502cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 503cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 504cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 505cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 506cabdff1aSopenharmony_ci 507cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 508cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 509cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 510cabdff1aSopenharmony_ci src += (4 * src_stride); 511cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 512cabdff1aSopenharmony_ci const20, const6, const3); 513cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, inp1); 514cabdff1aSopenharmony_ci ST_UB(res, dst); 515cabdff1aSopenharmony_ci dst += dst_stride; 516cabdff1aSopenharmony_ci 517cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 518cabdff1aSopenharmony_ci const20, const6, const3); 519cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, inp3); 520cabdff1aSopenharmony_ci ST_UB(res, dst); 521cabdff1aSopenharmony_ci dst += dst_stride; 522cabdff1aSopenharmony_ci 523cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 524cabdff1aSopenharmony_ci const20, const6, const3); 525cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, inp5); 526cabdff1aSopenharmony_ci ST_UB(res, dst); 527cabdff1aSopenharmony_ci dst += dst_stride; 528cabdff1aSopenharmony_ci 529cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 530cabdff1aSopenharmony_ci const20, const6, const3); 531cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, inp7); 532cabdff1aSopenharmony_ci ST_UB(res, dst); 533cabdff1aSopenharmony_ci dst += dst_stride; 534cabdff1aSopenharmony_ci } 535cabdff1aSopenharmony_ci} 536cabdff1aSopenharmony_ci 537cabdff1aSopenharmony_cistatic void horiz_mc_qpel_no_rnd_aver_src0_8width_msa(const uint8_t *src, 538cabdff1aSopenharmony_ci int32_t src_stride, 539cabdff1aSopenharmony_ci uint8_t *dst, 540cabdff1aSopenharmony_ci int32_t dst_stride, 541cabdff1aSopenharmony_ci int32_t height) 542cabdff1aSopenharmony_ci{ 543cabdff1aSopenharmony_ci uint8_t loop_count; 544cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 545cabdff1aSopenharmony_ci v16u8 res0, res1; 546cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 547cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 548cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 549cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 550cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 551cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 552cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 553cabdff1aSopenharmony_ci 554cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 555cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 556cabdff1aSopenharmony_ci src += (4 * src_stride); 557cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 558cabdff1aSopenharmony_ci mask2, mask3, const20, 559cabdff1aSopenharmony_ci const6, const3); 560cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 561cabdff1aSopenharmony_ci mask2, mask3, const20, 562cabdff1aSopenharmony_ci const6, const3); 563cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 564cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 565cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(inp0, res0); 566cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(inp2, res1); 567cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 568cabdff1aSopenharmony_ci dst += (4 * dst_stride); 569cabdff1aSopenharmony_ci } 570cabdff1aSopenharmony_ci} 571cabdff1aSopenharmony_ci 572cabdff1aSopenharmony_cistatic void horiz_mc_qpel_no_rnd_aver_src0_16width_msa(const uint8_t *src, 573cabdff1aSopenharmony_ci int32_t src_stride, 574cabdff1aSopenharmony_ci uint8_t *dst, 575cabdff1aSopenharmony_ci int32_t dst_stride, 576cabdff1aSopenharmony_ci int32_t height) 577cabdff1aSopenharmony_ci{ 578cabdff1aSopenharmony_ci uint8_t loop_count; 579cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 580cabdff1aSopenharmony_ci v16u8 res; 581cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 582cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 583cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 584cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 585cabdff1aSopenharmony_ci 586cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 587cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 588cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 589cabdff1aSopenharmony_ci src += (4 * src_stride); 590cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 591cabdff1aSopenharmony_ci const20, const6, const3); 592cabdff1aSopenharmony_ci res = __msa_ave_u_b(inp0, res); 593cabdff1aSopenharmony_ci ST_UB(res, dst); 594cabdff1aSopenharmony_ci dst += dst_stride; 595cabdff1aSopenharmony_ci 596cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, 597cabdff1aSopenharmony_ci const20, const6, const3); 598cabdff1aSopenharmony_ci res = __msa_ave_u_b(inp2, res); 599cabdff1aSopenharmony_ci ST_UB(res, dst); 600cabdff1aSopenharmony_ci dst += dst_stride; 601cabdff1aSopenharmony_ci 602cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, 603cabdff1aSopenharmony_ci const20, const6, const3); 604cabdff1aSopenharmony_ci res = __msa_ave_u_b(inp4, res); 605cabdff1aSopenharmony_ci ST_UB(res, dst); 606cabdff1aSopenharmony_ci dst += dst_stride; 607cabdff1aSopenharmony_ci 608cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, 609cabdff1aSopenharmony_ci const20, const6, const3); 610cabdff1aSopenharmony_ci res = __msa_ave_u_b(inp6, res); 611cabdff1aSopenharmony_ci ST_UB(res, dst); 612cabdff1aSopenharmony_ci dst += dst_stride; 613cabdff1aSopenharmony_ci } 614cabdff1aSopenharmony_ci} 615cabdff1aSopenharmony_ci 616cabdff1aSopenharmony_cistatic void horiz_mc_qpel_no_rnd_8width_msa(const uint8_t *src, 617cabdff1aSopenharmony_ci int32_t src_stride, 618cabdff1aSopenharmony_ci uint8_t *dst, 619cabdff1aSopenharmony_ci int32_t dst_stride, 620cabdff1aSopenharmony_ci int32_t height) 621cabdff1aSopenharmony_ci{ 622cabdff1aSopenharmony_ci uint8_t loop_count; 623cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 624cabdff1aSopenharmony_ci v16u8 res0, res1; 625cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 626cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 627cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 628cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 629cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 630cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 631cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 632cabdff1aSopenharmony_ci 633cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 634cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 635cabdff1aSopenharmony_ci src += (4 * src_stride); 636cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 637cabdff1aSopenharmony_ci mask2, mask3, const20, 638cabdff1aSopenharmony_ci const6, const3); 639cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 640cabdff1aSopenharmony_ci mask2, mask3, const20, 641cabdff1aSopenharmony_ci const6, const3); 642cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 643cabdff1aSopenharmony_ci dst += (4 * dst_stride); 644cabdff1aSopenharmony_ci } 645cabdff1aSopenharmony_ci} 646cabdff1aSopenharmony_ci 647cabdff1aSopenharmony_cistatic void horiz_mc_qpel_no_rnd_16width_msa(const uint8_t *src, 648cabdff1aSopenharmony_ci int32_t src_stride, 649cabdff1aSopenharmony_ci uint8_t *dst, 650cabdff1aSopenharmony_ci int32_t dst_stride, 651cabdff1aSopenharmony_ci int32_t height) 652cabdff1aSopenharmony_ci{ 653cabdff1aSopenharmony_ci uint8_t loop_count; 654cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 655cabdff1aSopenharmony_ci v16u8 res; 656cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 657cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 658cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 659cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 660cabdff1aSopenharmony_ci 661cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 662cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 663cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 664cabdff1aSopenharmony_ci src += (4 * src_stride); 665cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 666cabdff1aSopenharmony_ci const20, const6, const3); 667cabdff1aSopenharmony_ci ST_UB(res, dst); 668cabdff1aSopenharmony_ci dst += dst_stride; 669cabdff1aSopenharmony_ci 670cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, 671cabdff1aSopenharmony_ci const20, const6, const3); 672cabdff1aSopenharmony_ci ST_UB(res, dst); 673cabdff1aSopenharmony_ci dst += dst_stride; 674cabdff1aSopenharmony_ci 675cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, 676cabdff1aSopenharmony_ci const20, const6, const3); 677cabdff1aSopenharmony_ci ST_UB(res, dst); 678cabdff1aSopenharmony_ci dst += dst_stride; 679cabdff1aSopenharmony_ci 680cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, 681cabdff1aSopenharmony_ci const20, const6, const3); 682cabdff1aSopenharmony_ci ST_UB(res, dst); 683cabdff1aSopenharmony_ci dst += dst_stride; 684cabdff1aSopenharmony_ci } 685cabdff1aSopenharmony_ci} 686cabdff1aSopenharmony_ci 687cabdff1aSopenharmony_cistatic void horiz_mc_qpel_no_rnd_aver_src1_8width_msa(const uint8_t *src, 688cabdff1aSopenharmony_ci int32_t src_stride, 689cabdff1aSopenharmony_ci uint8_t *dst, 690cabdff1aSopenharmony_ci int32_t dst_stride, 691cabdff1aSopenharmony_ci int32_t height) 692cabdff1aSopenharmony_ci{ 693cabdff1aSopenharmony_ci uint8_t loop_count; 694cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 695cabdff1aSopenharmony_ci v16u8 res0, res1; 696cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 697cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 698cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 699cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 700cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 701cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 702cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 703cabdff1aSopenharmony_ci 704cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 705cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 706cabdff1aSopenharmony_ci src += (4 * src_stride); 707cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 708cabdff1aSopenharmony_ci mask2, mask3, const20, 709cabdff1aSopenharmony_ci const6, const3); 710cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 711cabdff1aSopenharmony_ci mask2, mask3, const20, 712cabdff1aSopenharmony_ci const6, const3); 713cabdff1aSopenharmony_ci SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1, 714cabdff1aSopenharmony_ci inp0, inp1, inp2, inp3); 715cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 716cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 717cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(inp0, res0); 718cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(inp2, res1); 719cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 720cabdff1aSopenharmony_ci dst += (4 * dst_stride); 721cabdff1aSopenharmony_ci } 722cabdff1aSopenharmony_ci} 723cabdff1aSopenharmony_ci 724cabdff1aSopenharmony_cistatic void horiz_mc_qpel_no_rnd_aver_src1_16width_msa(const uint8_t *src, 725cabdff1aSopenharmony_ci int32_t src_stride, 726cabdff1aSopenharmony_ci uint8_t *dst, 727cabdff1aSopenharmony_ci int32_t dst_stride, 728cabdff1aSopenharmony_ci int32_t height) 729cabdff1aSopenharmony_ci{ 730cabdff1aSopenharmony_ci uint8_t loop_count; 731cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 732cabdff1aSopenharmony_ci v16u8 res; 733cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 734cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 735cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 736cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 737cabdff1aSopenharmony_ci 738cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 739cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 740cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 741cabdff1aSopenharmony_ci src += (4 * src_stride); 742cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 743cabdff1aSopenharmony_ci const20, const6, const3); 744cabdff1aSopenharmony_ci res = __msa_ave_u_b(res, inp1); 745cabdff1aSopenharmony_ci ST_UB(res, dst); 746cabdff1aSopenharmony_ci dst += dst_stride; 747cabdff1aSopenharmony_ci 748cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, 749cabdff1aSopenharmony_ci const20, const6, const3); 750cabdff1aSopenharmony_ci res = __msa_ave_u_b(res, inp3); 751cabdff1aSopenharmony_ci ST_UB(res, dst); 752cabdff1aSopenharmony_ci dst += dst_stride; 753cabdff1aSopenharmony_ci 754cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, 755cabdff1aSopenharmony_ci const20, const6, const3); 756cabdff1aSopenharmony_ci res = __msa_ave_u_b(res, inp5); 757cabdff1aSopenharmony_ci ST_UB(res, dst); 758cabdff1aSopenharmony_ci dst += dst_stride; 759cabdff1aSopenharmony_ci 760cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, 761cabdff1aSopenharmony_ci const20, const6, const3); 762cabdff1aSopenharmony_ci res = __msa_ave_u_b(res, inp7); 763cabdff1aSopenharmony_ci ST_UB(res, dst); 764cabdff1aSopenharmony_ci dst += dst_stride; 765cabdff1aSopenharmony_ci } 766cabdff1aSopenharmony_ci} 767cabdff1aSopenharmony_ci 768cabdff1aSopenharmony_cistatic void horiz_mc_qpel_avg_dst_aver_src0_8width_msa(const uint8_t *src, 769cabdff1aSopenharmony_ci int32_t src_stride, 770cabdff1aSopenharmony_ci uint8_t *dst, 771cabdff1aSopenharmony_ci int32_t dst_stride, 772cabdff1aSopenharmony_ci int32_t height) 773cabdff1aSopenharmony_ci{ 774cabdff1aSopenharmony_ci uint8_t loop_count; 775cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 776cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3; 777cabdff1aSopenharmony_ci v16u8 res0, res1; 778cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 779cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 780cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 781cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 782cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 783cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 784cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 785cabdff1aSopenharmony_ci 786cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 787cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 788cabdff1aSopenharmony_ci src += (4 * src_stride); 789cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 790cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 791cabdff1aSopenharmony_ci const20, const6, const3); 792cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 793cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 794cabdff1aSopenharmony_ci const20, const6, const3); 795cabdff1aSopenharmony_ci LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 796cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 797cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 798cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 799cabdff1aSopenharmony_ci dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 800cabdff1aSopenharmony_ci AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); 801cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 802cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 803cabdff1aSopenharmony_ci dst += (4 * dst_stride); 804cabdff1aSopenharmony_ci } 805cabdff1aSopenharmony_ci} 806cabdff1aSopenharmony_ci 807cabdff1aSopenharmony_cistatic void horiz_mc_qpel_avg_dst_aver_src0_16width_msa(const uint8_t *src, 808cabdff1aSopenharmony_ci int32_t src_stride, 809cabdff1aSopenharmony_ci uint8_t *dst, 810cabdff1aSopenharmony_ci int32_t dst_stride, 811cabdff1aSopenharmony_ci int32_t height) 812cabdff1aSopenharmony_ci{ 813cabdff1aSopenharmony_ci uint8_t loop_count; 814cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 815cabdff1aSopenharmony_ci v16u8 res0, res1; 816cabdff1aSopenharmony_ci v16u8 dst0, dst1; 817cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 818cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 819cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 820cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 821cabdff1aSopenharmony_ci 822cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 823cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 824cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 825cabdff1aSopenharmony_ci src += (4 * src_stride); 826cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 827cabdff1aSopenharmony_ci const20, const6, const3); 828cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 829cabdff1aSopenharmony_ci const20, const6, const3); 830cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 831cabdff1aSopenharmony_ci AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); 832cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); 833cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 834cabdff1aSopenharmony_ci dst += (2 * dst_stride); 835cabdff1aSopenharmony_ci 836cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 837cabdff1aSopenharmony_ci const20, const6, const3); 838cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 839cabdff1aSopenharmony_ci const20, const6, const3); 840cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 841cabdff1aSopenharmony_ci AVER_UB2_UB(inp4, res0, inp6, res1, res0, res1); 842cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); 843cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 844cabdff1aSopenharmony_ci dst += (2 * dst_stride); 845cabdff1aSopenharmony_ci } 846cabdff1aSopenharmony_ci} 847cabdff1aSopenharmony_ci 848cabdff1aSopenharmony_cistatic void horiz_mc_qpel_avg_dst_8width_msa(const uint8_t *src, 849cabdff1aSopenharmony_ci int32_t src_stride, 850cabdff1aSopenharmony_ci uint8_t *dst, 851cabdff1aSopenharmony_ci int32_t dst_stride, 852cabdff1aSopenharmony_ci int32_t height) 853cabdff1aSopenharmony_ci{ 854cabdff1aSopenharmony_ci uint8_t loop_count; 855cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 856cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3; 857cabdff1aSopenharmony_ci v16u8 res0, res1; 858cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 859cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 860cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 861cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 862cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 863cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 864cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 865cabdff1aSopenharmony_ci 866cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 867cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 868cabdff1aSopenharmony_ci src += (4 * src_stride); 869cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 870cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 871cabdff1aSopenharmony_ci const20, const6, const3); 872cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 873cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 874cabdff1aSopenharmony_ci const20, const6, const3); 875cabdff1aSopenharmony_ci LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 876cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 877cabdff1aSopenharmony_ci dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 878cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 879cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 880cabdff1aSopenharmony_ci dst += (4 * dst_stride); 881cabdff1aSopenharmony_ci } 882cabdff1aSopenharmony_ci} 883cabdff1aSopenharmony_ci 884cabdff1aSopenharmony_cistatic void horiz_mc_qpel_avg_dst_16width_msa(const uint8_t *src, 885cabdff1aSopenharmony_ci int32_t src_stride, 886cabdff1aSopenharmony_ci uint8_t *dst, 887cabdff1aSopenharmony_ci int32_t dst_stride, 888cabdff1aSopenharmony_ci int32_t height) 889cabdff1aSopenharmony_ci{ 890cabdff1aSopenharmony_ci uint8_t loop_count; 891cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 892cabdff1aSopenharmony_ci v16u8 res0, res1; 893cabdff1aSopenharmony_ci v16u8 dst0, dst1; 894cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 895cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 896cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 897cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 898cabdff1aSopenharmony_ci 899cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 900cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 901cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 902cabdff1aSopenharmony_ci src += (4 * src_stride); 903cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 904cabdff1aSopenharmony_ci const20, const6, const3); 905cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 906cabdff1aSopenharmony_ci const20, const6, const3); 907cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 908cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); 909cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 910cabdff1aSopenharmony_ci dst += (2 * dst_stride); 911cabdff1aSopenharmony_ci 912cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 913cabdff1aSopenharmony_ci const20, const6, const3); 914cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 915cabdff1aSopenharmony_ci const20, const6, const3); 916cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 917cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); 918cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 919cabdff1aSopenharmony_ci dst += (2 * dst_stride); 920cabdff1aSopenharmony_ci } 921cabdff1aSopenharmony_ci} 922cabdff1aSopenharmony_ci 923cabdff1aSopenharmony_cistatic void horiz_mc_qpel_avg_dst_aver_src1_8width_msa(const uint8_t *src, 924cabdff1aSopenharmony_ci int32_t src_stride, 925cabdff1aSopenharmony_ci uint8_t *dst, 926cabdff1aSopenharmony_ci int32_t dst_stride, 927cabdff1aSopenharmony_ci int32_t height) 928cabdff1aSopenharmony_ci{ 929cabdff1aSopenharmony_ci uint8_t loop_count; 930cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 931cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3; 932cabdff1aSopenharmony_ci v16u8 res0, res1; 933cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 934cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 935cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 936cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 937cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 938cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 939cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 940cabdff1aSopenharmony_ci 941cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 942cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 943cabdff1aSopenharmony_ci src += (4 * src_stride); 944cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 945cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 946cabdff1aSopenharmony_ci const20, const6, const3); 947cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 948cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 949cabdff1aSopenharmony_ci const20, const6, const3); 950cabdff1aSopenharmony_ci LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 951cabdff1aSopenharmony_ci SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1, 952cabdff1aSopenharmony_ci inp0, inp1, inp2, inp3); 953cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 954cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 955cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 956cabdff1aSopenharmony_ci dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 957cabdff1aSopenharmony_ci AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); 958cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 959cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 960cabdff1aSopenharmony_ci dst += (4 * dst_stride); 961cabdff1aSopenharmony_ci } 962cabdff1aSopenharmony_ci} 963cabdff1aSopenharmony_ci 964cabdff1aSopenharmony_cistatic void horiz_mc_qpel_avg_dst_aver_src1_16width_msa(const uint8_t *src, 965cabdff1aSopenharmony_ci int32_t src_stride, 966cabdff1aSopenharmony_ci uint8_t *dst, 967cabdff1aSopenharmony_ci int32_t dst_stride, 968cabdff1aSopenharmony_ci int32_t height) 969cabdff1aSopenharmony_ci{ 970cabdff1aSopenharmony_ci uint8_t loop_count; 971cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 972cabdff1aSopenharmony_ci v16u8 res0, res1, dst0, dst1; 973cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 974cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 975cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 976cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 977cabdff1aSopenharmony_ci 978cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 979cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 980cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 981cabdff1aSopenharmony_ci src += (4 * src_stride); 982cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 983cabdff1aSopenharmony_ci const20, const6, const3); 984cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 985cabdff1aSopenharmony_ci const20, const6, const3); 986cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 987cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp1, res1, inp3, res0, res1); 988cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); 989cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 990cabdff1aSopenharmony_ci dst += (2 * dst_stride); 991cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 992cabdff1aSopenharmony_ci const20, const6, const3); 993cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 994cabdff1aSopenharmony_ci const20, const6, const3); 995cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 996cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp5, res1, inp7, res0, res1); 997cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1); 998cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 999cabdff1aSopenharmony_ci dst += (2 * dst_stride); 1000cabdff1aSopenharmony_ci } 1001cabdff1aSopenharmony_ci} 1002cabdff1aSopenharmony_ci 1003cabdff1aSopenharmony_ci 1004cabdff1aSopenharmony_cistatic void vert_mc_qpel_aver_src0_8x8_msa(const uint8_t *src, 1005cabdff1aSopenharmony_ci int32_t src_stride, 1006cabdff1aSopenharmony_ci uint8_t *dst, 1007cabdff1aSopenharmony_ci int32_t dst_stride) 1008cabdff1aSopenharmony_ci{ 1009cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1010cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, res0, res1; 1011cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 1012cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 1013cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 1014cabdff1aSopenharmony_ci 1015cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1016cabdff1aSopenharmony_ci src += (4 * src_stride); 1017cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp4, inp5); 1018cabdff1aSopenharmony_ci src += (2 * src_stride); 1019cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, 1020cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 1021cabdff1aSopenharmony_ci inp1, inp0, inp0, inp1, 1022cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 1023cabdff1aSopenharmony_ci const20, const6, const3); 1024cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp6, inp7); 1025cabdff1aSopenharmony_ci src += (2 * src_stride); 1026cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, 1027cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 1028cabdff1aSopenharmony_ci inp3, inp2, inp1, inp0, 1029cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 1030cabdff1aSopenharmony_ci const20, const6, const3); 1031cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 1032cabdff1aSopenharmony_ci tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 1033cabdff1aSopenharmony_ci AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 1034cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 1035cabdff1aSopenharmony_ci 1036cabdff1aSopenharmony_ci inp8 = LD_UB(src); 1037cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, 1038cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 1039cabdff1aSopenharmony_ci inp5, inp4, inp3, inp2, 1040cabdff1aSopenharmony_ci inp6, inp7, inp8, inp8, 1041cabdff1aSopenharmony_ci const20, const6, const3); 1042cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, 1043cabdff1aSopenharmony_ci inp7, inp8, inp8, inp7, 1044cabdff1aSopenharmony_ci inp7, inp6, inp5, inp4, 1045cabdff1aSopenharmony_ci inp8, inp8, inp7, inp6, 1046cabdff1aSopenharmony_ci const20, const6, const3); 1047cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5); 1048cabdff1aSopenharmony_ci tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7); 1049cabdff1aSopenharmony_ci AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 1050cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1051cabdff1aSopenharmony_ci} 1052cabdff1aSopenharmony_ci 1053cabdff1aSopenharmony_cistatic void vert_mc_qpel_aver_src0_16x16_msa(const uint8_t *src, 1054cabdff1aSopenharmony_ci int32_t src_stride, 1055cabdff1aSopenharmony_ci uint8_t *dst, 1056cabdff1aSopenharmony_ci int32_t dst_stride) 1057cabdff1aSopenharmony_ci{ 1058cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1059cabdff1aSopenharmony_ci v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 1060cabdff1aSopenharmony_ci v16u8 res0; 1061cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 1062cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 1063cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 1064cabdff1aSopenharmony_ci 1065cabdff1aSopenharmony_ci LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 1066cabdff1aSopenharmony_ci src += (5 * src_stride); 1067cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, 1068cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 1069cabdff1aSopenharmony_ci const20, const6, const3); 1070cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp0); 1071cabdff1aSopenharmony_ci ST_UB(res0, dst); 1072cabdff1aSopenharmony_ci dst += dst_stride; 1073cabdff1aSopenharmony_ci 1074cabdff1aSopenharmony_ci inp5 = LD_UB(src); 1075cabdff1aSopenharmony_ci src += src_stride; 1076cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, 1077cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 1078cabdff1aSopenharmony_ci const20, const6, const3); 1079cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp1); 1080cabdff1aSopenharmony_ci ST_UB(res0, dst); 1081cabdff1aSopenharmony_ci dst += dst_stride; 1082cabdff1aSopenharmony_ci 1083cabdff1aSopenharmony_ci inp6 = LD_UB(src); 1084cabdff1aSopenharmony_ci src += src_stride; 1085cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, 1086cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 1087cabdff1aSopenharmony_ci const20, const6, const3); 1088cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp2); 1089cabdff1aSopenharmony_ci ST_UB(res0, dst); 1090cabdff1aSopenharmony_ci dst += dst_stride; 1091cabdff1aSopenharmony_ci 1092cabdff1aSopenharmony_ci inp7 = LD_UB(src); 1093cabdff1aSopenharmony_ci src += src_stride; 1094cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, 1095cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 1096cabdff1aSopenharmony_ci const20, const6, const3); 1097cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp3); 1098cabdff1aSopenharmony_ci ST_UB(res0, dst); 1099cabdff1aSopenharmony_ci dst += dst_stride; 1100cabdff1aSopenharmony_ci 1101cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp8, inp9); 1102cabdff1aSopenharmony_ci src += (2 * src_stride); 1103cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, 1104cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 1105cabdff1aSopenharmony_ci const20, const6, const3); 1106cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp4); 1107cabdff1aSopenharmony_ci ST_UB(res0, dst); 1108cabdff1aSopenharmony_ci dst += dst_stride; 1109cabdff1aSopenharmony_ci 1110cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, 1111cabdff1aSopenharmony_ci inp6, inp7, inp8, inp9, 1112cabdff1aSopenharmony_ci const20, const6, const3); 1113cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp5); 1114cabdff1aSopenharmony_ci ST_UB(res0, dst); 1115cabdff1aSopenharmony_ci dst += dst_stride; 1116cabdff1aSopenharmony_ci 1117cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp10, inp11); 1118cabdff1aSopenharmony_ci src += (2 * src_stride); 1119cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, 1120cabdff1aSopenharmony_ci inp7, inp8, inp9, inp10, 1121cabdff1aSopenharmony_ci const20, const6, const3); 1122cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp6); 1123cabdff1aSopenharmony_ci ST_UB(res0, dst); 1124cabdff1aSopenharmony_ci dst += dst_stride; 1125cabdff1aSopenharmony_ci 1126cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, 1127cabdff1aSopenharmony_ci inp8, inp9, inp10, inp11, 1128cabdff1aSopenharmony_ci const20, const6, const3); 1129cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp7); 1130cabdff1aSopenharmony_ci ST_UB(res0, dst); 1131cabdff1aSopenharmony_ci dst += dst_stride; 1132cabdff1aSopenharmony_ci 1133cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp12, inp13); 1134cabdff1aSopenharmony_ci src += (2 * src_stride); 1135cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, 1136cabdff1aSopenharmony_ci inp9, inp10, inp11, inp12, 1137cabdff1aSopenharmony_ci const20, const6, const3); 1138cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp8); 1139cabdff1aSopenharmony_ci ST_UB(res0, dst); 1140cabdff1aSopenharmony_ci dst += dst_stride; 1141cabdff1aSopenharmony_ci 1142cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, 1143cabdff1aSopenharmony_ci inp10, inp11, inp12, inp13, 1144cabdff1aSopenharmony_ci const20, const6, const3); 1145cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp9); 1146cabdff1aSopenharmony_ci ST_UB(res0, dst); 1147cabdff1aSopenharmony_ci dst += dst_stride; 1148cabdff1aSopenharmony_ci 1149cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp14, inp15); 1150cabdff1aSopenharmony_ci src += (2 * src_stride); 1151cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, 1152cabdff1aSopenharmony_ci inp11, inp12, inp13, inp14, 1153cabdff1aSopenharmony_ci const20, const6, const3); 1154cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp10); 1155cabdff1aSopenharmony_ci ST_UB(res0, dst); 1156cabdff1aSopenharmony_ci dst += dst_stride; 1157cabdff1aSopenharmony_ci 1158cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, 1159cabdff1aSopenharmony_ci inp12, inp13, inp14, inp15, 1160cabdff1aSopenharmony_ci const20, const6, const3); 1161cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp11); 1162cabdff1aSopenharmony_ci ST_UB(res0, dst); 1163cabdff1aSopenharmony_ci dst += dst_stride; 1164cabdff1aSopenharmony_ci 1165cabdff1aSopenharmony_ci inp16 = LD_UB(src); 1166cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, 1167cabdff1aSopenharmony_ci inp13, inp14, inp15, inp16, 1168cabdff1aSopenharmony_ci const20, const6, const3); 1169cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp12); 1170cabdff1aSopenharmony_ci ST_UB(res0, dst); 1171cabdff1aSopenharmony_ci dst += dst_stride; 1172cabdff1aSopenharmony_ci 1173cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, 1174cabdff1aSopenharmony_ci inp14, inp15, inp16, inp16, 1175cabdff1aSopenharmony_ci const20, const6, const3); 1176cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp13); 1177cabdff1aSopenharmony_ci ST_UB(res0, dst); 1178cabdff1aSopenharmony_ci dst += dst_stride; 1179cabdff1aSopenharmony_ci 1180cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, 1181cabdff1aSopenharmony_ci inp15, inp16, inp16, inp15, 1182cabdff1aSopenharmony_ci const20, const6, const3); 1183cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp14); 1184cabdff1aSopenharmony_ci ST_UB(res0, dst); 1185cabdff1aSopenharmony_ci dst += dst_stride; 1186cabdff1aSopenharmony_ci 1187cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, 1188cabdff1aSopenharmony_ci inp16, inp16, inp15, inp14, 1189cabdff1aSopenharmony_ci const20, const6, const3); 1190cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp15); 1191cabdff1aSopenharmony_ci ST_UB(res0, dst); 1192cabdff1aSopenharmony_ci} 1193cabdff1aSopenharmony_ci 1194cabdff1aSopenharmony_cistatic void vert_mc_qpel_8x8_msa(const uint8_t *src, 1195cabdff1aSopenharmony_ci int32_t src_stride, 1196cabdff1aSopenharmony_ci uint8_t *dst, 1197cabdff1aSopenharmony_ci int32_t dst_stride) 1198cabdff1aSopenharmony_ci{ 1199cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1200cabdff1aSopenharmony_ci v16u8 res0, res1; 1201cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 1202cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 1203cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 1204cabdff1aSopenharmony_ci 1205cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1206cabdff1aSopenharmony_ci src += (4 * src_stride); 1207cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp4, inp5); 1208cabdff1aSopenharmony_ci src += (2 * src_stride); 1209cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, 1210cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 1211cabdff1aSopenharmony_ci inp1, inp0, inp0, inp1, 1212cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 1213cabdff1aSopenharmony_ci const20, const6, const3); 1214cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp6, inp7); 1215cabdff1aSopenharmony_ci src += (2 * src_stride); 1216cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, 1217cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 1218cabdff1aSopenharmony_ci inp3, inp2, inp1, inp0, 1219cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 1220cabdff1aSopenharmony_ci const20, const6, const3); 1221cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 1222cabdff1aSopenharmony_ci 1223cabdff1aSopenharmony_ci inp8 = LD_UB(src); 1224cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, 1225cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 1226cabdff1aSopenharmony_ci inp5, inp4, inp3, inp2, 1227cabdff1aSopenharmony_ci inp6, inp7, inp8, inp8, 1228cabdff1aSopenharmony_ci const20, const6, const3); 1229cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, 1230cabdff1aSopenharmony_ci inp7, inp8, inp8, inp7, 1231cabdff1aSopenharmony_ci inp7, inp6, inp5, inp4, 1232cabdff1aSopenharmony_ci inp8, inp8, inp7, inp6, 1233cabdff1aSopenharmony_ci const20, const6, const3); 1234cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1235cabdff1aSopenharmony_ci} 1236cabdff1aSopenharmony_ci 1237cabdff1aSopenharmony_cistatic void vert_mc_qpel_16x16_msa(const uint8_t *src, 1238cabdff1aSopenharmony_ci int32_t src_stride, 1239cabdff1aSopenharmony_ci uint8_t *dst, 1240cabdff1aSopenharmony_ci int32_t dst_stride) 1241cabdff1aSopenharmony_ci{ 1242cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1243cabdff1aSopenharmony_ci v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 1244cabdff1aSopenharmony_ci v16u8 res0; 1245cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 1246cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 1247cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 1248cabdff1aSopenharmony_ci 1249cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1250cabdff1aSopenharmony_ci src += (4 * src_stride); 1251cabdff1aSopenharmony_ci inp4 = LD_UB(src); 1252cabdff1aSopenharmony_ci src += src_stride; 1253cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, 1254cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 1255cabdff1aSopenharmony_ci const20, const6, const3); 1256cabdff1aSopenharmony_ci ST_UB(res0, dst); 1257cabdff1aSopenharmony_ci dst += dst_stride; 1258cabdff1aSopenharmony_ci 1259cabdff1aSopenharmony_ci inp5 = LD_UB(src); 1260cabdff1aSopenharmony_ci src += src_stride; 1261cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, 1262cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 1263cabdff1aSopenharmony_ci const20, const6, const3); 1264cabdff1aSopenharmony_ci ST_UB(res0, dst); 1265cabdff1aSopenharmony_ci dst += dst_stride; 1266cabdff1aSopenharmony_ci 1267cabdff1aSopenharmony_ci inp6 = LD_UB(src); 1268cabdff1aSopenharmony_ci src += src_stride; 1269cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, 1270cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 1271cabdff1aSopenharmony_ci const20, const6, const3); 1272cabdff1aSopenharmony_ci ST_UB(res0, dst); 1273cabdff1aSopenharmony_ci dst += dst_stride; 1274cabdff1aSopenharmony_ci 1275cabdff1aSopenharmony_ci inp7 = LD_UB(src); 1276cabdff1aSopenharmony_ci src += src_stride; 1277cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, 1278cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 1279cabdff1aSopenharmony_ci const20, const6, const3); 1280cabdff1aSopenharmony_ci ST_UB(res0, dst); 1281cabdff1aSopenharmony_ci dst += dst_stride; 1282cabdff1aSopenharmony_ci 1283cabdff1aSopenharmony_ci inp8 = LD_UB(src); 1284cabdff1aSopenharmony_ci src += src_stride; 1285cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, 1286cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 1287cabdff1aSopenharmony_ci const20, const6, const3); 1288cabdff1aSopenharmony_ci ST_UB(res0, dst); 1289cabdff1aSopenharmony_ci dst += dst_stride; 1290cabdff1aSopenharmony_ci 1291cabdff1aSopenharmony_ci inp9 = LD_UB(src); 1292cabdff1aSopenharmony_ci src += src_stride; 1293cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, 1294cabdff1aSopenharmony_ci inp6, inp7, inp8, inp9, 1295cabdff1aSopenharmony_ci const20, const6, const3); 1296cabdff1aSopenharmony_ci ST_UB(res0, dst); 1297cabdff1aSopenharmony_ci dst += dst_stride; 1298cabdff1aSopenharmony_ci 1299cabdff1aSopenharmony_ci inp10 = LD_UB(src); 1300cabdff1aSopenharmony_ci src += src_stride; 1301cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, 1302cabdff1aSopenharmony_ci inp7, inp8, inp9, inp10, 1303cabdff1aSopenharmony_ci const20, const6, const3); 1304cabdff1aSopenharmony_ci ST_UB(res0, dst); 1305cabdff1aSopenharmony_ci dst += dst_stride; 1306cabdff1aSopenharmony_ci 1307cabdff1aSopenharmony_ci inp11 = LD_UB(src); 1308cabdff1aSopenharmony_ci src += src_stride; 1309cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, 1310cabdff1aSopenharmony_ci inp8, inp9, inp10, inp11, 1311cabdff1aSopenharmony_ci const20, const6, const3); 1312cabdff1aSopenharmony_ci ST_UB(res0, dst); 1313cabdff1aSopenharmony_ci dst += dst_stride; 1314cabdff1aSopenharmony_ci 1315cabdff1aSopenharmony_ci inp12 = LD_UB(src); 1316cabdff1aSopenharmony_ci src += src_stride; 1317cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, 1318cabdff1aSopenharmony_ci inp9, inp10, inp11, inp12, 1319cabdff1aSopenharmony_ci const20, const6, const3); 1320cabdff1aSopenharmony_ci ST_UB(res0, dst); 1321cabdff1aSopenharmony_ci dst += dst_stride; 1322cabdff1aSopenharmony_ci 1323cabdff1aSopenharmony_ci inp13 = LD_UB(src); 1324cabdff1aSopenharmony_ci src += src_stride; 1325cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, 1326cabdff1aSopenharmony_ci inp10, inp11, inp12, inp13, 1327cabdff1aSopenharmony_ci const20, const6, const3); 1328cabdff1aSopenharmony_ci ST_UB(res0, dst); 1329cabdff1aSopenharmony_ci dst += dst_stride; 1330cabdff1aSopenharmony_ci 1331cabdff1aSopenharmony_ci inp14 = LD_UB(src); 1332cabdff1aSopenharmony_ci src += src_stride; 1333cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, 1334cabdff1aSopenharmony_ci inp11, inp12, inp13, inp14, 1335cabdff1aSopenharmony_ci const20, const6, const3); 1336cabdff1aSopenharmony_ci ST_UB(res0, dst); 1337cabdff1aSopenharmony_ci dst += dst_stride; 1338cabdff1aSopenharmony_ci 1339cabdff1aSopenharmony_ci inp15 = LD_UB(src); 1340cabdff1aSopenharmony_ci src += src_stride; 1341cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, 1342cabdff1aSopenharmony_ci inp12, inp13, inp14, inp15, 1343cabdff1aSopenharmony_ci const20, const6, const3); 1344cabdff1aSopenharmony_ci ST_UB(res0, dst); 1345cabdff1aSopenharmony_ci dst += dst_stride; 1346cabdff1aSopenharmony_ci 1347cabdff1aSopenharmony_ci inp16 = LD_UB(src); 1348cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, 1349cabdff1aSopenharmony_ci inp13, inp14, inp15, inp16, 1350cabdff1aSopenharmony_ci const20, const6, const3); 1351cabdff1aSopenharmony_ci ST_UB(res0, dst); 1352cabdff1aSopenharmony_ci dst += dst_stride; 1353cabdff1aSopenharmony_ci 1354cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, 1355cabdff1aSopenharmony_ci inp14, inp15, inp16, inp16, 1356cabdff1aSopenharmony_ci const20, const6, const3); 1357cabdff1aSopenharmony_ci ST_UB(res0, dst); 1358cabdff1aSopenharmony_ci dst += dst_stride; 1359cabdff1aSopenharmony_ci 1360cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, 1361cabdff1aSopenharmony_ci inp15, inp16, inp16, inp15, 1362cabdff1aSopenharmony_ci const20, const6, const3); 1363cabdff1aSopenharmony_ci ST_UB(res0, dst); 1364cabdff1aSopenharmony_ci dst += dst_stride; 1365cabdff1aSopenharmony_ci 1366cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, 1367cabdff1aSopenharmony_ci inp16, inp16, inp15, inp14, 1368cabdff1aSopenharmony_ci const20, const6, const3); 1369cabdff1aSopenharmony_ci ST_UB(res0, dst); 1370cabdff1aSopenharmony_ci dst += dst_stride; 1371cabdff1aSopenharmony_ci} 1372cabdff1aSopenharmony_ci 1373cabdff1aSopenharmony_cistatic void vert_mc_qpel_aver_src1_8x8_msa(const uint8_t *src, 1374cabdff1aSopenharmony_ci int32_t src_stride, 1375cabdff1aSopenharmony_ci uint8_t *dst, 1376cabdff1aSopenharmony_ci int32_t dst_stride) 1377cabdff1aSopenharmony_ci{ 1378cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1379cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, res0, res1; 1380cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 1381cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 1382cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 1383cabdff1aSopenharmony_ci 1384cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1385cabdff1aSopenharmony_ci src += (4 * src_stride); 1386cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp4, inp5); 1387cabdff1aSopenharmony_ci src += (2 * src_stride); 1388cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, 1389cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 1390cabdff1aSopenharmony_ci inp1, inp0, inp0, inp1, 1391cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 1392cabdff1aSopenharmony_ci const20, const6, const3); 1393cabdff1aSopenharmony_ci 1394cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp6, inp7); 1395cabdff1aSopenharmony_ci src += (2 * src_stride); 1396cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, 1397cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 1398cabdff1aSopenharmony_ci inp3, inp2, inp1, inp0, 1399cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 1400cabdff1aSopenharmony_ci const20, const6, const3); 1401cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2); 1402cabdff1aSopenharmony_ci tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4); 1403cabdff1aSopenharmony_ci AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 1404cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 1405cabdff1aSopenharmony_ci 1406cabdff1aSopenharmony_ci inp8 = LD_UB(src); 1407cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, 1408cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 1409cabdff1aSopenharmony_ci inp5, inp4, inp3, inp2, 1410cabdff1aSopenharmony_ci inp6, inp7, inp8, inp8, 1411cabdff1aSopenharmony_ci const20, const6, const3); 1412cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, 1413cabdff1aSopenharmony_ci inp7, inp8, inp8, inp7, 1414cabdff1aSopenharmony_ci inp7, inp6, inp5, inp4, 1415cabdff1aSopenharmony_ci inp8, inp8, inp7, inp6, 1416cabdff1aSopenharmony_ci const20, const6, const3); 1417cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6); 1418cabdff1aSopenharmony_ci tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8); 1419cabdff1aSopenharmony_ci AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 1420cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1421cabdff1aSopenharmony_ci} 1422cabdff1aSopenharmony_ci 1423cabdff1aSopenharmony_cistatic void vert_mc_qpel_aver_src1_16x16_msa(const uint8_t *src, 1424cabdff1aSopenharmony_ci int32_t src_stride, 1425cabdff1aSopenharmony_ci uint8_t *dst, 1426cabdff1aSopenharmony_ci int32_t dst_stride) 1427cabdff1aSopenharmony_ci{ 1428cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1429cabdff1aSopenharmony_ci v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 1430cabdff1aSopenharmony_ci v16u8 res0; 1431cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 1432cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 1433cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 1434cabdff1aSopenharmony_ci 1435cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1436cabdff1aSopenharmony_ci src += (4 * src_stride); 1437cabdff1aSopenharmony_ci inp4 = LD_UB(src); 1438cabdff1aSopenharmony_ci src += src_stride; 1439cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, 1440cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 1441cabdff1aSopenharmony_ci const20, const6, const3); 1442cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp1); 1443cabdff1aSopenharmony_ci ST_UB(res0, dst); 1444cabdff1aSopenharmony_ci dst += dst_stride; 1445cabdff1aSopenharmony_ci 1446cabdff1aSopenharmony_ci inp5 = LD_UB(src); 1447cabdff1aSopenharmony_ci src += src_stride; 1448cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, 1449cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 1450cabdff1aSopenharmony_ci const20, const6, const3); 1451cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp2); 1452cabdff1aSopenharmony_ci ST_UB(res0, dst); 1453cabdff1aSopenharmony_ci dst += dst_stride; 1454cabdff1aSopenharmony_ci 1455cabdff1aSopenharmony_ci inp6 = LD_UB(src); 1456cabdff1aSopenharmony_ci src += src_stride; 1457cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, 1458cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 1459cabdff1aSopenharmony_ci const20, const6, const3); 1460cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp3); 1461cabdff1aSopenharmony_ci ST_UB(res0, dst); 1462cabdff1aSopenharmony_ci dst += dst_stride; 1463cabdff1aSopenharmony_ci 1464cabdff1aSopenharmony_ci inp7 = LD_UB(src); 1465cabdff1aSopenharmony_ci src += src_stride; 1466cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, 1467cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 1468cabdff1aSopenharmony_ci const20, const6, const3); 1469cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp4); 1470cabdff1aSopenharmony_ci ST_UB(res0, dst); 1471cabdff1aSopenharmony_ci dst += dst_stride; 1472cabdff1aSopenharmony_ci 1473cabdff1aSopenharmony_ci inp8 = LD_UB(src); 1474cabdff1aSopenharmony_ci src += src_stride; 1475cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, 1476cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 1477cabdff1aSopenharmony_ci const20, const6, const3); 1478cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp5); 1479cabdff1aSopenharmony_ci ST_UB(res0, dst); 1480cabdff1aSopenharmony_ci dst += dst_stride; 1481cabdff1aSopenharmony_ci 1482cabdff1aSopenharmony_ci inp9 = LD_UB(src); 1483cabdff1aSopenharmony_ci src += src_stride; 1484cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, 1485cabdff1aSopenharmony_ci inp6, inp7, inp8, inp9, 1486cabdff1aSopenharmony_ci const20, const6, const3); 1487cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp6); 1488cabdff1aSopenharmony_ci ST_UB(res0, dst); 1489cabdff1aSopenharmony_ci dst += dst_stride; 1490cabdff1aSopenharmony_ci 1491cabdff1aSopenharmony_ci inp10 = LD_UB(src); 1492cabdff1aSopenharmony_ci src += src_stride; 1493cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, 1494cabdff1aSopenharmony_ci inp7, inp8, inp9, inp10, 1495cabdff1aSopenharmony_ci const20, const6, const3); 1496cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp7); 1497cabdff1aSopenharmony_ci ST_UB(res0, dst); 1498cabdff1aSopenharmony_ci dst += dst_stride; 1499cabdff1aSopenharmony_ci 1500cabdff1aSopenharmony_ci inp11 = LD_UB(src); 1501cabdff1aSopenharmony_ci src += src_stride; 1502cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, 1503cabdff1aSopenharmony_ci inp8, inp9, inp10, inp11, 1504cabdff1aSopenharmony_ci const20, const6, const3); 1505cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp8); 1506cabdff1aSopenharmony_ci ST_UB(res0, dst); 1507cabdff1aSopenharmony_ci dst += dst_stride; 1508cabdff1aSopenharmony_ci 1509cabdff1aSopenharmony_ci inp12 = LD_UB(src); 1510cabdff1aSopenharmony_ci src += src_stride; 1511cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, 1512cabdff1aSopenharmony_ci inp9, inp10, inp11, inp12, 1513cabdff1aSopenharmony_ci const20, const6, const3); 1514cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp9); 1515cabdff1aSopenharmony_ci ST_UB(res0, dst); 1516cabdff1aSopenharmony_ci dst += dst_stride; 1517cabdff1aSopenharmony_ci 1518cabdff1aSopenharmony_ci inp13 = LD_UB(src); 1519cabdff1aSopenharmony_ci src += src_stride; 1520cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, 1521cabdff1aSopenharmony_ci inp10, inp11, inp12, inp13, 1522cabdff1aSopenharmony_ci const20, const6, const3); 1523cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp10); 1524cabdff1aSopenharmony_ci ST_UB(res0, dst); 1525cabdff1aSopenharmony_ci dst += dst_stride; 1526cabdff1aSopenharmony_ci 1527cabdff1aSopenharmony_ci inp14 = LD_UB(src); 1528cabdff1aSopenharmony_ci src += src_stride; 1529cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, 1530cabdff1aSopenharmony_ci inp11, inp12, inp13, inp14, 1531cabdff1aSopenharmony_ci const20, const6, const3); 1532cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp11); 1533cabdff1aSopenharmony_ci ST_UB(res0, dst); 1534cabdff1aSopenharmony_ci dst += dst_stride; 1535cabdff1aSopenharmony_ci 1536cabdff1aSopenharmony_ci inp15 = LD_UB(src); 1537cabdff1aSopenharmony_ci src += src_stride; 1538cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, 1539cabdff1aSopenharmony_ci inp12, inp13, inp14, inp15, 1540cabdff1aSopenharmony_ci const20, const6, const3); 1541cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp12); 1542cabdff1aSopenharmony_ci ST_UB(res0, dst); 1543cabdff1aSopenharmony_ci dst += dst_stride; 1544cabdff1aSopenharmony_ci 1545cabdff1aSopenharmony_ci inp16 = LD_UB(src); 1546cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, 1547cabdff1aSopenharmony_ci inp13, inp14, inp15, inp16, 1548cabdff1aSopenharmony_ci const20, const6, const3); 1549cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp13); 1550cabdff1aSopenharmony_ci ST_UB(res0, dst); 1551cabdff1aSopenharmony_ci dst += dst_stride; 1552cabdff1aSopenharmony_ci 1553cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, 1554cabdff1aSopenharmony_ci inp14, inp15, inp16, inp16, 1555cabdff1aSopenharmony_ci const20, const6, const3); 1556cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp14); 1557cabdff1aSopenharmony_ci ST_UB(res0, dst); 1558cabdff1aSopenharmony_ci dst += dst_stride; 1559cabdff1aSopenharmony_ci 1560cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, 1561cabdff1aSopenharmony_ci inp15, inp16, inp16, inp15, 1562cabdff1aSopenharmony_ci const20, const6, const3); 1563cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp15); 1564cabdff1aSopenharmony_ci ST_UB(res0, dst); 1565cabdff1aSopenharmony_ci dst += dst_stride; 1566cabdff1aSopenharmony_ci 1567cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, 1568cabdff1aSopenharmony_ci inp16, inp16, inp15, inp14, 1569cabdff1aSopenharmony_ci const20, const6, const3); 1570cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(res0, inp16); 1571cabdff1aSopenharmony_ci ST_UB(res0, dst); 1572cabdff1aSopenharmony_ci} 1573cabdff1aSopenharmony_ci 1574cabdff1aSopenharmony_cistatic void vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t *src, 1575cabdff1aSopenharmony_ci int32_t src_stride, 1576cabdff1aSopenharmony_ci uint8_t *dst, 1577cabdff1aSopenharmony_ci int32_t dst_stride) 1578cabdff1aSopenharmony_ci{ 1579cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1580cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, res0, res1; 1581cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 1582cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 1583cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 1584cabdff1aSopenharmony_ci 1585cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1586cabdff1aSopenharmony_ci src += (4 * src_stride); 1587cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp4, inp5); 1588cabdff1aSopenharmony_ci src += (2 * src_stride); 1589cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2, 1590cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 1591cabdff1aSopenharmony_ci inp1, inp0, inp0, inp1, 1592cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 1593cabdff1aSopenharmony_ci const20, const6, const3); 1594cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp6, inp7); 1595cabdff1aSopenharmony_ci src += (2 * src_stride); 1596cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0, 1597cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 1598cabdff1aSopenharmony_ci inp3, inp2, inp1, inp0, 1599cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 1600cabdff1aSopenharmony_ci const20, const6, const3); 1601cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 1602cabdff1aSopenharmony_ci tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 1603cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, tmp0); 1604cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(res1, tmp1); 1605cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 1606cabdff1aSopenharmony_ci 1607cabdff1aSopenharmony_ci inp8 = LD_UB(src); 1608cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1, 1609cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 1610cabdff1aSopenharmony_ci inp5, inp4, inp3, inp2, 1611cabdff1aSopenharmony_ci inp6, inp7, inp8, inp8, 1612cabdff1aSopenharmony_ci const20, const6, const3); 1613cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3, 1614cabdff1aSopenharmony_ci inp7, inp8, inp8, inp7, 1615cabdff1aSopenharmony_ci inp7, inp6, inp5, inp4, 1616cabdff1aSopenharmony_ci inp8, inp8, inp7, inp6, 1617cabdff1aSopenharmony_ci const20, const6, const3); 1618cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5); 1619cabdff1aSopenharmony_ci tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7); 1620cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, tmp0); 1621cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(res1, tmp1); 1622cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1623cabdff1aSopenharmony_ci} 1624cabdff1aSopenharmony_ci 1625cabdff1aSopenharmony_cistatic void vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t *src, 1626cabdff1aSopenharmony_ci int32_t src_stride, 1627cabdff1aSopenharmony_ci uint8_t *dst, 1628cabdff1aSopenharmony_ci int32_t dst_stride) 1629cabdff1aSopenharmony_ci{ 1630cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1631cabdff1aSopenharmony_ci v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 1632cabdff1aSopenharmony_ci v16u8 res0; 1633cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 1634cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 1635cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 1636cabdff1aSopenharmony_ci 1637cabdff1aSopenharmony_ci LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 1638cabdff1aSopenharmony_ci src += (5 * src_stride); 1639cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2, 1640cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 1641cabdff1aSopenharmony_ci const20, const6, const3); 1642cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp0); 1643cabdff1aSopenharmony_ci ST_UB(res0, dst); 1644cabdff1aSopenharmony_ci dst += dst_stride; 1645cabdff1aSopenharmony_ci 1646cabdff1aSopenharmony_ci inp5 = LD_UB(src); 1647cabdff1aSopenharmony_ci src += src_stride; 1648cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1, 1649cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 1650cabdff1aSopenharmony_ci const20, const6, const3); 1651cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp1); 1652cabdff1aSopenharmony_ci ST_UB(res0, dst); 1653cabdff1aSopenharmony_ci dst += dst_stride; 1654cabdff1aSopenharmony_ci 1655cabdff1aSopenharmony_ci inp6 = LD_UB(src); 1656cabdff1aSopenharmony_ci src += src_stride; 1657cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0, 1658cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 1659cabdff1aSopenharmony_ci const20, const6, const3); 1660cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp2); 1661cabdff1aSopenharmony_ci ST_UB(res0, dst); 1662cabdff1aSopenharmony_ci dst += dst_stride; 1663cabdff1aSopenharmony_ci 1664cabdff1aSopenharmony_ci inp7 = LD_UB(src); 1665cabdff1aSopenharmony_ci src += src_stride; 1666cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0, 1667cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 1668cabdff1aSopenharmony_ci const20, const6, const3); 1669cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp3); 1670cabdff1aSopenharmony_ci ST_UB(res0, dst); 1671cabdff1aSopenharmony_ci dst += dst_stride; 1672cabdff1aSopenharmony_ci 1673cabdff1aSopenharmony_ci inp8 = LD_UB(src); 1674cabdff1aSopenharmony_ci src += src_stride; 1675cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1, 1676cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 1677cabdff1aSopenharmony_ci const20, const6, const3); 1678cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp4); 1679cabdff1aSopenharmony_ci ST_UB(res0, dst); 1680cabdff1aSopenharmony_ci dst += dst_stride; 1681cabdff1aSopenharmony_ci 1682cabdff1aSopenharmony_ci inp9 = LD_UB(src); 1683cabdff1aSopenharmony_ci src += src_stride; 1684cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2, 1685cabdff1aSopenharmony_ci inp6, inp7, inp8, inp9, 1686cabdff1aSopenharmony_ci const20, const6, const3); 1687cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp5); 1688cabdff1aSopenharmony_ci ST_UB(res0, dst); 1689cabdff1aSopenharmony_ci dst += dst_stride; 1690cabdff1aSopenharmony_ci 1691cabdff1aSopenharmony_ci inp10 = LD_UB(src); 1692cabdff1aSopenharmony_ci src += src_stride; 1693cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3, 1694cabdff1aSopenharmony_ci inp7, inp8, inp9, inp10, 1695cabdff1aSopenharmony_ci const20, const6, const3); 1696cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp6); 1697cabdff1aSopenharmony_ci ST_UB(res0, dst); 1698cabdff1aSopenharmony_ci dst += dst_stride; 1699cabdff1aSopenharmony_ci 1700cabdff1aSopenharmony_ci inp11 = LD_UB(src); 1701cabdff1aSopenharmony_ci src += src_stride; 1702cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4, 1703cabdff1aSopenharmony_ci inp8, inp9, inp10, inp11, 1704cabdff1aSopenharmony_ci const20, const6, const3); 1705cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp7); 1706cabdff1aSopenharmony_ci ST_UB(res0, dst); 1707cabdff1aSopenharmony_ci dst += dst_stride; 1708cabdff1aSopenharmony_ci 1709cabdff1aSopenharmony_ci inp12 = LD_UB(src); 1710cabdff1aSopenharmony_ci src += src_stride; 1711cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5, 1712cabdff1aSopenharmony_ci inp9, inp10, inp11, inp12, 1713cabdff1aSopenharmony_ci const20, const6, const3); 1714cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp8); 1715cabdff1aSopenharmony_ci ST_UB(res0, dst); 1716cabdff1aSopenharmony_ci dst += dst_stride; 1717cabdff1aSopenharmony_ci 1718cabdff1aSopenharmony_ci inp13 = LD_UB(src); 1719cabdff1aSopenharmony_ci src += src_stride; 1720cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6, 1721cabdff1aSopenharmony_ci inp10, inp11, inp12, inp13, 1722cabdff1aSopenharmony_ci const20, const6, const3); 1723cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp9); 1724cabdff1aSopenharmony_ci ST_UB(res0, dst); 1725cabdff1aSopenharmony_ci dst += dst_stride; 1726cabdff1aSopenharmony_ci 1727cabdff1aSopenharmony_ci inp14 = LD_UB(src); 1728cabdff1aSopenharmony_ci src += src_stride; 1729cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7, 1730cabdff1aSopenharmony_ci inp11, inp12, inp13, inp14, 1731cabdff1aSopenharmony_ci const20, const6, const3); 1732cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp10); 1733cabdff1aSopenharmony_ci ST_UB(res0, dst); 1734cabdff1aSopenharmony_ci dst += dst_stride; 1735cabdff1aSopenharmony_ci 1736cabdff1aSopenharmony_ci inp15 = LD_UB(src); 1737cabdff1aSopenharmony_ci src += src_stride; 1738cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8, 1739cabdff1aSopenharmony_ci inp12, inp13, inp14, inp15, 1740cabdff1aSopenharmony_ci const20, const6, const3); 1741cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp11); 1742cabdff1aSopenharmony_ci ST_UB(res0, dst); 1743cabdff1aSopenharmony_ci dst += dst_stride; 1744cabdff1aSopenharmony_ci 1745cabdff1aSopenharmony_ci inp16 = LD_UB(src); 1746cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9, 1747cabdff1aSopenharmony_ci inp13, inp14, inp15, inp16, 1748cabdff1aSopenharmony_ci const20, const6, const3); 1749cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp12); 1750cabdff1aSopenharmony_ci ST_UB(res0, dst); 1751cabdff1aSopenharmony_ci dst += dst_stride; 1752cabdff1aSopenharmony_ci 1753cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10, 1754cabdff1aSopenharmony_ci inp14, inp15, inp16, inp16, 1755cabdff1aSopenharmony_ci const20, const6, const3); 1756cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp13); 1757cabdff1aSopenharmony_ci ST_UB(res0, dst); 1758cabdff1aSopenharmony_ci dst += dst_stride; 1759cabdff1aSopenharmony_ci 1760cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11, 1761cabdff1aSopenharmony_ci inp15, inp16, inp16, inp15, 1762cabdff1aSopenharmony_ci const20, const6, const3); 1763cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp14); 1764cabdff1aSopenharmony_ci ST_UB(res0, dst); 1765cabdff1aSopenharmony_ci dst += dst_stride; 1766cabdff1aSopenharmony_ci 1767cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12, 1768cabdff1aSopenharmony_ci inp16, inp16, inp15, inp14, 1769cabdff1aSopenharmony_ci const20, const6, const3); 1770cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp15); 1771cabdff1aSopenharmony_ci ST_UB(res0, dst); 1772cabdff1aSopenharmony_ci dst += dst_stride; 1773cabdff1aSopenharmony_ci} 1774cabdff1aSopenharmony_ci 1775cabdff1aSopenharmony_cistatic void vert_mc_qpel_no_rnd_8x8_msa(const uint8_t *src, 1776cabdff1aSopenharmony_ci int32_t src_stride, 1777cabdff1aSopenharmony_ci uint8_t *dst, 1778cabdff1aSopenharmony_ci int32_t dst_stride) 1779cabdff1aSopenharmony_ci{ 1780cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1781cabdff1aSopenharmony_ci v16u8 res0, res1; 1782cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 1783cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 1784cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 1785cabdff1aSopenharmony_ci 1786cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1787cabdff1aSopenharmony_ci src += (4 * src_stride); 1788cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp4, inp5); 1789cabdff1aSopenharmony_ci src += (2 * src_stride); 1790cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2, 1791cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 1792cabdff1aSopenharmony_ci inp1, inp0, inp0, inp1, 1793cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 1794cabdff1aSopenharmony_ci const20, const6, const3); 1795cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp6, inp7); 1796cabdff1aSopenharmony_ci src += (2 * src_stride); 1797cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0, 1798cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 1799cabdff1aSopenharmony_ci inp3, inp2, inp1, inp0, 1800cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 1801cabdff1aSopenharmony_ci const20, const6, const3); 1802cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 1803cabdff1aSopenharmony_ci 1804cabdff1aSopenharmony_ci inp8 = LD_UB(src); 1805cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1, 1806cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 1807cabdff1aSopenharmony_ci inp5, inp4, inp3, inp2, 1808cabdff1aSopenharmony_ci inp6, inp7, inp8, inp8, 1809cabdff1aSopenharmony_ci const20, const6, const3); 1810cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3, 1811cabdff1aSopenharmony_ci inp7, inp8, inp8, inp7, 1812cabdff1aSopenharmony_ci inp7, inp6, inp5, inp4, 1813cabdff1aSopenharmony_ci inp8, inp8, inp7, inp6, 1814cabdff1aSopenharmony_ci const20, const6, const3); 1815cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1816cabdff1aSopenharmony_ci} 1817cabdff1aSopenharmony_ci 1818cabdff1aSopenharmony_cistatic void vert_mc_qpel_no_rnd_16x16_msa(const uint8_t *src, 1819cabdff1aSopenharmony_ci int32_t src_stride, 1820cabdff1aSopenharmony_ci uint8_t *dst, 1821cabdff1aSopenharmony_ci int32_t dst_stride) 1822cabdff1aSopenharmony_ci{ 1823cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1824cabdff1aSopenharmony_ci v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 1825cabdff1aSopenharmony_ci v16u8 res0; 1826cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 1827cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 1828cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 1829cabdff1aSopenharmony_ci 1830cabdff1aSopenharmony_ci LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 1831cabdff1aSopenharmony_ci src += (5 * src_stride); 1832cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2, 1833cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 1834cabdff1aSopenharmony_ci const20, const6, const3); 1835cabdff1aSopenharmony_ci ST_UB(res0, dst); 1836cabdff1aSopenharmony_ci dst += dst_stride; 1837cabdff1aSopenharmony_ci 1838cabdff1aSopenharmony_ci inp5 = LD_UB(src); 1839cabdff1aSopenharmony_ci src += src_stride; 1840cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1, 1841cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 1842cabdff1aSopenharmony_ci const20, const6, const3); 1843cabdff1aSopenharmony_ci ST_UB(res0, dst); 1844cabdff1aSopenharmony_ci dst += dst_stride; 1845cabdff1aSopenharmony_ci 1846cabdff1aSopenharmony_ci inp6 = LD_UB(src); 1847cabdff1aSopenharmony_ci src += src_stride; 1848cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0, 1849cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 1850cabdff1aSopenharmony_ci const20, const6, const3); 1851cabdff1aSopenharmony_ci ST_UB(res0, dst); 1852cabdff1aSopenharmony_ci dst += dst_stride; 1853cabdff1aSopenharmony_ci 1854cabdff1aSopenharmony_ci inp7 = LD_UB(src); 1855cabdff1aSopenharmony_ci src += src_stride; 1856cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0, 1857cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 1858cabdff1aSopenharmony_ci const20, const6, const3); 1859cabdff1aSopenharmony_ci ST_UB(res0, dst); 1860cabdff1aSopenharmony_ci dst += dst_stride; 1861cabdff1aSopenharmony_ci 1862cabdff1aSopenharmony_ci inp8 = LD_UB(src); 1863cabdff1aSopenharmony_ci src += src_stride; 1864cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1, 1865cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 1866cabdff1aSopenharmony_ci const20, const6, const3); 1867cabdff1aSopenharmony_ci ST_UB(res0, dst); 1868cabdff1aSopenharmony_ci dst += dst_stride; 1869cabdff1aSopenharmony_ci 1870cabdff1aSopenharmony_ci inp9 = LD_UB(src); 1871cabdff1aSopenharmony_ci src += src_stride; 1872cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2, 1873cabdff1aSopenharmony_ci inp6, inp7, inp8, inp9, 1874cabdff1aSopenharmony_ci const20, const6, const3); 1875cabdff1aSopenharmony_ci ST_UB(res0, dst); 1876cabdff1aSopenharmony_ci dst += dst_stride; 1877cabdff1aSopenharmony_ci 1878cabdff1aSopenharmony_ci inp10 = LD_UB(src); 1879cabdff1aSopenharmony_ci src += src_stride; 1880cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3, 1881cabdff1aSopenharmony_ci inp7, inp8, inp9, inp10, 1882cabdff1aSopenharmony_ci const20, const6, const3); 1883cabdff1aSopenharmony_ci ST_UB(res0, dst); 1884cabdff1aSopenharmony_ci dst += dst_stride; 1885cabdff1aSopenharmony_ci 1886cabdff1aSopenharmony_ci inp11 = LD_UB(src); 1887cabdff1aSopenharmony_ci src += src_stride; 1888cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4, 1889cabdff1aSopenharmony_ci inp8, inp9, inp10, inp11, 1890cabdff1aSopenharmony_ci const20, const6, const3); 1891cabdff1aSopenharmony_ci ST_UB(res0, dst); 1892cabdff1aSopenharmony_ci dst += dst_stride; 1893cabdff1aSopenharmony_ci 1894cabdff1aSopenharmony_ci inp12 = LD_UB(src); 1895cabdff1aSopenharmony_ci src += src_stride; 1896cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5, 1897cabdff1aSopenharmony_ci inp9, inp10, inp11, inp12, 1898cabdff1aSopenharmony_ci const20, const6, const3); 1899cabdff1aSopenharmony_ci ST_UB(res0, dst); 1900cabdff1aSopenharmony_ci dst += dst_stride; 1901cabdff1aSopenharmony_ci 1902cabdff1aSopenharmony_ci inp13 = LD_UB(src); 1903cabdff1aSopenharmony_ci src += src_stride; 1904cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6, 1905cabdff1aSopenharmony_ci inp10, inp11, inp12, inp13, 1906cabdff1aSopenharmony_ci const20, const6, const3); 1907cabdff1aSopenharmony_ci ST_UB(res0, dst); 1908cabdff1aSopenharmony_ci dst += dst_stride; 1909cabdff1aSopenharmony_ci 1910cabdff1aSopenharmony_ci inp14 = LD_UB(src); 1911cabdff1aSopenharmony_ci src += src_stride; 1912cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7, 1913cabdff1aSopenharmony_ci inp11, inp12, inp13, inp14, 1914cabdff1aSopenharmony_ci const20, const6, const3); 1915cabdff1aSopenharmony_ci ST_UB(res0, dst); 1916cabdff1aSopenharmony_ci dst += dst_stride; 1917cabdff1aSopenharmony_ci 1918cabdff1aSopenharmony_ci inp15 = LD_UB(src); 1919cabdff1aSopenharmony_ci src += src_stride; 1920cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8, 1921cabdff1aSopenharmony_ci inp12, inp13, inp14, inp15, 1922cabdff1aSopenharmony_ci const20, const6, const3); 1923cabdff1aSopenharmony_ci ST_UB(res0, dst); 1924cabdff1aSopenharmony_ci dst += dst_stride; 1925cabdff1aSopenharmony_ci 1926cabdff1aSopenharmony_ci inp16 = LD_UB(src); 1927cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9, 1928cabdff1aSopenharmony_ci inp13, inp14, inp15, inp16, 1929cabdff1aSopenharmony_ci const20, const6, const3); 1930cabdff1aSopenharmony_ci ST_UB(res0, dst); 1931cabdff1aSopenharmony_ci dst += dst_stride; 1932cabdff1aSopenharmony_ci 1933cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10, 1934cabdff1aSopenharmony_ci inp14, inp15, inp16, inp16, 1935cabdff1aSopenharmony_ci const20, const6, const3); 1936cabdff1aSopenharmony_ci ST_UB(res0, dst); 1937cabdff1aSopenharmony_ci dst += dst_stride; 1938cabdff1aSopenharmony_ci 1939cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11, 1940cabdff1aSopenharmony_ci inp15, inp16, inp16, inp15, 1941cabdff1aSopenharmony_ci const20, const6, const3); 1942cabdff1aSopenharmony_ci ST_UB(res0, dst); 1943cabdff1aSopenharmony_ci dst += dst_stride; 1944cabdff1aSopenharmony_ci 1945cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12, 1946cabdff1aSopenharmony_ci inp16, inp16, inp15, inp14, 1947cabdff1aSopenharmony_ci const20, const6, const3); 1948cabdff1aSopenharmony_ci ST_UB(res0, dst); 1949cabdff1aSopenharmony_ci} 1950cabdff1aSopenharmony_ci 1951cabdff1aSopenharmony_cistatic void vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t *src, 1952cabdff1aSopenharmony_ci int32_t src_stride, 1953cabdff1aSopenharmony_ci uint8_t *dst, 1954cabdff1aSopenharmony_ci int32_t dst_stride) 1955cabdff1aSopenharmony_ci{ 1956cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 1957cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, res0, res1; 1958cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 1959cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 1960cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 1961cabdff1aSopenharmony_ci 1962cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 1963cabdff1aSopenharmony_ci src += (4 * src_stride); 1964cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp4, inp5); 1965cabdff1aSopenharmony_ci src += (2 * src_stride); 1966cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2, 1967cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 1968cabdff1aSopenharmony_ci inp1, inp0, inp0, inp1, 1969cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 1970cabdff1aSopenharmony_ci const20, const6, const3); 1971cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp6, inp7); 1972cabdff1aSopenharmony_ci src += (2 * src_stride); 1973cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0, 1974cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 1975cabdff1aSopenharmony_ci inp3, inp2, inp1, inp0, 1976cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 1977cabdff1aSopenharmony_ci const20, const6, const3); 1978cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2); 1979cabdff1aSopenharmony_ci tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4); 1980cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, tmp0); 1981cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(res1, tmp1); 1982cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 1983cabdff1aSopenharmony_ci 1984cabdff1aSopenharmony_ci inp8 = LD_UB(src); 1985cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1, 1986cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 1987cabdff1aSopenharmony_ci inp5, inp4, inp3, inp2, 1988cabdff1aSopenharmony_ci inp6, inp7, inp8, inp8, 1989cabdff1aSopenharmony_ci const20, const6, const3); 1990cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3, 1991cabdff1aSopenharmony_ci inp7, inp8, inp8, inp7, 1992cabdff1aSopenharmony_ci inp7, inp6, inp5, inp4, 1993cabdff1aSopenharmony_ci inp8, inp8, inp7, inp6, 1994cabdff1aSopenharmony_ci const20, const6, const3); 1995cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6); 1996cabdff1aSopenharmony_ci tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8); 1997cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, tmp0); 1998cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(res1, tmp1); 1999cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 2000cabdff1aSopenharmony_ci} 2001cabdff1aSopenharmony_ci 2002cabdff1aSopenharmony_cistatic void vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t *src, 2003cabdff1aSopenharmony_ci int32_t src_stride, 2004cabdff1aSopenharmony_ci uint8_t *dst, 2005cabdff1aSopenharmony_ci int32_t dst_stride) 2006cabdff1aSopenharmony_ci{ 2007cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2008cabdff1aSopenharmony_ci v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 2009cabdff1aSopenharmony_ci v16u8 res0; 2010cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 2011cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 2012cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 2013cabdff1aSopenharmony_ci 2014cabdff1aSopenharmony_ci LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 2015cabdff1aSopenharmony_ci src += (5 * src_stride); 2016cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2, 2017cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 2018cabdff1aSopenharmony_ci const20, const6, const3); 2019cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp1); 2020cabdff1aSopenharmony_ci ST_UB(res0, dst); 2021cabdff1aSopenharmony_ci dst += dst_stride; 2022cabdff1aSopenharmony_ci 2023cabdff1aSopenharmony_ci inp5 = LD_UB(src); 2024cabdff1aSopenharmony_ci src += src_stride; 2025cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1, 2026cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 2027cabdff1aSopenharmony_ci const20, const6, const3); 2028cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp2); 2029cabdff1aSopenharmony_ci ST_UB(res0, dst); 2030cabdff1aSopenharmony_ci dst += dst_stride; 2031cabdff1aSopenharmony_ci 2032cabdff1aSopenharmony_ci inp6 = LD_UB(src); 2033cabdff1aSopenharmony_ci src += src_stride; 2034cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0, 2035cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 2036cabdff1aSopenharmony_ci const20, const6, const3); 2037cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp3); 2038cabdff1aSopenharmony_ci ST_UB(res0, dst); 2039cabdff1aSopenharmony_ci dst += dst_stride; 2040cabdff1aSopenharmony_ci 2041cabdff1aSopenharmony_ci inp7 = LD_UB(src); 2042cabdff1aSopenharmony_ci src += src_stride; 2043cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0, 2044cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 2045cabdff1aSopenharmony_ci const20, const6, const3); 2046cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp4); 2047cabdff1aSopenharmony_ci ST_UB(res0, dst); 2048cabdff1aSopenharmony_ci dst += dst_stride; 2049cabdff1aSopenharmony_ci 2050cabdff1aSopenharmony_ci inp8 = LD_UB(src); 2051cabdff1aSopenharmony_ci src += src_stride; 2052cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1, 2053cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 2054cabdff1aSopenharmony_ci const20, const6, const3); 2055cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp5); 2056cabdff1aSopenharmony_ci ST_UB(res0, dst); 2057cabdff1aSopenharmony_ci dst += dst_stride; 2058cabdff1aSopenharmony_ci 2059cabdff1aSopenharmony_ci inp9 = LD_UB(src); 2060cabdff1aSopenharmony_ci src += src_stride; 2061cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2, 2062cabdff1aSopenharmony_ci inp6, inp7, inp8, inp9, 2063cabdff1aSopenharmony_ci const20, const6, const3); 2064cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp6); 2065cabdff1aSopenharmony_ci ST_UB(res0, dst); 2066cabdff1aSopenharmony_ci dst += dst_stride; 2067cabdff1aSopenharmony_ci 2068cabdff1aSopenharmony_ci inp10 = LD_UB(src); 2069cabdff1aSopenharmony_ci src += src_stride; 2070cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3, 2071cabdff1aSopenharmony_ci inp7, inp8, inp9, inp10, 2072cabdff1aSopenharmony_ci const20, const6, const3); 2073cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp7); 2074cabdff1aSopenharmony_ci ST_UB(res0, dst); 2075cabdff1aSopenharmony_ci dst += dst_stride; 2076cabdff1aSopenharmony_ci 2077cabdff1aSopenharmony_ci inp11 = LD_UB(src); 2078cabdff1aSopenharmony_ci src += src_stride; 2079cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4, 2080cabdff1aSopenharmony_ci inp8, inp9, inp10, inp11, 2081cabdff1aSopenharmony_ci const20, const6, const3); 2082cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp8); 2083cabdff1aSopenharmony_ci ST_UB(res0, dst); 2084cabdff1aSopenharmony_ci dst += dst_stride; 2085cabdff1aSopenharmony_ci 2086cabdff1aSopenharmony_ci inp12 = LD_UB(src); 2087cabdff1aSopenharmony_ci src += src_stride; 2088cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5, 2089cabdff1aSopenharmony_ci inp9, inp10, inp11, inp12, 2090cabdff1aSopenharmony_ci const20, const6, const3); 2091cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp9); 2092cabdff1aSopenharmony_ci ST_UB(res0, dst); 2093cabdff1aSopenharmony_ci dst += dst_stride; 2094cabdff1aSopenharmony_ci 2095cabdff1aSopenharmony_ci inp13 = LD_UB(src); 2096cabdff1aSopenharmony_ci src += src_stride; 2097cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6, 2098cabdff1aSopenharmony_ci inp10, inp11, inp12, inp13, 2099cabdff1aSopenharmony_ci const20, const6, const3); 2100cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp10); 2101cabdff1aSopenharmony_ci ST_UB(res0, dst); 2102cabdff1aSopenharmony_ci dst += dst_stride; 2103cabdff1aSopenharmony_ci 2104cabdff1aSopenharmony_ci inp14 = LD_UB(src); 2105cabdff1aSopenharmony_ci src += src_stride; 2106cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7, 2107cabdff1aSopenharmony_ci inp11, inp12, inp13, inp14, 2108cabdff1aSopenharmony_ci const20, const6, const3); 2109cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp11); 2110cabdff1aSopenharmony_ci ST_UB(res0, dst); 2111cabdff1aSopenharmony_ci dst += dst_stride; 2112cabdff1aSopenharmony_ci 2113cabdff1aSopenharmony_ci inp15 = LD_UB(src); 2114cabdff1aSopenharmony_ci src += src_stride; 2115cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8, 2116cabdff1aSopenharmony_ci inp12, inp13, inp14, inp15, 2117cabdff1aSopenharmony_ci const20, const6, const3); 2118cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp12); 2119cabdff1aSopenharmony_ci ST_UB(res0, dst); 2120cabdff1aSopenharmony_ci dst += dst_stride; 2121cabdff1aSopenharmony_ci 2122cabdff1aSopenharmony_ci inp16 = LD_UB(src); 2123cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9, 2124cabdff1aSopenharmony_ci inp13, inp14, inp15, inp16, 2125cabdff1aSopenharmony_ci const20, const6, const3); 2126cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp13); 2127cabdff1aSopenharmony_ci ST_UB(res0, dst); 2128cabdff1aSopenharmony_ci dst += dst_stride; 2129cabdff1aSopenharmony_ci 2130cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10, 2131cabdff1aSopenharmony_ci inp14, inp15, inp16, inp16, 2132cabdff1aSopenharmony_ci const20, const6, const3); 2133cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp14); 2134cabdff1aSopenharmony_ci ST_UB(res0, dst); 2135cabdff1aSopenharmony_ci dst += dst_stride; 2136cabdff1aSopenharmony_ci 2137cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11, 2138cabdff1aSopenharmony_ci inp15, inp16, inp16, inp15, 2139cabdff1aSopenharmony_ci const20, const6, const3); 2140cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp15); 2141cabdff1aSopenharmony_ci ST_UB(res0, dst); 2142cabdff1aSopenharmony_ci dst += dst_stride; 2143cabdff1aSopenharmony_ci 2144cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12, 2145cabdff1aSopenharmony_ci inp16, inp16, inp15, inp14, 2146cabdff1aSopenharmony_ci const20, const6, const3); 2147cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(res0, inp16); 2148cabdff1aSopenharmony_ci ST_UB(res0, dst); 2149cabdff1aSopenharmony_ci} 2150cabdff1aSopenharmony_ci 2151cabdff1aSopenharmony_cistatic void vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t *src, 2152cabdff1aSopenharmony_ci int32_t src_stride, 2153cabdff1aSopenharmony_ci uint8_t *dst, 2154cabdff1aSopenharmony_ci int32_t dst_stride) 2155cabdff1aSopenharmony_ci{ 2156cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2157cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3; 2158cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, res0, res1; 2159cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 2160cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 2161cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 2162cabdff1aSopenharmony_ci 2163cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 2164cabdff1aSopenharmony_ci src += (4 * src_stride); 2165cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp4, inp5); 2166cabdff1aSopenharmony_ci src += (2 * src_stride); 2167cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, 2168cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 2169cabdff1aSopenharmony_ci inp1, inp0, inp0, inp1, 2170cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 2171cabdff1aSopenharmony_ci const20, const6, const3); 2172cabdff1aSopenharmony_ci 2173cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp6, inp7); 2174cabdff1aSopenharmony_ci src += (2 * src_stride); 2175cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, 2176cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 2177cabdff1aSopenharmony_ci inp3, inp2, inp1, inp0, 2178cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 2179cabdff1aSopenharmony_ci const20, const6, const3); 2180cabdff1aSopenharmony_ci 2181cabdff1aSopenharmony_ci LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 2182cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 2183cabdff1aSopenharmony_ci tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 2184cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 2185cabdff1aSopenharmony_ci dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 2186cabdff1aSopenharmony_ci AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 2187cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 2188cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 2189cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2190cabdff1aSopenharmony_ci 2191cabdff1aSopenharmony_ci inp8 = LD_UB(src); 2192cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, 2193cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 2194cabdff1aSopenharmony_ci inp5, inp4, inp3, inp2, 2195cabdff1aSopenharmony_ci inp6, inp7, inp8, inp8, 2196cabdff1aSopenharmony_ci const20, const6, const3); 2197cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, 2198cabdff1aSopenharmony_ci inp7, inp8, inp8, inp7, 2199cabdff1aSopenharmony_ci inp7, inp6, inp5, inp4, 2200cabdff1aSopenharmony_ci inp8, inp8, inp7, inp6, 2201cabdff1aSopenharmony_ci const20, const6, const3); 2202cabdff1aSopenharmony_ci 2203cabdff1aSopenharmony_ci LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 2204cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5); 2205cabdff1aSopenharmony_ci tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7); 2206cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 2207cabdff1aSopenharmony_ci dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 2208cabdff1aSopenharmony_ci AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 2209cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 2210cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 2211cabdff1aSopenharmony_ci} 2212cabdff1aSopenharmony_ci 2213cabdff1aSopenharmony_cistatic void vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t *src, 2214cabdff1aSopenharmony_ci int32_t src_stride, 2215cabdff1aSopenharmony_ci uint8_t *dst, 2216cabdff1aSopenharmony_ci int32_t dst_stride) 2217cabdff1aSopenharmony_ci{ 2218cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2219cabdff1aSopenharmony_ci v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 2220cabdff1aSopenharmony_ci v16u8 res0, res1, dst0, dst1; 2221cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 2222cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 2223cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 2224cabdff1aSopenharmony_ci 2225cabdff1aSopenharmony_ci LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 2226cabdff1aSopenharmony_ci src += (5 * src_stride); 2227cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, 2228cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 2229cabdff1aSopenharmony_ci const20, const6, const3); 2230cabdff1aSopenharmony_ci 2231cabdff1aSopenharmony_ci inp5 = LD_UB(src); 2232cabdff1aSopenharmony_ci src += src_stride; 2233cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, 2234cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 2235cabdff1aSopenharmony_ci const20, const6, const3); 2236cabdff1aSopenharmony_ci 2237cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2238cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp0, res1, inp1, res0, res1); 2239cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2240cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2241cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2242cabdff1aSopenharmony_ci 2243cabdff1aSopenharmony_ci inp6 = LD_UB(src); 2244cabdff1aSopenharmony_ci src += src_stride; 2245cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, 2246cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 2247cabdff1aSopenharmony_ci const20, const6, const3); 2248cabdff1aSopenharmony_ci 2249cabdff1aSopenharmony_ci inp7 = LD_UB(src); 2250cabdff1aSopenharmony_ci src += src_stride; 2251cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, 2252cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 2253cabdff1aSopenharmony_ci const20, const6, const3); 2254cabdff1aSopenharmony_ci 2255cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2256cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp2, res1, inp3, res0, res1); 2257cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2258cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2259cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2260cabdff1aSopenharmony_ci 2261cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp8, inp9); 2262cabdff1aSopenharmony_ci src += (2 * src_stride); 2263cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, 2264cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 2265cabdff1aSopenharmony_ci const20, const6, const3); 2266cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, 2267cabdff1aSopenharmony_ci inp6, inp7, inp8, inp9, 2268cabdff1aSopenharmony_ci const20, const6, const3); 2269cabdff1aSopenharmony_ci 2270cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2271cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp4, res1, inp5, res0, res1); 2272cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2273cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2274cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2275cabdff1aSopenharmony_ci 2276cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp10, inp11); 2277cabdff1aSopenharmony_ci src += (2 * src_stride); 2278cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, 2279cabdff1aSopenharmony_ci inp7, inp8, inp9, inp10, 2280cabdff1aSopenharmony_ci const20, const6, const3); 2281cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, 2282cabdff1aSopenharmony_ci inp8, inp9, inp10, inp11, 2283cabdff1aSopenharmony_ci const20, const6, const3); 2284cabdff1aSopenharmony_ci 2285cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2286cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp6, res1, inp7, res0, res1); 2287cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2288cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2289cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2290cabdff1aSopenharmony_ci 2291cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp12, inp13); 2292cabdff1aSopenharmony_ci src += (2 * src_stride); 2293cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, 2294cabdff1aSopenharmony_ci inp9, inp10, inp11, inp12, 2295cabdff1aSopenharmony_ci const20, const6, const3); 2296cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, 2297cabdff1aSopenharmony_ci inp10, inp11, inp12, inp13, 2298cabdff1aSopenharmony_ci const20, const6, const3); 2299cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2300cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp8, res1, inp9, res0, res1); 2301cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2302cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2303cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2304cabdff1aSopenharmony_ci 2305cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp14, inp15); 2306cabdff1aSopenharmony_ci src += (2 * src_stride); 2307cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, 2308cabdff1aSopenharmony_ci inp11, inp12, inp13, inp14, 2309cabdff1aSopenharmony_ci const20, const6, const3); 2310cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, 2311cabdff1aSopenharmony_ci inp12, inp13, inp14, inp15, 2312cabdff1aSopenharmony_ci const20, const6, const3); 2313cabdff1aSopenharmony_ci 2314cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2315cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1); 2316cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2317cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2318cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2319cabdff1aSopenharmony_ci 2320cabdff1aSopenharmony_ci inp16 = LD_UB(src); 2321cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, 2322cabdff1aSopenharmony_ci inp13, inp14, inp15, inp16, 2323cabdff1aSopenharmony_ci const20, const6, const3); 2324cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, 2325cabdff1aSopenharmony_ci inp14, inp15, inp16, inp16, 2326cabdff1aSopenharmony_ci const20, const6, const3); 2327cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2328cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1); 2329cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2330cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2331cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2332cabdff1aSopenharmony_ci 2333cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, 2334cabdff1aSopenharmony_ci inp15, inp16, inp16, inp15, 2335cabdff1aSopenharmony_ci const20, const6, const3); 2336cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, 2337cabdff1aSopenharmony_ci inp16, inp16, inp15, inp14, 2338cabdff1aSopenharmony_ci const20, const6, const3); 2339cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2340cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1); 2341cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2342cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2343cabdff1aSopenharmony_ci} 2344cabdff1aSopenharmony_ci 2345cabdff1aSopenharmony_cistatic void vert_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, 2346cabdff1aSopenharmony_ci int32_t src_stride, 2347cabdff1aSopenharmony_ci uint8_t *dst, 2348cabdff1aSopenharmony_ci int32_t dst_stride) 2349cabdff1aSopenharmony_ci{ 2350cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2351cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3; 2352cabdff1aSopenharmony_ci v16u8 res0, res1; 2353cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 2354cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 2355cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 2356cabdff1aSopenharmony_ci 2357cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 2358cabdff1aSopenharmony_ci src += (4 * src_stride); 2359cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp4, inp5); 2360cabdff1aSopenharmony_ci src += (2 * src_stride); 2361cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, 2362cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 2363cabdff1aSopenharmony_ci inp1, inp0, inp0, inp1, 2364cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 2365cabdff1aSopenharmony_ci const20, const6, const3); 2366cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp6, inp7); 2367cabdff1aSopenharmony_ci src += (2 * src_stride); 2368cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, 2369cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 2370cabdff1aSopenharmony_ci inp3, inp2, inp1, inp0, 2371cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 2372cabdff1aSopenharmony_ci const20, const6, const3); 2373cabdff1aSopenharmony_ci LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 2374cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 2375cabdff1aSopenharmony_ci dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 2376cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 2377cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 2378cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2379cabdff1aSopenharmony_ci 2380cabdff1aSopenharmony_ci inp8 = LD_UB(src); 2381cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, 2382cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 2383cabdff1aSopenharmony_ci inp5, inp4, inp3, inp2, 2384cabdff1aSopenharmony_ci inp6, inp7, inp8, inp8, 2385cabdff1aSopenharmony_ci const20, const6, const3); 2386cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, 2387cabdff1aSopenharmony_ci inp7, inp8, inp8, inp7, 2388cabdff1aSopenharmony_ci inp7, inp6, inp5, inp4, 2389cabdff1aSopenharmony_ci inp8, inp8, inp7, inp6, 2390cabdff1aSopenharmony_ci const20, const6, const3); 2391cabdff1aSopenharmony_ci LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 2392cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 2393cabdff1aSopenharmony_ci dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 2394cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 2395cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 2396cabdff1aSopenharmony_ci} 2397cabdff1aSopenharmony_ci 2398cabdff1aSopenharmony_cistatic void vert_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, 2399cabdff1aSopenharmony_ci int32_t src_stride, 2400cabdff1aSopenharmony_ci uint8_t *dst, 2401cabdff1aSopenharmony_ci int32_t dst_stride) 2402cabdff1aSopenharmony_ci{ 2403cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2404cabdff1aSopenharmony_ci v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 2405cabdff1aSopenharmony_ci v16u8 res0, res1, dst0, dst1; 2406cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 2407cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 2408cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 2409cabdff1aSopenharmony_ci 2410cabdff1aSopenharmony_ci LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 2411cabdff1aSopenharmony_ci src += (5 * src_stride); 2412cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, 2413cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 2414cabdff1aSopenharmony_ci const20, const6, const3); 2415cabdff1aSopenharmony_ci inp5 = LD_UB(src); 2416cabdff1aSopenharmony_ci src += src_stride; 2417cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, 2418cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 2419cabdff1aSopenharmony_ci const20, const6, const3); 2420cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2421cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2422cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2423cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2424cabdff1aSopenharmony_ci 2425cabdff1aSopenharmony_ci inp6 = LD_UB(src); 2426cabdff1aSopenharmony_ci src += src_stride; 2427cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, 2428cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 2429cabdff1aSopenharmony_ci const20, const6, const3); 2430cabdff1aSopenharmony_ci inp7 = LD_UB(src); 2431cabdff1aSopenharmony_ci src += src_stride; 2432cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, 2433cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 2434cabdff1aSopenharmony_ci const20, const6, const3); 2435cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2436cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2437cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2438cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2439cabdff1aSopenharmony_ci 2440cabdff1aSopenharmony_ci inp8 = LD_UB(src); 2441cabdff1aSopenharmony_ci src += src_stride; 2442cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, 2443cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 2444cabdff1aSopenharmony_ci const20, const6, const3); 2445cabdff1aSopenharmony_ci inp9 = LD_UB(src); 2446cabdff1aSopenharmony_ci src += src_stride; 2447cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, 2448cabdff1aSopenharmony_ci inp6, inp7, inp8, inp9, 2449cabdff1aSopenharmony_ci const20, const6, const3); 2450cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2451cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2452cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2453cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2454cabdff1aSopenharmony_ci 2455cabdff1aSopenharmony_ci inp10 = LD_UB(src); 2456cabdff1aSopenharmony_ci src += src_stride; 2457cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, 2458cabdff1aSopenharmony_ci inp7, inp8, inp9, inp10, 2459cabdff1aSopenharmony_ci const20, const6, const3); 2460cabdff1aSopenharmony_ci inp11 = LD_UB(src); 2461cabdff1aSopenharmony_ci src += src_stride; 2462cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, 2463cabdff1aSopenharmony_ci inp8, inp9, inp10, inp11, 2464cabdff1aSopenharmony_ci const20, const6, const3); 2465cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2466cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2467cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2468cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2469cabdff1aSopenharmony_ci 2470cabdff1aSopenharmony_ci inp12 = LD_UB(src); 2471cabdff1aSopenharmony_ci src += src_stride; 2472cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, 2473cabdff1aSopenharmony_ci inp9, inp10, inp11, inp12, 2474cabdff1aSopenharmony_ci const20, const6, const3); 2475cabdff1aSopenharmony_ci inp13 = LD_UB(src); 2476cabdff1aSopenharmony_ci src += src_stride; 2477cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, 2478cabdff1aSopenharmony_ci inp10, inp11, inp12, inp13, 2479cabdff1aSopenharmony_ci const20, const6, const3); 2480cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2481cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2482cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2483cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2484cabdff1aSopenharmony_ci 2485cabdff1aSopenharmony_ci inp14 = LD_UB(src); 2486cabdff1aSopenharmony_ci src += src_stride; 2487cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, 2488cabdff1aSopenharmony_ci inp11, inp12, inp13, inp14, 2489cabdff1aSopenharmony_ci const20, const6, const3); 2490cabdff1aSopenharmony_ci inp15 = LD_UB(src); 2491cabdff1aSopenharmony_ci src += src_stride; 2492cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, 2493cabdff1aSopenharmony_ci inp12, inp13, inp14, inp15, 2494cabdff1aSopenharmony_ci const20, const6, const3); 2495cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2496cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2497cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2498cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2499cabdff1aSopenharmony_ci 2500cabdff1aSopenharmony_ci inp16 = LD_UB(src); 2501cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, 2502cabdff1aSopenharmony_ci inp13, inp14, inp15, inp16, 2503cabdff1aSopenharmony_ci const20, const6, const3); 2504cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, 2505cabdff1aSopenharmony_ci inp14, inp15, inp16, inp16, 2506cabdff1aSopenharmony_ci const20, const6, const3); 2507cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2508cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2509cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2510cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2511cabdff1aSopenharmony_ci 2512cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, 2513cabdff1aSopenharmony_ci inp15, inp16, inp16, inp15, 2514cabdff1aSopenharmony_ci const20, const6, const3); 2515cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, 2516cabdff1aSopenharmony_ci inp16, inp16, inp15, inp14, 2517cabdff1aSopenharmony_ci const20, const6, const3); 2518cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2519cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2520cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2521cabdff1aSopenharmony_ci} 2522cabdff1aSopenharmony_ci 2523cabdff1aSopenharmony_cistatic void vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t *src, 2524cabdff1aSopenharmony_ci int32_t src_stride, 2525cabdff1aSopenharmony_ci uint8_t *dst, 2526cabdff1aSopenharmony_ci int32_t dst_stride) 2527cabdff1aSopenharmony_ci{ 2528cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2529cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3; 2530cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, res0, res1; 2531cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 2532cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 2533cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 2534cabdff1aSopenharmony_ci 2535cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 2536cabdff1aSopenharmony_ci src += (4 * src_stride); 2537cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp4, inp5); 2538cabdff1aSopenharmony_ci src += (2 * src_stride); 2539cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2, 2540cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 2541cabdff1aSopenharmony_ci inp1, inp0, inp0, inp1, 2542cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 2543cabdff1aSopenharmony_ci const20, const6, const3); 2544cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp6, inp7); 2545cabdff1aSopenharmony_ci src += (2 * src_stride); 2546cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0, 2547cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 2548cabdff1aSopenharmony_ci inp3, inp2, inp1, inp0, 2549cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 2550cabdff1aSopenharmony_ci const20, const6, const3); 2551cabdff1aSopenharmony_ci LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 2552cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2); 2553cabdff1aSopenharmony_ci tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4); 2554cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 2555cabdff1aSopenharmony_ci dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 2556cabdff1aSopenharmony_ci AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 2557cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 2558cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 2559cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2560cabdff1aSopenharmony_ci 2561cabdff1aSopenharmony_ci inp8 = LD_UB(src); 2562cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, 2563cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 2564cabdff1aSopenharmony_ci inp5, inp4, inp3, inp2, 2565cabdff1aSopenharmony_ci inp6, inp7, inp8, inp8, 2566cabdff1aSopenharmony_ci const20, const6, const3); 2567cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3, 2568cabdff1aSopenharmony_ci inp7, inp8, inp8, inp7, 2569cabdff1aSopenharmony_ci inp7, inp6, inp5, inp4, 2570cabdff1aSopenharmony_ci inp8, inp8, inp7, inp6, 2571cabdff1aSopenharmony_ci const20, const6, const3); 2572cabdff1aSopenharmony_ci LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 2573cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6); 2574cabdff1aSopenharmony_ci tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8); 2575cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); 2576cabdff1aSopenharmony_ci dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); 2577cabdff1aSopenharmony_ci AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); 2578cabdff1aSopenharmony_ci AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); 2579cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 2580cabdff1aSopenharmony_ci} 2581cabdff1aSopenharmony_ci 2582cabdff1aSopenharmony_cistatic void vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t *src, 2583cabdff1aSopenharmony_ci int32_t src_stride, 2584cabdff1aSopenharmony_ci uint8_t *dst, 2585cabdff1aSopenharmony_ci int32_t dst_stride) 2586cabdff1aSopenharmony_ci{ 2587cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8; 2588cabdff1aSopenharmony_ci v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16; 2589cabdff1aSopenharmony_ci v16u8 res0, res1, dst0, dst1; 2590cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 2591cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 2592cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 2593cabdff1aSopenharmony_ci 2594cabdff1aSopenharmony_ci LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4); 2595cabdff1aSopenharmony_ci src += (5 * src_stride); 2596cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2, 2597cabdff1aSopenharmony_ci inp1, inp2, inp3, inp4, 2598cabdff1aSopenharmony_ci const20, const6, const3); 2599cabdff1aSopenharmony_ci inp5 = LD_UB(src); 2600cabdff1aSopenharmony_ci src += src_stride; 2601cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1, 2602cabdff1aSopenharmony_ci inp2, inp3, inp4, inp5, 2603cabdff1aSopenharmony_ci const20, const6, const3); 2604cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2605cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp1, res1, inp2, res0, res1); 2606cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2607cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2608cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2609cabdff1aSopenharmony_ci 2610cabdff1aSopenharmony_ci inp6 = LD_UB(src); 2611cabdff1aSopenharmony_ci src += src_stride; 2612cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0, 2613cabdff1aSopenharmony_ci inp3, inp4, inp5, inp6, 2614cabdff1aSopenharmony_ci const20, const6, const3); 2615cabdff1aSopenharmony_ci inp7 = LD_UB(src); 2616cabdff1aSopenharmony_ci src += src_stride; 2617cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0, 2618cabdff1aSopenharmony_ci inp4, inp5, inp6, inp7, 2619cabdff1aSopenharmony_ci const20, const6, const3); 2620cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2621cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp3, res1, inp4, res0, res1); 2622cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2623cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2624cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2625cabdff1aSopenharmony_ci 2626cabdff1aSopenharmony_ci inp8 = LD_UB(src); 2627cabdff1aSopenharmony_ci src += src_stride; 2628cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1, 2629cabdff1aSopenharmony_ci inp5, inp6, inp7, inp8, 2630cabdff1aSopenharmony_ci const20, const6, const3); 2631cabdff1aSopenharmony_ci inp9 = LD_UB(src); 2632cabdff1aSopenharmony_ci src += src_stride; 2633cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2, 2634cabdff1aSopenharmony_ci inp6, inp7, inp8, inp9, 2635cabdff1aSopenharmony_ci const20, const6, const3); 2636cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2637cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp5, res1, inp6, res0, res1); 2638cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2639cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2640cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2641cabdff1aSopenharmony_ci 2642cabdff1aSopenharmony_ci inp10 = LD_UB(src); 2643cabdff1aSopenharmony_ci src += src_stride; 2644cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3, 2645cabdff1aSopenharmony_ci inp7, inp8, inp9, inp10, 2646cabdff1aSopenharmony_ci const20, const6, const3); 2647cabdff1aSopenharmony_ci inp11 = LD_UB(src); 2648cabdff1aSopenharmony_ci src += src_stride; 2649cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4, 2650cabdff1aSopenharmony_ci inp8, inp9, inp10, inp11, 2651cabdff1aSopenharmony_ci const20, const6, const3); 2652cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2653cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp7, res1, inp8, res0, res1); 2654cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2655cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2656cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2657cabdff1aSopenharmony_ci 2658cabdff1aSopenharmony_ci inp12 = LD_UB(src); 2659cabdff1aSopenharmony_ci src += src_stride; 2660cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5, 2661cabdff1aSopenharmony_ci inp9, inp10, inp11, inp12, 2662cabdff1aSopenharmony_ci const20, const6, const3); 2663cabdff1aSopenharmony_ci inp13 = LD_UB(src); 2664cabdff1aSopenharmony_ci src += src_stride; 2665cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6, 2666cabdff1aSopenharmony_ci inp10, inp11, inp12, inp13, 2667cabdff1aSopenharmony_ci const20, const6, const3); 2668cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2669cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp9, res1, inp10, res0, res1); 2670cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2671cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2672cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2673cabdff1aSopenharmony_ci 2674cabdff1aSopenharmony_ci inp14 = LD_UB(src); 2675cabdff1aSopenharmony_ci src += src_stride; 2676cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7, 2677cabdff1aSopenharmony_ci inp11, inp12, inp13, inp14, 2678cabdff1aSopenharmony_ci const20, const6, const3); 2679cabdff1aSopenharmony_ci inp15 = LD_UB(src); 2680cabdff1aSopenharmony_ci src += src_stride; 2681cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8, 2682cabdff1aSopenharmony_ci inp12, inp13, inp14, inp15, 2683cabdff1aSopenharmony_ci const20, const6, const3); 2684cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2685cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1); 2686cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2687cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2688cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2689cabdff1aSopenharmony_ci 2690cabdff1aSopenharmony_ci inp16 = LD_UB(src); 2691cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9, 2692cabdff1aSopenharmony_ci inp13, inp14, inp15, inp16, 2693cabdff1aSopenharmony_ci const20, const6, const3); 2694cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10, 2695cabdff1aSopenharmony_ci inp14, inp15, inp16, inp16, 2696cabdff1aSopenharmony_ci const20, const6, const3); 2697cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2698cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1); 2699cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2700cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2701cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2702cabdff1aSopenharmony_ci 2703cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11, 2704cabdff1aSopenharmony_ci inp15, inp16, inp16, inp15, 2705cabdff1aSopenharmony_ci const20, const6, const3); 2706cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12, 2707cabdff1aSopenharmony_ci inp16, inp16, inp15, inp14, 2708cabdff1aSopenharmony_ci const20, const6, const3); 2709cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 2710cabdff1aSopenharmony_ci AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1); 2711cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 2712cabdff1aSopenharmony_ci ST_UB2(res0, res1, dst, dst_stride); 2713cabdff1aSopenharmony_ci} 2714cabdff1aSopenharmony_ci 2715cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(const uint8_t *src, 2716cabdff1aSopenharmony_ci int32_t src_stride, 2717cabdff1aSopenharmony_ci uint8_t *dst, 2718cabdff1aSopenharmony_ci int32_t dst_stride, 2719cabdff1aSopenharmony_ci int32_t height) 2720cabdff1aSopenharmony_ci{ 2721cabdff1aSopenharmony_ci uint8_t loop_count; 2722cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 2723cabdff1aSopenharmony_ci v16u8 res; 2724cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 2725cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 2726cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 2727cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 2728cabdff1aSopenharmony_ci 2729cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 2730cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 2731cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 2732cabdff1aSopenharmony_ci src += (4 * src_stride); 2733cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 2734cabdff1aSopenharmony_ci const20, const6, const3); 2735cabdff1aSopenharmony_ci res = __msa_ave_u_b(inp0, res); 2736cabdff1aSopenharmony_ci ST_UB(res, dst); 2737cabdff1aSopenharmony_ci dst += dst_stride; 2738cabdff1aSopenharmony_ci 2739cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, 2740cabdff1aSopenharmony_ci const20, const6, const3); 2741cabdff1aSopenharmony_ci res = __msa_ave_u_b(inp2, res); 2742cabdff1aSopenharmony_ci ST_UB(res, dst); 2743cabdff1aSopenharmony_ci dst += dst_stride; 2744cabdff1aSopenharmony_ci 2745cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, 2746cabdff1aSopenharmony_ci const20, const6, const3); 2747cabdff1aSopenharmony_ci res = __msa_ave_u_b(inp4, res); 2748cabdff1aSopenharmony_ci ST_UB(res, dst); 2749cabdff1aSopenharmony_ci dst += dst_stride; 2750cabdff1aSopenharmony_ci 2751cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, 2752cabdff1aSopenharmony_ci const20, const6, const3); 2753cabdff1aSopenharmony_ci res = __msa_ave_u_b(inp6, res); 2754cabdff1aSopenharmony_ci ST_UB(res, dst); 2755cabdff1aSopenharmony_ci dst += dst_stride; 2756cabdff1aSopenharmony_ci } 2757cabdff1aSopenharmony_ci 2758cabdff1aSopenharmony_ci LD_UB2(src, 1, inp0, inp1); 2759cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 2760cabdff1aSopenharmony_ci const20, const6, const3); 2761cabdff1aSopenharmony_ci res = __msa_ave_u_b(inp0, res); 2762cabdff1aSopenharmony_ci ST_UB(res, dst); 2763cabdff1aSopenharmony_ci} 2764cabdff1aSopenharmony_ci 2765cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(const uint8_t *src, 2766cabdff1aSopenharmony_ci int32_t src_stride, 2767cabdff1aSopenharmony_ci uint8_t *dst, 2768cabdff1aSopenharmony_ci int32_t dst_stride) 2769cabdff1aSopenharmony_ci{ 2770cabdff1aSopenharmony_ci uint8_t buff[272]; 2771cabdff1aSopenharmony_ci 2772cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 2773cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 2774cabdff1aSopenharmony_ci} 2775cabdff1aSopenharmony_ci 2776cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t *src, 2777cabdff1aSopenharmony_ci int32_t src_stride, 2778cabdff1aSopenharmony_ci uint8_t *dst, 2779cabdff1aSopenharmony_ci int32_t dst_stride) 2780cabdff1aSopenharmony_ci{ 2781cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 2782cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 2783cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 2784cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 2785cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 2786cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 2787cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 2788cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 2789cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 2790cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 2791cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 2792cabdff1aSopenharmony_ci 2793cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 2794cabdff1aSopenharmony_ci src += (2 * src_stride); 2795cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 2796cabdff1aSopenharmony_ci mask2, mask3, const20, 2797cabdff1aSopenharmony_ci const6, const3); 2798cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 2799cabdff1aSopenharmony_ci horiz0 = __msa_ave_u_b(inp0, res0); 2800cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 2801cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 2802cabdff1aSopenharmony_ci src += (2 * src_stride); 2803cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 2804cabdff1aSopenharmony_ci mask2, mask3, const20, 2805cabdff1aSopenharmony_ci const6, const3); 2806cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 2807cabdff1aSopenharmony_ci horiz2 = __msa_ave_u_b(inp2, res1); 2808cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 2809cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 2810cabdff1aSopenharmony_ci src += (2 * src_stride); 2811cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 2812cabdff1aSopenharmony_ci mask2, mask3, const20, 2813cabdff1aSopenharmony_ci const6, const3); 2814cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 2815cabdff1aSopenharmony_ci horiz4 = __msa_ave_u_b(inp0, res0); 2816cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 2817cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 2818cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 2819cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 2820cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 2821cabdff1aSopenharmony_ci const20, const6, const3); 2822cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 2823cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(avg0, res0); 2824cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 2825cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2826cabdff1aSopenharmony_ci 2827cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 2828cabdff1aSopenharmony_ci src += (2 * src_stride); 2829cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 2830cabdff1aSopenharmony_ci mask2, mask3, const20, 2831cabdff1aSopenharmony_ci const6, const3); 2832cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 2833cabdff1aSopenharmony_ci horiz6 = __msa_ave_u_b(inp2, res1); 2834cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 2835cabdff1aSopenharmony_ci inp0 = LD_UB(src); 2836cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 2837cabdff1aSopenharmony_ci mask2, mask3, const20, 2838cabdff1aSopenharmony_ci const6, const3); 2839cabdff1aSopenharmony_ci horiz8 = __msa_ave_u_b(inp0, res0); 2840cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 2841cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 2842cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 2843cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 2844cabdff1aSopenharmony_ci const20, const6, const3); 2845cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 2846cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(avg1, res1); 2847cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 2848cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 2849cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 2850cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 2851cabdff1aSopenharmony_ci const20, const6, const3); 2852cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 2853cabdff1aSopenharmony_ci dst += 2 * dst_stride; 2854cabdff1aSopenharmony_ci 2855cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 2856cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(avg0, res0); 2857cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 2858cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 2859cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 2860cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 2861cabdff1aSopenharmony_ci const20, const6, const3); 2862cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 2863cabdff1aSopenharmony_ci dst += 2 * dst_stride; 2864cabdff1aSopenharmony_ci 2865cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 2866cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(avg1, res1); 2867cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 2868cabdff1aSopenharmony_ci} 2869cabdff1aSopenharmony_ci 2870cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t *src, 2871cabdff1aSopenharmony_ci int32_t src_stride, 2872cabdff1aSopenharmony_ci uint8_t *dst, 2873cabdff1aSopenharmony_ci int32_t dst_stride, 2874cabdff1aSopenharmony_ci int32_t height) 2875cabdff1aSopenharmony_ci{ 2876cabdff1aSopenharmony_ci uint8_t loop_count; 2877cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 2878cabdff1aSopenharmony_ci v16u8 res; 2879cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 2880cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 2881cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 2882cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 2883cabdff1aSopenharmony_ci 2884cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 2885cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 2886cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 2887cabdff1aSopenharmony_ci src += (4 * src_stride); 2888cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 2889cabdff1aSopenharmony_ci const20, const6, const3); 2890cabdff1aSopenharmony_ci ST_UB(res, dst); 2891cabdff1aSopenharmony_ci dst += dst_stride; 2892cabdff1aSopenharmony_ci 2893cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, 2894cabdff1aSopenharmony_ci const20, const6, const3); 2895cabdff1aSopenharmony_ci ST_UB(res, dst); 2896cabdff1aSopenharmony_ci dst += dst_stride; 2897cabdff1aSopenharmony_ci 2898cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, 2899cabdff1aSopenharmony_ci const20, const6, const3); 2900cabdff1aSopenharmony_ci ST_UB(res, dst); 2901cabdff1aSopenharmony_ci dst += dst_stride; 2902cabdff1aSopenharmony_ci 2903cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, 2904cabdff1aSopenharmony_ci const20, const6, const3); 2905cabdff1aSopenharmony_ci ST_UB(res, dst); 2906cabdff1aSopenharmony_ci dst += dst_stride; 2907cabdff1aSopenharmony_ci } 2908cabdff1aSopenharmony_ci 2909cabdff1aSopenharmony_ci LD_UB2(src, 1, inp0, inp1); 2910cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 2911cabdff1aSopenharmony_ci const20, const6, const3); 2912cabdff1aSopenharmony_ci ST_UB(res, dst); 2913cabdff1aSopenharmony_ci} 2914cabdff1aSopenharmony_ci 2915cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(const uint8_t *src, 2916cabdff1aSopenharmony_ci int32_t src_stride, 2917cabdff1aSopenharmony_ci uint8_t *dst, 2918cabdff1aSopenharmony_ci int32_t dst_stride) 2919cabdff1aSopenharmony_ci{ 2920cabdff1aSopenharmony_ci uint8_t buff[272]; 2921cabdff1aSopenharmony_ci 2922cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16); 2923cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 2924cabdff1aSopenharmony_ci} 2925cabdff1aSopenharmony_ci 2926cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src, 2927cabdff1aSopenharmony_ci int32_t src_stride, 2928cabdff1aSopenharmony_ci uint8_t *dst, 2929cabdff1aSopenharmony_ci int32_t dst_stride) 2930cabdff1aSopenharmony_ci{ 2931cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 2932cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 2933cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 2934cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 2935cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 2936cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 2937cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 2938cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 2939cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 2940cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 2941cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 2942cabdff1aSopenharmony_ci 2943cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 2944cabdff1aSopenharmony_ci src += (2 * src_stride); 2945cabdff1aSopenharmony_ci horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 2946cabdff1aSopenharmony_ci mask2, mask3, const20, 2947cabdff1aSopenharmony_ci const6, const3); 2948cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 2949cabdff1aSopenharmony_ci 2950cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 2951cabdff1aSopenharmony_ci src += (2 * src_stride); 2952cabdff1aSopenharmony_ci horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 2953cabdff1aSopenharmony_ci mask2, mask3, const20, 2954cabdff1aSopenharmony_ci const6, const3); 2955cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 2956cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 2957cabdff1aSopenharmony_ci src += (2 * src_stride); 2958cabdff1aSopenharmony_ci horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 2959cabdff1aSopenharmony_ci mask2, mask3, const20, 2960cabdff1aSopenharmony_ci const6, const3); 2961cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 2962cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 2963cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 2964cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 2965cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 2966cabdff1aSopenharmony_ci const20, const6, const3); 2967cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 2968cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(avg0, res0); 2969cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 2970cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2971cabdff1aSopenharmony_ci 2972cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 2973cabdff1aSopenharmony_ci src += (2 * src_stride); 2974cabdff1aSopenharmony_ci horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 2975cabdff1aSopenharmony_ci mask2, mask3, const20, 2976cabdff1aSopenharmony_ci const6, const3); 2977cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 2978cabdff1aSopenharmony_ci inp0 = LD_UB(src); 2979cabdff1aSopenharmony_ci horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 2980cabdff1aSopenharmony_ci mask2, mask3, const20, 2981cabdff1aSopenharmony_ci const6, const3); 2982cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 2983cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 2984cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 2985cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 2986cabdff1aSopenharmony_ci const20, const6, const3); 2987cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 2988cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(avg1, res1); 2989cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 2990cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(avg0, res0); 2991cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 2992cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2993cabdff1aSopenharmony_ci 2994cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 2995cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 2996cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 2997cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 2998cabdff1aSopenharmony_ci const20, const6, const3); 2999cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 3000cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(avg0, res0); 3001cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 3002cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3003cabdff1aSopenharmony_ci 3004cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3005cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 3006cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 3007cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 3008cabdff1aSopenharmony_ci const20, const6, const3); 3009cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 3010cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(avg1, res1); 3011cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 3012cabdff1aSopenharmony_ci} 3013cabdff1aSopenharmony_ci 3014cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t *src, 3015cabdff1aSopenharmony_ci int32_t src_stride, 3016cabdff1aSopenharmony_ci uint8_t *dst, 3017cabdff1aSopenharmony_ci int32_t dst_stride, 3018cabdff1aSopenharmony_ci int32_t height) 3019cabdff1aSopenharmony_ci{ 3020cabdff1aSopenharmony_ci uint8_t loop_count; 3021cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 3022cabdff1aSopenharmony_ci v16u8 res; 3023cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 3024cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 3025cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 3026cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 3027cabdff1aSopenharmony_ci 3028cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 3029cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 3030cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 3031cabdff1aSopenharmony_ci src += (4 * src_stride); 3032cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 3033cabdff1aSopenharmony_ci const20, const6, const3); 3034cabdff1aSopenharmony_ci res = __msa_ave_u_b(res, inp1); 3035cabdff1aSopenharmony_ci ST_UB(res, dst); 3036cabdff1aSopenharmony_ci dst += dst_stride; 3037cabdff1aSopenharmony_ci 3038cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask, 3039cabdff1aSopenharmony_ci const20, const6, const3); 3040cabdff1aSopenharmony_ci res = __msa_ave_u_b(res, inp3); 3041cabdff1aSopenharmony_ci ST_UB(res, dst); 3042cabdff1aSopenharmony_ci dst += dst_stride; 3043cabdff1aSopenharmony_ci 3044cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask, 3045cabdff1aSopenharmony_ci const20, const6, const3); 3046cabdff1aSopenharmony_ci res = __msa_ave_u_b(res, inp5); 3047cabdff1aSopenharmony_ci ST_UB(res, dst); 3048cabdff1aSopenharmony_ci dst += dst_stride; 3049cabdff1aSopenharmony_ci 3050cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask, 3051cabdff1aSopenharmony_ci const20, const6, const3); 3052cabdff1aSopenharmony_ci res = __msa_ave_u_b(res, inp7); 3053cabdff1aSopenharmony_ci ST_UB(res, dst); 3054cabdff1aSopenharmony_ci dst += dst_stride; 3055cabdff1aSopenharmony_ci } 3056cabdff1aSopenharmony_ci 3057cabdff1aSopenharmony_ci LD_UB2(src, 1, inp0, inp1); 3058cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, 3059cabdff1aSopenharmony_ci const20, const6, const3); 3060cabdff1aSopenharmony_ci res = __msa_ave_u_b(inp1, res); 3061cabdff1aSopenharmony_ci ST_UB(res, dst); 3062cabdff1aSopenharmony_ci} 3063cabdff1aSopenharmony_ci 3064cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(const uint8_t *src, 3065cabdff1aSopenharmony_ci int32_t src_stride, 3066cabdff1aSopenharmony_ci uint8_t *dst, 3067cabdff1aSopenharmony_ci int32_t dst_stride) 3068cabdff1aSopenharmony_ci{ 3069cabdff1aSopenharmony_ci uint8_t buff[272]; 3070cabdff1aSopenharmony_ci 3071cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 3072cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 3073cabdff1aSopenharmony_ci} 3074cabdff1aSopenharmony_ci 3075cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src, 3076cabdff1aSopenharmony_ci int32_t src_stride, 3077cabdff1aSopenharmony_ci uint8_t *dst, 3078cabdff1aSopenharmony_ci int32_t dst_stride) 3079cabdff1aSopenharmony_ci{ 3080cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 3081cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 3082cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 3083cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3084cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3085cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3086cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3087cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3088cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 3089cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 3090cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 3091cabdff1aSopenharmony_ci 3092cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3093cabdff1aSopenharmony_ci src += (2 * src_stride); 3094cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3095cabdff1aSopenharmony_ci mask2, mask3, const20, 3096cabdff1aSopenharmony_ci const6, const3); 3097cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 3098cabdff1aSopenharmony_ci 3099cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 3100cabdff1aSopenharmony_ci horiz0 = __msa_ave_u_b(inp0, res0); 3101cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3102cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3103cabdff1aSopenharmony_ci src += (2 * src_stride); 3104cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3105cabdff1aSopenharmony_ci mask2, mask3, const20, 3106cabdff1aSopenharmony_ci const6, const3); 3107cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 3108cabdff1aSopenharmony_ci 3109cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 3110cabdff1aSopenharmony_ci horiz2 = __msa_ave_u_b(inp2, res1); 3111cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3112cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3113cabdff1aSopenharmony_ci src += (2 * src_stride); 3114cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3115cabdff1aSopenharmony_ci mask2, mask3, const20, 3116cabdff1aSopenharmony_ci const6, const3); 3117cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 3118cabdff1aSopenharmony_ci 3119cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 3120cabdff1aSopenharmony_ci horiz4 = __msa_ave_u_b(inp0, res0); 3121cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3122cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3123cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 3124cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 3125cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 3126cabdff1aSopenharmony_ci const20, const6, const3); 3127cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 3128cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(avg0, res0); 3129cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 3130cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3131cabdff1aSopenharmony_ci 3132cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3133cabdff1aSopenharmony_ci src += (2 * src_stride); 3134cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3135cabdff1aSopenharmony_ci mask2, mask3, const20, 3136cabdff1aSopenharmony_ci const6, const3); 3137cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 3138cabdff1aSopenharmony_ci 3139cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 3140cabdff1aSopenharmony_ci horiz6 = __msa_ave_u_b(inp2, res1); 3141cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3142cabdff1aSopenharmony_ci inp0 = LD_UB(src); 3143cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3144cabdff1aSopenharmony_ci mask2, mask3, const20, 3145cabdff1aSopenharmony_ci const6, const3); 3146cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 3147cabdff1aSopenharmony_ci horiz8 = __msa_ave_u_b(inp0, res0); 3148cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3149cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 3150cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 3151cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 3152cabdff1aSopenharmony_ci const20, const6, const3); 3153cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 3154cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(avg1, res1); 3155cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 3156cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3157cabdff1aSopenharmony_ci 3158cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3159cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 3160cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 3161cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 3162cabdff1aSopenharmony_ci const20, const6, const3); 3163cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 3164cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(avg0, res0); 3165cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 3166cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3167cabdff1aSopenharmony_ci 3168cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3169cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 3170cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 3171cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 3172cabdff1aSopenharmony_ci const20, const6, const3); 3173cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 3174cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(avg1, res1); 3175cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 3176cabdff1aSopenharmony_ci} 3177cabdff1aSopenharmony_ci 3178cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t *src, 3179cabdff1aSopenharmony_ci int32_t src_stride, 3180cabdff1aSopenharmony_ci uint8_t *dst, 3181cabdff1aSopenharmony_ci int32_t dst_stride) 3182cabdff1aSopenharmony_ci{ 3183cabdff1aSopenharmony_ci uint8_t buff[272]; 3184cabdff1aSopenharmony_ci 3185cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 3186cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride); 3187cabdff1aSopenharmony_ci} 3188cabdff1aSopenharmony_ci 3189cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t *src, 3190cabdff1aSopenharmony_ci int32_t src_stride, 3191cabdff1aSopenharmony_ci uint8_t *dst, 3192cabdff1aSopenharmony_ci int32_t dst_stride) 3193cabdff1aSopenharmony_ci{ 3194cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 3195cabdff1aSopenharmony_ci v16u8 res0, res1; 3196cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 3197cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3198cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3199cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3200cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3201cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3202cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 3203cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 3204cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 3205cabdff1aSopenharmony_ci 3206cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3207cabdff1aSopenharmony_ci src += (2 * src_stride); 3208cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3209cabdff1aSopenharmony_ci mask2, mask3, const20, 3210cabdff1aSopenharmony_ci const6, const3); 3211cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 3212cabdff1aSopenharmony_ci horiz0 = __msa_ave_u_b(inp0, res0); 3213cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3214cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3215cabdff1aSopenharmony_ci src += (2 * src_stride); 3216cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3217cabdff1aSopenharmony_ci mask2, mask3, const20, 3218cabdff1aSopenharmony_ci const6, const3); 3219cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 3220cabdff1aSopenharmony_ci horiz2 = __msa_ave_u_b(inp2, res1); 3221cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3222cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3223cabdff1aSopenharmony_ci src += (2 * src_stride); 3224cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3225cabdff1aSopenharmony_ci mask2, mask3, const20, 3226cabdff1aSopenharmony_ci const6, const3); 3227cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 3228cabdff1aSopenharmony_ci horiz4 = __msa_ave_u_b(inp0, res0); 3229cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3230cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3231cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 3232cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 3233cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 3234cabdff1aSopenharmony_ci const20, const6, const3); 3235cabdff1aSopenharmony_ci 3236cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3237cabdff1aSopenharmony_ci src += (2 * src_stride); 3238cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 3239cabdff1aSopenharmony_ci dst += 2 * dst_stride; 3240cabdff1aSopenharmony_ci 3241cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3242cabdff1aSopenharmony_ci mask2, mask3, const20, 3243cabdff1aSopenharmony_ci const6, const3); 3244cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 3245cabdff1aSopenharmony_ci horiz6 = __msa_ave_u_b(inp2, res1); 3246cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3247cabdff1aSopenharmony_ci inp0 = LD_UB(src); 3248cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3249cabdff1aSopenharmony_ci mask2, mask3, const20, 3250cabdff1aSopenharmony_ci const6, const3); 3251cabdff1aSopenharmony_ci horiz8 = __msa_ave_u_b(inp0, res0); 3252cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3253cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 3254cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 3255cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 3256cabdff1aSopenharmony_ci const20, const6, const3); 3257cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3258cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 3259cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 3260cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 3261cabdff1aSopenharmony_ci const20, const6, const3); 3262cabdff1aSopenharmony_ci ST_D4(res1, res0, 0, 1, 0, 1, dst, dst_stride); 3263cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3264cabdff1aSopenharmony_ci 3265cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3266cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 3267cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 3268cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 3269cabdff1aSopenharmony_ci const20, const6, const3); 3270cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 3271cabdff1aSopenharmony_ci} 3272cabdff1aSopenharmony_ci 3273cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_16x16_msa(const uint8_t *src, 3274cabdff1aSopenharmony_ci int32_t src_stride, 3275cabdff1aSopenharmony_ci uint8_t *dst, 3276cabdff1aSopenharmony_ci int32_t dst_stride) 3277cabdff1aSopenharmony_ci{ 3278cabdff1aSopenharmony_ci uint8_t buff[272]; 3279cabdff1aSopenharmony_ci 3280cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16); 3281cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride); 3282cabdff1aSopenharmony_ci} 3283cabdff1aSopenharmony_ci 3284cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_8x8_msa(const uint8_t *src, 3285cabdff1aSopenharmony_ci int32_t src_stride, 3286cabdff1aSopenharmony_ci uint8_t *dst, 3287cabdff1aSopenharmony_ci int32_t dst_stride) 3288cabdff1aSopenharmony_ci{ 3289cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 3290cabdff1aSopenharmony_ci v16u8 res0, res1; 3291cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 3292cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3293cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3294cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3295cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3296cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3297cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 3298cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 3299cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 3300cabdff1aSopenharmony_ci 3301cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3302cabdff1aSopenharmony_ci src += (2 * src_stride); 3303cabdff1aSopenharmony_ci horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3304cabdff1aSopenharmony_ci mask2, mask3, const20, 3305cabdff1aSopenharmony_ci const6, const3); 3306cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3307cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3308cabdff1aSopenharmony_ci src += (2 * src_stride); 3309cabdff1aSopenharmony_ci horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3310cabdff1aSopenharmony_ci mask2, mask3, const20, 3311cabdff1aSopenharmony_ci const6, const3); 3312cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3313cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3314cabdff1aSopenharmony_ci src += (2 * src_stride); 3315cabdff1aSopenharmony_ci horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3316cabdff1aSopenharmony_ci mask2, mask3, const20, 3317cabdff1aSopenharmony_ci const6, const3); 3318cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3319cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3320cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 3321cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 3322cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 3323cabdff1aSopenharmony_ci const20, const6, const3); 3324cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3325cabdff1aSopenharmony_ci src += (2 * src_stride); 3326cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 3327cabdff1aSopenharmony_ci dst += 2 * dst_stride; 3328cabdff1aSopenharmony_ci 3329cabdff1aSopenharmony_ci horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3330cabdff1aSopenharmony_ci mask2, mask3, const20, 3331cabdff1aSopenharmony_ci const6, const3); 3332cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3333cabdff1aSopenharmony_ci inp0 = LD_UB(src); 3334cabdff1aSopenharmony_ci horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3335cabdff1aSopenharmony_ci mask2, mask3, const20, 3336cabdff1aSopenharmony_ci const6, const3); 3337cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3338cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 3339cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 3340cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 3341cabdff1aSopenharmony_ci const20, const6, const3); 3342cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3343cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 3344cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 3345cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 3346cabdff1aSopenharmony_ci const20, const6, const3); 3347cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 3348cabdff1aSopenharmony_ci dst += 2 * dst_stride; 3349cabdff1aSopenharmony_ci 3350cabdff1aSopenharmony_ci 3351cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3352cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 3353cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 3354cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 3355cabdff1aSopenharmony_ci const20, const6, const3); 3356cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 3357cabdff1aSopenharmony_ci} 3358cabdff1aSopenharmony_ci 3359cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t *src, 3360cabdff1aSopenharmony_ci int32_t src_stride, 3361cabdff1aSopenharmony_ci uint8_t *dst, 3362cabdff1aSopenharmony_ci int32_t dst_stride) 3363cabdff1aSopenharmony_ci{ 3364cabdff1aSopenharmony_ci uint8_t buff[272]; 3365cabdff1aSopenharmony_ci 3366cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 3367cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride); 3368cabdff1aSopenharmony_ci} 3369cabdff1aSopenharmony_ci 3370cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src, 3371cabdff1aSopenharmony_ci int32_t src_stride, 3372cabdff1aSopenharmony_ci uint8_t *dst, 3373cabdff1aSopenharmony_ci int32_t dst_stride) 3374cabdff1aSopenharmony_ci{ 3375cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 3376cabdff1aSopenharmony_ci v16u8 res0, res1; 3377cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 3378cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3379cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3380cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3381cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3382cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3383cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 3384cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 3385cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 3386cabdff1aSopenharmony_ci 3387cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3388cabdff1aSopenharmony_ci src += (2 * src_stride); 3389cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3390cabdff1aSopenharmony_ci mask2, mask3, const20, 3391cabdff1aSopenharmony_ci const6, const3); 3392cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 3393cabdff1aSopenharmony_ci 3394cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 3395cabdff1aSopenharmony_ci horiz0 = __msa_ave_u_b(inp0, res0); 3396cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3397cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3398cabdff1aSopenharmony_ci src += (2 * src_stride); 3399cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3400cabdff1aSopenharmony_ci mask2, mask3, const20, 3401cabdff1aSopenharmony_ci const6, const3); 3402cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 3403cabdff1aSopenharmony_ci 3404cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 3405cabdff1aSopenharmony_ci horiz2 = __msa_ave_u_b(inp2, res1); 3406cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3407cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3408cabdff1aSopenharmony_ci src += (2 * src_stride); 3409cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3410cabdff1aSopenharmony_ci mask2, mask3, const20, 3411cabdff1aSopenharmony_ci const6, const3); 3412cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 3413cabdff1aSopenharmony_ci 3414cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 3415cabdff1aSopenharmony_ci horiz4 = __msa_ave_u_b(inp0, res0); 3416cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3417cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3418cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 3419cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 3420cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 3421cabdff1aSopenharmony_ci const20, const6, const3); 3422cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3423cabdff1aSopenharmony_ci src += (2 * src_stride); 3424cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 3425cabdff1aSopenharmony_ci dst += 2 * dst_stride; 3426cabdff1aSopenharmony_ci 3427cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3428cabdff1aSopenharmony_ci mask2, mask3, const20, 3429cabdff1aSopenharmony_ci const6, const3); 3430cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 3431cabdff1aSopenharmony_ci 3432cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 3433cabdff1aSopenharmony_ci horiz6 = __msa_ave_u_b(inp2, res1); 3434cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3435cabdff1aSopenharmony_ci inp0 = LD_UB(src); 3436cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3437cabdff1aSopenharmony_ci mask2, mask3, const20, 3438cabdff1aSopenharmony_ci const6, const3); 3439cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 3440cabdff1aSopenharmony_ci horiz8 = __msa_ave_u_b(inp0, res0); 3441cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3442cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 3443cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 3444cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 3445cabdff1aSopenharmony_ci const20, const6, const3); 3446cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3447cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 3448cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 3449cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 3450cabdff1aSopenharmony_ci const20, const6, const3); 3451cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 3452cabdff1aSopenharmony_ci dst += 2 * dst_stride; 3453cabdff1aSopenharmony_ci 3454cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3455cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 3456cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 3457cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 3458cabdff1aSopenharmony_ci const20, const6, const3); 3459cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 3460cabdff1aSopenharmony_ci} 3461cabdff1aSopenharmony_ci 3462cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t *src, 3463cabdff1aSopenharmony_ci int32_t src_stride, 3464cabdff1aSopenharmony_ci uint8_t *dst, 3465cabdff1aSopenharmony_ci int32_t dst_stride) 3466cabdff1aSopenharmony_ci{ 3467cabdff1aSopenharmony_ci uint8_t buff[272]; 3468cabdff1aSopenharmony_ci 3469cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 3470cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 3471cabdff1aSopenharmony_ci} 3472cabdff1aSopenharmony_ci 3473cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t *src, 3474cabdff1aSopenharmony_ci int32_t src_stride, 3475cabdff1aSopenharmony_ci uint8_t *dst, 3476cabdff1aSopenharmony_ci int32_t dst_stride) 3477cabdff1aSopenharmony_ci{ 3478cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 3479cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 3480cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 3481cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3482cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3483cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3484cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3485cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3486cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 3487cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 3488cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 3489cabdff1aSopenharmony_ci 3490cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3491cabdff1aSopenharmony_ci src += (2 * src_stride); 3492cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3493cabdff1aSopenharmony_ci mask2, mask3, const20, 3494cabdff1aSopenharmony_ci const6, const3); 3495cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 3496cabdff1aSopenharmony_ci horiz0 = __msa_ave_u_b(inp0, res0); 3497cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3498cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3499cabdff1aSopenharmony_ci src += (2 * src_stride); 3500cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3501cabdff1aSopenharmony_ci mask2, mask3, const20, 3502cabdff1aSopenharmony_ci const6, const3); 3503cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 3504cabdff1aSopenharmony_ci horiz2 = __msa_ave_u_b(inp2, res1); 3505cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3506cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3507cabdff1aSopenharmony_ci src += (2 * src_stride); 3508cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3509cabdff1aSopenharmony_ci mask2, mask3, const20, 3510cabdff1aSopenharmony_ci const6, const3); 3511cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 3512cabdff1aSopenharmony_ci horiz4 = __msa_ave_u_b(inp0, res0); 3513cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3514cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3515cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 3516cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 3517cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 3518cabdff1aSopenharmony_ci const20, const6, const3); 3519cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 3520cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(avg0, res0); 3521cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 3522cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3523cabdff1aSopenharmony_ci 3524cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3525cabdff1aSopenharmony_ci src += (2 * src_stride); 3526cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3527cabdff1aSopenharmony_ci mask2, mask3, const20, 3528cabdff1aSopenharmony_ci const6, const3); 3529cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 3530cabdff1aSopenharmony_ci horiz6 = __msa_ave_u_b(inp2, res1); 3531cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3532cabdff1aSopenharmony_ci inp0 = LD_UB(src); 3533cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3534cabdff1aSopenharmony_ci mask2, mask3, const20, 3535cabdff1aSopenharmony_ci const6, const3); 3536cabdff1aSopenharmony_ci horiz8 = __msa_ave_u_b(inp0, res0); 3537cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3538cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 3539cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 3540cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 3541cabdff1aSopenharmony_ci const20, const6, const3); 3542cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 3543cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(avg1, res1); 3544cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3545cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 3546cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 3547cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 3548cabdff1aSopenharmony_ci const20, const6, const3); 3549cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 3550cabdff1aSopenharmony_ci dst += 2 * dst_stride; 3551cabdff1aSopenharmony_ci 3552cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 3553cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(avg0, res0); 3554cabdff1aSopenharmony_ci 3555cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3556cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 3557cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 3558cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 3559cabdff1aSopenharmony_ci const20, const6, const3); 3560cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 3561cabdff1aSopenharmony_ci dst += 2 * dst_stride; 3562cabdff1aSopenharmony_ci 3563cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 3564cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(avg1, res1); 3565cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 3566cabdff1aSopenharmony_ci} 3567cabdff1aSopenharmony_ci 3568cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t *src, 3569cabdff1aSopenharmony_ci int32_t src_stride, 3570cabdff1aSopenharmony_ci uint8_t *dst, 3571cabdff1aSopenharmony_ci int32_t dst_stride) 3572cabdff1aSopenharmony_ci{ 3573cabdff1aSopenharmony_ci uint8_t buff[272]; 3574cabdff1aSopenharmony_ci 3575cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16); 3576cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 3577cabdff1aSopenharmony_ci} 3578cabdff1aSopenharmony_ci 3579cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t *src, 3580cabdff1aSopenharmony_ci int32_t src_stride, 3581cabdff1aSopenharmony_ci uint8_t *dst, 3582cabdff1aSopenharmony_ci int32_t dst_stride) 3583cabdff1aSopenharmony_ci{ 3584cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 3585cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 3586cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 3587cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3588cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3589cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3590cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3591cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3592cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 3593cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 3594cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 3595cabdff1aSopenharmony_ci 3596cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3597cabdff1aSopenharmony_ci src += (2 * src_stride); 3598cabdff1aSopenharmony_ci horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3599cabdff1aSopenharmony_ci mask2, mask3, const20, 3600cabdff1aSopenharmony_ci const6, const3); 3601cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3602cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3603cabdff1aSopenharmony_ci src += (2 * src_stride); 3604cabdff1aSopenharmony_ci horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3605cabdff1aSopenharmony_ci mask2, mask3, const20, 3606cabdff1aSopenharmony_ci const6, const3); 3607cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3608cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3609cabdff1aSopenharmony_ci src += (2 * src_stride); 3610cabdff1aSopenharmony_ci horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3611cabdff1aSopenharmony_ci mask2, mask3, const20, 3612cabdff1aSopenharmony_ci const6, const3); 3613cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3614cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3615cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 3616cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 3617cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 3618cabdff1aSopenharmony_ci const20, const6, const3); 3619cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 3620cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(avg0, res0); 3621cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3622cabdff1aSopenharmony_ci src += (2 * src_stride); 3623cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 3624cabdff1aSopenharmony_ci dst += 2 * dst_stride; 3625cabdff1aSopenharmony_ci 3626cabdff1aSopenharmony_ci horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3627cabdff1aSopenharmony_ci mask2, mask3, const20, 3628cabdff1aSopenharmony_ci const6, const3); 3629cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3630cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3631cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 3632cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 3633cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 3634cabdff1aSopenharmony_ci const20, const6, const3); 3635cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 3636cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(avg1, res1); 3637cabdff1aSopenharmony_ci inp0 = LD_UB(src); 3638cabdff1aSopenharmony_ci horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3639cabdff1aSopenharmony_ci mask2, mask3, const20, 3640cabdff1aSopenharmony_ci const6, const3); 3641cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 3642cabdff1aSopenharmony_ci dst += 2 * dst_stride; 3643cabdff1aSopenharmony_ci 3644cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3645cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 3646cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 3647cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 3648cabdff1aSopenharmony_ci const20, const6, const3); 3649cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 3650cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(avg0, res0); 3651cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3652cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 3653cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 3654cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 3655cabdff1aSopenharmony_ci const20, const6, const3); 3656cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 3657cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(avg1, res1); 3658cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 3659cabdff1aSopenharmony_ci} 3660cabdff1aSopenharmony_ci 3661cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t *src, 3662cabdff1aSopenharmony_ci int32_t src_stride, 3663cabdff1aSopenharmony_ci uint8_t *dst, 3664cabdff1aSopenharmony_ci int32_t dst_stride) 3665cabdff1aSopenharmony_ci{ 3666cabdff1aSopenharmony_ci uint8_t buff[272]; 3667cabdff1aSopenharmony_ci 3668cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 3669cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 3670cabdff1aSopenharmony_ci} 3671cabdff1aSopenharmony_ci 3672cabdff1aSopenharmony_cistatic void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src, 3673cabdff1aSopenharmony_ci int32_t src_stride, 3674cabdff1aSopenharmony_ci uint8_t *dst, 3675cabdff1aSopenharmony_ci int32_t dst_stride) 3676cabdff1aSopenharmony_ci{ 3677cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 3678cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 3679cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 3680cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3681cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3682cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3683cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3684cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3685cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 3686cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 3687cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 3688cabdff1aSopenharmony_ci 3689cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3690cabdff1aSopenharmony_ci src += (2 * src_stride); 3691cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3692cabdff1aSopenharmony_ci mask2, mask3, const20, 3693cabdff1aSopenharmony_ci const6, const3); 3694cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 3695cabdff1aSopenharmony_ci 3696cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 3697cabdff1aSopenharmony_ci horiz0 = __msa_ave_u_b(inp0, res0); 3698cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3699cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3700cabdff1aSopenharmony_ci src += (2 * src_stride); 3701cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3702cabdff1aSopenharmony_ci mask2, mask3, const20, 3703cabdff1aSopenharmony_ci const6, const3); 3704cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 3705cabdff1aSopenharmony_ci 3706cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 3707cabdff1aSopenharmony_ci horiz2 = __msa_ave_u_b(inp2, res1); 3708cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3709cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3710cabdff1aSopenharmony_ci src += (2 * src_stride); 3711cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, 3712cabdff1aSopenharmony_ci mask2, mask3, const20, 3713cabdff1aSopenharmony_ci const6, const3); 3714cabdff1aSopenharmony_ci 3715cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 3716cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 3717cabdff1aSopenharmony_ci horiz4 = __msa_ave_u_b(inp0, res0); 3718cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3719cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3720cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 3721cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 3722cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 3723cabdff1aSopenharmony_ci const20, const6, const3); 3724cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 3725cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(avg0, res0); 3726cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 3727cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3728cabdff1aSopenharmony_ci 3729cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3730cabdff1aSopenharmony_ci src += (2 * src_stride); 3731cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, 3732cabdff1aSopenharmony_ci mask2, mask3, const20, 3733cabdff1aSopenharmony_ci const6, const3); 3734cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 3735cabdff1aSopenharmony_ci 3736cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 3737cabdff1aSopenharmony_ci horiz6 = __msa_ave_u_b(inp2, res1); 3738cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3739cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3740cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 3741cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 3742cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 3743cabdff1aSopenharmony_ci const20, const6, const3); 3744cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 3745cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(avg1, res1); 3746cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 3747cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3748cabdff1aSopenharmony_ci 3749cabdff1aSopenharmony_ci inp0 = LD_UB(src); 3750cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, 3751cabdff1aSopenharmony_ci mask2, mask3, const20, 3752cabdff1aSopenharmony_ci const6, const3); 3753cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 3754cabdff1aSopenharmony_ci horiz8 = __msa_ave_u_b(inp0, res0); 3755cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3756cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 3757cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 3758cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 3759cabdff1aSopenharmony_ci const20, const6, const3); 3760cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3761cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 3762cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 3763cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 3764cabdff1aSopenharmony_ci const20, const6, const3); 3765cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 3766cabdff1aSopenharmony_ci res0 = __msa_ave_u_b(avg0, res0); 3767cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 3768cabdff1aSopenharmony_ci res1 = __msa_ave_u_b(avg1, res1); 3769cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 3770cabdff1aSopenharmony_ci} 3771cabdff1aSopenharmony_ci 3772cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t *src, 3773cabdff1aSopenharmony_ci int32_t src_stride, 3774cabdff1aSopenharmony_ci uint8_t *dst, 3775cabdff1aSopenharmony_ci int32_t dst_stride, 3776cabdff1aSopenharmony_ci int32_t height) 3777cabdff1aSopenharmony_ci{ 3778cabdff1aSopenharmony_ci uint8_t loop_count; 3779cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 3780cabdff1aSopenharmony_ci v16u8 res; 3781cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 3782cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 3783cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 3784cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 3785cabdff1aSopenharmony_ci 3786cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 3787cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 3788cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 3789cabdff1aSopenharmony_ci src += (4 * src_stride); 3790cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 3791cabdff1aSopenharmony_ci const20, const6, const3); 3792cabdff1aSopenharmony_ci res = __msa_aver_u_b(inp0, res); 3793cabdff1aSopenharmony_ci ST_UB(res, dst); 3794cabdff1aSopenharmony_ci dst += dst_stride; 3795cabdff1aSopenharmony_ci 3796cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 3797cabdff1aSopenharmony_ci const20, const6, const3); 3798cabdff1aSopenharmony_ci res = __msa_aver_u_b(inp2, res); 3799cabdff1aSopenharmony_ci ST_UB(res, dst); 3800cabdff1aSopenharmony_ci dst += dst_stride; 3801cabdff1aSopenharmony_ci 3802cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 3803cabdff1aSopenharmony_ci const20, const6, const3); 3804cabdff1aSopenharmony_ci res = __msa_aver_u_b(inp4, res); 3805cabdff1aSopenharmony_ci ST_UB(res, dst); 3806cabdff1aSopenharmony_ci dst += dst_stride; 3807cabdff1aSopenharmony_ci 3808cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 3809cabdff1aSopenharmony_ci const20, const6, const3); 3810cabdff1aSopenharmony_ci res = __msa_aver_u_b(inp6, res); 3811cabdff1aSopenharmony_ci ST_UB(res, dst); 3812cabdff1aSopenharmony_ci dst += dst_stride; 3813cabdff1aSopenharmony_ci } 3814cabdff1aSopenharmony_ci 3815cabdff1aSopenharmony_ci LD_UB2(src, 1, inp0, inp1); 3816cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3); 3817cabdff1aSopenharmony_ci res = __msa_aver_u_b(inp0, res); 3818cabdff1aSopenharmony_ci ST_UB(res, dst); 3819cabdff1aSopenharmony_ci} 3820cabdff1aSopenharmony_ci 3821cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_hv_src00_16x16_msa(const uint8_t *src, 3822cabdff1aSopenharmony_ci int32_t src_stride, 3823cabdff1aSopenharmony_ci uint8_t *dst, 3824cabdff1aSopenharmony_ci int32_t dst_stride) 3825cabdff1aSopenharmony_ci{ 3826cabdff1aSopenharmony_ci uint8_t buff[272]; 3827cabdff1aSopenharmony_ci 3828cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 3829cabdff1aSopenharmony_ci vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 3830cabdff1aSopenharmony_ci} 3831cabdff1aSopenharmony_ci 3832cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t *src, 3833cabdff1aSopenharmony_ci int32_t src_stride, 3834cabdff1aSopenharmony_ci uint8_t *dst, 3835cabdff1aSopenharmony_ci int32_t dst_stride) 3836cabdff1aSopenharmony_ci{ 3837cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 3838cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 3839cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 3840cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3841cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3842cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3843cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3844cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3845cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 3846cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 3847cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 3848cabdff1aSopenharmony_ci 3849cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 3850cabdff1aSopenharmony_ci src += (4 * src_stride); 3851cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 3852cabdff1aSopenharmony_ci const20, const6, const3); 3853cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 3854cabdff1aSopenharmony_ci const20, const6, const3); 3855cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 3856cabdff1aSopenharmony_ci horiz0 = __msa_aver_u_b(inp0, res0); 3857cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3858cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 3859cabdff1aSopenharmony_ci horiz2 = __msa_aver_u_b(inp2, res1); 3860cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 3861cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3862cabdff1aSopenharmony_ci src += (2 * src_stride); 3863cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 3864cabdff1aSopenharmony_ci const20, const6, const3); 3865cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 3866cabdff1aSopenharmony_ci horiz4 = __msa_aver_u_b(inp0, res0); 3867cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 3868cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 3869cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 3870cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 3871cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 3872cabdff1aSopenharmony_ci const20, const6, const3); 3873cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 3874cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 3875cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 3876cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3877cabdff1aSopenharmony_ci 3878cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3879cabdff1aSopenharmony_ci src += (2 * src_stride); 3880cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 3881cabdff1aSopenharmony_ci const20, const6, const3); 3882cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 3883cabdff1aSopenharmony_ci horiz6 = __msa_aver_u_b(inp2, res1); 3884cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 3885cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 3886cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 3887cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 3888cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 3889cabdff1aSopenharmony_ci const20, const6, const3); 3890cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 3891cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 3892cabdff1aSopenharmony_ci 3893cabdff1aSopenharmony_ci inp0 = LD_UB(src); 3894cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 3895cabdff1aSopenharmony_ci const20, const6, const3); 3896cabdff1aSopenharmony_ci horiz8 = __msa_aver_u_b(inp0, res0); 3897cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 3898cabdff1aSopenharmony_ci dst += 2 * dst_stride; 3899cabdff1aSopenharmony_ci 3900cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 3901cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 3902cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 3903cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 3904cabdff1aSopenharmony_ci const20, const6, const3); 3905cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 3906cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 3907cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 3908cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 3909cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 3910cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 3911cabdff1aSopenharmony_ci const20, const6, const3); 3912cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 3913cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 3914cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 3915cabdff1aSopenharmony_ci} 3916cabdff1aSopenharmony_ci 3917cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t *src, 3918cabdff1aSopenharmony_ci int32_t src_stride, 3919cabdff1aSopenharmony_ci uint8_t *dst, 3920cabdff1aSopenharmony_ci int32_t dst_stride, 3921cabdff1aSopenharmony_ci int32_t height) 3922cabdff1aSopenharmony_ci{ 3923cabdff1aSopenharmony_ci uint8_t loop_count; 3924cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 3925cabdff1aSopenharmony_ci v16u8 res; 3926cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 3927cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 3928cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 3929cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 3930cabdff1aSopenharmony_ci 3931cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 3932cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 3933cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 3934cabdff1aSopenharmony_ci src += (4 * src_stride); 3935cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 3936cabdff1aSopenharmony_ci const20, const6, const3); 3937cabdff1aSopenharmony_ci ST_UB(res, dst); 3938cabdff1aSopenharmony_ci dst += dst_stride; 3939cabdff1aSopenharmony_ci 3940cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 3941cabdff1aSopenharmony_ci const20, const6, const3); 3942cabdff1aSopenharmony_ci ST_UB(res, dst); 3943cabdff1aSopenharmony_ci dst += dst_stride; 3944cabdff1aSopenharmony_ci 3945cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 3946cabdff1aSopenharmony_ci const20, const6, const3); 3947cabdff1aSopenharmony_ci ST_UB(res, dst); 3948cabdff1aSopenharmony_ci dst += dst_stride; 3949cabdff1aSopenharmony_ci 3950cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 3951cabdff1aSopenharmony_ci const20, const6, const3); 3952cabdff1aSopenharmony_ci ST_UB(res, dst); 3953cabdff1aSopenharmony_ci dst += dst_stride; 3954cabdff1aSopenharmony_ci } 3955cabdff1aSopenharmony_ci 3956cabdff1aSopenharmony_ci LD_UB2(src, 1, inp0, inp1); 3957cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3); 3958cabdff1aSopenharmony_ci ST_UB(res, dst); 3959cabdff1aSopenharmony_ci} 3960cabdff1aSopenharmony_ci 3961cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_v_src0_16x16_msa(const uint8_t *src, 3962cabdff1aSopenharmony_ci int32_t src_stride, 3963cabdff1aSopenharmony_ci uint8_t *dst, 3964cabdff1aSopenharmony_ci int32_t dst_stride) 3965cabdff1aSopenharmony_ci{ 3966cabdff1aSopenharmony_ci uint8_t buff[272]; 3967cabdff1aSopenharmony_ci 3968cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); 3969cabdff1aSopenharmony_ci vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 3970cabdff1aSopenharmony_ci} 3971cabdff1aSopenharmony_ci 3972cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t *src, 3973cabdff1aSopenharmony_ci int32_t src_stride, 3974cabdff1aSopenharmony_ci uint8_t *dst, 3975cabdff1aSopenharmony_ci int32_t dst_stride) 3976cabdff1aSopenharmony_ci{ 3977cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 3978cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 3979cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 3980cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 3981cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 3982cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 3983cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 3984cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 3985cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 3986cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 3987cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 3988cabdff1aSopenharmony_ci 3989cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 3990cabdff1aSopenharmony_ci src += (2 * src_stride); 3991cabdff1aSopenharmony_ci horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 3992cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 3993cabdff1aSopenharmony_ci const20, const6, const3); 3994cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 3995cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 3996cabdff1aSopenharmony_ci src += (2 * src_stride); 3997cabdff1aSopenharmony_ci horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 3998cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 3999cabdff1aSopenharmony_ci const20, const6, const3); 4000cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4001cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 4002cabdff1aSopenharmony_ci src += (2 * src_stride); 4003cabdff1aSopenharmony_ci horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4004cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4005cabdff1aSopenharmony_ci const20, const6, const3); 4006cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4007cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4008cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 4009cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 4010cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 4011cabdff1aSopenharmony_ci const20, const6, const3); 4012cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 4013cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4014cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 4015cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4016cabdff1aSopenharmony_ci 4017cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 4018cabdff1aSopenharmony_ci src += (2 * src_stride); 4019cabdff1aSopenharmony_ci horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4020cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4021cabdff1aSopenharmony_ci const20, const6, const3); 4022cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4023cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4024cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 4025cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 4026cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 4027cabdff1aSopenharmony_ci const20, const6, const3); 4028cabdff1aSopenharmony_ci inp0 = LD_UB(src); 4029cabdff1aSopenharmony_ci horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, 4030cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4031cabdff1aSopenharmony_ci const20, const6, const3); 4032cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 4033cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4034cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4035cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 4036cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 4037cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 4038cabdff1aSopenharmony_ci const20, const6, const3); 4039cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 4040cabdff1aSopenharmony_ci dst += 2 * dst_stride; 4041cabdff1aSopenharmony_ci 4042cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 4043cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4044cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4045cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 4046cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 4047cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 4048cabdff1aSopenharmony_ci const20, const6, const3); 4049cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 4050cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4051cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4052cabdff1aSopenharmony_ci} 4053cabdff1aSopenharmony_ci 4054cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t *src, 4055cabdff1aSopenharmony_ci int32_t src_stride, 4056cabdff1aSopenharmony_ci uint8_t *dst, 4057cabdff1aSopenharmony_ci int32_t dst_stride, 4058cabdff1aSopenharmony_ci int32_t height) 4059cabdff1aSopenharmony_ci{ 4060cabdff1aSopenharmony_ci uint8_t loop_count; 4061cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7; 4062cabdff1aSopenharmony_ci v16u8 res; 4063cabdff1aSopenharmony_ci v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; 4064cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 4065cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 4066cabdff1aSopenharmony_ci v8u16 const20 = (v8u16) __msa_ldi_h(20); 4067cabdff1aSopenharmony_ci 4068cabdff1aSopenharmony_ci for (loop_count = (height >> 2); loop_count--;) { 4069cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp2, inp4, inp6); 4070cabdff1aSopenharmony_ci LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7); 4071cabdff1aSopenharmony_ci src += (4 * src_stride); 4072cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, 4073cabdff1aSopenharmony_ci const20, const6, const3); 4074cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, inp1); 4075cabdff1aSopenharmony_ci ST_UB(res, dst); 4076cabdff1aSopenharmony_ci dst += dst_stride; 4077cabdff1aSopenharmony_ci 4078cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask, 4079cabdff1aSopenharmony_ci const20, const6, const3); 4080cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, inp3); 4081cabdff1aSopenharmony_ci ST_UB(res, dst); 4082cabdff1aSopenharmony_ci dst += dst_stride; 4083cabdff1aSopenharmony_ci 4084cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask, 4085cabdff1aSopenharmony_ci const20, const6, const3); 4086cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, inp5); 4087cabdff1aSopenharmony_ci ST_UB(res, dst); 4088cabdff1aSopenharmony_ci dst += dst_stride; 4089cabdff1aSopenharmony_ci 4090cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask, 4091cabdff1aSopenharmony_ci const20, const6, const3); 4092cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, inp7); 4093cabdff1aSopenharmony_ci ST_UB(res, dst); 4094cabdff1aSopenharmony_ci dst += dst_stride; 4095cabdff1aSopenharmony_ci } 4096cabdff1aSopenharmony_ci 4097cabdff1aSopenharmony_ci LD_UB2(src, 1, inp0, inp1); 4098cabdff1aSopenharmony_ci res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3); 4099cabdff1aSopenharmony_ci res = __msa_aver_u_b(inp1, res); 4100cabdff1aSopenharmony_ci ST_UB(res, dst); 4101cabdff1aSopenharmony_ci} 4102cabdff1aSopenharmony_ci 4103cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_hv_src10_16x16_msa(const uint8_t *src, 4104cabdff1aSopenharmony_ci int32_t src_stride, 4105cabdff1aSopenharmony_ci uint8_t *dst, 4106cabdff1aSopenharmony_ci int32_t dst_stride) 4107cabdff1aSopenharmony_ci{ 4108cabdff1aSopenharmony_ci uint8_t buff[272]; 4109cabdff1aSopenharmony_ci 4110cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 4111cabdff1aSopenharmony_ci vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 4112cabdff1aSopenharmony_ci} 4113cabdff1aSopenharmony_ci 4114cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src, 4115cabdff1aSopenharmony_ci int32_t src_stride, 4116cabdff1aSopenharmony_ci uint8_t *dst, 4117cabdff1aSopenharmony_ci int32_t dst_stride) 4118cabdff1aSopenharmony_ci{ 4119cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 4120cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 4121cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 4122cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4123cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4124cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4125cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4126cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4127cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 4128cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 4129cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 4130cabdff1aSopenharmony_ci 4131cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 4132cabdff1aSopenharmony_ci src += (4 * src_stride); 4133cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4134cabdff1aSopenharmony_ci const20, const6, const3); 4135cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4136cabdff1aSopenharmony_ci const20, const6, const3); 4137cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 4138cabdff1aSopenharmony_ci 4139cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 4140cabdff1aSopenharmony_ci horiz0 = __msa_aver_u_b(inp0, res0); 4141cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4142cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 4143cabdff1aSopenharmony_ci 4144cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 4145cabdff1aSopenharmony_ci horiz2 = __msa_aver_u_b(inp2, res1); 4146cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4147cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 4148cabdff1aSopenharmony_ci src += (4 * src_stride); 4149cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4150cabdff1aSopenharmony_ci const20, const6, const3); 4151cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4152cabdff1aSopenharmony_ci const20, const6, const3); 4153cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 4154cabdff1aSopenharmony_ci 4155cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 4156cabdff1aSopenharmony_ci horiz4 = __msa_aver_u_b(inp0, res0); 4157cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4158cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 4159cabdff1aSopenharmony_ci 4160cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 4161cabdff1aSopenharmony_ci horiz6 = __msa_aver_u_b(inp2, res1); 4162cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4163cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4164cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 4165cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 4166cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 4167cabdff1aSopenharmony_ci const20, const6, const3); 4168cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 4169cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4170cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4171cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 4172cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 4173cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 4174cabdff1aSopenharmony_ci const20, const6, const3); 4175cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 4176cabdff1aSopenharmony_ci dst += 2 * dst_stride; 4177cabdff1aSopenharmony_ci 4178cabdff1aSopenharmony_ci inp0 = LD_UB(src); 4179cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 4180cabdff1aSopenharmony_ci const20, const6, const3); 4181cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 4182cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4183cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 4184cabdff1aSopenharmony_ci horiz8 = __msa_aver_u_b(inp0, res0); 4185cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4186cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 4187cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 4188cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 4189cabdff1aSopenharmony_ci const20, const6, const3); 4190cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 4191cabdff1aSopenharmony_ci dst += 2 * dst_stride; 4192cabdff1aSopenharmony_ci 4193cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 4194cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4195cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4196cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 4197cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 4198cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 4199cabdff1aSopenharmony_ci const20, const6, const3); 4200cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 4201cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4202cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4203cabdff1aSopenharmony_ci} 4204cabdff1aSopenharmony_ci 4205cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t *src, 4206cabdff1aSopenharmony_ci int32_t src_stride, 4207cabdff1aSopenharmony_ci uint8_t *dst, 4208cabdff1aSopenharmony_ci int32_t dst_stride) 4209cabdff1aSopenharmony_ci{ 4210cabdff1aSopenharmony_ci uint8_t buff[272]; 4211cabdff1aSopenharmony_ci 4212cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 4213cabdff1aSopenharmony_ci vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride); 4214cabdff1aSopenharmony_ci} 4215cabdff1aSopenharmony_ci 4216cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t *src, 4217cabdff1aSopenharmony_ci int32_t src_stride, 4218cabdff1aSopenharmony_ci uint8_t *dst, 4219cabdff1aSopenharmony_ci int32_t dst_stride) 4220cabdff1aSopenharmony_ci{ 4221cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 4222cabdff1aSopenharmony_ci v16u8 res0, res1; 4223cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 4224cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4225cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4226cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4227cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4228cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4229cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 4230cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 4231cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 4232cabdff1aSopenharmony_ci 4233cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 4234cabdff1aSopenharmony_ci src += (2 * src_stride); 4235cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4236cabdff1aSopenharmony_ci const20, const6, const3); 4237cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 4238cabdff1aSopenharmony_ci horiz0 = __msa_aver_u_b(inp0, res0); 4239cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4240cabdff1aSopenharmony_ci 4241cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 4242cabdff1aSopenharmony_ci src += (2 * src_stride); 4243cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4244cabdff1aSopenharmony_ci const20, const6, const3); 4245cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 4246cabdff1aSopenharmony_ci horiz2 = __msa_aver_u_b(inp2, res1); 4247cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4248cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 4249cabdff1aSopenharmony_ci src += (2 * src_stride); 4250cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4251cabdff1aSopenharmony_ci const20, const6, const3); 4252cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 4253cabdff1aSopenharmony_ci horiz4 = __msa_aver_u_b(inp0, res0); 4254cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4255cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4256cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 4257cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 4258cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 4259cabdff1aSopenharmony_ci const20, const6, const3); 4260cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 4261cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4262cabdff1aSopenharmony_ci 4263cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 4264cabdff1aSopenharmony_ci src += (2 * src_stride); 4265cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4266cabdff1aSopenharmony_ci const20, const6, const3); 4267cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 4268cabdff1aSopenharmony_ci horiz6 = __msa_aver_u_b(inp2, res1); 4269cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4270cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4271cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 4272cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 4273cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 4274cabdff1aSopenharmony_ci const20, const6, const3); 4275cabdff1aSopenharmony_ci inp0 = LD_UB(src); 4276cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 4277cabdff1aSopenharmony_ci const20, const6, const3); 4278cabdff1aSopenharmony_ci horiz8 = __msa_aver_u_b(inp0, res0); 4279cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4280cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 4281cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 4282cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 4283cabdff1aSopenharmony_ci const20, const6, const3); 4284cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 4285cabdff1aSopenharmony_ci dst += 2 * dst_stride; 4286cabdff1aSopenharmony_ci 4287cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4288cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 4289cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 4290cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 4291cabdff1aSopenharmony_ci const20, const6, const3); 4292cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4293cabdff1aSopenharmony_ci} 4294cabdff1aSopenharmony_ci 4295cabdff1aSopenharmony_cistatic void hv_mc_qpel_16x16_msa(const uint8_t *src, 4296cabdff1aSopenharmony_ci int32_t src_stride, 4297cabdff1aSopenharmony_ci uint8_t *dst, 4298cabdff1aSopenharmony_ci int32_t dst_stride) 4299cabdff1aSopenharmony_ci{ 4300cabdff1aSopenharmony_ci uint8_t buff[272]; 4301cabdff1aSopenharmony_ci 4302cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); 4303cabdff1aSopenharmony_ci vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride); 4304cabdff1aSopenharmony_ci} 4305cabdff1aSopenharmony_ci 4306cabdff1aSopenharmony_cistatic void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride, 4307cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 4308cabdff1aSopenharmony_ci{ 4309cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 4310cabdff1aSopenharmony_ci v16u8 res0, res1; 4311cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 4312cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4313cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4314cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4315cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4316cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4317cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 4318cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 4319cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 4320cabdff1aSopenharmony_ci 4321cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 4322cabdff1aSopenharmony_ci src += (2 * src_stride); 4323cabdff1aSopenharmony_ci horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4324cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4325cabdff1aSopenharmony_ci const20, const6, const3); 4326cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4327cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 4328cabdff1aSopenharmony_ci src += (2 * src_stride); 4329cabdff1aSopenharmony_ci horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4330cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4331cabdff1aSopenharmony_ci const20, const6, const3); 4332cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4333cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 4334cabdff1aSopenharmony_ci src += (2 * src_stride); 4335cabdff1aSopenharmony_ci horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4336cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4337cabdff1aSopenharmony_ci const20, const6, const3); 4338cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4339cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4340cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 4341cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 4342cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 4343cabdff1aSopenharmony_ci const20, const6, const3); 4344cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 4345cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4346cabdff1aSopenharmony_ci 4347cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 4348cabdff1aSopenharmony_ci src += (2 * src_stride); 4349cabdff1aSopenharmony_ci horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4350cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4351cabdff1aSopenharmony_ci const20, const6, const3); 4352cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4353cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4354cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 4355cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 4356cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 4357cabdff1aSopenharmony_ci const20, const6, const3); 4358cabdff1aSopenharmony_ci inp0 = LD_UB(src); 4359cabdff1aSopenharmony_ci horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, 4360cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4361cabdff1aSopenharmony_ci const20, const6, const3); 4362cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 4363cabdff1aSopenharmony_ci dst += 2 * dst_stride; 4364cabdff1aSopenharmony_ci 4365cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4366cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 4367cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 4368cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 4369cabdff1aSopenharmony_ci const20, const6, const3); 4370cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4371cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 4372cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 4373cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 4374cabdff1aSopenharmony_ci const20, const6, const3); 4375cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4376cabdff1aSopenharmony_ci} 4377cabdff1aSopenharmony_ci 4378cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t *src, 4379cabdff1aSopenharmony_ci int32_t src_stride, 4380cabdff1aSopenharmony_ci uint8_t *dst, 4381cabdff1aSopenharmony_ci int32_t dst_stride) 4382cabdff1aSopenharmony_ci{ 4383cabdff1aSopenharmony_ci uint8_t buff[272]; 4384cabdff1aSopenharmony_ci 4385cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 4386cabdff1aSopenharmony_ci vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride); 4387cabdff1aSopenharmony_ci} 4388cabdff1aSopenharmony_ci 4389cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t *src, 4390cabdff1aSopenharmony_ci int32_t src_stride, 4391cabdff1aSopenharmony_ci uint8_t *dst, 4392cabdff1aSopenharmony_ci int32_t dst_stride) 4393cabdff1aSopenharmony_ci{ 4394cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 4395cabdff1aSopenharmony_ci v16u8 res0, res1; 4396cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 4397cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4398cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4399cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4400cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4401cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4402cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 4403cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 4404cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 4405cabdff1aSopenharmony_ci 4406cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 4407cabdff1aSopenharmony_ci src += (4 * src_stride); 4408cabdff1aSopenharmony_ci 4409cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4410cabdff1aSopenharmony_ci const20, const6, const3); 4411cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4412cabdff1aSopenharmony_ci const20, const6, const3); 4413cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 4414cabdff1aSopenharmony_ci 4415cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 4416cabdff1aSopenharmony_ci horiz0 = __msa_aver_u_b(inp0, res0); 4417cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4418cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 4419cabdff1aSopenharmony_ci 4420cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 4421cabdff1aSopenharmony_ci horiz2 = __msa_aver_u_b(inp2, res1); 4422cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4423cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 4424cabdff1aSopenharmony_ci src += (4 * src_stride); 4425cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4426cabdff1aSopenharmony_ci const20, const6, const3); 4427cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4428cabdff1aSopenharmony_ci const20, const6, const3); 4429cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 4430cabdff1aSopenharmony_ci 4431cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 4432cabdff1aSopenharmony_ci horiz4 = __msa_aver_u_b(inp0, res0); 4433cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4434cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 4435cabdff1aSopenharmony_ci 4436cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 4437cabdff1aSopenharmony_ci horiz6 = __msa_aver_u_b(inp2, res1); 4438cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4439cabdff1aSopenharmony_ci inp0 = LD_UB(src); 4440cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 4441cabdff1aSopenharmony_ci const20, const6, const3); 4442cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 4443cabdff1aSopenharmony_ci horiz8 = __msa_aver_u_b(inp0, res0); 4444cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4445cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 4446cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 4447cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 4448cabdff1aSopenharmony_ci const20, const6, const3); 4449cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4450cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 4451cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 4452cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 4453cabdff1aSopenharmony_ci const20, const6, const3); 4454cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4455cabdff1aSopenharmony_ci dst += (4 * dst_stride); 4456cabdff1aSopenharmony_ci 4457cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4458cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 4459cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 4460cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 4461cabdff1aSopenharmony_ci const20, const6, const3); 4462cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4463cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 4464cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 4465cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 4466cabdff1aSopenharmony_ci const20, const6, const3); 4467cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4468cabdff1aSopenharmony_ci} 4469cabdff1aSopenharmony_ci 4470cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t *src, 4471cabdff1aSopenharmony_ci int32_t src_stride, 4472cabdff1aSopenharmony_ci uint8_t *dst, 4473cabdff1aSopenharmony_ci int32_t dst_stride) 4474cabdff1aSopenharmony_ci{ 4475cabdff1aSopenharmony_ci uint8_t buff[272]; 4476cabdff1aSopenharmony_ci 4477cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 4478cabdff1aSopenharmony_ci vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 4479cabdff1aSopenharmony_ci} 4480cabdff1aSopenharmony_ci 4481cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t *src, 4482cabdff1aSopenharmony_ci int32_t src_stride, 4483cabdff1aSopenharmony_ci uint8_t *dst, 4484cabdff1aSopenharmony_ci int32_t dst_stride) 4485cabdff1aSopenharmony_ci{ 4486cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 4487cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 4488cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 4489cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4490cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4491cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4492cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4493cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4494cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 4495cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 4496cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 4497cabdff1aSopenharmony_ci 4498cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 4499cabdff1aSopenharmony_ci src += (4 * src_stride); 4500cabdff1aSopenharmony_ci 4501cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4502cabdff1aSopenharmony_ci const20, const6, const3); 4503cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4504cabdff1aSopenharmony_ci const20, const6, const3); 4505cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 4506cabdff1aSopenharmony_ci horiz0 = __msa_aver_u_b(inp0, res0); 4507cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4508cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 4509cabdff1aSopenharmony_ci horiz2 = __msa_aver_u_b(inp2, res1); 4510cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4511cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 4512cabdff1aSopenharmony_ci src += (2 * src_stride); 4513cabdff1aSopenharmony_ci 4514cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4515cabdff1aSopenharmony_ci const20, const6, const3); 4516cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 4517cabdff1aSopenharmony_ci horiz4 = __msa_aver_u_b(inp0, res0); 4518cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4519cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4520cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 4521cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 4522cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 4523cabdff1aSopenharmony_ci const20, const6, const3); 4524cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2); 4525cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4526cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 4527cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4528cabdff1aSopenharmony_ci 4529cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 4530cabdff1aSopenharmony_ci src += (2 * src_stride); 4531cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4532cabdff1aSopenharmony_ci const20, const6, const3); 4533cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 4534cabdff1aSopenharmony_ci horiz6 = __msa_aver_u_b(inp2, res1); 4535cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4536cabdff1aSopenharmony_ci inp0 = LD_UB(src); 4537cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 4538cabdff1aSopenharmony_ci const20, const6, const3); 4539cabdff1aSopenharmony_ci horiz8 = __msa_aver_u_b(inp0, res0); 4540cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4541cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 4542cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 4543cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 4544cabdff1aSopenharmony_ci const20, const6, const3); 4545cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4); 4546cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4547cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4548cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 4549cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 4550cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 4551cabdff1aSopenharmony_ci const20, const6, const3); 4552cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 4553cabdff1aSopenharmony_ci dst += 2 * dst_stride; 4554cabdff1aSopenharmony_ci 4555cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6); 4556cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4557cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4558cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 4559cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 4560cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 4561cabdff1aSopenharmony_ci const20, const6, const3); 4562cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8); 4563cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4564cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4565cabdff1aSopenharmony_ci} 4566cabdff1aSopenharmony_ci 4567cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t *src, 4568cabdff1aSopenharmony_ci int32_t src_stride, 4569cabdff1aSopenharmony_ci uint8_t *dst, 4570cabdff1aSopenharmony_ci int32_t dst_stride) 4571cabdff1aSopenharmony_ci{ 4572cabdff1aSopenharmony_ci uint8_t buff[272]; 4573cabdff1aSopenharmony_ci 4574cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); 4575cabdff1aSopenharmony_ci vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 4576cabdff1aSopenharmony_ci} 4577cabdff1aSopenharmony_ci 4578cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t *src, 4579cabdff1aSopenharmony_ci int32_t src_stride, 4580cabdff1aSopenharmony_ci uint8_t *dst, 4581cabdff1aSopenharmony_ci int32_t dst_stride) 4582cabdff1aSopenharmony_ci{ 4583cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 4584cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 4585cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 4586cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4587cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4588cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4589cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4590cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4591cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 4592cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 4593cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 4594cabdff1aSopenharmony_ci 4595cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 4596cabdff1aSopenharmony_ci src += (2 * src_stride); 4597cabdff1aSopenharmony_ci horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4598cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4599cabdff1aSopenharmony_ci const20, const6, const3); 4600cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4601cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 4602cabdff1aSopenharmony_ci src += (2 * src_stride); 4603cabdff1aSopenharmony_ci horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4604cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4605cabdff1aSopenharmony_ci const20, const6, const3); 4606cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4607cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 4608cabdff1aSopenharmony_ci src += (2 * src_stride); 4609cabdff1aSopenharmony_ci horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4610cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4611cabdff1aSopenharmony_ci const20, const6, const3); 4612cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4613cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4614cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4615cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 4616cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 4617cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 4618cabdff1aSopenharmony_ci const20, const6, const3); 4619cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2); 4620cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4621cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 4622cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4623cabdff1aSopenharmony_ci 4624cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 4625cabdff1aSopenharmony_ci src += (2 * src_stride); 4626cabdff1aSopenharmony_ci horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4627cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4628cabdff1aSopenharmony_ci const20, const6, const3); 4629cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4630cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4631cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 4632cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 4633cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 4634cabdff1aSopenharmony_ci const20, const6, const3); 4635cabdff1aSopenharmony_ci inp0 = LD_UB(src); 4636cabdff1aSopenharmony_ci horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, 4637cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4638cabdff1aSopenharmony_ci const20, const6, const3); 4639cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4); 4640cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4641cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4642cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 4643cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 4644cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 4645cabdff1aSopenharmony_ci const20, const6, const3); 4646cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 4647cabdff1aSopenharmony_ci dst += 2 * dst_stride; 4648cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6); 4649cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4650cabdff1aSopenharmony_ci 4651cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4652cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 4653cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 4654cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 4655cabdff1aSopenharmony_ci const20, const6, const3); 4656cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8); 4657cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4658cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4659cabdff1aSopenharmony_ci} 4660cabdff1aSopenharmony_ci 4661cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t *src, 4662cabdff1aSopenharmony_ci int32_t src_stride, 4663cabdff1aSopenharmony_ci uint8_t *dst, 4664cabdff1aSopenharmony_ci int32_t dst_stride) 4665cabdff1aSopenharmony_ci{ 4666cabdff1aSopenharmony_ci uint8_t buff[272]; 4667cabdff1aSopenharmony_ci 4668cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 4669cabdff1aSopenharmony_ci vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 4670cabdff1aSopenharmony_ci} 4671cabdff1aSopenharmony_ci 4672cabdff1aSopenharmony_cistatic void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src, 4673cabdff1aSopenharmony_ci int32_t src_stride, 4674cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 4675cabdff1aSopenharmony_ci{ 4676cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 4677cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 4678cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 4679cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4680cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4681cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4682cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4683cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4684cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 4685cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 4686cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 4687cabdff1aSopenharmony_ci 4688cabdff1aSopenharmony_ci LD_UB4(src, src_stride, inp0, inp1, inp2, inp3); 4689cabdff1aSopenharmony_ci src += (4 * src_stride); 4690cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4691cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4692cabdff1aSopenharmony_ci const20, const6, const3); 4693cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 4694cabdff1aSopenharmony_ci 4695cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 4696cabdff1aSopenharmony_ci horiz0 = __msa_aver_u_b(inp0, res0); 4697cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4698cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4699cabdff1aSopenharmony_ci const20, const6, const3); 4700cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 4701cabdff1aSopenharmony_ci 4702cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 4703cabdff1aSopenharmony_ci horiz2 = __msa_aver_u_b(inp2, res1); 4704cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4705cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 4706cabdff1aSopenharmony_ci src += (2 * src_stride); 4707cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4708cabdff1aSopenharmony_ci const20, const6, const3); 4709cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 4710cabdff1aSopenharmony_ci 4711cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); 4712cabdff1aSopenharmony_ci horiz4 = __msa_aver_u_b(inp0, res0); 4713cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4714cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4715cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 4716cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 4717cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 4718cabdff1aSopenharmony_ci const20, const6, const3); 4719cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 4720cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4721cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 4722cabdff1aSopenharmony_ci src += (2 * src_stride); 4723cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 4724cabdff1aSopenharmony_ci dst += 2 * dst_stride; 4725cabdff1aSopenharmony_ci 4726cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4727cabdff1aSopenharmony_ci const20, const6, const3); 4728cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 4729cabdff1aSopenharmony_ci 4730cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); 4731cabdff1aSopenharmony_ci horiz6 = __msa_aver_u_b(inp2, res1); 4732cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4733cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4734cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 4735cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 4736cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 4737cabdff1aSopenharmony_ci const20, const6, const3); 4738cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 4739cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4740cabdff1aSopenharmony_ci inp0 = LD_UB(src); 4741cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 4742cabdff1aSopenharmony_ci const20, const6, const3); 4743cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 4744cabdff1aSopenharmony_ci horiz8 = __msa_aver_u_b(inp0, res0); 4745cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4746cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 4747cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 4748cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 4749cabdff1aSopenharmony_ci const20, const6, const3); 4750cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 4751cabdff1aSopenharmony_ci dst += 2 * dst_stride; 4752cabdff1aSopenharmony_ci 4753cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 4754cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4755cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4756cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 4757cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 4758cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 4759cabdff1aSopenharmony_ci const20, const6, const3); 4760cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 4761cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4762cabdff1aSopenharmony_ci ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); 4763cabdff1aSopenharmony_ci} 4764cabdff1aSopenharmony_ci 4765cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t *src, 4766cabdff1aSopenharmony_ci int32_t src_stride, 4767cabdff1aSopenharmony_ci uint8_t *dst, 4768cabdff1aSopenharmony_ci int32_t dst_stride) 4769cabdff1aSopenharmony_ci{ 4770cabdff1aSopenharmony_ci uint8_t buff[272]; 4771cabdff1aSopenharmony_ci 4772cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 4773cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 4774cabdff1aSopenharmony_ci} 4775cabdff1aSopenharmony_ci 4776cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src, 4777cabdff1aSopenharmony_ci int32_t src_stride, 4778cabdff1aSopenharmony_ci uint8_t *dst, 4779cabdff1aSopenharmony_ci int32_t dst_stride) 4780cabdff1aSopenharmony_ci{ 4781cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 4782cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 4783cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 4784cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4785cabdff1aSopenharmony_ci v16u8 dst0, dst1; 4786cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4787cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4788cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4789cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4790cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 4791cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 4792cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 4793cabdff1aSopenharmony_ci 4794cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 4795cabdff1aSopenharmony_ci src += (2 * src_stride); 4796cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4797cabdff1aSopenharmony_ci const20, const6, const3); 4798cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 4799cabdff1aSopenharmony_ci src += (2 * src_stride); 4800cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 4801cabdff1aSopenharmony_ci horiz0 = __msa_aver_u_b(inp0, res0); 4802cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4803cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4804cabdff1aSopenharmony_ci const20, const6, const3); 4805cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 4806cabdff1aSopenharmony_ci src += (2 * src_stride); 4807cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 4808cabdff1aSopenharmony_ci horiz2 = __msa_aver_u_b(inp2, res1); 4809cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4810cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 4811cabdff1aSopenharmony_ci const20, const6, const3); 4812cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 4813cabdff1aSopenharmony_ci horiz4 = __msa_aver_u_b(inp0, res0); 4814cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4815cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 4816cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 4817cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4818cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 4819cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 4820cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 4821cabdff1aSopenharmony_ci const20, const6, const3); 4822cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4823cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4824cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4825cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 4826cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4827cabdff1aSopenharmony_ci 4828cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 4829cabdff1aSopenharmony_ci src += (2 * src_stride); 4830cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 4831cabdff1aSopenharmony_ci const20, const6, const3); 4832cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 4833cabdff1aSopenharmony_ci horiz6 = __msa_aver_u_b(inp2, res1); 4834cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4835cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 4836cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 4837cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4838cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 4839cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 4840cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 4841cabdff1aSopenharmony_ci const20, const6, const3); 4842cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4843cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4844cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4845cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 4846cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4847cabdff1aSopenharmony_ci 4848cabdff1aSopenharmony_ci inp0 = LD_UB(src); 4849cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 4850cabdff1aSopenharmony_ci const20, const6, const3); 4851cabdff1aSopenharmony_ci horiz8 = __msa_aver_u_b(inp0, res0); 4852cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 4853cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 4854cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4855cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 4856cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 4857cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 4858cabdff1aSopenharmony_ci const20, const6, const3); 4859cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4860cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4861cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4862cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 4863cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4864cabdff1aSopenharmony_ci 4865cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 4866cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 4867cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4868cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 4869cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 4870cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 4871cabdff1aSopenharmony_ci const20, const6, const3); 4872cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4873cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4874cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4875cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 4876cabdff1aSopenharmony_ci} 4877cabdff1aSopenharmony_ci 4878cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t *src, 4879cabdff1aSopenharmony_ci int32_t src_stride, 4880cabdff1aSopenharmony_ci uint8_t *dst, 4881cabdff1aSopenharmony_ci int32_t dst_stride) 4882cabdff1aSopenharmony_ci{ 4883cabdff1aSopenharmony_ci uint8_t buff[272]; 4884cabdff1aSopenharmony_ci 4885cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); 4886cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 4887cabdff1aSopenharmony_ci} 4888cabdff1aSopenharmony_ci 4889cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src, 4890cabdff1aSopenharmony_ci int32_t src_stride, 4891cabdff1aSopenharmony_ci uint8_t *dst, 4892cabdff1aSopenharmony_ci int32_t dst_stride) 4893cabdff1aSopenharmony_ci{ 4894cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 4895cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 4896cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 4897cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 4898cabdff1aSopenharmony_ci v16u8 dst0, dst1; 4899cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 4900cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 4901cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 4902cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 4903cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 4904cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 4905cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 4906cabdff1aSopenharmony_ci 4907cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 4908cabdff1aSopenharmony_ci src += (2 * src_stride); 4909cabdff1aSopenharmony_ci horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4910cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4911cabdff1aSopenharmony_ci const20, const6, const3); 4912cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 4913cabdff1aSopenharmony_ci src += (2 * src_stride); 4914cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 4915cabdff1aSopenharmony_ci horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4916cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4917cabdff1aSopenharmony_ci const20, const6, const3); 4918cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 4919cabdff1aSopenharmony_ci src += (2 * src_stride); 4920cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 4921cabdff1aSopenharmony_ci horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 4922cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4923cabdff1aSopenharmony_ci const20, const6, const3); 4924cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 4925cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 4926cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 4927cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 4928cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 4929cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 4930cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 4931cabdff1aSopenharmony_ci const20, const6, const3); 4932cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4933cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4934cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4935cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 4936cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4937cabdff1aSopenharmony_ci 4938cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 4939cabdff1aSopenharmony_ci src += (2 * src_stride); 4940cabdff1aSopenharmony_ci horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 4941cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4942cabdff1aSopenharmony_ci const20, const6, const3); 4943cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 4944cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 4945cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 4946cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 4947cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 4948cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 4949cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 4950cabdff1aSopenharmony_ci const20, const6, const3); 4951cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4952cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4953cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4954cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 4955cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4956cabdff1aSopenharmony_ci 4957cabdff1aSopenharmony_ci inp0 = LD_UB(src); 4958cabdff1aSopenharmony_ci horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, 4959cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 4960cabdff1aSopenharmony_ci const20, const6, const3); 4961cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 4962cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 4963cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 4964cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 4965cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 4966cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 4967cabdff1aSopenharmony_ci const20, const6, const3); 4968cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4969cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4970cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 4971cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 4972cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4973cabdff1aSopenharmony_ci 4974cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 4975cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 4976cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 4977cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 4978cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 4979cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 4980cabdff1aSopenharmony_ci const20, const6, const3); 4981cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4982cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 4983cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 4984cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 4985cabdff1aSopenharmony_ci} 4986cabdff1aSopenharmony_ci 4987cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t *src, 4988cabdff1aSopenharmony_ci int32_t src_stride, 4989cabdff1aSopenharmony_ci uint8_t *dst, 4990cabdff1aSopenharmony_ci int32_t dst_stride) 4991cabdff1aSopenharmony_ci{ 4992cabdff1aSopenharmony_ci uint8_t buff[272]; 4993cabdff1aSopenharmony_ci 4994cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 4995cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride); 4996cabdff1aSopenharmony_ci} 4997cabdff1aSopenharmony_ci 4998cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src, 4999cabdff1aSopenharmony_ci int32_t src_stride, 5000cabdff1aSopenharmony_ci uint8_t *dst, 5001cabdff1aSopenharmony_ci int32_t dst_stride) 5002cabdff1aSopenharmony_ci{ 5003cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 5004cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 5005cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 5006cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5007cabdff1aSopenharmony_ci v16u8 dst0, dst1; 5008cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5009cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5010cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5011cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5012cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 5013cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 5014cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 5015cabdff1aSopenharmony_ci 5016cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5017cabdff1aSopenharmony_ci src += (2 * src_stride); 5018cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5019cabdff1aSopenharmony_ci const20, const6, const3); 5020cabdff1aSopenharmony_ci 5021cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5022cabdff1aSopenharmony_ci src += (2 * src_stride); 5023cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 5024cabdff1aSopenharmony_ci 5025cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5026cabdff1aSopenharmony_ci horiz0 = __msa_aver_u_b(inp0, res0); 5027cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5028cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5029cabdff1aSopenharmony_ci const20, const6, const3); 5030cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5031cabdff1aSopenharmony_ci src += (2 * src_stride); 5032cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 5033cabdff1aSopenharmony_ci 5034cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5035cabdff1aSopenharmony_ci horiz2 = __msa_aver_u_b(inp2, res1); 5036cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5037cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5038cabdff1aSopenharmony_ci const20, const6, const3); 5039cabdff1aSopenharmony_ci 5040cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 5041cabdff1aSopenharmony_ci 5042cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5043cabdff1aSopenharmony_ci horiz4 = __msa_aver_u_b(inp0, res0); 5044cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5045cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5046cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); 5047cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 5048cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 5049cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 5050cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 5051cabdff1aSopenharmony_ci const20, const6, const3); 5052cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5053cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5054cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5055cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5056cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5057cabdff1aSopenharmony_ci 5058cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5059cabdff1aSopenharmony_ci src += (2 * src_stride); 5060cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5061cabdff1aSopenharmony_ci const20, const6, const3); 5062cabdff1aSopenharmony_ci 5063cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 5064cabdff1aSopenharmony_ci 5065cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5066cabdff1aSopenharmony_ci horiz6 = __msa_aver_u_b(inp2, res1); 5067cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5068cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5069cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); 5070cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 5071cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 5072cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 5073cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 5074cabdff1aSopenharmony_ci const20, const6, const3); 5075cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5076cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5077cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5078cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5079cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5080cabdff1aSopenharmony_ci 5081cabdff1aSopenharmony_ci inp0 = LD_UB(src); 5082cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 5083cabdff1aSopenharmony_ci const20, const6, const3); 5084cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 5085cabdff1aSopenharmony_ci horiz8 = __msa_aver_u_b(inp0, res0); 5086cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5087cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); 5088cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 5089cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 5090cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 5091cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 5092cabdff1aSopenharmony_ci const20, const6, const3); 5093cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5094cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5095cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5096cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5097cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5098cabdff1aSopenharmony_ci 5099cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5100cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); 5101cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 5102cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 5103cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 5104cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 5105cabdff1aSopenharmony_ci const20, const6, const3); 5106cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5107cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5108cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5109cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5110cabdff1aSopenharmony_ci} 5111cabdff1aSopenharmony_ci 5112cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t *src, 5113cabdff1aSopenharmony_ci int32_t src_stride, 5114cabdff1aSopenharmony_ci uint8_t *dst, 5115cabdff1aSopenharmony_ci int32_t dst_stride) 5116cabdff1aSopenharmony_ci{ 5117cabdff1aSopenharmony_ci uint8_t buff[272]; 5118cabdff1aSopenharmony_ci 5119cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 5120cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride); 5121cabdff1aSopenharmony_ci} 5122cabdff1aSopenharmony_ci 5123cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src, 5124cabdff1aSopenharmony_ci int32_t src_stride, 5125cabdff1aSopenharmony_ci uint8_t *dst, 5126cabdff1aSopenharmony_ci int32_t dst_stride) 5127cabdff1aSopenharmony_ci{ 5128cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 5129cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 5130cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 5131cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5132cabdff1aSopenharmony_ci v16u8 dst0, dst1; 5133cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5134cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5135cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5136cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5137cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 5138cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 5139cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 5140cabdff1aSopenharmony_ci 5141cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5142cabdff1aSopenharmony_ci src += (2 * src_stride); 5143cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5144cabdff1aSopenharmony_ci const20, const6, const3); 5145cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5146cabdff1aSopenharmony_ci src += (2 * src_stride); 5147cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5148cabdff1aSopenharmony_ci horiz0 = __msa_aver_u_b(inp0, res0); 5149cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5150cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5151cabdff1aSopenharmony_ci const20, const6, const3); 5152cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5153cabdff1aSopenharmony_ci src += (2 * src_stride); 5154cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5155cabdff1aSopenharmony_ci horiz2 = __msa_aver_u_b(inp2, res1); 5156cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5157cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5158cabdff1aSopenharmony_ci const20, const6, const3); 5159cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5160cabdff1aSopenharmony_ci horiz4 = __msa_aver_u_b(inp0, res0); 5161cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5162cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5163cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 5164cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 5165cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 5166cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 5167cabdff1aSopenharmony_ci const20, const6, const3); 5168cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5169cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5170cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5171cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5172cabdff1aSopenharmony_ci 5173cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5174cabdff1aSopenharmony_ci src += (2 * src_stride); 5175cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5176cabdff1aSopenharmony_ci const20, const6, const3); 5177cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5178cabdff1aSopenharmony_ci horiz6 = __msa_aver_u_b(inp2, res1); 5179cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5180cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5181cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 5182cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 5183cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 5184cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 5185cabdff1aSopenharmony_ci const20, const6, const3); 5186cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5187cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5188cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5189cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5190cabdff1aSopenharmony_ci 5191cabdff1aSopenharmony_ci inp0 = LD_UB(src); 5192cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 5193cabdff1aSopenharmony_ci const20, const6, const3); 5194cabdff1aSopenharmony_ci horiz8 = __msa_aver_u_b(inp0, res0); 5195cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5196cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 5197cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 5198cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 5199cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 5200cabdff1aSopenharmony_ci const20, const6, const3); 5201cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5202cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5203cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5204cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5205cabdff1aSopenharmony_ci 5206cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5207cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 5208cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 5209cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 5210cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 5211cabdff1aSopenharmony_ci const20, const6, const3); 5212cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5213cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5214cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5215cabdff1aSopenharmony_ci} 5216cabdff1aSopenharmony_ci 5217cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride, 5218cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 5219cabdff1aSopenharmony_ci{ 5220cabdff1aSopenharmony_ci uint8_t buff[272]; 5221cabdff1aSopenharmony_ci 5222cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); 5223cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride); 5224cabdff1aSopenharmony_ci 5225cabdff1aSopenharmony_ci} 5226cabdff1aSopenharmony_ci 5227cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride, 5228cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 5229cabdff1aSopenharmony_ci{ 5230cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 5231cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 5232cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 5233cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5234cabdff1aSopenharmony_ci v16u8 dst0, dst1; 5235cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5236cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5237cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5238cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5239cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 5240cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 5241cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 5242cabdff1aSopenharmony_ci 5243cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5244cabdff1aSopenharmony_ci src += (2 * src_stride); 5245cabdff1aSopenharmony_ci horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 5246cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 5247cabdff1aSopenharmony_ci const20, const6, const3); 5248cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5249cabdff1aSopenharmony_ci src += (2 * src_stride); 5250cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5251cabdff1aSopenharmony_ci horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 5252cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 5253cabdff1aSopenharmony_ci const20, const6, const3); 5254cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5255cabdff1aSopenharmony_ci src += (2 * src_stride); 5256cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5257cabdff1aSopenharmony_ci horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 5258cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 5259cabdff1aSopenharmony_ci const20, const6, const3); 5260cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5261cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5262cabdff1aSopenharmony_ci src += (2 * src_stride); 5263cabdff1aSopenharmony_ci horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 5264cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 5265cabdff1aSopenharmony_ci const20, const6, const3); 5266cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5267cabdff1aSopenharmony_ci inp0 = LD_UB(src); 5268cabdff1aSopenharmony_ci horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, 5269cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 5270cabdff1aSopenharmony_ci const20, const6, const3); 5271cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5272cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 5273cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 5274cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 5275cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 5276cabdff1aSopenharmony_ci const20, const6, const3); 5277cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5278cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5279cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5280cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5281cabdff1aSopenharmony_ci 5282cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5283cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 5284cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 5285cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 5286cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 5287cabdff1aSopenharmony_ci const20, const6, const3); 5288cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5289cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5290cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5291cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5292cabdff1aSopenharmony_ci 5293cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5294cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 5295cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 5296cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 5297cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 5298cabdff1aSopenharmony_ci const20, const6, const3); 5299cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5300cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5301cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5302cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5303cabdff1aSopenharmony_ci 5304cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5305cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 5306cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 5307cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 5308cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 5309cabdff1aSopenharmony_ci const20, const6, const3); 5310cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5311cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5312cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5313cabdff1aSopenharmony_ci} 5314cabdff1aSopenharmony_ci 5315cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t *src, 5316cabdff1aSopenharmony_ci int32_t src_stride, 5317cabdff1aSopenharmony_ci uint8_t *dst, 5318cabdff1aSopenharmony_ci int32_t dst_stride) 5319cabdff1aSopenharmony_ci{ 5320cabdff1aSopenharmony_ci uint8_t buff[272]; 5321cabdff1aSopenharmony_ci 5322cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 5323cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride); 5324cabdff1aSopenharmony_ci} 5325cabdff1aSopenharmony_ci 5326cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src, 5327cabdff1aSopenharmony_ci int32_t src_stride, 5328cabdff1aSopenharmony_ci uint8_t *dst, 5329cabdff1aSopenharmony_ci int32_t dst_stride) 5330cabdff1aSopenharmony_ci{ 5331cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 5332cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 5333cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 5334cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5335cabdff1aSopenharmony_ci v16u8 dst0, dst1; 5336cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5337cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5338cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5339cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5340cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 5341cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 5342cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 5343cabdff1aSopenharmony_ci 5344cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5345cabdff1aSopenharmony_ci src += (2 * src_stride); 5346cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5347cabdff1aSopenharmony_ci const20, const6, const3); 5348cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5349cabdff1aSopenharmony_ci src += (2 * src_stride); 5350cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 5351cabdff1aSopenharmony_ci 5352cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5353cabdff1aSopenharmony_ci horiz0 = __msa_aver_u_b(inp0, res0); 5354cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5355cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5356cabdff1aSopenharmony_ci const20, const6, const3); 5357cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5358cabdff1aSopenharmony_ci src += (2 * src_stride); 5359cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 5360cabdff1aSopenharmony_ci 5361cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5362cabdff1aSopenharmony_ci horiz2 = __msa_aver_u_b(inp2, res1); 5363cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5364cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5365cabdff1aSopenharmony_ci const20, const6, const3); 5366cabdff1aSopenharmony_ci 5367cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 5368cabdff1aSopenharmony_ci 5369cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5370cabdff1aSopenharmony_ci horiz4 = __msa_aver_u_b(inp0, res0); 5371cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5372cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5373cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 5374cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 5375cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 5376cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 5377cabdff1aSopenharmony_ci const20, const6, const3); 5378cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5379cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5380cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5381cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5382cabdff1aSopenharmony_ci 5383cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5384cabdff1aSopenharmony_ci src += (2 * src_stride); 5385cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5386cabdff1aSopenharmony_ci const20, const6, const3); 5387cabdff1aSopenharmony_ci 5388cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 5389cabdff1aSopenharmony_ci 5390cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5391cabdff1aSopenharmony_ci horiz6 = __msa_aver_u_b(inp2, res1); 5392cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5393cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5394cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 5395cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 5396cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 5397cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 5398cabdff1aSopenharmony_ci const20, const6, const3); 5399cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5400cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5401cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5402cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5403cabdff1aSopenharmony_ci 5404cabdff1aSopenharmony_ci inp0 = LD_UB(src); 5405cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 5406cabdff1aSopenharmony_ci const20, const6, const3); 5407cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 5408cabdff1aSopenharmony_ci horiz8 = __msa_aver_u_b(inp0, res0); 5409cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5410cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 5411cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 5412cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 5413cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 5414cabdff1aSopenharmony_ci const20, const6, const3); 5415cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5416cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5417cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5418cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5419cabdff1aSopenharmony_ci 5420cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5421cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 5422cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 5423cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 5424cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 5425cabdff1aSopenharmony_ci const20, const6, const3); 5426cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5427cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5428cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5429cabdff1aSopenharmony_ci} 5430cabdff1aSopenharmony_ci 5431cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t *src, 5432cabdff1aSopenharmony_ci int32_t src_stride, 5433cabdff1aSopenharmony_ci uint8_t *dst, 5434cabdff1aSopenharmony_ci int32_t dst_stride) 5435cabdff1aSopenharmony_ci{ 5436cabdff1aSopenharmony_ci uint8_t buff[272]; 5437cabdff1aSopenharmony_ci 5438cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16); 5439cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 5440cabdff1aSopenharmony_ci} 5441cabdff1aSopenharmony_ci 5442cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src, 5443cabdff1aSopenharmony_ci int32_t src_stride, 5444cabdff1aSopenharmony_ci uint8_t *dst, 5445cabdff1aSopenharmony_ci int32_t dst_stride) 5446cabdff1aSopenharmony_ci{ 5447cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 5448cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 5449cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 5450cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5451cabdff1aSopenharmony_ci v16u8 dst0, dst1; 5452cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5453cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5454cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5455cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5456cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 5457cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 5458cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 5459cabdff1aSopenharmony_ci 5460cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5461cabdff1aSopenharmony_ci src += (2 * src_stride); 5462cabdff1aSopenharmony_ci 5463cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5464cabdff1aSopenharmony_ci const20, const6, const3); 5465cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5466cabdff1aSopenharmony_ci horiz0 = __msa_aver_u_b(inp0, res0); 5467cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5468cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5469cabdff1aSopenharmony_ci src += (2 * src_stride); 5470cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5471cabdff1aSopenharmony_ci const20, const6, const3); 5472cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5473cabdff1aSopenharmony_ci horiz2 = __msa_aver_u_b(inp2, res1); 5474cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5475cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5476cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5477cabdff1aSopenharmony_ci src += (2 * src_stride); 5478cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5479cabdff1aSopenharmony_ci const20, const6, const3); 5480cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5481cabdff1aSopenharmony_ci horiz4 = __msa_aver_u_b(inp0, res0); 5482cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5483cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 5484cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 5485cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 5486cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 5487cabdff1aSopenharmony_ci const20, const6, const3); 5488cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 5489cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5490cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5491cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5492cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5493cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5494cabdff1aSopenharmony_ci 5495cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5496cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5497cabdff1aSopenharmony_ci src += (2 * src_stride); 5498cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5499cabdff1aSopenharmony_ci const20, const6, const3); 5500cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5501cabdff1aSopenharmony_ci horiz6 = __msa_aver_u_b(inp2, res1); 5502cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5503cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 5504cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 5505cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 5506cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 5507cabdff1aSopenharmony_ci const20, const6, const3); 5508cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 5509cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5510cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5511cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5512cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5513cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5514cabdff1aSopenharmony_ci 5515cabdff1aSopenharmony_ci inp0 = LD_UB(src); 5516cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 5517cabdff1aSopenharmony_ci const20, const6, const3); 5518cabdff1aSopenharmony_ci horiz8 = __msa_aver_u_b(inp0, res0); 5519cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, 5520cabdff1aSopenharmony_ci horiz5, horiz6, horiz7, horiz8, 5521cabdff1aSopenharmony_ci horiz5, horiz4, horiz3, horiz2, 5522cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz8, 5523cabdff1aSopenharmony_ci const20, const6, const3); 5524cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, 5525cabdff1aSopenharmony_ci horiz7, horiz8, horiz8, horiz7, 5526cabdff1aSopenharmony_ci horiz7, horiz6, horiz5, horiz4, 5527cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz6, 5528cabdff1aSopenharmony_ci const20, const6, const3); 5529cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 5530cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5531cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5532cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5533cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5534cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5535cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5536cabdff1aSopenharmony_ci 5537cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 5538cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5539cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5540cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5541cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5542cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5543cabdff1aSopenharmony_ci} 5544cabdff1aSopenharmony_ci 5545cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t *src, 5546cabdff1aSopenharmony_ci int32_t src_stride, 5547cabdff1aSopenharmony_ci uint8_t *dst, 5548cabdff1aSopenharmony_ci int32_t dst_stride) 5549cabdff1aSopenharmony_ci{ 5550cabdff1aSopenharmony_ci uint8_t buff[272]; 5551cabdff1aSopenharmony_ci 5552cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16); 5553cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 5554cabdff1aSopenharmony_ci} 5555cabdff1aSopenharmony_ci 5556cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src, 5557cabdff1aSopenharmony_ci int32_t src_stride, 5558cabdff1aSopenharmony_ci uint8_t *dst, 5559cabdff1aSopenharmony_ci int32_t dst_stride) 5560cabdff1aSopenharmony_ci{ 5561cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 5562cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 5563cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 5564cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5565cabdff1aSopenharmony_ci v16u8 dst0, dst1; 5566cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5567cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5568cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5569cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5570cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 5571cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 5572cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 5573cabdff1aSopenharmony_ci 5574cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5575cabdff1aSopenharmony_ci src += (2 * src_stride); 5576cabdff1aSopenharmony_ci horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 5577cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 5578cabdff1aSopenharmony_ci const20, const6, const3); 5579cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5580cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5581cabdff1aSopenharmony_ci src += (2 * src_stride); 5582cabdff1aSopenharmony_ci horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 5583cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 5584cabdff1aSopenharmony_ci const20, const6, const3); 5585cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5586cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5587cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5588cabdff1aSopenharmony_ci src += (2 * src_stride); 5589cabdff1aSopenharmony_ci horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, 5590cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 5591cabdff1aSopenharmony_ci const20, const6, const3); 5592cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5593cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, 5594cabdff1aSopenharmony_ci horiz1, horiz2, horiz3, horiz4, 5595cabdff1aSopenharmony_ci horiz1, horiz0, horiz0, horiz1, 5596cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz5, 5597cabdff1aSopenharmony_ci const20, const6, const3); 5598cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 5599cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5600cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5601cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5602cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5603cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5604cabdff1aSopenharmony_ci 5605cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5606cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5607cabdff1aSopenharmony_ci src += (2 * src_stride); 5608cabdff1aSopenharmony_ci horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, 5609cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 5610cabdff1aSopenharmony_ci const20, const6, const3); 5611cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5612cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, 5613cabdff1aSopenharmony_ci horiz3, horiz4, horiz5, horiz6, 5614cabdff1aSopenharmony_ci horiz3, horiz2, horiz1, horiz0, 5615cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz7, 5616cabdff1aSopenharmony_ci const20, const6, const3); 5617cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 5618cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5619cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5620cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5621cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5622cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5623cabdff1aSopenharmony_ci 5624cabdff1aSopenharmony_ci inp0 = LD_UB(src); 5625cabdff1aSopenharmony_ci horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, 5626cabdff1aSopenharmony_ci mask0, mask1, mask2, mask3, 5627cabdff1aSopenharmony_ci const20, const6, const3); 5628cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5, 5629cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz5, horiz4, 5630cabdff1aSopenharmony_ci horiz3, horiz2, horiz6, horiz7, horiz8, 5631cabdff1aSopenharmony_ci horiz8, const20, const6, const3); 5632cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7, 5633cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz7, horiz6, 5634cabdff1aSopenharmony_ci horiz5, horiz4, horiz8, horiz8, horiz7, 5635cabdff1aSopenharmony_ci horiz6, const20, const6, const3); 5636cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 5637cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5638cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5639cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5640cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5641cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5642cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5643cabdff1aSopenharmony_ci 5644cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 5645cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5646cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5647cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5648cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5649cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5650cabdff1aSopenharmony_ci} 5651cabdff1aSopenharmony_ci 5652cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t *src, 5653cabdff1aSopenharmony_ci int32_t src_stride, 5654cabdff1aSopenharmony_ci uint8_t *dst, 5655cabdff1aSopenharmony_ci int32_t dst_stride) 5656cabdff1aSopenharmony_ci{ 5657cabdff1aSopenharmony_ci uint8_t buff[272]; 5658cabdff1aSopenharmony_ci 5659cabdff1aSopenharmony_ci hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16); 5660cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride); 5661cabdff1aSopenharmony_ci} 5662cabdff1aSopenharmony_ci 5663cabdff1aSopenharmony_cistatic void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src, 5664cabdff1aSopenharmony_ci int32_t src_stride, 5665cabdff1aSopenharmony_ci uint8_t *dst, 5666cabdff1aSopenharmony_ci int32_t dst_stride) 5667cabdff1aSopenharmony_ci{ 5668cabdff1aSopenharmony_ci v16u8 inp0, inp1, inp2, inp3; 5669cabdff1aSopenharmony_ci v16u8 res0, res1, avg0, avg1; 5670cabdff1aSopenharmony_ci v16u8 horiz0, horiz1, horiz2, horiz3; 5671cabdff1aSopenharmony_ci v16u8 horiz4, horiz5, horiz6, horiz7, horiz8; 5672cabdff1aSopenharmony_ci v16u8 dst0, dst1; 5673cabdff1aSopenharmony_ci v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 5674cabdff1aSopenharmony_ci v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 }; 5675cabdff1aSopenharmony_ci v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 }; 5676cabdff1aSopenharmony_ci v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 }; 5677cabdff1aSopenharmony_ci v16u8 const20 = (v16u8) __msa_ldi_b(20); 5678cabdff1aSopenharmony_ci v16u8 const6 = (v16u8) __msa_ldi_b(6); 5679cabdff1aSopenharmony_ci v16u8 const3 = (v16u8) __msa_ldi_b(3); 5680cabdff1aSopenharmony_ci 5681cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5682cabdff1aSopenharmony_ci src += (2 * src_stride); 5683cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5684cabdff1aSopenharmony_ci const20, const6, const3); 5685cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5686cabdff1aSopenharmony_ci src += (2 * src_stride); 5687cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 5688cabdff1aSopenharmony_ci 5689cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5690cabdff1aSopenharmony_ci horiz0 = __msa_aver_u_b(inp0, res0); 5691cabdff1aSopenharmony_ci horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); 5692cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5693cabdff1aSopenharmony_ci const20, const6, const3); 5694cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp0, inp1); 5695cabdff1aSopenharmony_ci src += (2 * src_stride); 5696cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 5697cabdff1aSopenharmony_ci 5698cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5699cabdff1aSopenharmony_ci horiz2 = __msa_aver_u_b(inp2, res1); 5700cabdff1aSopenharmony_ci horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); 5701cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, 5702cabdff1aSopenharmony_ci const20, const6, const3); 5703cabdff1aSopenharmony_ci SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); 5704cabdff1aSopenharmony_ci 5705cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); 5706cabdff1aSopenharmony_ci horiz4 = __msa_aver_u_b(inp0, res0); 5707cabdff1aSopenharmony_ci horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); 5708cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5709cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); 5710cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, horiz1, 5711cabdff1aSopenharmony_ci horiz2, horiz3, horiz4, horiz1, horiz0, 5712cabdff1aSopenharmony_ci horiz0, horiz1, horiz2, horiz3, horiz4, 5713cabdff1aSopenharmony_ci horiz5, const20, const6, const3); 5714cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5715cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5716cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5717cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5718cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5719cabdff1aSopenharmony_ci 5720cabdff1aSopenharmony_ci LD_UB2(src, src_stride, inp2, inp3); 5721cabdff1aSopenharmony_ci src += (2 * src_stride); 5722cabdff1aSopenharmony_ci res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, 5723cabdff1aSopenharmony_ci const20, const6, const3); 5724cabdff1aSopenharmony_ci SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); 5725cabdff1aSopenharmony_ci 5726cabdff1aSopenharmony_ci inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); 5727cabdff1aSopenharmony_ci horiz6 = __msa_aver_u_b(inp2, res1); 5728cabdff1aSopenharmony_ci horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1); 5729cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5730cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); 5731cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, horiz3, 5732cabdff1aSopenharmony_ci horiz4, horiz5, horiz6, horiz3, horiz2, 5733cabdff1aSopenharmony_ci horiz1, horiz0, horiz4, horiz5, horiz6, 5734cabdff1aSopenharmony_ci horiz7, const20, const6, const3); 5735cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5736cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5737cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5738cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5739cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5740cabdff1aSopenharmony_ci 5741cabdff1aSopenharmony_ci inp0 = LD_UB(src); 5742cabdff1aSopenharmony_ci res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, 5743cabdff1aSopenharmony_ci const20, const6, const3); 5744cabdff1aSopenharmony_ci inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1); 5745cabdff1aSopenharmony_ci horiz8 = __msa_aver_u_b(inp0, res0); 5746cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5747cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); 5748cabdff1aSopenharmony_ci res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5, 5749cabdff1aSopenharmony_ci horiz6, horiz7, horiz8, horiz5, horiz4, 5750cabdff1aSopenharmony_ci horiz3, horiz2, horiz6, horiz7, horiz8, 5751cabdff1aSopenharmony_ci horiz8, const20, const6, const3); 5752cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5753cabdff1aSopenharmony_ci avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5754cabdff1aSopenharmony_ci res0 = __msa_aver_u_b(avg0, res0); 5755cabdff1aSopenharmony_ci ST_D2(res0, 0, 1, dst, dst_stride); 5756cabdff1aSopenharmony_ci dst += (2 * dst_stride); 5757cabdff1aSopenharmony_ci 5758cabdff1aSopenharmony_ci LD_UB2(dst, dst_stride, dst0, dst1); 5759cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); 5760cabdff1aSopenharmony_ci res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7, 5761cabdff1aSopenharmony_ci horiz8, horiz8, horiz7, horiz7, horiz6, 5762cabdff1aSopenharmony_ci horiz5, horiz4, horiz8, horiz8, horiz7, 5763cabdff1aSopenharmony_ci horiz6, const20, const6, const3); 5764cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5765cabdff1aSopenharmony_ci avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); 5766cabdff1aSopenharmony_ci res1 = __msa_aver_u_b(avg1, res1); 5767cabdff1aSopenharmony_ci ST_D2(res1, 0, 1, dst, dst_stride); 5768cabdff1aSopenharmony_ci} 5769cabdff1aSopenharmony_ci 5770cabdff1aSopenharmony_cistatic void copy_8x8_msa(const uint8_t *src, int32_t src_stride, 5771cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 5772cabdff1aSopenharmony_ci{ 5773cabdff1aSopenharmony_ci uint64_t src0, src1; 5774cabdff1aSopenharmony_ci int32_t loop_cnt; 5775cabdff1aSopenharmony_ci 5776cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 5777cabdff1aSopenharmony_ci src0 = LD(src); 5778cabdff1aSopenharmony_ci src += src_stride; 5779cabdff1aSopenharmony_ci src1 = LD(src); 5780cabdff1aSopenharmony_ci src += src_stride; 5781cabdff1aSopenharmony_ci 5782cabdff1aSopenharmony_ci SD(src0, dst); 5783cabdff1aSopenharmony_ci dst += dst_stride; 5784cabdff1aSopenharmony_ci SD(src1, dst); 5785cabdff1aSopenharmony_ci dst += dst_stride; 5786cabdff1aSopenharmony_ci } 5787cabdff1aSopenharmony_ci} 5788cabdff1aSopenharmony_ci 5789cabdff1aSopenharmony_cistatic void copy_16x16_msa(const uint8_t *src, int32_t src_stride, 5790cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride) 5791cabdff1aSopenharmony_ci{ 5792cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 5793cabdff1aSopenharmony_ci v16u8 src8, src9, src10, src11, src12, src13, src14, src15; 5794cabdff1aSopenharmony_ci 5795cabdff1aSopenharmony_ci LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 5796cabdff1aSopenharmony_ci src += (8 * src_stride); 5797cabdff1aSopenharmony_ci LD_UB8(src, src_stride, 5798cabdff1aSopenharmony_ci src8, src9, src10, src11, src12, src13, src14, src15); 5799cabdff1aSopenharmony_ci 5800cabdff1aSopenharmony_ci ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 5801cabdff1aSopenharmony_ci dst += (8 * dst_stride); 5802cabdff1aSopenharmony_ci ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, 5803cabdff1aSopenharmony_ci dst, dst_stride); 5804cabdff1aSopenharmony_ci} 5805cabdff1aSopenharmony_ci 5806cabdff1aSopenharmony_cistatic void avg_width8_msa(const uint8_t *src, int32_t src_stride, 5807cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 5808cabdff1aSopenharmony_ci int32_t height) 5809cabdff1aSopenharmony_ci{ 5810cabdff1aSopenharmony_ci int32_t cnt; 5811cabdff1aSopenharmony_ci uint64_t out0, out1, out2, out3; 5812cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3; 5813cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3; 5814cabdff1aSopenharmony_ci 5815cabdff1aSopenharmony_ci for (cnt = (height / 4); cnt--;) { 5816cabdff1aSopenharmony_ci LD_UB4(src, src_stride, src0, src1, src2, src3); 5817cabdff1aSopenharmony_ci src += (4 * src_stride); 5818cabdff1aSopenharmony_ci LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 5819cabdff1aSopenharmony_ci 5820cabdff1aSopenharmony_ci AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, 5821cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 5822cabdff1aSopenharmony_ci 5823cabdff1aSopenharmony_ci out0 = __msa_copy_u_d((v2i64) dst0, 0); 5824cabdff1aSopenharmony_ci out1 = __msa_copy_u_d((v2i64) dst1, 0); 5825cabdff1aSopenharmony_ci out2 = __msa_copy_u_d((v2i64) dst2, 0); 5826cabdff1aSopenharmony_ci out3 = __msa_copy_u_d((v2i64) dst3, 0); 5827cabdff1aSopenharmony_ci SD4(out0, out1, out2, out3, dst, dst_stride); 5828cabdff1aSopenharmony_ci dst += (4 * dst_stride); 5829cabdff1aSopenharmony_ci } 5830cabdff1aSopenharmony_ci} 5831cabdff1aSopenharmony_ci 5832cabdff1aSopenharmony_cistatic void avg_width16_msa(const uint8_t *src, int32_t src_stride, 5833cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 5834cabdff1aSopenharmony_ci int32_t height) 5835cabdff1aSopenharmony_ci{ 5836cabdff1aSopenharmony_ci int32_t cnt; 5837cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 5838cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 5839cabdff1aSopenharmony_ci 5840cabdff1aSopenharmony_ci for (cnt = (height / 8); cnt--;) { 5841cabdff1aSopenharmony_ci LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 5842cabdff1aSopenharmony_ci src += (8 * src_stride); 5843cabdff1aSopenharmony_ci LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 5844cabdff1aSopenharmony_ci 5845cabdff1aSopenharmony_ci AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, 5846cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 5847cabdff1aSopenharmony_ci AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, 5848cabdff1aSopenharmony_ci dst4, dst5, dst6, dst7); 5849cabdff1aSopenharmony_ci ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); 5850cabdff1aSopenharmony_ci dst += (8 * dst_stride); 5851cabdff1aSopenharmony_ci } 5852cabdff1aSopenharmony_ci} 5853cabdff1aSopenharmony_ci 5854cabdff1aSopenharmony_civoid ff_copy_16x16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride) 5855cabdff1aSopenharmony_ci{ 5856cabdff1aSopenharmony_ci copy_16x16_msa(src, stride, dest, stride); 5857cabdff1aSopenharmony_ci} 5858cabdff1aSopenharmony_ci 5859cabdff1aSopenharmony_civoid ff_copy_8x8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride) 5860cabdff1aSopenharmony_ci{ 5861cabdff1aSopenharmony_ci copy_8x8_msa(src, stride, dest, stride); 5862cabdff1aSopenharmony_ci} 5863cabdff1aSopenharmony_ci 5864cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dest, 5865cabdff1aSopenharmony_ci const uint8_t *src, 5866cabdff1aSopenharmony_ci ptrdiff_t stride) 5867cabdff1aSopenharmony_ci{ 5868cabdff1aSopenharmony_ci horiz_mc_qpel_aver_src0_8width_msa(src, stride, dest, stride, 8); 5869cabdff1aSopenharmony_ci} 5870cabdff1aSopenharmony_ci 5871cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dest, 5872cabdff1aSopenharmony_ci const uint8_t *src, 5873cabdff1aSopenharmony_ci ptrdiff_t stride) 5874cabdff1aSopenharmony_ci{ 5875cabdff1aSopenharmony_ci horiz_mc_qpel_aver_src0_16width_msa(src, stride, dest, stride, 16); 5876cabdff1aSopenharmony_ci} 5877cabdff1aSopenharmony_ci 5878cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_8width_msa(uint8_t *dest, const uint8_t *src, 5879cabdff1aSopenharmony_ci ptrdiff_t stride) 5880cabdff1aSopenharmony_ci{ 5881cabdff1aSopenharmony_ci horiz_mc_qpel_8width_msa(src, stride, dest, stride, 8); 5882cabdff1aSopenharmony_ci} 5883cabdff1aSopenharmony_ci 5884cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_16width_msa(uint8_t *dest, 5885cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 5886cabdff1aSopenharmony_ci{ 5887cabdff1aSopenharmony_ci horiz_mc_qpel_16width_msa(src, stride, dest, stride, 16); 5888cabdff1aSopenharmony_ci} 5889cabdff1aSopenharmony_ci 5890cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dest, 5891cabdff1aSopenharmony_ci const uint8_t *src, 5892cabdff1aSopenharmony_ci ptrdiff_t stride) 5893cabdff1aSopenharmony_ci{ 5894cabdff1aSopenharmony_ci horiz_mc_qpel_aver_src1_8width_msa(src, stride, dest, stride, 8); 5895cabdff1aSopenharmony_ci} 5896cabdff1aSopenharmony_ci 5897cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dest, 5898cabdff1aSopenharmony_ci const uint8_t *src, 5899cabdff1aSopenharmony_ci ptrdiff_t stride) 5900cabdff1aSopenharmony_ci{ 5901cabdff1aSopenharmony_ci horiz_mc_qpel_aver_src1_16width_msa(src, stride, dest, stride, 16); 5902cabdff1aSopenharmony_ci} 5903cabdff1aSopenharmony_ci 5904cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dest, 5905cabdff1aSopenharmony_ci const uint8_t *src, 5906cabdff1aSopenharmony_ci ptrdiff_t stride) 5907cabdff1aSopenharmony_ci{ 5908cabdff1aSopenharmony_ci horiz_mc_qpel_no_rnd_aver_src0_8width_msa(src, stride, dest, stride, 8); 5909cabdff1aSopenharmony_ci} 5910cabdff1aSopenharmony_ci 5911cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dest, 5912cabdff1aSopenharmony_ci const uint8_t *src, 5913cabdff1aSopenharmony_ci ptrdiff_t stride) 5914cabdff1aSopenharmony_ci{ 5915cabdff1aSopenharmony_ci horiz_mc_qpel_no_rnd_aver_src0_16width_msa(src, stride, dest, stride, 16); 5916cabdff1aSopenharmony_ci} 5917cabdff1aSopenharmony_ci 5918cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dest, 5919cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 5920cabdff1aSopenharmony_ci{ 5921cabdff1aSopenharmony_ci horiz_mc_qpel_no_rnd_8width_msa(src, stride, dest, stride, 8); 5922cabdff1aSopenharmony_ci} 5923cabdff1aSopenharmony_ci 5924cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dest, 5925cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 5926cabdff1aSopenharmony_ci{ 5927cabdff1aSopenharmony_ci horiz_mc_qpel_no_rnd_16width_msa(src, stride, dest, stride, 16); 5928cabdff1aSopenharmony_ci} 5929cabdff1aSopenharmony_ci 5930cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dest, 5931cabdff1aSopenharmony_ci const uint8_t *src, 5932cabdff1aSopenharmony_ci ptrdiff_t stride) 5933cabdff1aSopenharmony_ci{ 5934cabdff1aSopenharmony_ci horiz_mc_qpel_no_rnd_aver_src1_8width_msa(src, stride, dest, stride, 8); 5935cabdff1aSopenharmony_ci} 5936cabdff1aSopenharmony_ci 5937cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dest, 5938cabdff1aSopenharmony_ci const uint8_t *src, 5939cabdff1aSopenharmony_ci ptrdiff_t stride) 5940cabdff1aSopenharmony_ci{ 5941cabdff1aSopenharmony_ci horiz_mc_qpel_no_rnd_aver_src1_16width_msa(src, stride, dest, stride, 16); 5942cabdff1aSopenharmony_ci} 5943cabdff1aSopenharmony_ci 5944cabdff1aSopenharmony_civoid ff_avg_width8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride) 5945cabdff1aSopenharmony_ci{ 5946cabdff1aSopenharmony_ci avg_width8_msa(src, stride, dest, stride, 8); 5947cabdff1aSopenharmony_ci} 5948cabdff1aSopenharmony_ci 5949cabdff1aSopenharmony_civoid ff_avg_width16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride) 5950cabdff1aSopenharmony_ci{ 5951cabdff1aSopenharmony_ci avg_width16_msa(src, stride, dest, stride, 16); 5952cabdff1aSopenharmony_ci} 5953cabdff1aSopenharmony_ci 5954cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dest, 5955cabdff1aSopenharmony_ci const uint8_t *src, 5956cabdff1aSopenharmony_ci ptrdiff_t stride) 5957cabdff1aSopenharmony_ci{ 5958cabdff1aSopenharmony_ci horiz_mc_qpel_avg_dst_aver_src0_8width_msa(src, stride, dest, stride, 8); 5959cabdff1aSopenharmony_ci} 5960cabdff1aSopenharmony_ci 5961cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dest, 5962cabdff1aSopenharmony_ci const uint8_t *src, 5963cabdff1aSopenharmony_ci ptrdiff_t stride) 5964cabdff1aSopenharmony_ci{ 5965cabdff1aSopenharmony_ci horiz_mc_qpel_avg_dst_aver_src0_16width_msa(src, stride, dest, stride, 16); 5966cabdff1aSopenharmony_ci} 5967cabdff1aSopenharmony_ci 5968cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dest, 5969cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 5970cabdff1aSopenharmony_ci{ 5971cabdff1aSopenharmony_ci horiz_mc_qpel_avg_dst_8width_msa(src, stride, dest, stride, 8); 5972cabdff1aSopenharmony_ci} 5973cabdff1aSopenharmony_ci 5974cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dest, 5975cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 5976cabdff1aSopenharmony_ci{ 5977cabdff1aSopenharmony_ci horiz_mc_qpel_avg_dst_16width_msa(src, stride, dest, stride, 16); 5978cabdff1aSopenharmony_ci} 5979cabdff1aSopenharmony_ci 5980cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dest, 5981cabdff1aSopenharmony_ci const uint8_t *src, 5982cabdff1aSopenharmony_ci ptrdiff_t stride) 5983cabdff1aSopenharmony_ci{ 5984cabdff1aSopenharmony_ci horiz_mc_qpel_avg_dst_aver_src1_8width_msa(src, stride, dest, stride, 8); 5985cabdff1aSopenharmony_ci} 5986cabdff1aSopenharmony_ci 5987cabdff1aSopenharmony_civoid ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dest, 5988cabdff1aSopenharmony_ci const uint8_t *src, 5989cabdff1aSopenharmony_ci ptrdiff_t stride) 5990cabdff1aSopenharmony_ci{ 5991cabdff1aSopenharmony_ci horiz_mc_qpel_avg_dst_aver_src1_16width_msa(src, stride, dest, stride, 16); 5992cabdff1aSopenharmony_ci} 5993cabdff1aSopenharmony_ci 5994cabdff1aSopenharmony_ci 5995cabdff1aSopenharmony_civoid ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dest, 5996cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 5997cabdff1aSopenharmony_ci{ 5998cabdff1aSopenharmony_ci vert_mc_qpel_aver_src0_8x8_msa(src, stride, dest, stride); 5999cabdff1aSopenharmony_ci} 6000cabdff1aSopenharmony_ci 6001cabdff1aSopenharmony_civoid ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dest, 6002cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6003cabdff1aSopenharmony_ci{ 6004cabdff1aSopenharmony_ci vert_mc_qpel_aver_src0_16x16_msa(src, stride, dest, stride); 6005cabdff1aSopenharmony_ci} 6006cabdff1aSopenharmony_ci 6007cabdff1aSopenharmony_civoid ff_vert_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src, 6008cabdff1aSopenharmony_ci ptrdiff_t stride) 6009cabdff1aSopenharmony_ci{ 6010cabdff1aSopenharmony_ci vert_mc_qpel_8x8_msa(src, stride, dest, stride); 6011cabdff1aSopenharmony_ci} 6012cabdff1aSopenharmony_ci 6013cabdff1aSopenharmony_civoid ff_vert_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src, 6014cabdff1aSopenharmony_ci ptrdiff_t stride) 6015cabdff1aSopenharmony_ci{ 6016cabdff1aSopenharmony_ci vert_mc_qpel_16x16_msa(src, stride, dest, stride); 6017cabdff1aSopenharmony_ci} 6018cabdff1aSopenharmony_ci 6019cabdff1aSopenharmony_civoid ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dest, 6020cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6021cabdff1aSopenharmony_ci{ 6022cabdff1aSopenharmony_ci vert_mc_qpel_aver_src1_8x8_msa(src, stride, dest, stride); 6023cabdff1aSopenharmony_ci} 6024cabdff1aSopenharmony_ci 6025cabdff1aSopenharmony_civoid ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dest, 6026cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6027cabdff1aSopenharmony_ci{ 6028cabdff1aSopenharmony_ci vert_mc_qpel_aver_src1_16x16_msa(src, stride, dest, stride); 6029cabdff1aSopenharmony_ci} 6030cabdff1aSopenharmony_ci 6031cabdff1aSopenharmony_civoid ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dest, 6032cabdff1aSopenharmony_ci const uint8_t *src, 6033cabdff1aSopenharmony_ci ptrdiff_t stride) 6034cabdff1aSopenharmony_ci{ 6035cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_aver_src0_8x8_msa(src, stride, dest, stride); 6036cabdff1aSopenharmony_ci} 6037cabdff1aSopenharmony_ci 6038cabdff1aSopenharmony_civoid ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dest, 6039cabdff1aSopenharmony_ci const uint8_t *src, 6040cabdff1aSopenharmony_ci ptrdiff_t stride) 6041cabdff1aSopenharmony_ci{ 6042cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_aver_src0_16x16_msa(src, stride, dest, stride); 6043cabdff1aSopenharmony_ci} 6044cabdff1aSopenharmony_ci 6045cabdff1aSopenharmony_civoid ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dest, 6046cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6047cabdff1aSopenharmony_ci{ 6048cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride); 6049cabdff1aSopenharmony_ci} 6050cabdff1aSopenharmony_ci 6051cabdff1aSopenharmony_civoid ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dest, 6052cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6053cabdff1aSopenharmony_ci{ 6054cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride); 6055cabdff1aSopenharmony_ci} 6056cabdff1aSopenharmony_ci 6057cabdff1aSopenharmony_civoid ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dest, 6058cabdff1aSopenharmony_ci const uint8_t *src, 6059cabdff1aSopenharmony_ci ptrdiff_t stride) 6060cabdff1aSopenharmony_ci{ 6061cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_aver_src1_8x8_msa(src, stride, dest, stride); 6062cabdff1aSopenharmony_ci} 6063cabdff1aSopenharmony_ci 6064cabdff1aSopenharmony_civoid ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dest, 6065cabdff1aSopenharmony_ci const uint8_t *src, 6066cabdff1aSopenharmony_ci ptrdiff_t stride) 6067cabdff1aSopenharmony_ci{ 6068cabdff1aSopenharmony_ci vert_mc_qpel_no_rnd_aver_src1_16x16_msa(src, stride, dest, stride); 6069cabdff1aSopenharmony_ci} 6070cabdff1aSopenharmony_ci 6071cabdff1aSopenharmony_civoid ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dest, 6072cabdff1aSopenharmony_ci const uint8_t *src, 6073cabdff1aSopenharmony_ci ptrdiff_t stride) 6074cabdff1aSopenharmony_ci{ 6075cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_aver_src0_8x8_msa(src, stride, dest, stride); 6076cabdff1aSopenharmony_ci} 6077cabdff1aSopenharmony_ci 6078cabdff1aSopenharmony_civoid ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dest, 6079cabdff1aSopenharmony_ci const uint8_t *src, 6080cabdff1aSopenharmony_ci ptrdiff_t stride) 6081cabdff1aSopenharmony_ci{ 6082cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_aver_src0_16x16_msa(src, stride, dest, stride); 6083cabdff1aSopenharmony_ci} 6084cabdff1aSopenharmony_ci 6085cabdff1aSopenharmony_civoid ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dest, 6086cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6087cabdff1aSopenharmony_ci{ 6088cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride); 6089cabdff1aSopenharmony_ci} 6090cabdff1aSopenharmony_ci 6091cabdff1aSopenharmony_civoid ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dest, 6092cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6093cabdff1aSopenharmony_ci{ 6094cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride); 6095cabdff1aSopenharmony_ci} 6096cabdff1aSopenharmony_ci 6097cabdff1aSopenharmony_civoid ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dest, 6098cabdff1aSopenharmony_ci const uint8_t *src, 6099cabdff1aSopenharmony_ci ptrdiff_t stride) 6100cabdff1aSopenharmony_ci{ 6101cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_aver_src1_8x8_msa(src, stride, dest, stride); 6102cabdff1aSopenharmony_ci} 6103cabdff1aSopenharmony_ci 6104cabdff1aSopenharmony_civoid ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dest, 6105cabdff1aSopenharmony_ci const uint8_t *src, 6106cabdff1aSopenharmony_ci ptrdiff_t stride) 6107cabdff1aSopenharmony_ci{ 6108cabdff1aSopenharmony_ci vert_mc_qpel_avg_dst_aver_src1_16x16_msa(src, stride, dest, stride); 6109cabdff1aSopenharmony_ci} 6110cabdff1aSopenharmony_ci 6111cabdff1aSopenharmony_ci/* HV cases */ 6112cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dest, 6113cabdff1aSopenharmony_ci const uint8_t *src, 6114cabdff1aSopenharmony_ci ptrdiff_t stride) 6115cabdff1aSopenharmony_ci{ 6116cabdff1aSopenharmony_ci hv_mc_qpel_aver_hv_src00_16x16_msa(src, stride, dest, stride); 6117cabdff1aSopenharmony_ci} 6118cabdff1aSopenharmony_ci 6119cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dest, 6120cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6121cabdff1aSopenharmony_ci{ 6122cabdff1aSopenharmony_ci hv_mc_qpel_aver_hv_src00_8x8_msa(src, stride, dest, stride); 6123cabdff1aSopenharmony_ci} 6124cabdff1aSopenharmony_ci 6125cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dest, 6126cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6127cabdff1aSopenharmony_ci{ 6128cabdff1aSopenharmony_ci hv_mc_qpel_aver_v_src0_16x16_msa(src, stride, dest, stride); 6129cabdff1aSopenharmony_ci} 6130cabdff1aSopenharmony_ci 6131cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dest, 6132cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6133cabdff1aSopenharmony_ci{ 6134cabdff1aSopenharmony_ci hv_mc_qpel_aver_v_src0_8x8_msa(src, stride, dest, stride); 6135cabdff1aSopenharmony_ci} 6136cabdff1aSopenharmony_ci 6137cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dest, 6138cabdff1aSopenharmony_ci const uint8_t *src, 6139cabdff1aSopenharmony_ci ptrdiff_t stride) 6140cabdff1aSopenharmony_ci{ 6141cabdff1aSopenharmony_ci hv_mc_qpel_aver_hv_src10_16x16_msa(src, stride, dest, stride); 6142cabdff1aSopenharmony_ci} 6143cabdff1aSopenharmony_ci 6144cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dest, 6145cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6146cabdff1aSopenharmony_ci{ 6147cabdff1aSopenharmony_ci hv_mc_qpel_aver_hv_src10_8x8_msa(src, stride, dest, stride); 6148cabdff1aSopenharmony_ci} 6149cabdff1aSopenharmony_ci 6150cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dest, 6151cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6152cabdff1aSopenharmony_ci{ 6153cabdff1aSopenharmony_ci hv_mc_qpel_aver_h_src0_16x16_msa(src, stride, dest, stride); 6154cabdff1aSopenharmony_ci} 6155cabdff1aSopenharmony_ci 6156cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dest, 6157cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6158cabdff1aSopenharmony_ci{ 6159cabdff1aSopenharmony_ci hv_mc_qpel_aver_h_src0_8x8_msa(src, stride, dest, stride); 6160cabdff1aSopenharmony_ci} 6161cabdff1aSopenharmony_ci 6162cabdff1aSopenharmony_civoid ff_hv_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src, 6163cabdff1aSopenharmony_ci ptrdiff_t stride) 6164cabdff1aSopenharmony_ci{ 6165cabdff1aSopenharmony_ci hv_mc_qpel_16x16_msa(src, stride, dest, stride); 6166cabdff1aSopenharmony_ci} 6167cabdff1aSopenharmony_ci 6168cabdff1aSopenharmony_civoid ff_hv_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src, 6169cabdff1aSopenharmony_ci ptrdiff_t stride) 6170cabdff1aSopenharmony_ci{ 6171cabdff1aSopenharmony_ci hv_mc_qpel_8x8_msa(src, stride, dest, stride); 6172cabdff1aSopenharmony_ci} 6173cabdff1aSopenharmony_ci 6174cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dest, 6175cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6176cabdff1aSopenharmony_ci{ 6177cabdff1aSopenharmony_ci hv_mc_qpel_aver_h_src1_16x16_msa(src, stride, dest, stride); 6178cabdff1aSopenharmony_ci} 6179cabdff1aSopenharmony_ci 6180cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dest, 6181cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6182cabdff1aSopenharmony_ci{ 6183cabdff1aSopenharmony_ci hv_mc_qpel_aver_h_src1_8x8_msa(src, stride, dest, stride); 6184cabdff1aSopenharmony_ci} 6185cabdff1aSopenharmony_ci 6186cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dest, 6187cabdff1aSopenharmony_ci const uint8_t *src, 6188cabdff1aSopenharmony_ci ptrdiff_t stride) 6189cabdff1aSopenharmony_ci{ 6190cabdff1aSopenharmony_ci hv_mc_qpel_aver_hv_src01_16x16_msa(src, stride, dest, stride); 6191cabdff1aSopenharmony_ci} 6192cabdff1aSopenharmony_ci 6193cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dest, 6194cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6195cabdff1aSopenharmony_ci{ 6196cabdff1aSopenharmony_ci hv_mc_qpel_aver_hv_src01_8x8_msa(src, stride, dest, stride); 6197cabdff1aSopenharmony_ci} 6198cabdff1aSopenharmony_ci 6199cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dest, 6200cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6201cabdff1aSopenharmony_ci{ 6202cabdff1aSopenharmony_ci hv_mc_qpel_aver_v_src1_16x16_msa(src, stride, dest, stride); 6203cabdff1aSopenharmony_ci} 6204cabdff1aSopenharmony_ci 6205cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dest, 6206cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6207cabdff1aSopenharmony_ci{ 6208cabdff1aSopenharmony_ci hv_mc_qpel_aver_v_src1_8x8_msa(src, stride, dest, stride); 6209cabdff1aSopenharmony_ci} 6210cabdff1aSopenharmony_ci 6211cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dest, 6212cabdff1aSopenharmony_ci const uint8_t *src, 6213cabdff1aSopenharmony_ci ptrdiff_t stride) 6214cabdff1aSopenharmony_ci{ 6215cabdff1aSopenharmony_ci hv_mc_qpel_aver_hv_src11_16x16_msa(src, stride, dest, stride); 6216cabdff1aSopenharmony_ci} 6217cabdff1aSopenharmony_ci 6218cabdff1aSopenharmony_civoid ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dest, 6219cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6220cabdff1aSopenharmony_ci{ 6221cabdff1aSopenharmony_ci hv_mc_qpel_aver_hv_src11_8x8_msa(src, stride, dest, stride); 6222cabdff1aSopenharmony_ci} 6223cabdff1aSopenharmony_ci 6224cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dest, 6225cabdff1aSopenharmony_ci const uint8_t *src, 6226cabdff1aSopenharmony_ci ptrdiff_t stride) 6227cabdff1aSopenharmony_ci{ 6228cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(src, stride, dest, stride); 6229cabdff1aSopenharmony_ci} 6230cabdff1aSopenharmony_ci 6231cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dest, 6232cabdff1aSopenharmony_ci const uint8_t *src, 6233cabdff1aSopenharmony_ci ptrdiff_t stride) 6234cabdff1aSopenharmony_ci{ 6235cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(src, stride, dest, stride); 6236cabdff1aSopenharmony_ci} 6237cabdff1aSopenharmony_ci 6238cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dest, 6239cabdff1aSopenharmony_ci const uint8_t *src, 6240cabdff1aSopenharmony_ci ptrdiff_t stride) 6241cabdff1aSopenharmony_ci{ 6242cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(src, stride, dest, stride); 6243cabdff1aSopenharmony_ci} 6244cabdff1aSopenharmony_ci 6245cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dest, 6246cabdff1aSopenharmony_ci const uint8_t *src, 6247cabdff1aSopenharmony_ci ptrdiff_t stride) 6248cabdff1aSopenharmony_ci{ 6249cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(src, stride, dest, stride); 6250cabdff1aSopenharmony_ci} 6251cabdff1aSopenharmony_ci 6252cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dest, 6253cabdff1aSopenharmony_ci const uint8_t *src, 6254cabdff1aSopenharmony_ci ptrdiff_t stride) 6255cabdff1aSopenharmony_ci{ 6256cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(src, stride, dest, stride); 6257cabdff1aSopenharmony_ci} 6258cabdff1aSopenharmony_ci 6259cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dest, 6260cabdff1aSopenharmony_ci const uint8_t *src, 6261cabdff1aSopenharmony_ci ptrdiff_t stride) 6262cabdff1aSopenharmony_ci{ 6263cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(src, stride, dest, stride); 6264cabdff1aSopenharmony_ci} 6265cabdff1aSopenharmony_ci 6266cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dest, 6267cabdff1aSopenharmony_ci const uint8_t *src, 6268cabdff1aSopenharmony_ci ptrdiff_t stride) 6269cabdff1aSopenharmony_ci{ 6270cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(src, stride, dest, stride); 6271cabdff1aSopenharmony_ci} 6272cabdff1aSopenharmony_ci 6273cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dest, 6274cabdff1aSopenharmony_ci const uint8_t *src, 6275cabdff1aSopenharmony_ci ptrdiff_t stride) 6276cabdff1aSopenharmony_ci{ 6277cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(src, stride, dest, stride); 6278cabdff1aSopenharmony_ci} 6279cabdff1aSopenharmony_ci 6280cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dest, 6281cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6282cabdff1aSopenharmony_ci{ 6283cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride); 6284cabdff1aSopenharmony_ci} 6285cabdff1aSopenharmony_ci 6286cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dest, 6287cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6288cabdff1aSopenharmony_ci{ 6289cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride); 6290cabdff1aSopenharmony_ci} 6291cabdff1aSopenharmony_ci 6292cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dest, 6293cabdff1aSopenharmony_ci const uint8_t *src, 6294cabdff1aSopenharmony_ci ptrdiff_t stride) 6295cabdff1aSopenharmony_ci{ 6296cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(src, stride, dest, stride); 6297cabdff1aSopenharmony_ci} 6298cabdff1aSopenharmony_ci 6299cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dest, 6300cabdff1aSopenharmony_ci const uint8_t *src, 6301cabdff1aSopenharmony_ci ptrdiff_t stride) 6302cabdff1aSopenharmony_ci{ 6303cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(src, stride, dest, stride); 6304cabdff1aSopenharmony_ci} 6305cabdff1aSopenharmony_ci 6306cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dest, 6307cabdff1aSopenharmony_ci const uint8_t *src, 6308cabdff1aSopenharmony_ci ptrdiff_t stride) 6309cabdff1aSopenharmony_ci{ 6310cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(src, stride, dest, stride); 6311cabdff1aSopenharmony_ci} 6312cabdff1aSopenharmony_ci 6313cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dest, 6314cabdff1aSopenharmony_ci const uint8_t *src, 6315cabdff1aSopenharmony_ci ptrdiff_t stride) 6316cabdff1aSopenharmony_ci{ 6317cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(src, stride, dest, stride); 6318cabdff1aSopenharmony_ci} 6319cabdff1aSopenharmony_ci 6320cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dest, 6321cabdff1aSopenharmony_ci const uint8_t *src, 6322cabdff1aSopenharmony_ci ptrdiff_t stride) 6323cabdff1aSopenharmony_ci{ 6324cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(src, stride, dest, stride); 6325cabdff1aSopenharmony_ci} 6326cabdff1aSopenharmony_ci 6327cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dest, 6328cabdff1aSopenharmony_ci const uint8_t *src, 6329cabdff1aSopenharmony_ci ptrdiff_t stride) 6330cabdff1aSopenharmony_ci{ 6331cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(src, stride, dest, stride); 6332cabdff1aSopenharmony_ci} 6333cabdff1aSopenharmony_ci 6334cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dest, 6335cabdff1aSopenharmony_ci const uint8_t *src, 6336cabdff1aSopenharmony_ci ptrdiff_t stride) 6337cabdff1aSopenharmony_ci{ 6338cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(src, stride, dest, stride); 6339cabdff1aSopenharmony_ci} 6340cabdff1aSopenharmony_ci 6341cabdff1aSopenharmony_civoid ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dest, 6342cabdff1aSopenharmony_ci const uint8_t *src, 6343cabdff1aSopenharmony_ci ptrdiff_t stride) 6344cabdff1aSopenharmony_ci{ 6345cabdff1aSopenharmony_ci hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(src, stride, dest, stride); 6346cabdff1aSopenharmony_ci} 6347cabdff1aSopenharmony_ci 6348cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dest, 6349cabdff1aSopenharmony_ci const uint8_t *src, 6350cabdff1aSopenharmony_ci ptrdiff_t stride) 6351cabdff1aSopenharmony_ci{ 6352cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(src, stride, dest, stride); 6353cabdff1aSopenharmony_ci} 6354cabdff1aSopenharmony_ci 6355cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dest, 6356cabdff1aSopenharmony_ci const uint8_t *src, 6357cabdff1aSopenharmony_ci ptrdiff_t stride) 6358cabdff1aSopenharmony_ci{ 6359cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(src, stride, dest, stride); 6360cabdff1aSopenharmony_ci} 6361cabdff1aSopenharmony_ci 6362cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dest, 6363cabdff1aSopenharmony_ci const uint8_t *src, 6364cabdff1aSopenharmony_ci ptrdiff_t stride) 6365cabdff1aSopenharmony_ci{ 6366cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(src, stride, dest, stride); 6367cabdff1aSopenharmony_ci} 6368cabdff1aSopenharmony_ci 6369cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dest, 6370cabdff1aSopenharmony_ci const uint8_t *src, 6371cabdff1aSopenharmony_ci ptrdiff_t stride) 6372cabdff1aSopenharmony_ci{ 6373cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(src, stride, dest, stride); 6374cabdff1aSopenharmony_ci} 6375cabdff1aSopenharmony_ci 6376cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dest, 6377cabdff1aSopenharmony_ci const uint8_t *src, 6378cabdff1aSopenharmony_ci ptrdiff_t stride) 6379cabdff1aSopenharmony_ci{ 6380cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(src, stride, dest, stride); 6381cabdff1aSopenharmony_ci} 6382cabdff1aSopenharmony_ci 6383cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dest, 6384cabdff1aSopenharmony_ci const uint8_t *src, 6385cabdff1aSopenharmony_ci ptrdiff_t stride) 6386cabdff1aSopenharmony_ci{ 6387cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(src, stride, dest, stride); 6388cabdff1aSopenharmony_ci} 6389cabdff1aSopenharmony_ci 6390cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dest, 6391cabdff1aSopenharmony_ci const uint8_t *src, 6392cabdff1aSopenharmony_ci ptrdiff_t stride) 6393cabdff1aSopenharmony_ci{ 6394cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(src, stride, dest, stride); 6395cabdff1aSopenharmony_ci} 6396cabdff1aSopenharmony_ci 6397cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dest, 6398cabdff1aSopenharmony_ci const uint8_t *src, 6399cabdff1aSopenharmony_ci ptrdiff_t stride) 6400cabdff1aSopenharmony_ci{ 6401cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(src, stride, dest, stride); 6402cabdff1aSopenharmony_ci} 6403cabdff1aSopenharmony_ci 6404cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dest, 6405cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6406cabdff1aSopenharmony_ci{ 6407cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride); 6408cabdff1aSopenharmony_ci} 6409cabdff1aSopenharmony_ci 6410cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dest, 6411cabdff1aSopenharmony_ci const uint8_t *src, ptrdiff_t stride) 6412cabdff1aSopenharmony_ci{ 6413cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride); 6414cabdff1aSopenharmony_ci} 6415cabdff1aSopenharmony_ci 6416cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dest, 6417cabdff1aSopenharmony_ci const uint8_t *src, 6418cabdff1aSopenharmony_ci ptrdiff_t stride) 6419cabdff1aSopenharmony_ci{ 6420cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(src, stride, dest, stride); 6421cabdff1aSopenharmony_ci} 6422cabdff1aSopenharmony_ci 6423cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dest, 6424cabdff1aSopenharmony_ci const uint8_t *src, 6425cabdff1aSopenharmony_ci ptrdiff_t stride) 6426cabdff1aSopenharmony_ci{ 6427cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(src, stride, dest, stride); 6428cabdff1aSopenharmony_ci} 6429cabdff1aSopenharmony_ci 6430cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dest, 6431cabdff1aSopenharmony_ci const uint8_t *src, 6432cabdff1aSopenharmony_ci ptrdiff_t stride) 6433cabdff1aSopenharmony_ci{ 6434cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(src, stride, dest, stride); 6435cabdff1aSopenharmony_ci} 6436cabdff1aSopenharmony_ci 6437cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dest, 6438cabdff1aSopenharmony_ci const uint8_t *src, 6439cabdff1aSopenharmony_ci ptrdiff_t stride) 6440cabdff1aSopenharmony_ci{ 6441cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(src, stride, dest, stride); 6442cabdff1aSopenharmony_ci} 6443cabdff1aSopenharmony_ci 6444cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dest, 6445cabdff1aSopenharmony_ci const uint8_t *src, 6446cabdff1aSopenharmony_ci ptrdiff_t stride) 6447cabdff1aSopenharmony_ci{ 6448cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(src, stride, dest, stride); 6449cabdff1aSopenharmony_ci} 6450cabdff1aSopenharmony_ci 6451cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dest, 6452cabdff1aSopenharmony_ci const uint8_t *src, 6453cabdff1aSopenharmony_ci ptrdiff_t stride) 6454cabdff1aSopenharmony_ci{ 6455cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(src, stride, dest, stride); 6456cabdff1aSopenharmony_ci} 6457cabdff1aSopenharmony_ci 6458cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dest, 6459cabdff1aSopenharmony_ci const uint8_t *src, 6460cabdff1aSopenharmony_ci ptrdiff_t stride) 6461cabdff1aSopenharmony_ci{ 6462cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(src, stride, dest, stride); 6463cabdff1aSopenharmony_ci} 6464cabdff1aSopenharmony_ci 6465cabdff1aSopenharmony_civoid ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dest, 6466cabdff1aSopenharmony_ci const uint8_t *src, 6467cabdff1aSopenharmony_ci ptrdiff_t stride) 6468cabdff1aSopenharmony_ci{ 6469cabdff1aSopenharmony_ci hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(src, stride, dest, stride); 6470cabdff1aSopenharmony_ci} 6471