1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 22cabdff1aSopenharmony_ci#include "h264chroma_mips.h" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_cistatic const uint8_t chroma_mask_arr[16 * 5] = { 25cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 26cabdff1aSopenharmony_ci 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24, 27cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28cabdff1aSopenharmony_ci 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8, 29cabdff1aSopenharmony_ci 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20 30cabdff1aSopenharmony_ci}; 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_cistatic void avc_chroma_hz_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, 33cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 34cabdff1aSopenharmony_ci{ 35cabdff1aSopenharmony_ci uint16_t out0, out1; 36cabdff1aSopenharmony_ci v16i8 src0, src1; 37cabdff1aSopenharmony_ci v8u16 res_r; 38cabdff1aSopenharmony_ci v8i16 res; 39cabdff1aSopenharmony_ci v16i8 mask; 40cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 41cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 42cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 43cabdff1aSopenharmony_ci 44cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 45cabdff1aSopenharmony_ci 46cabdff1aSopenharmony_ci LD_SB2(src, stride, src0, src1); 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci src0 = __msa_vshf_b(mask, src1, src0); 49cabdff1aSopenharmony_ci res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec); 50cabdff1aSopenharmony_ci res_r <<= 3; 51cabdff1aSopenharmony_ci res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 52cabdff1aSopenharmony_ci res_r = __msa_sat_u_h(res_r, 7); 53cabdff1aSopenharmony_ci res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 54cabdff1aSopenharmony_ci 55cabdff1aSopenharmony_ci out0 = __msa_copy_u_h(res, 0); 56cabdff1aSopenharmony_ci out1 = __msa_copy_u_h(res, 2); 57cabdff1aSopenharmony_ci 58cabdff1aSopenharmony_ci SH(out0, dst); 59cabdff1aSopenharmony_ci dst += stride; 60cabdff1aSopenharmony_ci SH(out1, dst); 61cabdff1aSopenharmony_ci} 62cabdff1aSopenharmony_ci 63cabdff1aSopenharmony_cistatic void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 64cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 65cabdff1aSopenharmony_ci{ 66cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3; 67cabdff1aSopenharmony_ci v8u16 res_r; 68cabdff1aSopenharmony_ci v8i16 res; 69cabdff1aSopenharmony_ci v16i8 mask; 70cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 71cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 72cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[64]); 75cabdff1aSopenharmony_ci 76cabdff1aSopenharmony_ci LD_UB4(src, stride, src0, src1, src2, src3); 77cabdff1aSopenharmony_ci 78cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); 79cabdff1aSopenharmony_ci 80cabdff1aSopenharmony_ci src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0); 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_ci res_r = __msa_dotp_u_h(src0, coeff_vec); 83cabdff1aSopenharmony_ci res_r <<= 3; 84cabdff1aSopenharmony_ci res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 85cabdff1aSopenharmony_ci res_r = __msa_sat_u_h(res_r, 7); 86cabdff1aSopenharmony_ci res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci ST_H4(res, 0, 1, 2, 3, dst, stride); 89cabdff1aSopenharmony_ci} 90cabdff1aSopenharmony_ci 91cabdff1aSopenharmony_cistatic void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 92cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1, 93cabdff1aSopenharmony_ci int32_t height) 94cabdff1aSopenharmony_ci{ 95cabdff1aSopenharmony_ci if (2 == height) { 96cabdff1aSopenharmony_ci avc_chroma_hz_2x2_msa(src, dst, stride, coeff0, coeff1); 97cabdff1aSopenharmony_ci } else if (4 == height) { 98cabdff1aSopenharmony_ci avc_chroma_hz_2x4_msa(src, dst, stride, coeff0, coeff1); 99cabdff1aSopenharmony_ci } 100cabdff1aSopenharmony_ci} 101cabdff1aSopenharmony_ci 102cabdff1aSopenharmony_cistatic void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, 103cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 104cabdff1aSopenharmony_ci{ 105cabdff1aSopenharmony_ci v16i8 src0, src1; 106cabdff1aSopenharmony_ci v8u16 res_r; 107cabdff1aSopenharmony_ci v4i32 res; 108cabdff1aSopenharmony_ci v16i8 mask; 109cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 110cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 111cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 114cabdff1aSopenharmony_ci 115cabdff1aSopenharmony_ci LD_SB2(src, stride, src0, src1); 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_ci src0 = __msa_vshf_b(mask, src1, src0); 118cabdff1aSopenharmony_ci res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec); 119cabdff1aSopenharmony_ci res_r <<= 3; 120cabdff1aSopenharmony_ci res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 121cabdff1aSopenharmony_ci res_r = __msa_sat_u_h(res_r, 7); 122cabdff1aSopenharmony_ci res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 123cabdff1aSopenharmony_ci 124cabdff1aSopenharmony_ci ST_W2(res, 0, 1, dst, stride); 125cabdff1aSopenharmony_ci} 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_cistatic void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 128cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 129cabdff1aSopenharmony_ci{ 130cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, out; 131cabdff1aSopenharmony_ci v8u16 res0_r, res1_r; 132cabdff1aSopenharmony_ci v16i8 mask; 133cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 134cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 135cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 136cabdff1aSopenharmony_ci 137cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_ci LD_UB4(src, stride, src0, src1, src2, src3); 140cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); 141cabdff1aSopenharmony_ci DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r); 142cabdff1aSopenharmony_ci res0_r <<= 3; 143cabdff1aSopenharmony_ci res1_r <<= 3; 144cabdff1aSopenharmony_ci SRARI_H2_UH(res0_r, res1_r, 6); 145cabdff1aSopenharmony_ci SAT_UH2_UH(res0_r, res1_r, 7); 146cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); 147cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 148cabdff1aSopenharmony_ci} 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_cistatic void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 151cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 152cabdff1aSopenharmony_ci{ 153cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1; 154cabdff1aSopenharmony_ci v16i8 mask; 155cabdff1aSopenharmony_ci v8u16 res0, res1, res2, res3; 156cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 157cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 158cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 159cabdff1aSopenharmony_ci 160cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 161cabdff1aSopenharmony_ci 162cabdff1aSopenharmony_ci LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 163cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); 164cabdff1aSopenharmony_ci VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6); 165cabdff1aSopenharmony_ci DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1); 166cabdff1aSopenharmony_ci DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3); 167cabdff1aSopenharmony_ci SLLI_4V(res0, res1, res2, res3, 3); 168cabdff1aSopenharmony_ci SRARI_H4_UH(res0, res1, res2, res3, 6); 169cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 170cabdff1aSopenharmony_ci PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 171cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 172cabdff1aSopenharmony_ci} 173cabdff1aSopenharmony_ci 174cabdff1aSopenharmony_cistatic void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 175cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1, 176cabdff1aSopenharmony_ci int32_t height) 177cabdff1aSopenharmony_ci{ 178cabdff1aSopenharmony_ci if (2 == height) { 179cabdff1aSopenharmony_ci avc_chroma_hz_4x2_msa(src, dst, stride, coeff0, coeff1); 180cabdff1aSopenharmony_ci } else if (4 == height) { 181cabdff1aSopenharmony_ci avc_chroma_hz_4x4_msa(src, dst, stride, coeff0, coeff1); 182cabdff1aSopenharmony_ci } else if (8 == height) { 183cabdff1aSopenharmony_ci avc_chroma_hz_4x8_msa(src, dst, stride, coeff0, coeff1); 184cabdff1aSopenharmony_ci } 185cabdff1aSopenharmony_ci} 186cabdff1aSopenharmony_ci 187cabdff1aSopenharmony_cistatic void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 188cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 189cabdff1aSopenharmony_ci{ 190cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, out0, out1; 191cabdff1aSopenharmony_ci v8u16 res0, res1, res2, res3; 192cabdff1aSopenharmony_ci v16i8 mask; 193cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 194cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 195cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 196cabdff1aSopenharmony_ci 197cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[32]); 198cabdff1aSopenharmony_ci LD_UB4(src, stride, src0, src1, src2, src3); 199cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1); 200cabdff1aSopenharmony_ci VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3); 201cabdff1aSopenharmony_ci DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 202cabdff1aSopenharmony_ci coeff_vec, res0, res1, res2, res3); 203cabdff1aSopenharmony_ci SLLI_4V(res0, res1, res2, res3, 3); 204cabdff1aSopenharmony_ci SRARI_H4_UH(res0, res1, res2, res3, 6); 205cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 206cabdff1aSopenharmony_ci PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 207cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 208cabdff1aSopenharmony_ci} 209cabdff1aSopenharmony_ci 210cabdff1aSopenharmony_cistatic void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 211cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 212cabdff1aSopenharmony_ci{ 213cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 214cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 215cabdff1aSopenharmony_ci v8u16 res0, res1, res2, res3, res4, res5, res6, res7; 216cabdff1aSopenharmony_ci v16i8 mask; 217cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 218cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 219cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 220cabdff1aSopenharmony_ci 221cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[32]); 222cabdff1aSopenharmony_ci 223cabdff1aSopenharmony_ci LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 224cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1); 225cabdff1aSopenharmony_ci VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3); 226cabdff1aSopenharmony_ci VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5); 227cabdff1aSopenharmony_ci VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7); 228cabdff1aSopenharmony_ci DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 229cabdff1aSopenharmony_ci coeff_vec, res0, res1, res2, res3); 230cabdff1aSopenharmony_ci DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec, 231cabdff1aSopenharmony_ci coeff_vec, res4, res5, res6, res7); 232cabdff1aSopenharmony_ci SLLI_4V(res0, res1, res2, res3, 3); 233cabdff1aSopenharmony_ci SLLI_4V(res4, res5, res6, res7, 3); 234cabdff1aSopenharmony_ci SRARI_H4_UH(res0, res1, res2, res3, 6); 235cabdff1aSopenharmony_ci SRARI_H4_UH(res4, res5, res6, res7, 6); 236cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 237cabdff1aSopenharmony_ci SAT_UH4_UH(res4, res5, res6, res7, 7); 238cabdff1aSopenharmony_ci PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 239cabdff1aSopenharmony_ci PCKEV_B2_UB(res5, res4, res7, res6, out2, out3); 240cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 241cabdff1aSopenharmony_ci} 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_cistatic void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst, 244cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 245cabdff1aSopenharmony_ci uint32_t coeff1, int32_t height) 246cabdff1aSopenharmony_ci{ 247cabdff1aSopenharmony_ci uint32_t row; 248cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, out0, out1; 249cabdff1aSopenharmony_ci v8u16 res0, res1, res2, res3; 250cabdff1aSopenharmony_ci v16i8 mask; 251cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 252cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 253cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 254cabdff1aSopenharmony_ci 255cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[32]); 256cabdff1aSopenharmony_ci 257cabdff1aSopenharmony_ci for (row = height >> 2; row--;) { 258cabdff1aSopenharmony_ci LD_UB4(src, stride, src0, src1, src2, src3); 259cabdff1aSopenharmony_ci src += (4 * stride); 260cabdff1aSopenharmony_ci 261cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1); 262cabdff1aSopenharmony_ci VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3); 263cabdff1aSopenharmony_ci DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 264cabdff1aSopenharmony_ci coeff_vec, res0, res1, res2, res3); 265cabdff1aSopenharmony_ci SLLI_4V(res0, res1, res2, res3, 3); 266cabdff1aSopenharmony_ci SRARI_H4_UH(res0, res1, res2, res3, 6); 267cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 268cabdff1aSopenharmony_ci PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 269cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 270cabdff1aSopenharmony_ci dst += (4 * stride); 271cabdff1aSopenharmony_ci } 272cabdff1aSopenharmony_ci 273cabdff1aSopenharmony_ci if (0 != (height % 4)) { 274cabdff1aSopenharmony_ci for (row = (height % 4); row--;) { 275cabdff1aSopenharmony_ci src0 = LD_UB(src); 276cabdff1aSopenharmony_ci src += stride; 277cabdff1aSopenharmony_ci 278cabdff1aSopenharmony_ci src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0); 279cabdff1aSopenharmony_ci 280cabdff1aSopenharmony_ci res0 = __msa_dotp_u_h(src0, coeff_vec); 281cabdff1aSopenharmony_ci res0 <<= 3; 282cabdff1aSopenharmony_ci res0 = (v8u16) __msa_srari_h((v8i16) res0, 6); 283cabdff1aSopenharmony_ci res0 = __msa_sat_u_h(res0, 7); 284cabdff1aSopenharmony_ci res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0); 285cabdff1aSopenharmony_ci 286cabdff1aSopenharmony_ci ST_D1(res0, 0, dst); 287cabdff1aSopenharmony_ci dst += stride; 288cabdff1aSopenharmony_ci } 289cabdff1aSopenharmony_ci } 290cabdff1aSopenharmony_ci} 291cabdff1aSopenharmony_ci 292cabdff1aSopenharmony_cistatic void avc_chroma_hz_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 293cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1, 294cabdff1aSopenharmony_ci int32_t height) 295cabdff1aSopenharmony_ci{ 296cabdff1aSopenharmony_ci if (4 == height) { 297cabdff1aSopenharmony_ci avc_chroma_hz_8x4_msa(src, dst, stride, coeff0, coeff1); 298cabdff1aSopenharmony_ci } else if (8 == height) { 299cabdff1aSopenharmony_ci avc_chroma_hz_8x8_msa(src, dst, stride, coeff0, coeff1); 300cabdff1aSopenharmony_ci } else { 301cabdff1aSopenharmony_ci avc_chroma_hz_nonmult_msa(src, dst, stride, coeff0, coeff1, height); 302cabdff1aSopenharmony_ci } 303cabdff1aSopenharmony_ci} 304cabdff1aSopenharmony_ci 305cabdff1aSopenharmony_cistatic void avc_chroma_vt_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, 306cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 307cabdff1aSopenharmony_ci{ 308cabdff1aSopenharmony_ci uint16_t out0, out1; 309cabdff1aSopenharmony_ci v16i8 src0, src1, src2; 310cabdff1aSopenharmony_ci v16u8 tmp0, tmp1; 311cabdff1aSopenharmony_ci v8i16 res; 312cabdff1aSopenharmony_ci v8u16 res_r; 313cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 314cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 315cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 316cabdff1aSopenharmony_ci 317cabdff1aSopenharmony_ci LD_SB3(src, stride, src0, src1, src2); 318cabdff1aSopenharmony_ci 319cabdff1aSopenharmony_ci ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1); 320cabdff1aSopenharmony_ci 321cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0); 322cabdff1aSopenharmony_ci 323cabdff1aSopenharmony_ci res_r = __msa_dotp_u_h(tmp0, coeff_vec); 324cabdff1aSopenharmony_ci res_r <<= 3; 325cabdff1aSopenharmony_ci res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 326cabdff1aSopenharmony_ci res_r = __msa_sat_u_h(res_r, 7); 327cabdff1aSopenharmony_ci res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 328cabdff1aSopenharmony_ci 329cabdff1aSopenharmony_ci out0 = __msa_copy_u_h(res, 0); 330cabdff1aSopenharmony_ci out1 = __msa_copy_u_h(res, 2); 331cabdff1aSopenharmony_ci 332cabdff1aSopenharmony_ci SH(out0, dst); 333cabdff1aSopenharmony_ci dst += stride; 334cabdff1aSopenharmony_ci SH(out1, dst); 335cabdff1aSopenharmony_ci} 336cabdff1aSopenharmony_ci 337cabdff1aSopenharmony_cistatic void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 338cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 339cabdff1aSopenharmony_ci{ 340cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4; 341cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3; 342cabdff1aSopenharmony_ci v8i16 res; 343cabdff1aSopenharmony_ci v8u16 res_r; 344cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 345cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 346cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 347cabdff1aSopenharmony_ci 348cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 349cabdff1aSopenharmony_ci ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 350cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 351cabdff1aSopenharmony_ci ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 352cabdff1aSopenharmony_ci 353cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0); 354cabdff1aSopenharmony_ci 355cabdff1aSopenharmony_ci res_r = __msa_dotp_u_h(tmp0, coeff_vec); 356cabdff1aSopenharmony_ci res_r <<= 3; 357cabdff1aSopenharmony_ci res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 358cabdff1aSopenharmony_ci res_r = __msa_sat_u_h(res_r, 7); 359cabdff1aSopenharmony_ci 360cabdff1aSopenharmony_ci res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 361cabdff1aSopenharmony_ci 362cabdff1aSopenharmony_ci ST_H4(res, 0, 1, 2, 3, dst, stride); 363cabdff1aSopenharmony_ci} 364cabdff1aSopenharmony_ci 365cabdff1aSopenharmony_cistatic void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 366cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1, 367cabdff1aSopenharmony_ci int32_t height) 368cabdff1aSopenharmony_ci{ 369cabdff1aSopenharmony_ci if (2 == height) { 370cabdff1aSopenharmony_ci avc_chroma_vt_2x2_msa(src, dst, stride, coeff0, coeff1); 371cabdff1aSopenharmony_ci } else if (4 == height) { 372cabdff1aSopenharmony_ci avc_chroma_vt_2x4_msa(src, dst, stride, coeff0, coeff1); 373cabdff1aSopenharmony_ci } 374cabdff1aSopenharmony_ci} 375cabdff1aSopenharmony_ci 376cabdff1aSopenharmony_cistatic void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, 377cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 378cabdff1aSopenharmony_ci{ 379cabdff1aSopenharmony_ci v16u8 src0, src1, src2; 380cabdff1aSopenharmony_ci v16u8 tmp0, tmp1; 381cabdff1aSopenharmony_ci v4i32 res; 382cabdff1aSopenharmony_ci v8u16 res_r; 383cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 384cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 385cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 386cabdff1aSopenharmony_ci 387cabdff1aSopenharmony_ci LD_UB3(src, stride, src0, src1, src2); 388cabdff1aSopenharmony_ci ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1); 389cabdff1aSopenharmony_ci 390cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0); 391cabdff1aSopenharmony_ci res_r = __msa_dotp_u_h(tmp0, coeff_vec); 392cabdff1aSopenharmony_ci res_r <<= 3; 393cabdff1aSopenharmony_ci res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 394cabdff1aSopenharmony_ci res_r = __msa_sat_u_h(res_r, 7); 395cabdff1aSopenharmony_ci res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 396cabdff1aSopenharmony_ci 397cabdff1aSopenharmony_ci ST_W2(res, 0, 1, dst, stride); 398cabdff1aSopenharmony_ci} 399cabdff1aSopenharmony_ci 400cabdff1aSopenharmony_cistatic void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 401cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 402cabdff1aSopenharmony_ci{ 403cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4; 404cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3; 405cabdff1aSopenharmony_ci v16u8 out; 406cabdff1aSopenharmony_ci v8u16 res0_r, res1_r; 407cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 408cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 409cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 410cabdff1aSopenharmony_ci 411cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 412cabdff1aSopenharmony_ci ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2, 413cabdff1aSopenharmony_ci tmp3); 414cabdff1aSopenharmony_ci ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 415cabdff1aSopenharmony_ci DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r); 416cabdff1aSopenharmony_ci res0_r <<= 3; 417cabdff1aSopenharmony_ci res1_r <<= 3; 418cabdff1aSopenharmony_ci SRARI_H2_UH(res0_r, res1_r, 6); 419cabdff1aSopenharmony_ci SAT_UH2_UH(res0_r, res1_r, 7); 420cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); 421cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 422cabdff1aSopenharmony_ci} 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_cistatic void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 425cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 426cabdff1aSopenharmony_ci{ 427cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 428cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1; 429cabdff1aSopenharmony_ci v8u16 res0, res1, res2, res3; 430cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 431cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 432cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 433cabdff1aSopenharmony_ci 434cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 435cabdff1aSopenharmony_ci src += (5 * stride); 436cabdff1aSopenharmony_ci LD_UB4(src, stride, src5, src6, src7, src8); 437cabdff1aSopenharmony_ci ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2, 438cabdff1aSopenharmony_ci tmp3); 439cabdff1aSopenharmony_ci ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6, 440cabdff1aSopenharmony_ci tmp7); 441cabdff1aSopenharmony_ci ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 442cabdff1aSopenharmony_ci ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6); 443cabdff1aSopenharmony_ci DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1); 444cabdff1aSopenharmony_ci DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3); 445cabdff1aSopenharmony_ci SLLI_4V(res0, res1, res2, res3, 3); 446cabdff1aSopenharmony_ci SRARI_H4_UH(res0, res1, res2, res3, 6); 447cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 448cabdff1aSopenharmony_ci PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 449cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 450cabdff1aSopenharmony_ci} 451cabdff1aSopenharmony_ci 452cabdff1aSopenharmony_cistatic void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 453cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1, 454cabdff1aSopenharmony_ci int32_t height) 455cabdff1aSopenharmony_ci{ 456cabdff1aSopenharmony_ci if (2 == height) { 457cabdff1aSopenharmony_ci avc_chroma_vt_4x2_msa(src, dst, stride, coeff0, coeff1); 458cabdff1aSopenharmony_ci } else if (4 == height) { 459cabdff1aSopenharmony_ci avc_chroma_vt_4x4_msa(src, dst, stride, coeff0, coeff1); 460cabdff1aSopenharmony_ci } else if (8 == height) { 461cabdff1aSopenharmony_ci avc_chroma_vt_4x8_msa(src, dst, stride, coeff0, coeff1); 462cabdff1aSopenharmony_ci } 463cabdff1aSopenharmony_ci} 464cabdff1aSopenharmony_ci 465cabdff1aSopenharmony_cistatic void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 466cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 467cabdff1aSopenharmony_ci{ 468cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, out0, out1; 469cabdff1aSopenharmony_ci v8u16 res0, res1, res2, res3; 470cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 471cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 472cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 473cabdff1aSopenharmony_ci 474cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 475cabdff1aSopenharmony_ci ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2, 476cabdff1aSopenharmony_ci src3); 477cabdff1aSopenharmony_ci DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 478cabdff1aSopenharmony_ci coeff_vec, res0, res1, res2, res3); 479cabdff1aSopenharmony_ci SLLI_4V(res0, res1, res2, res3, 3); 480cabdff1aSopenharmony_ci SRARI_H4_UH(res0, res1, res2, res3, 6); 481cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 482cabdff1aSopenharmony_ci PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 483cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 484cabdff1aSopenharmony_ci} 485cabdff1aSopenharmony_ci 486cabdff1aSopenharmony_cistatic void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 487cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1) 488cabdff1aSopenharmony_ci{ 489cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 490cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 491cabdff1aSopenharmony_ci v8u16 res0, res1, res2, res3, res4, res5, res6, res7; 492cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 493cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 494cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 495cabdff1aSopenharmony_ci 496cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 497cabdff1aSopenharmony_ci src += (5 * stride); 498cabdff1aSopenharmony_ci LD_UB4(src, stride, src5, src6, src7, src8); 499cabdff1aSopenharmony_ci ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2, 500cabdff1aSopenharmony_ci src3); 501cabdff1aSopenharmony_ci ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6, 502cabdff1aSopenharmony_ci src7); 503cabdff1aSopenharmony_ci DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 504cabdff1aSopenharmony_ci coeff_vec, res0, res1, res2, res3); 505cabdff1aSopenharmony_ci DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec, 506cabdff1aSopenharmony_ci coeff_vec, res4, res5, res6, res7); 507cabdff1aSopenharmony_ci SLLI_4V(res0, res1, res2, res3, 3); 508cabdff1aSopenharmony_ci SLLI_4V(res4, res5, res6, res7, 3); 509cabdff1aSopenharmony_ci SRARI_H4_UH(res0, res1, res2, res3, 6); 510cabdff1aSopenharmony_ci SRARI_H4_UH(res4, res5, res6, res7, 6); 511cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 512cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 513cabdff1aSopenharmony_ci PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 514cabdff1aSopenharmony_ci PCKEV_B2_UB(res5, res4, res7, res6, out2, out3); 515cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 516cabdff1aSopenharmony_ci} 517cabdff1aSopenharmony_ci 518cabdff1aSopenharmony_cistatic void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 519cabdff1aSopenharmony_ci uint32_t coeff0, uint32_t coeff1, 520cabdff1aSopenharmony_ci int32_t height) 521cabdff1aSopenharmony_ci{ 522cabdff1aSopenharmony_ci if (4 == height) { 523cabdff1aSopenharmony_ci avc_chroma_vt_8x4_msa(src, dst, stride, coeff0, coeff1); 524cabdff1aSopenharmony_ci } else if (8 == height) { 525cabdff1aSopenharmony_ci avc_chroma_vt_8x8_msa(src, dst, stride, coeff0, coeff1); 526cabdff1aSopenharmony_ci } 527cabdff1aSopenharmony_ci} 528cabdff1aSopenharmony_ci 529cabdff1aSopenharmony_cistatic void avc_chroma_hv_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, 530cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 531cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1) 532cabdff1aSopenharmony_ci{ 533cabdff1aSopenharmony_ci uint16_t out0, out1; 534cabdff1aSopenharmony_ci v16u8 src0, src1, src2; 535cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_vt0, res_vt1; 536cabdff1aSopenharmony_ci v8i16 res_vert; 537cabdff1aSopenharmony_ci v16i8 mask; 538cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 539cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 540cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 541cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 542cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 543cabdff1aSopenharmony_ci 544cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[48]); 545cabdff1aSopenharmony_ci 546cabdff1aSopenharmony_ci LD_UB3(src, stride, src0, src1, src2); 547cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 548cabdff1aSopenharmony_ci DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); 549cabdff1aSopenharmony_ci MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 550cabdff1aSopenharmony_ci 551cabdff1aSopenharmony_ci res_vt0 += res_vt1; 552cabdff1aSopenharmony_ci res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); 553cabdff1aSopenharmony_ci res_vt0 = __msa_sat_u_h(res_vt0, 7); 554cabdff1aSopenharmony_ci res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); 555cabdff1aSopenharmony_ci 556cabdff1aSopenharmony_ci out0 = __msa_copy_u_h(res_vert, 0); 557cabdff1aSopenharmony_ci out1 = __msa_copy_u_h(res_vert, 1); 558cabdff1aSopenharmony_ci 559cabdff1aSopenharmony_ci SH(out0, dst); 560cabdff1aSopenharmony_ci dst += stride; 561cabdff1aSopenharmony_ci SH(out1, dst); 562cabdff1aSopenharmony_ci} 563cabdff1aSopenharmony_ci 564cabdff1aSopenharmony_cistatic void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 565cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 566cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1) 567cabdff1aSopenharmony_ci{ 568cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4; 569cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3; 570cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_vt0, res_vt1; 571cabdff1aSopenharmony_ci v8i16 res; 572cabdff1aSopenharmony_ci v16i8 mask; 573cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 574cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 575cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 576cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 577cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 578cabdff1aSopenharmony_ci 579cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[48]); 580cabdff1aSopenharmony_ci 581cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 582cabdff1aSopenharmony_ci 583cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1); 584cabdff1aSopenharmony_ci VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3); 585cabdff1aSopenharmony_ci ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 586cabdff1aSopenharmony_ci DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); 587cabdff1aSopenharmony_ci MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 588cabdff1aSopenharmony_ci 589cabdff1aSopenharmony_ci res_vt0 += res_vt1; 590cabdff1aSopenharmony_ci res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); 591cabdff1aSopenharmony_ci res_vt0 = __msa_sat_u_h(res_vt0, 7); 592cabdff1aSopenharmony_ci 593cabdff1aSopenharmony_ci res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); 594cabdff1aSopenharmony_ci 595cabdff1aSopenharmony_ci ST_H4(res, 0, 1, 2, 3, dst, stride); 596cabdff1aSopenharmony_ci} 597cabdff1aSopenharmony_ci 598cabdff1aSopenharmony_cistatic void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 599cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 600cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1, 601cabdff1aSopenharmony_ci int32_t height) 602cabdff1aSopenharmony_ci{ 603cabdff1aSopenharmony_ci if (2 == height) { 604cabdff1aSopenharmony_ci avc_chroma_hv_2x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 605cabdff1aSopenharmony_ci coef_ver1); 606cabdff1aSopenharmony_ci } else if (4 == height) { 607cabdff1aSopenharmony_ci avc_chroma_hv_2x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 608cabdff1aSopenharmony_ci coef_ver1); 609cabdff1aSopenharmony_ci } 610cabdff1aSopenharmony_ci} 611cabdff1aSopenharmony_ci 612cabdff1aSopenharmony_cistatic void avc_chroma_hv_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, 613cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 614cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1) 615cabdff1aSopenharmony_ci{ 616cabdff1aSopenharmony_ci v16u8 src0, src1, src2; 617cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_vt0, res_vt1; 618cabdff1aSopenharmony_ci v16i8 mask; 619cabdff1aSopenharmony_ci v4i32 res; 620cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 621cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 622cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 623cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 624cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 625cabdff1aSopenharmony_ci 626cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 627cabdff1aSopenharmony_ci LD_UB3(src, stride, src0, src1, src2); 628cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 629cabdff1aSopenharmony_ci DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); 630cabdff1aSopenharmony_ci MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 631cabdff1aSopenharmony_ci 632cabdff1aSopenharmony_ci res_vt0 += res_vt1; 633cabdff1aSopenharmony_ci res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); 634cabdff1aSopenharmony_ci res_vt0 = __msa_sat_u_h(res_vt0, 7); 635cabdff1aSopenharmony_ci res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); 636cabdff1aSopenharmony_ci 637cabdff1aSopenharmony_ci ST_W2(res, 0, 1, dst, stride); 638cabdff1aSopenharmony_ci} 639cabdff1aSopenharmony_ci 640cabdff1aSopenharmony_cistatic void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 641cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 642cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1) 643cabdff1aSopenharmony_ci{ 644cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4; 645cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_hz2, res_hz3; 646cabdff1aSopenharmony_ci v8u16 res_vt0, res_vt1, res_vt2, res_vt3; 647cabdff1aSopenharmony_ci v16i8 mask; 648cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 649cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 650cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 651cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 652cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 653cabdff1aSopenharmony_ci v4i32 res0, res1; 654cabdff1aSopenharmony_ci 655cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 656cabdff1aSopenharmony_ci 657cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 658cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 659cabdff1aSopenharmony_ci VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3); 660cabdff1aSopenharmony_ci DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, 661cabdff1aSopenharmony_ci coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, 662cabdff1aSopenharmony_ci res_hz3); 663cabdff1aSopenharmony_ci MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, 664cabdff1aSopenharmony_ci res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 665cabdff1aSopenharmony_ci ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1); 666cabdff1aSopenharmony_ci SRARI_H2_UH(res_vt0, res_vt1, 6); 667cabdff1aSopenharmony_ci SAT_UH2_UH(res_vt0, res_vt1, 7); 668cabdff1aSopenharmony_ci PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1); 669cabdff1aSopenharmony_ci ST_W2(res0, 0, 1, dst, stride); 670cabdff1aSopenharmony_ci ST_W2(res1, 0, 1, dst + 2 * stride, stride); 671cabdff1aSopenharmony_ci} 672cabdff1aSopenharmony_ci 673cabdff1aSopenharmony_cistatic void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 674cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 675cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1) 676cabdff1aSopenharmony_ci{ 677cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1; 678cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7; 679cabdff1aSopenharmony_ci v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7; 680cabdff1aSopenharmony_ci v16i8 mask; 681cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 682cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 683cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 684cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 685cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 686cabdff1aSopenharmony_ci 687cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 688cabdff1aSopenharmony_ci 689cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 690cabdff1aSopenharmony_ci src += (5 * stride); 691cabdff1aSopenharmony_ci LD_UB4(src, stride, src5, src6, src7, src8); 692cabdff1aSopenharmony_ci 693cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 694cabdff1aSopenharmony_ci VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3); 695cabdff1aSopenharmony_ci VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5); 696cabdff1aSopenharmony_ci VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7); 697cabdff1aSopenharmony_ci DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, 698cabdff1aSopenharmony_ci coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3); 699cabdff1aSopenharmony_ci DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec, 700cabdff1aSopenharmony_ci coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7); 701cabdff1aSopenharmony_ci MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, 702cabdff1aSopenharmony_ci res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 703cabdff1aSopenharmony_ci MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1, 704cabdff1aSopenharmony_ci res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7); 705cabdff1aSopenharmony_ci ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1); 706cabdff1aSopenharmony_ci ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3); 707cabdff1aSopenharmony_ci SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); 708cabdff1aSopenharmony_ci SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); 709cabdff1aSopenharmony_ci PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1); 710cabdff1aSopenharmony_ci ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 711cabdff1aSopenharmony_ci} 712cabdff1aSopenharmony_ci 713cabdff1aSopenharmony_cistatic void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 714cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 715cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1, 716cabdff1aSopenharmony_ci int32_t height) 717cabdff1aSopenharmony_ci{ 718cabdff1aSopenharmony_ci if (2 == height) { 719cabdff1aSopenharmony_ci avc_chroma_hv_4x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 720cabdff1aSopenharmony_ci coef_ver1); 721cabdff1aSopenharmony_ci } else if (4 == height) { 722cabdff1aSopenharmony_ci avc_chroma_hv_4x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 723cabdff1aSopenharmony_ci coef_ver1); 724cabdff1aSopenharmony_ci } else if (8 == height) { 725cabdff1aSopenharmony_ci avc_chroma_hv_4x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 726cabdff1aSopenharmony_ci coef_ver1); 727cabdff1aSopenharmony_ci } 728cabdff1aSopenharmony_ci} 729cabdff1aSopenharmony_ci 730cabdff1aSopenharmony_cistatic void avc_chroma_hv_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 731cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 732cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1) 733cabdff1aSopenharmony_ci{ 734cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, out0, out1; 735cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4; 736cabdff1aSopenharmony_ci v8u16 res_vt0, res_vt1, res_vt2, res_vt3; 737cabdff1aSopenharmony_ci v16i8 mask; 738cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 739cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 740cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 741cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 742cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 743cabdff1aSopenharmony_ci 744cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[32]); 745cabdff1aSopenharmony_ci 746cabdff1aSopenharmony_ci src0 = LD_UB(src); 747cabdff1aSopenharmony_ci src += stride; 748cabdff1aSopenharmony_ci 749cabdff1aSopenharmony_ci src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0); 750cabdff1aSopenharmony_ci res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec); 751cabdff1aSopenharmony_ci 752cabdff1aSopenharmony_ci LD_UB4(src, stride, src1, src2, src3, src4); 753cabdff1aSopenharmony_ci src += (4 * stride); 754cabdff1aSopenharmony_ci 755cabdff1aSopenharmony_ci VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2); 756cabdff1aSopenharmony_ci VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4); 757cabdff1aSopenharmony_ci DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec, 758cabdff1aSopenharmony_ci coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4); 759cabdff1aSopenharmony_ci MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0, 760cabdff1aSopenharmony_ci res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 761cabdff1aSopenharmony_ci 762cabdff1aSopenharmony_ci res_vt0 += (res_hz0 * coeff_vt_vec1); 763cabdff1aSopenharmony_ci res_vt1 += (res_hz1 * coeff_vt_vec1); 764cabdff1aSopenharmony_ci res_vt2 += (res_hz2 * coeff_vt_vec1); 765cabdff1aSopenharmony_ci res_vt3 += (res_hz3 * coeff_vt_vec1); 766cabdff1aSopenharmony_ci 767cabdff1aSopenharmony_ci SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); 768cabdff1aSopenharmony_ci SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); 769cabdff1aSopenharmony_ci PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1); 770cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 771cabdff1aSopenharmony_ci} 772cabdff1aSopenharmony_ci 773cabdff1aSopenharmony_cistatic void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 774cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 775cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1) 776cabdff1aSopenharmony_ci{ 777cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 778cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 779cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4; 780cabdff1aSopenharmony_ci v8u16 res_hz5, res_hz6, res_hz7, res_hz8; 781cabdff1aSopenharmony_ci v8u16 res_vt0, res_vt1, res_vt2, res_vt3; 782cabdff1aSopenharmony_ci v8u16 res_vt4, res_vt5, res_vt6, res_vt7; 783cabdff1aSopenharmony_ci v16i8 mask; 784cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 785cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 786cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 787cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 788cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 789cabdff1aSopenharmony_ci 790cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[32]); 791cabdff1aSopenharmony_ci 792cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 793cabdff1aSopenharmony_ci src += (5 * stride); 794cabdff1aSopenharmony_ci LD_UB4(src, stride, src5, src6, src7, src8); 795cabdff1aSopenharmony_ci src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0); 796cabdff1aSopenharmony_ci VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2); 797cabdff1aSopenharmony_ci VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4); 798cabdff1aSopenharmony_ci VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6); 799cabdff1aSopenharmony_ci VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8); 800cabdff1aSopenharmony_ci res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec); 801cabdff1aSopenharmony_ci DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec, 802cabdff1aSopenharmony_ci coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, 803cabdff1aSopenharmony_ci res_hz4); 804cabdff1aSopenharmony_ci DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec, 805cabdff1aSopenharmony_ci coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8); 806cabdff1aSopenharmony_ci MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, 807cabdff1aSopenharmony_ci coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, 808cabdff1aSopenharmony_ci res_vt3); 809cabdff1aSopenharmony_ci MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7, 810cabdff1aSopenharmony_ci coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, 811cabdff1aSopenharmony_ci res_vt7); 812cabdff1aSopenharmony_ci res_vt0 += (res_hz0 * coeff_vt_vec1); 813cabdff1aSopenharmony_ci res_vt1 += (res_hz1 * coeff_vt_vec1); 814cabdff1aSopenharmony_ci res_vt2 += (res_hz2 * coeff_vt_vec1); 815cabdff1aSopenharmony_ci res_vt3 += (res_hz3 * coeff_vt_vec1); 816cabdff1aSopenharmony_ci res_vt4 += (res_hz4 * coeff_vt_vec1); 817cabdff1aSopenharmony_ci res_vt5 += (res_hz5 * coeff_vt_vec1); 818cabdff1aSopenharmony_ci res_vt6 += (res_hz6 * coeff_vt_vec1); 819cabdff1aSopenharmony_ci res_vt7 += (res_hz7 * coeff_vt_vec1); 820cabdff1aSopenharmony_ci SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); 821cabdff1aSopenharmony_ci SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6); 822cabdff1aSopenharmony_ci SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); 823cabdff1aSopenharmony_ci SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7); 824cabdff1aSopenharmony_ci PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1); 825cabdff1aSopenharmony_ci PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3); 826cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 827cabdff1aSopenharmony_ci} 828cabdff1aSopenharmony_ci 829cabdff1aSopenharmony_cistatic void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, 830cabdff1aSopenharmony_ci uint32_t coef_hor0, uint32_t coef_hor1, 831cabdff1aSopenharmony_ci uint32_t coef_ver0, uint32_t coef_ver1, 832cabdff1aSopenharmony_ci int32_t height) 833cabdff1aSopenharmony_ci{ 834cabdff1aSopenharmony_ci if (4 == height) { 835cabdff1aSopenharmony_ci avc_chroma_hv_8x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 836cabdff1aSopenharmony_ci coef_ver1); 837cabdff1aSopenharmony_ci } else if (8 == height) { 838cabdff1aSopenharmony_ci avc_chroma_hv_8x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0, 839cabdff1aSopenharmony_ci coef_ver1); 840cabdff1aSopenharmony_ci } 841cabdff1aSopenharmony_ci} 842cabdff1aSopenharmony_ci 843cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst, 844cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 845cabdff1aSopenharmony_ci uint32_t coeff1) 846cabdff1aSopenharmony_ci{ 847cabdff1aSopenharmony_ci uint16_t out0, out1; 848cabdff1aSopenharmony_ci v16i8 src0, src1; 849cabdff1aSopenharmony_ci v16u8 dst_data = { 0 }; 850cabdff1aSopenharmony_ci v8u16 res_r; 851cabdff1aSopenharmony_ci v16u8 res; 852cabdff1aSopenharmony_ci v16i8 mask; 853cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 854cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 855cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 856cabdff1aSopenharmony_ci 857cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 858cabdff1aSopenharmony_ci 859cabdff1aSopenharmony_ci LD_SB2(src, stride, src0, src1); 860cabdff1aSopenharmony_ci 861cabdff1aSopenharmony_ci out0 = LH(dst); 862cabdff1aSopenharmony_ci out1 = LH(dst + stride); 863cabdff1aSopenharmony_ci 864cabdff1aSopenharmony_ci dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0); 865cabdff1aSopenharmony_ci dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1); 866cabdff1aSopenharmony_ci 867cabdff1aSopenharmony_ci src0 = __msa_vshf_b(mask, src1, src0); 868cabdff1aSopenharmony_ci 869cabdff1aSopenharmony_ci res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec); 870cabdff1aSopenharmony_ci res_r <<= 3; 871cabdff1aSopenharmony_ci res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 872cabdff1aSopenharmony_ci res_r = __msa_sat_u_h(res_r, 7); 873cabdff1aSopenharmony_ci 874cabdff1aSopenharmony_ci res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 875cabdff1aSopenharmony_ci dst_data = __msa_aver_u_b(res, dst_data); 876cabdff1aSopenharmony_ci 877cabdff1aSopenharmony_ci out0 = __msa_copy_u_h((v8i16) dst_data, 0); 878cabdff1aSopenharmony_ci out1 = __msa_copy_u_h((v8i16) dst_data, 2); 879cabdff1aSopenharmony_ci 880cabdff1aSopenharmony_ci SH(out0, dst); 881cabdff1aSopenharmony_ci dst += stride; 882cabdff1aSopenharmony_ci SH(out1, dst); 883cabdff1aSopenharmony_ci} 884cabdff1aSopenharmony_ci 885cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst, 886cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 887cabdff1aSopenharmony_ci uint32_t coeff1) 888cabdff1aSopenharmony_ci{ 889cabdff1aSopenharmony_ci uint16_t tp0, tp1, tp2, tp3; 890cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3; 891cabdff1aSopenharmony_ci v16u8 dst0, dst_data = { 0 }; 892cabdff1aSopenharmony_ci v8u16 res_r; 893cabdff1aSopenharmony_ci v16i8 mask; 894cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 895cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 896cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 897cabdff1aSopenharmony_ci 898cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[64]); 899cabdff1aSopenharmony_ci 900cabdff1aSopenharmony_ci LD_UB4(src, stride, src0, src1, src2, src3); 901cabdff1aSopenharmony_ci tp0 = LH(dst); 902cabdff1aSopenharmony_ci tp1 = LH(dst + stride); 903cabdff1aSopenharmony_ci tp2 = LH(dst + 2 * stride); 904cabdff1aSopenharmony_ci tp3 = LH(dst + 3 * stride); 905cabdff1aSopenharmony_ci dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0); 906cabdff1aSopenharmony_ci dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1); 907cabdff1aSopenharmony_ci dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2); 908cabdff1aSopenharmony_ci dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3); 909cabdff1aSopenharmony_ci 910cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); 911cabdff1aSopenharmony_ci 912cabdff1aSopenharmony_ci src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0); 913cabdff1aSopenharmony_ci 914cabdff1aSopenharmony_ci res_r = __msa_dotp_u_h(src0, coeff_vec); 915cabdff1aSopenharmony_ci res_r <<= 3; 916cabdff1aSopenharmony_ci res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 917cabdff1aSopenharmony_ci res_r = __msa_sat_u_h(res_r, 7); 918cabdff1aSopenharmony_ci 919cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 920cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b(dst0, dst_data); 921cabdff1aSopenharmony_ci 922cabdff1aSopenharmony_ci ST_H4(dst0, 0, 1, 2, 3, dst, stride); 923cabdff1aSopenharmony_ci} 924cabdff1aSopenharmony_ci 925cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst, 926cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 927cabdff1aSopenharmony_ci uint32_t coeff1, int32_t height) 928cabdff1aSopenharmony_ci{ 929cabdff1aSopenharmony_ci if (2 == height) { 930cabdff1aSopenharmony_ci avc_chroma_hz_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1); 931cabdff1aSopenharmony_ci } else if (4 == height) { 932cabdff1aSopenharmony_ci avc_chroma_hz_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1); 933cabdff1aSopenharmony_ci } 934cabdff1aSopenharmony_ci} 935cabdff1aSopenharmony_ci 936cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst, 937cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 938cabdff1aSopenharmony_ci uint32_t coeff1) 939cabdff1aSopenharmony_ci{ 940cabdff1aSopenharmony_ci uint32_t load0, load1; 941cabdff1aSopenharmony_ci v16i8 src0, src1; 942cabdff1aSopenharmony_ci v16u8 dst_data = { 0 }; 943cabdff1aSopenharmony_ci v8u16 res_r; 944cabdff1aSopenharmony_ci v16i8 res, mask; 945cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 946cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 947cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 948cabdff1aSopenharmony_ci 949cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 950cabdff1aSopenharmony_ci 951cabdff1aSopenharmony_ci LD_SB2(src, stride, src0, src1); 952cabdff1aSopenharmony_ci 953cabdff1aSopenharmony_ci LW2(dst, stride, load0, load1); 954cabdff1aSopenharmony_ci 955cabdff1aSopenharmony_ci INSERT_W2_UB(load0, load1, dst_data); 956cabdff1aSopenharmony_ci 957cabdff1aSopenharmony_ci src0 = __msa_vshf_b(mask, src1, src0); 958cabdff1aSopenharmony_ci 959cabdff1aSopenharmony_ci res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec); 960cabdff1aSopenharmony_ci res_r <<= 3; 961cabdff1aSopenharmony_ci res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 962cabdff1aSopenharmony_ci res_r = __msa_sat_u_h(res_r, 7); 963cabdff1aSopenharmony_ci res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 964cabdff1aSopenharmony_ci dst_data = __msa_aver_u_b((v16u8) res, dst_data); 965cabdff1aSopenharmony_ci 966cabdff1aSopenharmony_ci ST_W2(dst_data, 0, 1, dst, stride); 967cabdff1aSopenharmony_ci} 968cabdff1aSopenharmony_ci 969cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst, 970cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 971cabdff1aSopenharmony_ci uint32_t coeff1) 972cabdff1aSopenharmony_ci{ 973cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 974cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3; 975cabdff1aSopenharmony_ci v16u8 out, dst_data = { 0 }; 976cabdff1aSopenharmony_ci v16i8 mask; 977cabdff1aSopenharmony_ci v8u16 res0_r, res1_r; 978cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 979cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 980cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 981cabdff1aSopenharmony_ci 982cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 983cabdff1aSopenharmony_ci 984cabdff1aSopenharmony_ci LD_UB4(src, stride, src0, src1, src2, src3); 985cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 986cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data); 987cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); 988cabdff1aSopenharmony_ci DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r); 989cabdff1aSopenharmony_ci res0_r <<= 3; 990cabdff1aSopenharmony_ci res1_r <<= 3; 991cabdff1aSopenharmony_ci SRARI_H2_UH(res0_r, res1_r, 6); 992cabdff1aSopenharmony_ci SAT_UH2_UH(res0_r, res1_r, 7); 993cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); 994cabdff1aSopenharmony_ci out = __msa_aver_u_b(out, dst_data); 995cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 996cabdff1aSopenharmony_ci} 997cabdff1aSopenharmony_ci 998cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst, 999cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1000cabdff1aSopenharmony_ci uint32_t coeff1) 1001cabdff1aSopenharmony_ci{ 1002cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 1003cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1; 1004cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }; 1005cabdff1aSopenharmony_ci v16i8 mask; 1006cabdff1aSopenharmony_ci v8u16 res0, res1, res2, res3; 1007cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1008cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1009cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1010cabdff1aSopenharmony_ci 1011cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 1012cabdff1aSopenharmony_ci 1013cabdff1aSopenharmony_ci LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 1014cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 1015cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1016cabdff1aSopenharmony_ci LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1017cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 1018cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2); 1019cabdff1aSopenharmony_ci VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6); 1020cabdff1aSopenharmony_ci DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1); 1021cabdff1aSopenharmony_ci DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3); 1022cabdff1aSopenharmony_ci SLLI_4V(res0, res1, res2, res3, 3); 1023cabdff1aSopenharmony_ci SRARI_H4_UH(res0, res1, res2, res3, 6); 1024cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 1025cabdff1aSopenharmony_ci PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 1026cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1027cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 1028cabdff1aSopenharmony_ci} 1029cabdff1aSopenharmony_ci 1030cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst, 1031cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1032cabdff1aSopenharmony_ci uint32_t coeff1, int32_t height) 1033cabdff1aSopenharmony_ci{ 1034cabdff1aSopenharmony_ci if (2 == height) { 1035cabdff1aSopenharmony_ci avc_chroma_hz_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1); 1036cabdff1aSopenharmony_ci } else if (4 == height) { 1037cabdff1aSopenharmony_ci avc_chroma_hz_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1); 1038cabdff1aSopenharmony_ci } else if (8 == height) { 1039cabdff1aSopenharmony_ci avc_chroma_hz_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1); 1040cabdff1aSopenharmony_ci } 1041cabdff1aSopenharmony_ci} 1042cabdff1aSopenharmony_ci 1043cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst, 1044cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1045cabdff1aSopenharmony_ci uint32_t coeff1) 1046cabdff1aSopenharmony_ci{ 1047cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 1048cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, out0, out1; 1049cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }; 1050cabdff1aSopenharmony_ci v8u16 res0, res1, res2, res3; 1051cabdff1aSopenharmony_ci v16i8 mask; 1052cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1053cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1054cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1055cabdff1aSopenharmony_ci 1056cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[32]); 1057cabdff1aSopenharmony_ci LD_UB4(src, stride, src0, src1, src2, src3); 1058cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 1059cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 1060cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 1061cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1); 1062cabdff1aSopenharmony_ci VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3); 1063cabdff1aSopenharmony_ci DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 1064cabdff1aSopenharmony_ci coeff_vec, res0, res1, res2, res3); 1065cabdff1aSopenharmony_ci SLLI_4V(res0, res1, res2, res3, 3); 1066cabdff1aSopenharmony_ci SRARI_H4_UH(res0, res1, res2, res3, 6); 1067cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 1068cabdff1aSopenharmony_ci PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 1069cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 1070cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 1071cabdff1aSopenharmony_ci} 1072cabdff1aSopenharmony_ci 1073cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst, 1074cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1075cabdff1aSopenharmony_ci uint32_t coeff1) 1076cabdff1aSopenharmony_ci{ 1077cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 1078cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 1079cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 1080cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 1081cabdff1aSopenharmony_ci v8u16 res0, res1, res2, res3, res4, res5, res6, res7; 1082cabdff1aSopenharmony_ci v16i8 mask; 1083cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1084cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1085cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1086cabdff1aSopenharmony_ci 1087cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[32]); 1088cabdff1aSopenharmony_ci 1089cabdff1aSopenharmony_ci LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 1090cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 1091cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 1092cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 1093cabdff1aSopenharmony_ci LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1094cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst2); 1095cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst3); 1096cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1); 1097cabdff1aSopenharmony_ci VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3); 1098cabdff1aSopenharmony_ci VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5); 1099cabdff1aSopenharmony_ci VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7); 1100cabdff1aSopenharmony_ci DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 1101cabdff1aSopenharmony_ci coeff_vec, res0, res1, res2, res3); 1102cabdff1aSopenharmony_ci DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec, 1103cabdff1aSopenharmony_ci coeff_vec, res4, res5, res6, res7); 1104cabdff1aSopenharmony_ci SLLI_4V(res0, res1, res2, res3, 3); 1105cabdff1aSopenharmony_ci SLLI_4V(res4, res5, res6, res7, 3); 1106cabdff1aSopenharmony_ci SRARI_H4_UH(res0, res1, res2, res3, 6); 1107cabdff1aSopenharmony_ci SRARI_H4_UH(res4, res5, res6, res7, 6); 1108cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 1109cabdff1aSopenharmony_ci SAT_UH4_UH(res4, res5, res6, res7, 7); 1110cabdff1aSopenharmony_ci PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 1111cabdff1aSopenharmony_ci PCKEV_B2_UB(res5, res4, res7, res6, out2, out3); 1112cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1113cabdff1aSopenharmony_ci AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3); 1114cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1115cabdff1aSopenharmony_ci} 1116cabdff1aSopenharmony_ci 1117cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst, 1118cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1119cabdff1aSopenharmony_ci uint32_t coeff1, int32_t height) 1120cabdff1aSopenharmony_ci{ 1121cabdff1aSopenharmony_ci if (4 == height) { 1122cabdff1aSopenharmony_ci avc_chroma_hz_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1); 1123cabdff1aSopenharmony_ci } else if (8 == height) { 1124cabdff1aSopenharmony_ci avc_chroma_hz_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1); 1125cabdff1aSopenharmony_ci } 1126cabdff1aSopenharmony_ci} 1127cabdff1aSopenharmony_ci 1128cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst, 1129cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1130cabdff1aSopenharmony_ci uint32_t coeff1) 1131cabdff1aSopenharmony_ci{ 1132cabdff1aSopenharmony_ci uint16_t out0, out1; 1133cabdff1aSopenharmony_ci v16i8 src0, src1, src2, tmp0, tmp1, res; 1134cabdff1aSopenharmony_ci v16u8 dst_data = { 0 }; 1135cabdff1aSopenharmony_ci v8i16 out; 1136cabdff1aSopenharmony_ci v8u16 res_r; 1137cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1138cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1139cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1140cabdff1aSopenharmony_ci 1141cabdff1aSopenharmony_ci LD_SB3(src, stride, src0, src1, src2); 1142cabdff1aSopenharmony_ci out0 = LH(dst); 1143cabdff1aSopenharmony_ci out1 = LH(dst + stride); 1144cabdff1aSopenharmony_ci 1145cabdff1aSopenharmony_ci dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0); 1146cabdff1aSopenharmony_ci dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1); 1147cabdff1aSopenharmony_ci 1148cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1); 1149cabdff1aSopenharmony_ci 1150cabdff1aSopenharmony_ci tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0); 1151cabdff1aSopenharmony_ci res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec); 1152cabdff1aSopenharmony_ci res_r <<= 3; 1153cabdff1aSopenharmony_ci res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 1154cabdff1aSopenharmony_ci res_r = __msa_sat_u_h(res_r, 7); 1155cabdff1aSopenharmony_ci res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 1156cabdff1aSopenharmony_ci out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data); 1157cabdff1aSopenharmony_ci out0 = __msa_copy_u_h(out, 0); 1158cabdff1aSopenharmony_ci out1 = __msa_copy_u_h(out, 2); 1159cabdff1aSopenharmony_ci 1160cabdff1aSopenharmony_ci SH(out0, dst); 1161cabdff1aSopenharmony_ci dst += stride; 1162cabdff1aSopenharmony_ci SH(out1, dst); 1163cabdff1aSopenharmony_ci} 1164cabdff1aSopenharmony_ci 1165cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst, 1166cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1167cabdff1aSopenharmony_ci uint32_t coeff1) 1168cabdff1aSopenharmony_ci{ 1169cabdff1aSopenharmony_ci uint16_t tp0, tp1, tp2, tp3; 1170cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 1171cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3; 1172cabdff1aSopenharmony_ci v8u16 res_r; 1173cabdff1aSopenharmony_ci v8i16 res; 1174cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1175cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1176cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1177cabdff1aSopenharmony_ci v16u8 dst_data = { 0 }; 1178cabdff1aSopenharmony_ci 1179cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 1180cabdff1aSopenharmony_ci 1181cabdff1aSopenharmony_ci tp0 = LH(dst); 1182cabdff1aSopenharmony_ci tp1 = LH(dst + stride); 1183cabdff1aSopenharmony_ci tp2 = LH(dst + 2 * stride); 1184cabdff1aSopenharmony_ci tp3 = LH(dst + 3 * stride); 1185cabdff1aSopenharmony_ci dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0); 1186cabdff1aSopenharmony_ci dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1); 1187cabdff1aSopenharmony_ci dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2); 1188cabdff1aSopenharmony_ci dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3); 1189cabdff1aSopenharmony_ci 1190cabdff1aSopenharmony_ci ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 1191cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1192cabdff1aSopenharmony_ci ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 1193cabdff1aSopenharmony_ci 1194cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0); 1195cabdff1aSopenharmony_ci 1196cabdff1aSopenharmony_ci res_r = __msa_dotp_u_h(tmp0, coeff_vec); 1197cabdff1aSopenharmony_ci res_r <<= 3; 1198cabdff1aSopenharmony_ci res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 1199cabdff1aSopenharmony_ci res_r = __msa_sat_u_h(res_r, 7); 1200cabdff1aSopenharmony_ci 1201cabdff1aSopenharmony_ci res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 1202cabdff1aSopenharmony_ci res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data); 1203cabdff1aSopenharmony_ci 1204cabdff1aSopenharmony_ci ST_H4(res, 0, 1, 2, 3, dst, stride); 1205cabdff1aSopenharmony_ci} 1206cabdff1aSopenharmony_ci 1207cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst, 1208cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1209cabdff1aSopenharmony_ci uint32_t coeff1, int32_t height) 1210cabdff1aSopenharmony_ci{ 1211cabdff1aSopenharmony_ci if (2 == height) { 1212cabdff1aSopenharmony_ci avc_chroma_vt_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1); 1213cabdff1aSopenharmony_ci } else if (4 == height) { 1214cabdff1aSopenharmony_ci avc_chroma_vt_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1); 1215cabdff1aSopenharmony_ci } 1216cabdff1aSopenharmony_ci} 1217cabdff1aSopenharmony_ci 1218cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst, 1219cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1220cabdff1aSopenharmony_ci uint32_t coeff1) 1221cabdff1aSopenharmony_ci{ 1222cabdff1aSopenharmony_ci uint32_t load0, load1; 1223cabdff1aSopenharmony_ci v16u8 src0, src1, src2, tmp0, tmp1; 1224cabdff1aSopenharmony_ci v16u8 dst_data = { 0 }; 1225cabdff1aSopenharmony_ci v8u16 res_r; 1226cabdff1aSopenharmony_ci v16u8 res; 1227cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1228cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1229cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1230cabdff1aSopenharmony_ci 1231cabdff1aSopenharmony_ci LD_UB3(src, stride, src0, src1, src2); 1232cabdff1aSopenharmony_ci 1233cabdff1aSopenharmony_ci LW2(dst, stride, load0, load1); 1234cabdff1aSopenharmony_ci 1235cabdff1aSopenharmony_ci INSERT_W2_UB(load0, load1, dst_data); 1236cabdff1aSopenharmony_ci ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1); 1237cabdff1aSopenharmony_ci 1238cabdff1aSopenharmony_ci tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0); 1239cabdff1aSopenharmony_ci 1240cabdff1aSopenharmony_ci res_r = __msa_dotp_u_h(tmp0, coeff_vec); 1241cabdff1aSopenharmony_ci res_r <<= 3; 1242cabdff1aSopenharmony_ci res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6); 1243cabdff1aSopenharmony_ci res_r = __msa_sat_u_h(res_r, 7); 1244cabdff1aSopenharmony_ci res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r); 1245cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, dst_data); 1246cabdff1aSopenharmony_ci 1247cabdff1aSopenharmony_ci ST_W2(res, 0, 1, dst, stride); 1248cabdff1aSopenharmony_ci} 1249cabdff1aSopenharmony_ci 1250cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst, 1251cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1252cabdff1aSopenharmony_ci uint32_t coeff1) 1253cabdff1aSopenharmony_ci{ 1254cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 1255cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4; 1256cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3; 1257cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }; 1258cabdff1aSopenharmony_ci v8u16 res0_r, res1_r; 1259cabdff1aSopenharmony_ci v16u8 out; 1260cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1261cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1262cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1263cabdff1aSopenharmony_ci 1264cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 1265cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 1266cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1267cabdff1aSopenharmony_ci ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2, 1268cabdff1aSopenharmony_ci tmp3); 1269cabdff1aSopenharmony_ci ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 1270cabdff1aSopenharmony_ci DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r); 1271cabdff1aSopenharmony_ci res0_r <<= 3; 1272cabdff1aSopenharmony_ci res1_r <<= 3; 1273cabdff1aSopenharmony_ci SRARI_H2_UH(res0_r, res1_r, 6); 1274cabdff1aSopenharmony_ci SAT_UH2_UH(res0_r, res1_r, 7); 1275cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); 1276cabdff1aSopenharmony_ci out = __msa_aver_u_b(out, dst0); 1277cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 1278cabdff1aSopenharmony_ci} 1279cabdff1aSopenharmony_ci 1280cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst, 1281cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1282cabdff1aSopenharmony_ci uint32_t coeff1) 1283cabdff1aSopenharmony_ci{ 1284cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 1285cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1286cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1; 1287cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }; 1288cabdff1aSopenharmony_ci v8u16 res0, res1, res2, res3; 1289cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1290cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1291cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1292cabdff1aSopenharmony_ci 1293cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 1294cabdff1aSopenharmony_ci src += (5 * stride); 1295cabdff1aSopenharmony_ci LD_UB4(src, stride, src5, src6, src7, src8); 1296cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 1297cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1298cabdff1aSopenharmony_ci LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1299cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 1300cabdff1aSopenharmony_ci ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2, 1301cabdff1aSopenharmony_ci tmp3); 1302cabdff1aSopenharmony_ci ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6, 1303cabdff1aSopenharmony_ci tmp7); 1304cabdff1aSopenharmony_ci ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2); 1305cabdff1aSopenharmony_ci ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6); 1306cabdff1aSopenharmony_ci DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1); 1307cabdff1aSopenharmony_ci DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3); 1308cabdff1aSopenharmony_ci SLLI_4V(res0, res1, res2, res3, 3); 1309cabdff1aSopenharmony_ci SRARI_H4_UH(res0, res1, res2, res3, 6); 1310cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 1311cabdff1aSopenharmony_ci PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 1312cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1313cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 1314cabdff1aSopenharmony_ci} 1315cabdff1aSopenharmony_ci 1316cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst, 1317cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1318cabdff1aSopenharmony_ci uint32_t coeff1, int32_t height) 1319cabdff1aSopenharmony_ci{ 1320cabdff1aSopenharmony_ci if (2 == height) { 1321cabdff1aSopenharmony_ci avc_chroma_vt_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1); 1322cabdff1aSopenharmony_ci } else if (4 == height) { 1323cabdff1aSopenharmony_ci avc_chroma_vt_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1); 1324cabdff1aSopenharmony_ci } else if (8 == height) { 1325cabdff1aSopenharmony_ci avc_chroma_vt_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1); 1326cabdff1aSopenharmony_ci } 1327cabdff1aSopenharmony_ci} 1328cabdff1aSopenharmony_ci 1329cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst, 1330cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1331cabdff1aSopenharmony_ci uint32_t coeff1) 1332cabdff1aSopenharmony_ci{ 1333cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 1334cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4; 1335cabdff1aSopenharmony_ci v16u8 out0, out1; 1336cabdff1aSopenharmony_ci v8u16 res0, res1, res2, res3; 1337cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }; 1338cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1339cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1340cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1341cabdff1aSopenharmony_ci 1342cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 1343cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 1344cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 1345cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 1346cabdff1aSopenharmony_ci ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 1347cabdff1aSopenharmony_ci src0, src1, src2, src3); 1348cabdff1aSopenharmony_ci DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 1349cabdff1aSopenharmony_ci coeff_vec, res0, res1, res2, res3); 1350cabdff1aSopenharmony_ci SLLI_4V(res0, res1, res2, res3, 3); 1351cabdff1aSopenharmony_ci SRARI_H4_UH(res0, res1, res2, res3, 6); 1352cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 1353cabdff1aSopenharmony_ci PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 1354cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1355cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 1356cabdff1aSopenharmony_ci} 1357cabdff1aSopenharmony_ci 1358cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst, 1359cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1360cabdff1aSopenharmony_ci uint32_t coeff1) 1361cabdff1aSopenharmony_ci{ 1362cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 1363cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1364cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 1365cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 1366cabdff1aSopenharmony_ci v8u16 res0, res1, res2, res3, res4, res5, res6, res7; 1367cabdff1aSopenharmony_ci v16i8 coeff_vec0 = __msa_fill_b(coeff0); 1368cabdff1aSopenharmony_ci v16i8 coeff_vec1 = __msa_fill_b(coeff1); 1369cabdff1aSopenharmony_ci v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1); 1370cabdff1aSopenharmony_ci 1371cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 1372cabdff1aSopenharmony_ci src += (5 * stride); 1373cabdff1aSopenharmony_ci LD_UB4(src, stride, src5, src6, src7, src8); 1374cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 1375cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 1376cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 1377cabdff1aSopenharmony_ci LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1378cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst2); 1379cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst3); 1380cabdff1aSopenharmony_ci ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 1381cabdff1aSopenharmony_ci src0, src1, src2, src3); 1382cabdff1aSopenharmony_ci ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, 1383cabdff1aSopenharmony_ci src4, src5, src6, src7); 1384cabdff1aSopenharmony_ci DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec, 1385cabdff1aSopenharmony_ci coeff_vec, res0, res1, res2, res3); 1386cabdff1aSopenharmony_ci DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec, 1387cabdff1aSopenharmony_ci coeff_vec, res4, res5, res6, res7); 1388cabdff1aSopenharmony_ci SLLI_4V(res0, res1, res2, res3, 3); 1389cabdff1aSopenharmony_ci SLLI_4V(res4, res5, res6, res7, 3); 1390cabdff1aSopenharmony_ci SRARI_H4_UH(res0, res1, res2, res3, 6); 1391cabdff1aSopenharmony_ci SRARI_H4_UH(res4, res5, res6, res7, 6); 1392cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 1393cabdff1aSopenharmony_ci SAT_UH4_UH(res0, res1, res2, res3, 7); 1394cabdff1aSopenharmony_ci PCKEV_B2_UB(res1, res0, res3, res2, out0, out1); 1395cabdff1aSopenharmony_ci PCKEV_B2_UB(res5, res4, res7, res6, out2, out3); 1396cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1397cabdff1aSopenharmony_ci AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3); 1398cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1399cabdff1aSopenharmony_ci} 1400cabdff1aSopenharmony_ci 1401cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst, 1402cabdff1aSopenharmony_ci int32_t stride, uint32_t coeff0, 1403cabdff1aSopenharmony_ci uint32_t coeff1, int32_t height) 1404cabdff1aSopenharmony_ci{ 1405cabdff1aSopenharmony_ci if (4 == height) { 1406cabdff1aSopenharmony_ci avc_chroma_vt_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1); 1407cabdff1aSopenharmony_ci } else if (8 == height) { 1408cabdff1aSopenharmony_ci avc_chroma_vt_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1); 1409cabdff1aSopenharmony_ci } 1410cabdff1aSopenharmony_ci} 1411cabdff1aSopenharmony_ci 1412cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst, 1413cabdff1aSopenharmony_ci int32_t stride, 1414cabdff1aSopenharmony_ci uint32_t coef_hor0, 1415cabdff1aSopenharmony_ci uint32_t coef_hor1, 1416cabdff1aSopenharmony_ci uint32_t coef_ver0, 1417cabdff1aSopenharmony_ci uint32_t coef_ver1) 1418cabdff1aSopenharmony_ci{ 1419cabdff1aSopenharmony_ci uint16_t out0, out1; 1420cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }; 1421cabdff1aSopenharmony_ci v16u8 src0, src1, src2; 1422cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_vt0, res_vt1; 1423cabdff1aSopenharmony_ci v16i8 res, mask; 1424cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1425cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1426cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1427cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1428cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1429cabdff1aSopenharmony_ci 1430cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[48]); 1431cabdff1aSopenharmony_ci 1432cabdff1aSopenharmony_ci LD_UB3(src, stride, src0, src1, src2); 1433cabdff1aSopenharmony_ci out0 = LH(dst); 1434cabdff1aSopenharmony_ci out1 = LH(dst + stride); 1435cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0); 1436cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1); 1437cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 1438cabdff1aSopenharmony_ci DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); 1439cabdff1aSopenharmony_ci MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 1440cabdff1aSopenharmony_ci 1441cabdff1aSopenharmony_ci res_vt0 += res_vt1; 1442cabdff1aSopenharmony_ci res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); 1443cabdff1aSopenharmony_ci res_vt0 = __msa_sat_u_h(res_vt0, 7); 1444cabdff1aSopenharmony_ci res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); 1445cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b((v16u8) res, dst0); 1446cabdff1aSopenharmony_ci out0 = __msa_copy_u_h((v8i16) dst0, 0); 1447cabdff1aSopenharmony_ci out1 = __msa_copy_u_h((v8i16) dst0, 1); 1448cabdff1aSopenharmony_ci 1449cabdff1aSopenharmony_ci SH(out0, dst); 1450cabdff1aSopenharmony_ci dst += stride; 1451cabdff1aSopenharmony_ci SH(out1, dst); 1452cabdff1aSopenharmony_ci} 1453cabdff1aSopenharmony_ci 1454cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst, 1455cabdff1aSopenharmony_ci int32_t stride, 1456cabdff1aSopenharmony_ci uint32_t coef_hor0, 1457cabdff1aSopenharmony_ci uint32_t coef_hor1, 1458cabdff1aSopenharmony_ci uint32_t coef_ver0, 1459cabdff1aSopenharmony_ci uint32_t coef_ver1) 1460cabdff1aSopenharmony_ci{ 1461cabdff1aSopenharmony_ci uint16_t tp0, tp1, tp2, tp3; 1462cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4; 1463cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3; 1464cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }; 1465cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_vt0, res_vt1; 1466cabdff1aSopenharmony_ci v16i8 res, mask; 1467cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1468cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1469cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1470cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1471cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1472cabdff1aSopenharmony_ci 1473cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[48]); 1474cabdff1aSopenharmony_ci 1475cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 1476cabdff1aSopenharmony_ci tp0 = LH(dst); 1477cabdff1aSopenharmony_ci tp1 = LH(dst + stride); 1478cabdff1aSopenharmony_ci tp2 = LH(dst + 2 * stride); 1479cabdff1aSopenharmony_ci tp3 = LH(dst + 3 * stride); 1480cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, tp0); 1481cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, tp1); 1482cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 2, tp2); 1483cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 3, tp3); 1484cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1); 1485cabdff1aSopenharmony_ci VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3); 1486cabdff1aSopenharmony_ci ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 1487cabdff1aSopenharmony_ci DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); 1488cabdff1aSopenharmony_ci MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 1489cabdff1aSopenharmony_ci 1490cabdff1aSopenharmony_ci res_vt0 += res_vt1; 1491cabdff1aSopenharmony_ci res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); 1492cabdff1aSopenharmony_ci res_vt0 = __msa_sat_u_h(res_vt0, 7); 1493cabdff1aSopenharmony_ci res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); 1494cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b((v16u8) res, dst0); 1495cabdff1aSopenharmony_ci 1496cabdff1aSopenharmony_ci ST_H4(dst0, 0, 1, 2, 3, dst, stride); 1497cabdff1aSopenharmony_ci} 1498cabdff1aSopenharmony_ci 1499cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst, 1500cabdff1aSopenharmony_ci int32_t stride, 1501cabdff1aSopenharmony_ci uint32_t coef_hor0, 1502cabdff1aSopenharmony_ci uint32_t coef_hor1, 1503cabdff1aSopenharmony_ci uint32_t coef_ver0, 1504cabdff1aSopenharmony_ci uint32_t coef_ver1, 1505cabdff1aSopenharmony_ci int32_t height) 1506cabdff1aSopenharmony_ci{ 1507cabdff1aSopenharmony_ci if (2 == height) { 1508cabdff1aSopenharmony_ci avc_chroma_hv_and_aver_dst_2x2_msa(src, dst, stride, coef_hor0, 1509cabdff1aSopenharmony_ci coef_hor1, coef_ver0, coef_ver1); 1510cabdff1aSopenharmony_ci } else if (4 == height) { 1511cabdff1aSopenharmony_ci avc_chroma_hv_and_aver_dst_2x4_msa(src, dst, stride, coef_hor0, 1512cabdff1aSopenharmony_ci coef_hor1, coef_ver0, coef_ver1); 1513cabdff1aSopenharmony_ci } 1514cabdff1aSopenharmony_ci} 1515cabdff1aSopenharmony_ci 1516cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst, 1517cabdff1aSopenharmony_ci int32_t stride, 1518cabdff1aSopenharmony_ci uint32_t coef_hor0, 1519cabdff1aSopenharmony_ci uint32_t coef_hor1, 1520cabdff1aSopenharmony_ci uint32_t coef_ver0, 1521cabdff1aSopenharmony_ci uint32_t coef_ver1) 1522cabdff1aSopenharmony_ci{ 1523cabdff1aSopenharmony_ci uint32_t tp0, tp1; 1524cabdff1aSopenharmony_ci v16u8 src0, src1, src2; 1525cabdff1aSopenharmony_ci v16u8 dst0, dst_data = { 0 }; 1526cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_vt0, res_vt1; 1527cabdff1aSopenharmony_ci v16i8 mask; 1528cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1529cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1530cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1531cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1532cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1533cabdff1aSopenharmony_ci 1534cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 1535cabdff1aSopenharmony_ci 1536cabdff1aSopenharmony_ci LD_UB3(src, stride, src0, src1, src2); 1537cabdff1aSopenharmony_ci LW2(dst, stride, tp0, tp1); 1538cabdff1aSopenharmony_ci INSERT_W2_UB(tp0, tp1, dst_data); 1539cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 1540cabdff1aSopenharmony_ci DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1); 1541cabdff1aSopenharmony_ci MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1); 1542cabdff1aSopenharmony_ci 1543cabdff1aSopenharmony_ci res_vt0 += res_vt1; 1544cabdff1aSopenharmony_ci res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6); 1545cabdff1aSopenharmony_ci res_vt0 = __msa_sat_u_h(res_vt0, 7); 1546cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0); 1547cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b(dst0, dst_data); 1548cabdff1aSopenharmony_ci 1549cabdff1aSopenharmony_ci ST_W2(dst0, 0, 1, dst, stride); 1550cabdff1aSopenharmony_ci} 1551cabdff1aSopenharmony_ci 1552cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst, 1553cabdff1aSopenharmony_ci int32_t stride, 1554cabdff1aSopenharmony_ci uint32_t coef_hor0, 1555cabdff1aSopenharmony_ci uint32_t coef_hor1, 1556cabdff1aSopenharmony_ci uint32_t coef_ver0, 1557cabdff1aSopenharmony_ci uint32_t coef_ver1) 1558cabdff1aSopenharmony_ci{ 1559cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 1560cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4; 1561cabdff1aSopenharmony_ci v16u8 out, dst_data = { 0 }; 1562cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_hz2, res_hz3; 1563cabdff1aSopenharmony_ci v8u16 res_vt0, res_vt1, res_vt2, res_vt3; 1564cabdff1aSopenharmony_ci v16i8 mask; 1565cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1566cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1567cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1568cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1569cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1570cabdff1aSopenharmony_ci 1571cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 1572cabdff1aSopenharmony_ci 1573cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 1574cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 1575cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data); 1576cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 1577cabdff1aSopenharmony_ci VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3); 1578cabdff1aSopenharmony_ci DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, 1579cabdff1aSopenharmony_ci coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, 1580cabdff1aSopenharmony_ci res_hz3); 1581cabdff1aSopenharmony_ci MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, 1582cabdff1aSopenharmony_ci res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 1583cabdff1aSopenharmony_ci ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1); 1584cabdff1aSopenharmony_ci SRARI_H2_UH(res_vt0, res_vt1, 6); 1585cabdff1aSopenharmony_ci SAT_UH2_UH(res_vt0, res_vt1, 7); 1586cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0); 1587cabdff1aSopenharmony_ci out = __msa_aver_u_b(out, dst_data); 1588cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 1589cabdff1aSopenharmony_ci} 1590cabdff1aSopenharmony_ci 1591cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst, 1592cabdff1aSopenharmony_ci int32_t stride, 1593cabdff1aSopenharmony_ci uint32_t coef_hor0, 1594cabdff1aSopenharmony_ci uint32_t coef_hor1, 1595cabdff1aSopenharmony_ci uint32_t coef_ver0, 1596cabdff1aSopenharmony_ci uint32_t coef_ver1) 1597cabdff1aSopenharmony_ci{ 1598cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 1599cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1; 1600cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }; 1601cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7; 1602cabdff1aSopenharmony_ci v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7; 1603cabdff1aSopenharmony_ci v16i8 mask; 1604cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1605cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1606cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1607cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1608cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1609cabdff1aSopenharmony_ci 1610cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[0]); 1611cabdff1aSopenharmony_ci 1612cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 1613cabdff1aSopenharmony_ci src += (5 * stride); 1614cabdff1aSopenharmony_ci LD_UB4(src, stride, src5, src6, src7, src8); 1615cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 1616cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1617cabdff1aSopenharmony_ci LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1618cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 1619cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1); 1620cabdff1aSopenharmony_ci VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3); 1621cabdff1aSopenharmony_ci VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5); 1622cabdff1aSopenharmony_ci VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7); 1623cabdff1aSopenharmony_ci DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, 1624cabdff1aSopenharmony_ci coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3); 1625cabdff1aSopenharmony_ci DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec, 1626cabdff1aSopenharmony_ci coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7); 1627cabdff1aSopenharmony_ci MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, 1628cabdff1aSopenharmony_ci res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 1629cabdff1aSopenharmony_ci MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1, 1630cabdff1aSopenharmony_ci res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7); 1631cabdff1aSopenharmony_ci ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1); 1632cabdff1aSopenharmony_ci ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3); 1633cabdff1aSopenharmony_ci SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); 1634cabdff1aSopenharmony_ci SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); 1635cabdff1aSopenharmony_ci PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1); 1636cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 1637cabdff1aSopenharmony_ci ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 1638cabdff1aSopenharmony_ci} 1639cabdff1aSopenharmony_ci 1640cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst, 1641cabdff1aSopenharmony_ci int32_t stride, 1642cabdff1aSopenharmony_ci uint32_t coef_hor0, 1643cabdff1aSopenharmony_ci uint32_t coef_hor1, 1644cabdff1aSopenharmony_ci uint32_t coef_ver0, 1645cabdff1aSopenharmony_ci uint32_t coef_ver1, 1646cabdff1aSopenharmony_ci int32_t height) 1647cabdff1aSopenharmony_ci{ 1648cabdff1aSopenharmony_ci if (2 == height) { 1649cabdff1aSopenharmony_ci avc_chroma_hv_and_aver_dst_4x2_msa(src, dst, stride, coef_hor0, 1650cabdff1aSopenharmony_ci coef_hor1, coef_ver0, coef_ver1); 1651cabdff1aSopenharmony_ci } else if (4 == height) { 1652cabdff1aSopenharmony_ci avc_chroma_hv_and_aver_dst_4x4_msa(src, dst, stride, coef_hor0, 1653cabdff1aSopenharmony_ci coef_hor1, coef_ver0, coef_ver1); 1654cabdff1aSopenharmony_ci } else if (8 == height) { 1655cabdff1aSopenharmony_ci avc_chroma_hv_and_aver_dst_4x8_msa(src, dst, stride, coef_hor0, 1656cabdff1aSopenharmony_ci coef_hor1, coef_ver0, coef_ver1); 1657cabdff1aSopenharmony_ci } 1658cabdff1aSopenharmony_ci} 1659cabdff1aSopenharmony_ci 1660cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst, 1661cabdff1aSopenharmony_ci int32_t stride, 1662cabdff1aSopenharmony_ci uint32_t coef_hor0, 1663cabdff1aSopenharmony_ci uint32_t coef_hor1, 1664cabdff1aSopenharmony_ci uint32_t coef_ver0, 1665cabdff1aSopenharmony_ci uint32_t coef_ver1) 1666cabdff1aSopenharmony_ci{ 1667cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 1668cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, out0, out1; 1669cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_hz2; 1670cabdff1aSopenharmony_ci v8u16 res_hz3, res_hz4; 1671cabdff1aSopenharmony_ci v8u16 res_vt0, res_vt1, res_vt2, res_vt3; 1672cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }; 1673cabdff1aSopenharmony_ci v16i8 mask; 1674cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1675cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1676cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1677cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1678cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1679cabdff1aSopenharmony_ci 1680cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[32]); 1681cabdff1aSopenharmony_ci 1682cabdff1aSopenharmony_ci src0 = LD_UB(src); 1683cabdff1aSopenharmony_ci src += stride; 1684cabdff1aSopenharmony_ci src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0); 1685cabdff1aSopenharmony_ci res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec); 1686cabdff1aSopenharmony_ci LD_UB4(src, stride, src1, src2, src3, src4); 1687cabdff1aSopenharmony_ci src += (4 * stride); 1688cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 1689cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 1690cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 1691cabdff1aSopenharmony_ci VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2); 1692cabdff1aSopenharmony_ci VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4); 1693cabdff1aSopenharmony_ci DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec, 1694cabdff1aSopenharmony_ci coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4); 1695cabdff1aSopenharmony_ci MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0, 1696cabdff1aSopenharmony_ci res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3); 1697cabdff1aSopenharmony_ci res_vt0 += (res_hz0 * coeff_vt_vec1); 1698cabdff1aSopenharmony_ci res_vt1 += (res_hz1 * coeff_vt_vec1); 1699cabdff1aSopenharmony_ci res_vt2 += (res_hz2 * coeff_vt_vec1); 1700cabdff1aSopenharmony_ci res_vt3 += (res_hz3 * coeff_vt_vec1); 1701cabdff1aSopenharmony_ci SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); 1702cabdff1aSopenharmony_ci SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); 1703cabdff1aSopenharmony_ci PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1); 1704cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1705cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 1706cabdff1aSopenharmony_ci} 1707cabdff1aSopenharmony_ci 1708cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst, 1709cabdff1aSopenharmony_ci int32_t stride, 1710cabdff1aSopenharmony_ci uint32_t coef_hor0, 1711cabdff1aSopenharmony_ci uint32_t coef_hor1, 1712cabdff1aSopenharmony_ci uint32_t coef_ver0, 1713cabdff1aSopenharmony_ci uint32_t coef_ver1) 1714cabdff1aSopenharmony_ci{ 1715cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 1716cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1717cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 1718cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 1719cabdff1aSopenharmony_ci v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4; 1720cabdff1aSopenharmony_ci v8u16 res_hz5, res_hz6, res_hz7, res_hz8; 1721cabdff1aSopenharmony_ci v8u16 res_vt0, res_vt1, res_vt2, res_vt3; 1722cabdff1aSopenharmony_ci v8u16 res_vt4, res_vt5, res_vt6, res_vt7; 1723cabdff1aSopenharmony_ci v16i8 mask; 1724cabdff1aSopenharmony_ci v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0); 1725cabdff1aSopenharmony_ci v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1); 1726cabdff1aSopenharmony_ci v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1); 1727cabdff1aSopenharmony_ci v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0); 1728cabdff1aSopenharmony_ci v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1); 1729cabdff1aSopenharmony_ci 1730cabdff1aSopenharmony_ci mask = LD_SB(&chroma_mask_arr[32]); 1731cabdff1aSopenharmony_ci 1732cabdff1aSopenharmony_ci LD_UB5(src, stride, src0, src1, src2, src3, src4); 1733cabdff1aSopenharmony_ci src += (5 * stride); 1734cabdff1aSopenharmony_ci LD_UB4(src, stride, src5, src6, src7, src8); 1735cabdff1aSopenharmony_ci src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0); 1736cabdff1aSopenharmony_ci VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2); 1737cabdff1aSopenharmony_ci VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4); 1738cabdff1aSopenharmony_ci VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6); 1739cabdff1aSopenharmony_ci VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8); 1740cabdff1aSopenharmony_ci res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec); 1741cabdff1aSopenharmony_ci DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec, 1742cabdff1aSopenharmony_ci coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, 1743cabdff1aSopenharmony_ci res_hz4); 1744cabdff1aSopenharmony_ci DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec, 1745cabdff1aSopenharmony_ci coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8); 1746cabdff1aSopenharmony_ci MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, 1747cabdff1aSopenharmony_ci coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, 1748cabdff1aSopenharmony_ci res_vt3); 1749cabdff1aSopenharmony_ci MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7, 1750cabdff1aSopenharmony_ci coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, 1751cabdff1aSopenharmony_ci res_vt7); 1752cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 1753cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 1754cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 1755cabdff1aSopenharmony_ci LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1756cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst2); 1757cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst3); 1758cabdff1aSopenharmony_ci res_vt0 += (res_hz0 * coeff_vt_vec1); 1759cabdff1aSopenharmony_ci res_vt1 += (res_hz1 * coeff_vt_vec1); 1760cabdff1aSopenharmony_ci res_vt2 += (res_hz2 * coeff_vt_vec1); 1761cabdff1aSopenharmony_ci res_vt3 += (res_hz3 * coeff_vt_vec1); 1762cabdff1aSopenharmony_ci res_vt4 += (res_hz4 * coeff_vt_vec1); 1763cabdff1aSopenharmony_ci res_vt5 += (res_hz5 * coeff_vt_vec1); 1764cabdff1aSopenharmony_ci res_vt6 += (res_hz6 * coeff_vt_vec1); 1765cabdff1aSopenharmony_ci res_vt7 += (res_hz7 * coeff_vt_vec1); 1766cabdff1aSopenharmony_ci SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6); 1767cabdff1aSopenharmony_ci SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6); 1768cabdff1aSopenharmony_ci SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7); 1769cabdff1aSopenharmony_ci SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7); 1770cabdff1aSopenharmony_ci PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1); 1771cabdff1aSopenharmony_ci PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3); 1772cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 1773cabdff1aSopenharmony_ci AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3); 1774cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1775cabdff1aSopenharmony_ci} 1776cabdff1aSopenharmony_ci 1777cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst, 1778cabdff1aSopenharmony_ci int32_t stride, 1779cabdff1aSopenharmony_ci uint32_t coef_hor0, 1780cabdff1aSopenharmony_ci uint32_t coef_hor1, 1781cabdff1aSopenharmony_ci uint32_t coef_ver0, 1782cabdff1aSopenharmony_ci uint32_t coef_ver1, 1783cabdff1aSopenharmony_ci int32_t height) 1784cabdff1aSopenharmony_ci{ 1785cabdff1aSopenharmony_ci if (4 == height) { 1786cabdff1aSopenharmony_ci avc_chroma_hv_and_aver_dst_8x4_msa(src, dst, stride, coef_hor0, 1787cabdff1aSopenharmony_ci coef_hor1, coef_ver0, coef_ver1); 1788cabdff1aSopenharmony_ci } else if (8 == height) { 1789cabdff1aSopenharmony_ci avc_chroma_hv_and_aver_dst_8x8_msa(src, dst, stride, coef_hor0, 1790cabdff1aSopenharmony_ci coef_hor1, coef_ver0, coef_ver1); 1791cabdff1aSopenharmony_ci } 1792cabdff1aSopenharmony_ci} 1793cabdff1aSopenharmony_ci 1794cabdff1aSopenharmony_cistatic void copy_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 1795cabdff1aSopenharmony_ci int32_t height) 1796cabdff1aSopenharmony_ci{ 1797cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; 1798cabdff1aSopenharmony_ci 1799cabdff1aSopenharmony_ci if (8 == height) { 1800cabdff1aSopenharmony_ci LW4(src, stride, tp0, tp1, tp2, tp3); 1801cabdff1aSopenharmony_ci src += 4 * stride; 1802cabdff1aSopenharmony_ci LW4(src, stride, tp4, tp5, tp6, tp7); 1803cabdff1aSopenharmony_ci SW4(tp0, tp1, tp2, tp3, dst, stride); 1804cabdff1aSopenharmony_ci dst += 4 * stride; 1805cabdff1aSopenharmony_ci SW4(tp4, tp5, tp6, tp7, dst, stride); 1806cabdff1aSopenharmony_ci } else if (4 == height) { 1807cabdff1aSopenharmony_ci LW4(src, stride, tp0, tp1, tp2, tp3); 1808cabdff1aSopenharmony_ci SW4(tp0, tp1, tp2, tp3, dst, stride); 1809cabdff1aSopenharmony_ci } else if (2 == height) { 1810cabdff1aSopenharmony_ci LW2(src, stride, tp0, tp1); 1811cabdff1aSopenharmony_ci SW(tp0, dst); 1812cabdff1aSopenharmony_ci dst += stride; 1813cabdff1aSopenharmony_ci SW(tp1, dst); 1814cabdff1aSopenharmony_ci } 1815cabdff1aSopenharmony_ci} 1816cabdff1aSopenharmony_ci 1817cabdff1aSopenharmony_cistatic void copy_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 1818cabdff1aSopenharmony_ci int32_t height) 1819cabdff1aSopenharmony_ci{ 1820cabdff1aSopenharmony_ci uint64_t src0, src1, src2, src3, src4, src5, src6, src7; 1821cabdff1aSopenharmony_ci 1822cabdff1aSopenharmony_ci if (8 == height) { 1823cabdff1aSopenharmony_ci LD4(src, stride, src0, src1, src2, src3); 1824cabdff1aSopenharmony_ci src += 4 * stride; 1825cabdff1aSopenharmony_ci LD4(src, stride, src4, src5, src6, src7); 1826cabdff1aSopenharmony_ci SD4(src0, src1, src2, src3, dst, stride); 1827cabdff1aSopenharmony_ci dst += 4 * stride; 1828cabdff1aSopenharmony_ci SD4(src4, src5, src6, src7, dst, stride); 1829cabdff1aSopenharmony_ci } else if (4 == height) { 1830cabdff1aSopenharmony_ci LD4(src, stride, src0, src1, src2, src3); 1831cabdff1aSopenharmony_ci SD4(src0, src1, src2, src3, dst, stride); 1832cabdff1aSopenharmony_ci } 1833cabdff1aSopenharmony_ci} 1834cabdff1aSopenharmony_ci 1835cabdff1aSopenharmony_cistatic void avg_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride, 1836cabdff1aSopenharmony_ci int32_t height) 1837cabdff1aSopenharmony_ci{ 1838cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 1839cabdff1aSopenharmony_ci v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 }; 1840cabdff1aSopenharmony_ci 1841cabdff1aSopenharmony_ci if (8 == height) { 1842cabdff1aSopenharmony_ci LW4(src, stride, tp0, tp1, tp2, tp3); 1843cabdff1aSopenharmony_ci src += 4 * stride; 1844cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 1845cabdff1aSopenharmony_ci LW4(src, stride, tp0, tp1, tp2, tp3); 1846cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, src1); 1847cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 1848cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1849cabdff1aSopenharmony_ci LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 1850cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 1851cabdff1aSopenharmony_ci AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); 1852cabdff1aSopenharmony_ci ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 1853cabdff1aSopenharmony_ci } else if (4 == height) { 1854cabdff1aSopenharmony_ci LW4(src, stride, tp0, tp1, tp2, tp3); 1855cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 1856cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 1857cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 1858cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b(src0, dst0); 1859cabdff1aSopenharmony_ci ST_W4(dst0, 0, 1, 2, 3, dst, stride); 1860cabdff1aSopenharmony_ci } else if (2 == height) { 1861cabdff1aSopenharmony_ci LW2(src, stride, tp0, tp1); 1862cabdff1aSopenharmony_ci INSERT_W2_UB(tp0, tp1, src0); 1863cabdff1aSopenharmony_ci LW2(dst, stride, tp0, tp1); 1864cabdff1aSopenharmony_ci INSERT_W2_UB(tp0, tp1, dst0); 1865cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b(src0, dst0); 1866cabdff1aSopenharmony_ci ST_W2(dst0, 0, 1, dst, stride); 1867cabdff1aSopenharmony_ci } 1868cabdff1aSopenharmony_ci} 1869cabdff1aSopenharmony_ci 1870cabdff1aSopenharmony_cistatic void avg_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride, 1871cabdff1aSopenharmony_ci int32_t height) 1872cabdff1aSopenharmony_ci{ 1873cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; 1874cabdff1aSopenharmony_ci v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 1875cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 1876cabdff1aSopenharmony_ci 1877cabdff1aSopenharmony_ci if (8 == height) { 1878cabdff1aSopenharmony_ci LD4(src, stride, tp0, tp1, tp2, tp3); 1879cabdff1aSopenharmony_ci src += 4 * stride; 1880cabdff1aSopenharmony_ci LD4(src, stride, tp4, tp5, tp6, tp7); 1881cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, src0); 1882cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, src1); 1883cabdff1aSopenharmony_ci INSERT_D2_UB(tp4, tp5, src2); 1884cabdff1aSopenharmony_ci INSERT_D2_UB(tp6, tp7, src3); 1885cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 1886cabdff1aSopenharmony_ci LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7); 1887cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 1888cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 1889cabdff1aSopenharmony_ci INSERT_D2_UB(tp4, tp5, dst2); 1890cabdff1aSopenharmony_ci INSERT_D2_UB(tp6, tp7, dst3); 1891cabdff1aSopenharmony_ci AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, 1892cabdff1aSopenharmony_ci dst2, dst3); 1893cabdff1aSopenharmony_ci ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1894cabdff1aSopenharmony_ci } else if (4 == height) { 1895cabdff1aSopenharmony_ci LD4(src, stride, tp0, tp1, tp2, tp3); 1896cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, src0); 1897cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, src1); 1898cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 1899cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 1900cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 1901cabdff1aSopenharmony_ci AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); 1902cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 1903cabdff1aSopenharmony_ci } 1904cabdff1aSopenharmony_ci} 1905cabdff1aSopenharmony_ci 1906cabdff1aSopenharmony_civoid ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, 1907cabdff1aSopenharmony_ci ptrdiff_t stride, int height, int x, int y) 1908cabdff1aSopenharmony_ci{ 1909cabdff1aSopenharmony_ci av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 1910cabdff1aSopenharmony_ci 1911cabdff1aSopenharmony_ci if (x && y) { 1912cabdff1aSopenharmony_ci avc_chroma_hv_8w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height); 1913cabdff1aSopenharmony_ci } else if (x) { 1914cabdff1aSopenharmony_ci avc_chroma_hz_8w_msa(src, dst, stride, x, (8 - x), height); 1915cabdff1aSopenharmony_ci } else if (y) { 1916cabdff1aSopenharmony_ci avc_chroma_vt_8w_msa(src, dst, stride, y, (8 - y), height); 1917cabdff1aSopenharmony_ci } else { 1918cabdff1aSopenharmony_ci copy_width8_msa(src, dst, stride, height); 1919cabdff1aSopenharmony_ci } 1920cabdff1aSopenharmony_ci} 1921cabdff1aSopenharmony_ci 1922cabdff1aSopenharmony_civoid ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, 1923cabdff1aSopenharmony_ci ptrdiff_t stride, int height, int x, int y) 1924cabdff1aSopenharmony_ci{ 1925cabdff1aSopenharmony_ci av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 1926cabdff1aSopenharmony_ci 1927cabdff1aSopenharmony_ci if (x && y) { 1928cabdff1aSopenharmony_ci avc_chroma_hv_4w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height); 1929cabdff1aSopenharmony_ci } else if (x) { 1930cabdff1aSopenharmony_ci avc_chroma_hz_4w_msa(src, dst, stride, x, (8 - x), height); 1931cabdff1aSopenharmony_ci } else if (y) { 1932cabdff1aSopenharmony_ci avc_chroma_vt_4w_msa(src, dst, stride, y, (8 - y), height); 1933cabdff1aSopenharmony_ci } else { 1934cabdff1aSopenharmony_ci copy_width4_msa(src, dst, stride, height); 1935cabdff1aSopenharmony_ci } 1936cabdff1aSopenharmony_ci} 1937cabdff1aSopenharmony_ci 1938cabdff1aSopenharmony_civoid ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, 1939cabdff1aSopenharmony_ci ptrdiff_t stride, int height, int x, int y) 1940cabdff1aSopenharmony_ci{ 1941cabdff1aSopenharmony_ci int32_t cnt; 1942cabdff1aSopenharmony_ci 1943cabdff1aSopenharmony_ci av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 1944cabdff1aSopenharmony_ci 1945cabdff1aSopenharmony_ci if (x && y) { 1946cabdff1aSopenharmony_ci avc_chroma_hv_2w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height); 1947cabdff1aSopenharmony_ci } else if (x) { 1948cabdff1aSopenharmony_ci avc_chroma_hz_2w_msa(src, dst, stride, x, (8 - x), height); 1949cabdff1aSopenharmony_ci } else if (y) { 1950cabdff1aSopenharmony_ci avc_chroma_vt_2w_msa(src, dst, stride, y, (8 - y), height); 1951cabdff1aSopenharmony_ci } else { 1952cabdff1aSopenharmony_ci for (cnt = height; cnt--;) { 1953cabdff1aSopenharmony_ci *((uint16_t *) dst) = *((uint16_t *) src); 1954cabdff1aSopenharmony_ci 1955cabdff1aSopenharmony_ci src += stride; 1956cabdff1aSopenharmony_ci dst += stride; 1957cabdff1aSopenharmony_ci } 1958cabdff1aSopenharmony_ci } 1959cabdff1aSopenharmony_ci} 1960cabdff1aSopenharmony_ci 1961cabdff1aSopenharmony_civoid ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, 1962cabdff1aSopenharmony_ci ptrdiff_t stride, int height, int x, int y) 1963cabdff1aSopenharmony_ci{ 1964cabdff1aSopenharmony_ci av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 1965cabdff1aSopenharmony_ci 1966cabdff1aSopenharmony_ci 1967cabdff1aSopenharmony_ci if (x && y) { 1968cabdff1aSopenharmony_ci avc_chroma_hv_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), y, 1969cabdff1aSopenharmony_ci (8 - y), height); 1970cabdff1aSopenharmony_ci } else if (x) { 1971cabdff1aSopenharmony_ci avc_chroma_hz_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), height); 1972cabdff1aSopenharmony_ci } else if (y) { 1973cabdff1aSopenharmony_ci avc_chroma_vt_and_aver_dst_8w_msa(src, dst, stride, y, (8 - y), height); 1974cabdff1aSopenharmony_ci } else { 1975cabdff1aSopenharmony_ci avg_width8_msa(src, dst, stride, height); 1976cabdff1aSopenharmony_ci } 1977cabdff1aSopenharmony_ci} 1978cabdff1aSopenharmony_ci 1979cabdff1aSopenharmony_civoid ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, 1980cabdff1aSopenharmony_ci ptrdiff_t stride, int height, int x, int y) 1981cabdff1aSopenharmony_ci{ 1982cabdff1aSopenharmony_ci av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 1983cabdff1aSopenharmony_ci 1984cabdff1aSopenharmony_ci if (x && y) { 1985cabdff1aSopenharmony_ci avc_chroma_hv_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), y, 1986cabdff1aSopenharmony_ci (8 - y), height); 1987cabdff1aSopenharmony_ci } else if (x) { 1988cabdff1aSopenharmony_ci avc_chroma_hz_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), height); 1989cabdff1aSopenharmony_ci } else if (y) { 1990cabdff1aSopenharmony_ci avc_chroma_vt_and_aver_dst_4w_msa(src, dst, stride, y, (8 - y), height); 1991cabdff1aSopenharmony_ci } else { 1992cabdff1aSopenharmony_ci avg_width4_msa(src, dst, stride, height); 1993cabdff1aSopenharmony_ci } 1994cabdff1aSopenharmony_ci} 1995cabdff1aSopenharmony_ci 1996cabdff1aSopenharmony_civoid ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, 1997cabdff1aSopenharmony_ci ptrdiff_t stride, int height, int x, int y) 1998cabdff1aSopenharmony_ci{ 1999cabdff1aSopenharmony_ci int32_t cnt; 2000cabdff1aSopenharmony_ci 2001cabdff1aSopenharmony_ci av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 2002cabdff1aSopenharmony_ci 2003cabdff1aSopenharmony_ci if (x && y) { 2004cabdff1aSopenharmony_ci avc_chroma_hv_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), y, 2005cabdff1aSopenharmony_ci (8 - y), height); 2006cabdff1aSopenharmony_ci } else if (x) { 2007cabdff1aSopenharmony_ci avc_chroma_hz_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), height); 2008cabdff1aSopenharmony_ci } else if (y) { 2009cabdff1aSopenharmony_ci avc_chroma_vt_and_aver_dst_2w_msa(src, dst, stride, y, (8 - y), height); 2010cabdff1aSopenharmony_ci } else { 2011cabdff1aSopenharmony_ci for (cnt = height; cnt--;) { 2012cabdff1aSopenharmony_ci dst[0] = (dst[0] + src[0] + 1) >> 1; 2013cabdff1aSopenharmony_ci dst[1] = (dst[1] + src[1] + 1) >> 1; 2014cabdff1aSopenharmony_ci 2015cabdff1aSopenharmony_ci src += stride; 2016cabdff1aSopenharmony_ci dst += stride; 2017cabdff1aSopenharmony_ci } 2018cabdff1aSopenharmony_ci } 2019cabdff1aSopenharmony_ci} 2020