1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 22cabdff1aSopenharmony_ci#include "libavcodec/mips/hevcdsp_mips.h" 23cabdff1aSopenharmony_ci#include "libavcodec/mips/hevc_macros_msa.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { 26cabdff1aSopenharmony_ci /* 8 width cases */ 27cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 29cabdff1aSopenharmony_ci}; 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ci#define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \ 32cabdff1aSopenharmony_ci out0, out1) \ 33cabdff1aSopenharmony_ci{ \ 34cabdff1aSopenharmony_ci v4i32 out0_r, out1_r, out0_l, out1_l; \ 35cabdff1aSopenharmony_ci \ 36cabdff1aSopenharmony_ci ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \ 37cabdff1aSopenharmony_ci ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \ 38cabdff1aSopenharmony_ci \ 39cabdff1aSopenharmony_ci out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \ 40cabdff1aSopenharmony_ci out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \ 41cabdff1aSopenharmony_ci out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \ 42cabdff1aSopenharmony_ci out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \ 43cabdff1aSopenharmony_ci \ 44cabdff1aSopenharmony_ci SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \ 45cabdff1aSopenharmony_ci PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \ 46cabdff1aSopenharmony_ci CLIP_SH2_0_255(out0, out1); \ 47cabdff1aSopenharmony_ci} 48cabdff1aSopenharmony_ci 49cabdff1aSopenharmony_ci#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \ 50cabdff1aSopenharmony_ci wgt, rnd, offset, out0, out1, out2, out3) \ 51cabdff1aSopenharmony_ci{ \ 52cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \ 53cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \ 54cabdff1aSopenharmony_ci} 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci#define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \ 57cabdff1aSopenharmony_ci offset, out0, out1) \ 58cabdff1aSopenharmony_ci{ \ 59cabdff1aSopenharmony_ci v4i32 out0_r, out1_r, out0_l, out1_l; \ 60cabdff1aSopenharmony_ci \ 61cabdff1aSopenharmony_ci ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \ 62cabdff1aSopenharmony_ci ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \ 63cabdff1aSopenharmony_ci out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \ 64cabdff1aSopenharmony_ci out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \ 65cabdff1aSopenharmony_ci out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \ 66cabdff1aSopenharmony_ci out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \ 67cabdff1aSopenharmony_ci SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \ 68cabdff1aSopenharmony_ci PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \ 69cabdff1aSopenharmony_ci CLIP_SH2_0_255(out0, out1); \ 70cabdff1aSopenharmony_ci} 71cabdff1aSopenharmony_ci 72cabdff1aSopenharmony_ci#define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \ 73cabdff1aSopenharmony_ci vec3, wgt, rnd, offset, out0, out1, \ 74cabdff1aSopenharmony_ci out2, out3) \ 75cabdff1aSopenharmony_ci{ \ 76cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \ 77cabdff1aSopenharmony_ci out0, out1); \ 78cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \ 79cabdff1aSopenharmony_ci out2, out3); \ 80cabdff1aSopenharmony_ci} 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr, 83cabdff1aSopenharmony_ci int32_t src_stride, 84cabdff1aSopenharmony_ci int16_t *src1_ptr, 85cabdff1aSopenharmony_ci int32_t src2_stride, 86cabdff1aSopenharmony_ci uint8_t *dst, 87cabdff1aSopenharmony_ci int32_t dst_stride, 88cabdff1aSopenharmony_ci int32_t height, 89cabdff1aSopenharmony_ci int32_t weight0, 90cabdff1aSopenharmony_ci int32_t weight1, 91cabdff1aSopenharmony_ci int32_t offset0, 92cabdff1aSopenharmony_ci int32_t offset1, 93cabdff1aSopenharmony_ci int32_t rnd_val) 94cabdff1aSopenharmony_ci{ 95cabdff1aSopenharmony_ci uint32_t loop_cnt, tp0, tp1, tp2, tp3; 96cabdff1aSopenharmony_ci uint64_t tpd0, tpd1, tpd2, tpd3; 97cabdff1aSopenharmony_ci int32_t offset, weight; 98cabdff1aSopenharmony_ci v16u8 out0, out1; 99cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 100cabdff1aSopenharmony_ci v16i8 src0 = { 0 }, src1 = { 0 }; 101cabdff1aSopenharmony_ci v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 102cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, weight_vec; 103cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, offset_vec, rnd_vec; 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 106cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 107cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 108cabdff1aSopenharmony_ci 109cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 110cabdff1aSopenharmony_ci weight_vec = (v8i16) __msa_fill_w(weight); 111cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_ci if (2 == height) { 114cabdff1aSopenharmony_ci LW2(src0_ptr, src_stride, tp0, tp1); 115cabdff1aSopenharmony_ci INSERT_W2_SB(tp0, tp1, src0); 116cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tpd0, tpd1); 117cabdff1aSopenharmony_ci INSERT_D2_SH(tpd0, tpd1, in0); 118cabdff1aSopenharmony_ci 119cabdff1aSopenharmony_ci dst0 = (v8i16) __msa_ilvr_b(zero, src0); 120cabdff1aSopenharmony_ci dst0 <<= 6; 121cabdff1aSopenharmony_ci 122cabdff1aSopenharmony_ci ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l); 123cabdff1aSopenharmony_ci dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec); 124cabdff1aSopenharmony_ci dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec); 125cabdff1aSopenharmony_ci SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); 126cabdff1aSopenharmony_ci dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 127cabdff1aSopenharmony_ci CLIP_SH_0_255(dst0); 128cabdff1aSopenharmony_ci out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); 129cabdff1aSopenharmony_ci ST_W2(out0, 0, 1, dst, dst_stride); 130cabdff1aSopenharmony_ci } else if (4 == height) { 131cabdff1aSopenharmony_ci LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 132cabdff1aSopenharmony_ci INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); 133cabdff1aSopenharmony_ci LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); 134cabdff1aSopenharmony_ci INSERT_D2_SH(tpd0, tpd1, in0); 135cabdff1aSopenharmony_ci INSERT_D2_SH(tpd2, tpd3, in1); 136cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 137cabdff1aSopenharmony_ci SLLI_2V(dst0, dst1, 6); 138cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec, 139cabdff1aSopenharmony_ci offset_vec, dst0, dst1); 140cabdff1aSopenharmony_ci out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 141cabdff1aSopenharmony_ci ST_W4(out0, 0, 1, 2, 3, dst, dst_stride); 142cabdff1aSopenharmony_ci } else if (0 == height % 8) { 143cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 144cabdff1aSopenharmony_ci LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 145cabdff1aSopenharmony_ci src0_ptr += 4 * src_stride; 146cabdff1aSopenharmony_ci INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); 147cabdff1aSopenharmony_ci LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 148cabdff1aSopenharmony_ci src0_ptr += 4 * src_stride; 149cabdff1aSopenharmony_ci INSERT_W4_SB(tp0, tp1, tp2, tp3, src1); 150cabdff1aSopenharmony_ci LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); 151cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 152cabdff1aSopenharmony_ci INSERT_D2_SH(tpd0, tpd1, in0); 153cabdff1aSopenharmony_ci INSERT_D2_SH(tpd2, tpd3, in1); 154cabdff1aSopenharmony_ci LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); 155cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 156cabdff1aSopenharmony_ci INSERT_D2_SH(tpd0, tpd1, in2); 157cabdff1aSopenharmony_ci INSERT_D2_SH(tpd2, tpd3, in3); 158cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 159cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 160cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 161cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, 162cabdff1aSopenharmony_ci in3, weight_vec, rnd_vec, offset_vec, 163cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 164cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 165cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 166cabdff1aSopenharmony_ci dst += (8 * dst_stride); 167cabdff1aSopenharmony_ci } 168cabdff1aSopenharmony_ci } 169cabdff1aSopenharmony_ci} 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr, 172cabdff1aSopenharmony_ci int32_t src_stride, 173cabdff1aSopenharmony_ci int16_t *src1_ptr, 174cabdff1aSopenharmony_ci int32_t src2_stride, 175cabdff1aSopenharmony_ci uint8_t *dst, 176cabdff1aSopenharmony_ci int32_t dst_stride, 177cabdff1aSopenharmony_ci int32_t height, 178cabdff1aSopenharmony_ci int32_t weight0, 179cabdff1aSopenharmony_ci int32_t weight1, 180cabdff1aSopenharmony_ci int32_t offset0, 181cabdff1aSopenharmony_ci int32_t offset1, 182cabdff1aSopenharmony_ci int32_t rnd_val) 183cabdff1aSopenharmony_ci{ 184cabdff1aSopenharmony_ci uint32_t loop_cnt; 185cabdff1aSopenharmony_ci int32_t offset, weight; 186cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 187cabdff1aSopenharmony_ci v16u8 out0, out1; 188cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 189cabdff1aSopenharmony_ci v16i8 src0 = { 0 }, src1 = { 0 }; 190cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 191cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 192cabdff1aSopenharmony_ci v4i32 offset_vec, weight_vec, rnd_vec; 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 195cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 196cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 197cabdff1aSopenharmony_ci 198cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 199cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 200cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 201cabdff1aSopenharmony_ci 202cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 203cabdff1aSopenharmony_ci LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 204cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 205cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 206cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src1); 207cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 208cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 209cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 210cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 211cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 212cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, 213cabdff1aSopenharmony_ci in0, in1, in2, in3, 214cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 215cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 216cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 217cabdff1aSopenharmony_ci ST_W2(out0, 0, 2, dst, dst_stride); 218cabdff1aSopenharmony_ci ST_H2(out0, 2, 6, dst + 4, dst_stride); 219cabdff1aSopenharmony_ci ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 220cabdff1aSopenharmony_ci ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 221cabdff1aSopenharmony_ci dst += (4 * dst_stride); 222cabdff1aSopenharmony_ci } 223cabdff1aSopenharmony_ci} 224cabdff1aSopenharmony_ci 225cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr, 226cabdff1aSopenharmony_ci int32_t src_stride, 227cabdff1aSopenharmony_ci int16_t *src1_ptr, 228cabdff1aSopenharmony_ci int32_t src2_stride, 229cabdff1aSopenharmony_ci uint8_t *dst, 230cabdff1aSopenharmony_ci int32_t dst_stride, 231cabdff1aSopenharmony_ci int32_t height, 232cabdff1aSopenharmony_ci int32_t weight0, 233cabdff1aSopenharmony_ci int32_t weight1, 234cabdff1aSopenharmony_ci int32_t offset0, 235cabdff1aSopenharmony_ci int32_t offset1, 236cabdff1aSopenharmony_ci int32_t rnd_val) 237cabdff1aSopenharmony_ci{ 238cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 239cabdff1aSopenharmony_ci int32_t offset, weight; 240cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 241cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 242cabdff1aSopenharmony_ci v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }; 243cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5; 244cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 245cabdff1aSopenharmony_ci v4i32 offset_vec, weight_vec, rnd_vec; 246cabdff1aSopenharmony_ci 247cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 248cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 249cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 250cabdff1aSopenharmony_ci 251cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 252cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 253cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 254cabdff1aSopenharmony_ci 255cabdff1aSopenharmony_ci if (2 == height) { 256cabdff1aSopenharmony_ci LD2(src0_ptr, src_stride, tp0, tp1); 257cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 258cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 259cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 260cabdff1aSopenharmony_ci SLLI_2V(dst0, dst1, 6); 261cabdff1aSopenharmony_ci 262cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, 263cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 264cabdff1aSopenharmony_ci dst0, dst1); 265cabdff1aSopenharmony_ci 266cabdff1aSopenharmony_ci out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 267cabdff1aSopenharmony_ci ST_D2(out0, 0, 1, dst, dst_stride); 268cabdff1aSopenharmony_ci } else if (6 == height) { 269cabdff1aSopenharmony_ci LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 270cabdff1aSopenharmony_ci src0_ptr += 4 * src_stride; 271cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 272cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src1); 273cabdff1aSopenharmony_ci LD2(src0_ptr, src_stride, tp0, tp1); 274cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src2); 275cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 276cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 277cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 278cabdff1aSopenharmony_ci LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); 279cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 280cabdff1aSopenharmony_ci SLLI_2V(dst4, dst5, 6); 281cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3, 282cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, dst0, dst1, 283cabdff1aSopenharmony_ci dst2, dst3); 284cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec, 285cabdff1aSopenharmony_ci offset_vec, dst4, dst5); 286cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 287cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 288cabdff1aSopenharmony_ci ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 289cabdff1aSopenharmony_ci } else if (0 == height % 4) { 290cabdff1aSopenharmony_ci uint32_t loop_cnt; 291cabdff1aSopenharmony_ci 292cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 293cabdff1aSopenharmony_ci LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 294cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 295cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 296cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src1); 297cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 298cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 299cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 300cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 301cabdff1aSopenharmony_ci 302cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 303cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, 304cabdff1aSopenharmony_ci in3, weight_vec, rnd_vec, offset_vec, 305cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 306cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 307cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 308cabdff1aSopenharmony_ci dst += (4 * dst_stride); 309cabdff1aSopenharmony_ci } 310cabdff1aSopenharmony_ci } 311cabdff1aSopenharmony_ci} 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr, 314cabdff1aSopenharmony_ci int32_t src_stride, 315cabdff1aSopenharmony_ci int16_t *src1_ptr, 316cabdff1aSopenharmony_ci int32_t src2_stride, 317cabdff1aSopenharmony_ci uint8_t *dst, 318cabdff1aSopenharmony_ci int32_t dst_stride, 319cabdff1aSopenharmony_ci int32_t height, 320cabdff1aSopenharmony_ci int32_t weight0, 321cabdff1aSopenharmony_ci int32_t weight1, 322cabdff1aSopenharmony_ci int32_t offset0, 323cabdff1aSopenharmony_ci int32_t offset1, 324cabdff1aSopenharmony_ci int32_t rnd_val) 325cabdff1aSopenharmony_ci{ 326cabdff1aSopenharmony_ci uint32_t loop_cnt; 327cabdff1aSopenharmony_ci int32_t offset, weight; 328cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 329cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 330cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 331cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 332cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 333cabdff1aSopenharmony_ci v4i32 offset_vec, weight_vec, rnd_vec; 334cabdff1aSopenharmony_ci 335cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 336cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 337cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 338cabdff1aSopenharmony_ci 339cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 340cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 341cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 342cabdff1aSopenharmony_ci 343cabdff1aSopenharmony_ci for (loop_cnt = (16 >> 2); loop_cnt--;) { 344cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 345cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 346cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 347cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 348cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 349cabdff1aSopenharmony_ci 350cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 351cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 352cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 353cabdff1aSopenharmony_ci 354cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 355cabdff1aSopenharmony_ci ILVL_W2_SB(src1, src0, src3, src2, src0, src1); 356cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5); 357cabdff1aSopenharmony_ci 358cabdff1aSopenharmony_ci dst4 <<= 6; 359cabdff1aSopenharmony_ci dst5 <<= 6; 360cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3, 361cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, dst0, dst1, 362cabdff1aSopenharmony_ci dst2, dst3); 363cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec, 364cabdff1aSopenharmony_ci offset_vec, dst4, dst5); 365cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 366cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 367cabdff1aSopenharmony_ci ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 368cabdff1aSopenharmony_ci dst += (4 * dst_stride); 369cabdff1aSopenharmony_ci } 370cabdff1aSopenharmony_ci} 371cabdff1aSopenharmony_ci 372cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr, 373cabdff1aSopenharmony_ci int32_t src_stride, 374cabdff1aSopenharmony_ci int16_t *src1_ptr, 375cabdff1aSopenharmony_ci int32_t src2_stride, 376cabdff1aSopenharmony_ci uint8_t *dst, 377cabdff1aSopenharmony_ci int32_t dst_stride, 378cabdff1aSopenharmony_ci int32_t height, 379cabdff1aSopenharmony_ci int32_t weight0, 380cabdff1aSopenharmony_ci int32_t weight1, 381cabdff1aSopenharmony_ci int32_t offset0, 382cabdff1aSopenharmony_ci int32_t offset1, 383cabdff1aSopenharmony_ci int32_t rnd_val) 384cabdff1aSopenharmony_ci{ 385cabdff1aSopenharmony_ci uint32_t loop_cnt; 386cabdff1aSopenharmony_ci int32_t offset, weight; 387cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 388cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 389cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 390cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 391cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 392cabdff1aSopenharmony_ci v4i32 offset_vec, weight_vec, rnd_vec; 393cabdff1aSopenharmony_ci 394cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 395cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 396cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 397cabdff1aSopenharmony_ci 398cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 399cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 400cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 401cabdff1aSopenharmony_ci 402cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 403cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 404cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 405cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 406cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 407cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 408cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1, 409cabdff1aSopenharmony_ci tmp2, tmp3); 410cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5, 411cabdff1aSopenharmony_ci tmp6, tmp7); 412cabdff1aSopenharmony_ci SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6); 413cabdff1aSopenharmony_ci SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6); 414cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5, 415cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, tmp0, tmp1, 416cabdff1aSopenharmony_ci tmp4, tmp5); 417cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7, 418cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, tmp2, tmp3, 419cabdff1aSopenharmony_ci tmp6, tmp7); 420cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1); 421cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3); 422cabdff1aSopenharmony_ci ST_UB4(out0, out1, out2, out3, dst, dst_stride); 423cabdff1aSopenharmony_ci dst += (4 * dst_stride); 424cabdff1aSopenharmony_ci } 425cabdff1aSopenharmony_ci} 426cabdff1aSopenharmony_ci 427cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr, 428cabdff1aSopenharmony_ci int32_t src_stride, 429cabdff1aSopenharmony_ci int16_t *src1_ptr, 430cabdff1aSopenharmony_ci int32_t src2_stride, 431cabdff1aSopenharmony_ci uint8_t *dst, 432cabdff1aSopenharmony_ci int32_t dst_stride, 433cabdff1aSopenharmony_ci int32_t height, 434cabdff1aSopenharmony_ci int32_t weight0, 435cabdff1aSopenharmony_ci int32_t weight1, 436cabdff1aSopenharmony_ci int32_t offset0, 437cabdff1aSopenharmony_ci int32_t offset1, 438cabdff1aSopenharmony_ci int32_t rnd_val) 439cabdff1aSopenharmony_ci{ 440cabdff1aSopenharmony_ci uint32_t loop_cnt; 441cabdff1aSopenharmony_ci int32_t offset, weight; 442cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3, out4, out5; 443cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 }; 444cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10; 445cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11; 446cabdff1aSopenharmony_ci v4i32 offset_vec, weight_vec, rnd_vec; 447cabdff1aSopenharmony_ci 448cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 449cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 450cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 451cabdff1aSopenharmony_ci 452cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 453cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 454cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 455cabdff1aSopenharmony_ci 456cabdff1aSopenharmony_ci for (loop_cnt = 8; loop_cnt--;) { 457cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5); 458cabdff1aSopenharmony_ci LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7); 459cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 460cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 461cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 462cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11); 463cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 464cabdff1aSopenharmony_ci 465cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 466cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 467cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5); 468cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src4, dst6, dst7); 469cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src5, dst8, dst9); 470cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11); 471cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 472cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 473cabdff1aSopenharmony_ci SLLI_4V(dst8, dst9, dst10, dst11, 6); 474cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5, 475cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, dst0, dst1, 476cabdff1aSopenharmony_ci dst2, dst3); 477cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6, 478cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, dst4, dst5, 479cabdff1aSopenharmony_ci dst6, dst7); 480cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10, 481cabdff1aSopenharmony_ci in11, weight_vec, rnd_vec, offset_vec, 482cabdff1aSopenharmony_ci dst8, dst9, dst10, dst11); 483cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 484cabdff1aSopenharmony_ci PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); 485cabdff1aSopenharmony_ci ST_UB4(out0, out1, out3, out4, dst, dst_stride); 486cabdff1aSopenharmony_ci ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride); 487cabdff1aSopenharmony_ci dst += (4 * dst_stride); 488cabdff1aSopenharmony_ci } 489cabdff1aSopenharmony_ci} 490cabdff1aSopenharmony_ci 491cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr, 492cabdff1aSopenharmony_ci int32_t src_stride, 493cabdff1aSopenharmony_ci int16_t *src1_ptr, 494cabdff1aSopenharmony_ci int32_t src2_stride, 495cabdff1aSopenharmony_ci uint8_t *dst, 496cabdff1aSopenharmony_ci int32_t dst_stride, 497cabdff1aSopenharmony_ci int32_t height, 498cabdff1aSopenharmony_ci int32_t weight0, 499cabdff1aSopenharmony_ci int32_t weight1, 500cabdff1aSopenharmony_ci int32_t offset0, 501cabdff1aSopenharmony_ci int32_t offset1, 502cabdff1aSopenharmony_ci int32_t rnd_val) 503cabdff1aSopenharmony_ci{ 504cabdff1aSopenharmony_ci uint32_t loop_cnt; 505cabdff1aSopenharmony_ci int32_t offset, weight; 506cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 507cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 508cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 509cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 510cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 511cabdff1aSopenharmony_ci v4i32 offset_vec, weight_vec, rnd_vec; 512cabdff1aSopenharmony_ci 513cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 514cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 515cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 516cabdff1aSopenharmony_ci 517cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 518cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 519cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 520cabdff1aSopenharmony_ci 521cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 522cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 16, src0, src1); 523cabdff1aSopenharmony_ci src0_ptr += src_stride; 524cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 16, src2, src3); 525cabdff1aSopenharmony_ci src0_ptr += src_stride; 526cabdff1aSopenharmony_ci LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 527cabdff1aSopenharmony_ci src1_ptr += src2_stride; 528cabdff1aSopenharmony_ci LD_SH4(src1_ptr, 8, in4, in5, in6, in7); 529cabdff1aSopenharmony_ci src1_ptr += src2_stride; 530cabdff1aSopenharmony_ci 531cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, tmp0, tmp4); 532cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, tmp1, tmp5); 533cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, tmp2, tmp6); 534cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src3, tmp3, tmp7); 535cabdff1aSopenharmony_ci SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6); 536cabdff1aSopenharmony_ci SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6); 537cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3, 538cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, tmp0, tmp4, 539cabdff1aSopenharmony_ci tmp1, tmp5); 540cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7, 541cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, tmp2, tmp6, 542cabdff1aSopenharmony_ci tmp3, tmp7); 543cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1); 544cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3); 545cabdff1aSopenharmony_ci ST_UB2(out0, out1, dst, 16); 546cabdff1aSopenharmony_ci dst += dst_stride; 547cabdff1aSopenharmony_ci ST_UB2(out2, out3, dst, 16); 548cabdff1aSopenharmony_ci dst += dst_stride; 549cabdff1aSopenharmony_ci } 550cabdff1aSopenharmony_ci} 551cabdff1aSopenharmony_ci 552cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr, 553cabdff1aSopenharmony_ci int32_t src_stride, 554cabdff1aSopenharmony_ci int16_t *src1_ptr, 555cabdff1aSopenharmony_ci int32_t src2_stride, 556cabdff1aSopenharmony_ci uint8_t *dst, 557cabdff1aSopenharmony_ci int32_t dst_stride, 558cabdff1aSopenharmony_ci int32_t height, 559cabdff1aSopenharmony_ci int32_t weight0, 560cabdff1aSopenharmony_ci int32_t weight1, 561cabdff1aSopenharmony_ci int32_t offset0, 562cabdff1aSopenharmony_ci int32_t offset1, 563cabdff1aSopenharmony_ci int32_t rnd_val) 564cabdff1aSopenharmony_ci{ 565cabdff1aSopenharmony_ci uint32_t loop_cnt; 566cabdff1aSopenharmony_ci int32_t offset, weight; 567cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 568cabdff1aSopenharmony_ci v16i8 src0, src1, src2; 569cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 570cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5; 571cabdff1aSopenharmony_ci v4i32 offset_vec, weight_vec, rnd_vec; 572cabdff1aSopenharmony_ci 573cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 574cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 575cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 576cabdff1aSopenharmony_ci 577cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 578cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 579cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 580cabdff1aSopenharmony_ci 581cabdff1aSopenharmony_ci for (loop_cnt = 64; loop_cnt--;) { 582cabdff1aSopenharmony_ci LD_SB3(src0_ptr, 16, src0, src1, src2); 583cabdff1aSopenharmony_ci src0_ptr += src_stride; 584cabdff1aSopenharmony_ci LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5); 585cabdff1aSopenharmony_ci src1_ptr += src2_stride; 586cabdff1aSopenharmony_ci 587cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 588cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 589cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 590cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 591cabdff1aSopenharmony_ci SLLI_2V(dst4, dst5, 6); 592cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3, 593cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, dst0, dst1, 594cabdff1aSopenharmony_ci dst2, dst3); 595cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec, 596cabdff1aSopenharmony_ci offset_vec, dst4, dst5); 597cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 598cabdff1aSopenharmony_ci ST_UB2(out0, out1, dst, 16); 599cabdff1aSopenharmony_ci ST_UB(out2, dst + 32); 600cabdff1aSopenharmony_ci dst += dst_stride; 601cabdff1aSopenharmony_ci } 602cabdff1aSopenharmony_ci} 603cabdff1aSopenharmony_ci 604cabdff1aSopenharmony_cistatic void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr, 605cabdff1aSopenharmony_ci int32_t src_stride, 606cabdff1aSopenharmony_ci int16_t *src1_ptr, 607cabdff1aSopenharmony_ci int32_t src2_stride, 608cabdff1aSopenharmony_ci uint8_t *dst, 609cabdff1aSopenharmony_ci int32_t dst_stride, 610cabdff1aSopenharmony_ci int32_t height, 611cabdff1aSopenharmony_ci int32_t weight0, 612cabdff1aSopenharmony_ci int32_t weight1, 613cabdff1aSopenharmony_ci int32_t offset0, 614cabdff1aSopenharmony_ci int32_t offset1, 615cabdff1aSopenharmony_ci int32_t rnd_val) 616cabdff1aSopenharmony_ci{ 617cabdff1aSopenharmony_ci uint32_t loop_cnt; 618cabdff1aSopenharmony_ci int32_t offset, weight; 619cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 620cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 621cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 622cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 623cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 624cabdff1aSopenharmony_ci v4i32 offset_vec, weight_vec, rnd_vec; 625cabdff1aSopenharmony_ci 626cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 627cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 628cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 629cabdff1aSopenharmony_ci 630cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 631cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 632cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 633cabdff1aSopenharmony_ci 634cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 635cabdff1aSopenharmony_ci LD_SB4(src0_ptr, 16, src0, src1, src2, src3); 636cabdff1aSopenharmony_ci src0_ptr += src_stride; 637cabdff1aSopenharmony_ci LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7); 638cabdff1aSopenharmony_ci src1_ptr += src2_stride; 639cabdff1aSopenharmony_ci 640cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1, 641cabdff1aSopenharmony_ci tmp2, tmp3); 642cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5, 643cabdff1aSopenharmony_ci tmp6, tmp7); 644cabdff1aSopenharmony_ci SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6); 645cabdff1aSopenharmony_ci SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6); 646cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3, 647cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, tmp0, tmp4, 648cabdff1aSopenharmony_ci tmp1, tmp5); 649cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7, 650cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, tmp2, tmp6, 651cabdff1aSopenharmony_ci tmp3, tmp7); 652cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1); 653cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3); 654cabdff1aSopenharmony_ci ST_UB4(out0, out1, out2, out3, dst, 16); 655cabdff1aSopenharmony_ci dst += dst_stride; 656cabdff1aSopenharmony_ci } 657cabdff1aSopenharmony_ci} 658cabdff1aSopenharmony_ci 659cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr, 660cabdff1aSopenharmony_ci int32_t src_stride, 661cabdff1aSopenharmony_ci int16_t *src1_ptr, 662cabdff1aSopenharmony_ci int32_t src2_stride, 663cabdff1aSopenharmony_ci uint8_t *dst, 664cabdff1aSopenharmony_ci int32_t dst_stride, 665cabdff1aSopenharmony_ci const int8_t *filter, 666cabdff1aSopenharmony_ci int32_t height, 667cabdff1aSopenharmony_ci int32_t weight0, 668cabdff1aSopenharmony_ci int32_t weight1, 669cabdff1aSopenharmony_ci int32_t offset0, 670cabdff1aSopenharmony_ci int32_t offset1, 671cabdff1aSopenharmony_ci int32_t rnd_val) 672cabdff1aSopenharmony_ci{ 673cabdff1aSopenharmony_ci uint32_t loop_cnt; 674cabdff1aSopenharmony_ci int32_t offset, weight, constant; 675cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 676cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 677cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 678cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 679cabdff1aSopenharmony_ci v8i16 dst0, dst1; 680cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 681cabdff1aSopenharmony_ci v8i16 filter_vec, out0, out1; 682cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 683cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 684cabdff1aSopenharmony_ci 685cabdff1aSopenharmony_ci src0_ptr -= 3; 686cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 687cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 688cabdff1aSopenharmony_ci 689cabdff1aSopenharmony_ci mask1 = mask0 + 2; 690cabdff1aSopenharmony_ci mask2 = mask0 + 4; 691cabdff1aSopenharmony_ci mask3 = mask0 + 6; 692cabdff1aSopenharmony_ci 693cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 694cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 695cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 696cabdff1aSopenharmony_ci constant = 128 * weight1; 697cabdff1aSopenharmony_ci constant <<= 6; 698cabdff1aSopenharmony_ci offset += constant; 699cabdff1aSopenharmony_ci 700cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 701cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 702cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 703cabdff1aSopenharmony_ci 704cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 705cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 706cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 707cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 708cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 709cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 710cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 711cabdff1aSopenharmony_ci 712cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, 713cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 714cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 715cabdff1aSopenharmony_ci filt3); 716cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3, 717cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 718cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 719cabdff1aSopenharmony_ci filt3); 720cabdff1aSopenharmony_ci 721cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, 722cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 723cabdff1aSopenharmony_ci out0, out1); 724cabdff1aSopenharmony_ci 725cabdff1aSopenharmony_ci out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0); 726cabdff1aSopenharmony_ci ST_W4(out0, 0, 1, 2, 3, dst, dst_stride); 727cabdff1aSopenharmony_ci dst += (4 * dst_stride); 728cabdff1aSopenharmony_ci } 729cabdff1aSopenharmony_ci} 730cabdff1aSopenharmony_ci 731cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr, 732cabdff1aSopenharmony_ci int32_t src_stride, 733cabdff1aSopenharmony_ci int16_t *src1_ptr, 734cabdff1aSopenharmony_ci int32_t src2_stride, 735cabdff1aSopenharmony_ci uint8_t *dst, 736cabdff1aSopenharmony_ci int32_t dst_stride, 737cabdff1aSopenharmony_ci const int8_t *filter, 738cabdff1aSopenharmony_ci int32_t height, 739cabdff1aSopenharmony_ci int32_t weight0, 740cabdff1aSopenharmony_ci int32_t weight1, 741cabdff1aSopenharmony_ci int32_t offset0, 742cabdff1aSopenharmony_ci int32_t offset1, 743cabdff1aSopenharmony_ci int32_t rnd_val) 744cabdff1aSopenharmony_ci{ 745cabdff1aSopenharmony_ci uint32_t loop_cnt; 746cabdff1aSopenharmony_ci int32_t offset, weight, constant; 747cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 748cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 749cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 750cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 751cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 752cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 753cabdff1aSopenharmony_ci v8i16 filter_vec, out0, out1, out2, out3; 754cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 755cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 756cabdff1aSopenharmony_ci 757cabdff1aSopenharmony_ci src0_ptr -= 3; 758cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 759cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 760cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 761cabdff1aSopenharmony_ci constant = 128 * weight1; 762cabdff1aSopenharmony_ci constant <<= 6; 763cabdff1aSopenharmony_ci offset += constant; 764cabdff1aSopenharmony_ci 765cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 766cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 767cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 768cabdff1aSopenharmony_ci 769cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 770cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 771cabdff1aSopenharmony_ci 772cabdff1aSopenharmony_ci mask1 = mask0 + 2; 773cabdff1aSopenharmony_ci mask2 = mask0 + 4; 774cabdff1aSopenharmony_ci mask3 = mask0 + 6; 775cabdff1aSopenharmony_ci 776cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 777cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 778cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 779cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 780cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 781cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 782cabdff1aSopenharmony_ci 783cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 784cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 785cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 786cabdff1aSopenharmony_ci filt3); 787cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 788cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 789cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 790cabdff1aSopenharmony_ci filt3); 791cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 792cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 793cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 794cabdff1aSopenharmony_ci filt3); 795cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 796cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 797cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 798cabdff1aSopenharmony_ci filt3); 799cabdff1aSopenharmony_ci 800cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 801cabdff1aSopenharmony_ci in0, in1, in2, in3, 802cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 803cabdff1aSopenharmony_ci out0, out1, out2, out3); 804cabdff1aSopenharmony_ci 805cabdff1aSopenharmony_ci PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 806cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 807cabdff1aSopenharmony_ci dst += (4 * dst_stride); 808cabdff1aSopenharmony_ci } 809cabdff1aSopenharmony_ci} 810cabdff1aSopenharmony_ci 811cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr, 812cabdff1aSopenharmony_ci int32_t src_stride, 813cabdff1aSopenharmony_ci int16_t *src1_ptr, 814cabdff1aSopenharmony_ci int32_t src2_stride, 815cabdff1aSopenharmony_ci uint8_t *dst, 816cabdff1aSopenharmony_ci int32_t dst_stride, 817cabdff1aSopenharmony_ci const int8_t *filter, 818cabdff1aSopenharmony_ci int32_t height, 819cabdff1aSopenharmony_ci int32_t weight0, 820cabdff1aSopenharmony_ci int32_t weight1, 821cabdff1aSopenharmony_ci int32_t offset0, 822cabdff1aSopenharmony_ci int32_t offset1, 823cabdff1aSopenharmony_ci int32_t rnd_val) 824cabdff1aSopenharmony_ci{ 825cabdff1aSopenharmony_ci uint32_t loop_cnt; 826cabdff1aSopenharmony_ci int32_t offset, weight, constant; 827cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3; 828cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 829cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3; 830cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec; 831cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 832cabdff1aSopenharmony_ci 833cabdff1aSopenharmony_ci src0_ptr -= 3; 834cabdff1aSopenharmony_ci 835cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 836cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 837cabdff1aSopenharmony_ci constant = 128 * weight1; 838cabdff1aSopenharmony_ci constant <<= 6; 839cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 840cabdff1aSopenharmony_ci offset += constant; 841cabdff1aSopenharmony_ci 842cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 843cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 844cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 845cabdff1aSopenharmony_ci 846cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 847cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 848cabdff1aSopenharmony_ci 849cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 850cabdff1aSopenharmony_ci mask1 = mask0 + 2; 851cabdff1aSopenharmony_ci mask2 = mask0 + 4; 852cabdff1aSopenharmony_ci mask3 = mask0 + 6; 853cabdff1aSopenharmony_ci mask4 = LD_SB(&ff_hevc_mask_arr[16]); 854cabdff1aSopenharmony_ci mask5 = mask4 + 2; 855cabdff1aSopenharmony_ci mask6 = mask4 + 4; 856cabdff1aSopenharmony_ci mask7 = mask4 + 6; 857cabdff1aSopenharmony_ci 858cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 859cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 860cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 861cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 862cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 863cabdff1aSopenharmony_ci vec3); 864cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 865cabdff1aSopenharmony_ci filt3); 866cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 867cabdff1aSopenharmony_ci vec3); 868cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 869cabdff1aSopenharmony_ci filt3); 870cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 871cabdff1aSopenharmony_ci vec3); 872cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 873cabdff1aSopenharmony_ci filt3); 874cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 875cabdff1aSopenharmony_ci vec3); 876cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 877cabdff1aSopenharmony_ci filt3); 878cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, 879cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, out0, out1, out2, 880cabdff1aSopenharmony_ci out3); 881cabdff1aSopenharmony_ci PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 882cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 883cabdff1aSopenharmony_ci 884cabdff1aSopenharmony_ci LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3); 885cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 886cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3); 887cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 888cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 889cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 890cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 891cabdff1aSopenharmony_ci vec3); 892cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 893cabdff1aSopenharmony_ci filt3); 894cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 895cabdff1aSopenharmony_ci vec3); 896cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 897cabdff1aSopenharmony_ci filt3); 898cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, 899cabdff1aSopenharmony_ci offset_vec, out0, out1); 900cabdff1aSopenharmony_ci out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0); 901cabdff1aSopenharmony_ci ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride); 902cabdff1aSopenharmony_ci dst += (4 * dst_stride); 903cabdff1aSopenharmony_ci } 904cabdff1aSopenharmony_ci} 905cabdff1aSopenharmony_ci 906cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr, 907cabdff1aSopenharmony_ci int32_t src_stride, 908cabdff1aSopenharmony_ci int16_t *src1_ptr, 909cabdff1aSopenharmony_ci int32_t src2_stride, 910cabdff1aSopenharmony_ci uint8_t *dst, 911cabdff1aSopenharmony_ci int32_t dst_stride, 912cabdff1aSopenharmony_ci const int8_t *filter, 913cabdff1aSopenharmony_ci int32_t height, 914cabdff1aSopenharmony_ci int32_t weight0, 915cabdff1aSopenharmony_ci int32_t weight1, 916cabdff1aSopenharmony_ci int32_t offset0, 917cabdff1aSopenharmony_ci int32_t offset1, 918cabdff1aSopenharmony_ci int32_t rnd_val) 919cabdff1aSopenharmony_ci{ 920cabdff1aSopenharmony_ci uint32_t loop_cnt; 921cabdff1aSopenharmony_ci int32_t offset, weight, constant; 922cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 923cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 924cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 925cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 926cabdff1aSopenharmony_ci v8i16 filter_vec, out0, out1, out2, out3; 927cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 928cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 929cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 930cabdff1aSopenharmony_ci v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 931cabdff1aSopenharmony_ci 932cabdff1aSopenharmony_ci src0_ptr -= 3; 933cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 934cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 935cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 936cabdff1aSopenharmony_ci constant = 128 * weight1; 937cabdff1aSopenharmony_ci constant <<= 6; 938cabdff1aSopenharmony_ci offset += constant; 939cabdff1aSopenharmony_ci 940cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 941cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 942cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 943cabdff1aSopenharmony_ci 944cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 945cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 946cabdff1aSopenharmony_ci 947cabdff1aSopenharmony_ci mask1 = mask0 + 2; 948cabdff1aSopenharmony_ci mask2 = mask0 + 4; 949cabdff1aSopenharmony_ci mask3 = mask0 + 6; 950cabdff1aSopenharmony_ci 951cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 952cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 8, src0, src1); 953cabdff1aSopenharmony_ci src0_ptr += src_stride; 954cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 8, src2, src3); 955cabdff1aSopenharmony_ci src0_ptr += src_stride; 956cabdff1aSopenharmony_ci LD_SH2(src1_ptr, 8, in0, in1); 957cabdff1aSopenharmony_ci src1_ptr += src2_stride; 958cabdff1aSopenharmony_ci LD_SH2(src1_ptr, 8, in2, in3); 959cabdff1aSopenharmony_ci src1_ptr += src2_stride; 960cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 961cabdff1aSopenharmony_ci 962cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 963cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 964cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 965cabdff1aSopenharmony_ci filt3); 966cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 967cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 968cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 969cabdff1aSopenharmony_ci filt3); 970cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 971cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 972cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 973cabdff1aSopenharmony_ci filt3); 974cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 975cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 976cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 977cabdff1aSopenharmony_ci filt3); 978cabdff1aSopenharmony_ci 979cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 980cabdff1aSopenharmony_ci in0, in1, in2, in3, 981cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 982cabdff1aSopenharmony_ci out0, out1, out2, out3); 983cabdff1aSopenharmony_ci 984cabdff1aSopenharmony_ci PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 985cabdff1aSopenharmony_ci ST_SH2(out0, out1, dst, dst_stride); 986cabdff1aSopenharmony_ci dst += (2 * dst_stride); 987cabdff1aSopenharmony_ci } 988cabdff1aSopenharmony_ci} 989cabdff1aSopenharmony_ci 990cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr, 991cabdff1aSopenharmony_ci int32_t src_stride, 992cabdff1aSopenharmony_ci int16_t *src1_ptr, 993cabdff1aSopenharmony_ci int32_t src2_stride, 994cabdff1aSopenharmony_ci uint8_t *dst, 995cabdff1aSopenharmony_ci int32_t dst_stride, 996cabdff1aSopenharmony_ci const int8_t *filter, 997cabdff1aSopenharmony_ci int32_t height, 998cabdff1aSopenharmony_ci int32_t weight0, 999cabdff1aSopenharmony_ci int32_t weight1, 1000cabdff1aSopenharmony_ci int32_t offset0, 1001cabdff1aSopenharmony_ci int32_t offset1, 1002cabdff1aSopenharmony_ci int32_t rnd_val) 1003cabdff1aSopenharmony_ci{ 1004cabdff1aSopenharmony_ci uint32_t loop_cnt; 1005cabdff1aSopenharmony_ci uint64_t dst_val0; 1006cabdff1aSopenharmony_ci int32_t offset, weight, constant; 1007cabdff1aSopenharmony_ci v16i8 src0, src1; 1008cabdff1aSopenharmony_ci v8i16 in0, in1, in2; 1009cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1010cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1011cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 1012cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2; 1013cabdff1aSopenharmony_ci v4i32 dst2_r, dst2_l; 1014cabdff1aSopenharmony_ci v8i16 filter_vec, out0, out1, out2; 1015cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 1016cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1017cabdff1aSopenharmony_ci 1018cabdff1aSopenharmony_ci src0_ptr = src0_ptr - 3; 1019cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 1020cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 1021cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 1022cabdff1aSopenharmony_ci constant = 128 * weight1; 1023cabdff1aSopenharmony_ci constant <<= 6; 1024cabdff1aSopenharmony_ci offset += constant; 1025cabdff1aSopenharmony_ci 1026cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 1027cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1028cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 1029cabdff1aSopenharmony_ci 1030cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1031cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1032cabdff1aSopenharmony_ci 1033cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1034cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1035cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1036cabdff1aSopenharmony_ci mask4 = mask0 + 8; 1037cabdff1aSopenharmony_ci mask5 = mask0 + 10; 1038cabdff1aSopenharmony_ci mask6 = mask0 + 12; 1039cabdff1aSopenharmony_ci mask7 = mask0 + 14; 1040cabdff1aSopenharmony_ci 1041cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 16, src0, src1); 1042cabdff1aSopenharmony_ci src0_ptr += src_stride; 1043cabdff1aSopenharmony_ci LD_SH2(src1_ptr, 8, in0, in1); 1044cabdff1aSopenharmony_ci in2 = LD_SH(src1_ptr + 16); 1045cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1046cabdff1aSopenharmony_ci XORI_B2_128_SB(src0, src1); 1047cabdff1aSopenharmony_ci 1048cabdff1aSopenharmony_ci for (loop_cnt = 31; loop_cnt--;) { 1049cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1050cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1051cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1052cabdff1aSopenharmony_ci filt3); 1053cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 1054cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1055cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1056cabdff1aSopenharmony_ci filt3); 1057cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1058cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1059cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1060cabdff1aSopenharmony_ci filt3); 1061cabdff1aSopenharmony_ci 1062cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, 1063cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 1064cabdff1aSopenharmony_ci out0, out1); 1065cabdff1aSopenharmony_ci 1066cabdff1aSopenharmony_ci ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l); 1067cabdff1aSopenharmony_ci dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, 1068cabdff1aSopenharmony_ci (v8i16) weight_vec); 1069cabdff1aSopenharmony_ci dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, 1070cabdff1aSopenharmony_ci (v8i16) weight_vec); 1071cabdff1aSopenharmony_ci SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); 1072cabdff1aSopenharmony_ci out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); 1073cabdff1aSopenharmony_ci CLIP_SH_0_255(out2); 1074cabdff1aSopenharmony_ci 1075cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 16, src0, src1); 1076cabdff1aSopenharmony_ci src0_ptr += src_stride; 1077cabdff1aSopenharmony_ci LD_SH2(src1_ptr, 8, in0, in1); 1078cabdff1aSopenharmony_ci in2 = LD_SH(src1_ptr + 16); 1079cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1080cabdff1aSopenharmony_ci XORI_B2_128_SB(src0, src1); 1081cabdff1aSopenharmony_ci PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); 1082cabdff1aSopenharmony_ci dst_val0 = __msa_copy_u_d((v2i64) out2, 0); 1083cabdff1aSopenharmony_ci ST_SH(out0, dst); 1084cabdff1aSopenharmony_ci SD(dst_val0, dst + 16); 1085cabdff1aSopenharmony_ci dst += dst_stride; 1086cabdff1aSopenharmony_ci } 1087cabdff1aSopenharmony_ci 1088cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1089cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1090cabdff1aSopenharmony_ci filt3); 1091cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); 1092cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1093cabdff1aSopenharmony_ci filt3); 1094cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1095cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1096cabdff1aSopenharmony_ci filt3); 1097cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec, 1098cabdff1aSopenharmony_ci out0, out1); 1099cabdff1aSopenharmony_ci ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l); 1100cabdff1aSopenharmony_ci dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec); 1101cabdff1aSopenharmony_ci dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec); 1102cabdff1aSopenharmony_ci SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); 1103cabdff1aSopenharmony_ci out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); 1104cabdff1aSopenharmony_ci CLIP_SH_0_255(out2); 1105cabdff1aSopenharmony_ci PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); 1106cabdff1aSopenharmony_ci dst_val0 = __msa_copy_u_d((v2i64) out2, 0); 1107cabdff1aSopenharmony_ci ST_SH(out0, dst); 1108cabdff1aSopenharmony_ci SD(dst_val0, dst + 16); 1109cabdff1aSopenharmony_ci dst += dst_stride; 1110cabdff1aSopenharmony_ci} 1111cabdff1aSopenharmony_ci 1112cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr, 1113cabdff1aSopenharmony_ci int32_t src_stride, 1114cabdff1aSopenharmony_ci int16_t *src1_ptr, 1115cabdff1aSopenharmony_ci int32_t src2_stride, 1116cabdff1aSopenharmony_ci uint8_t *dst, 1117cabdff1aSopenharmony_ci int32_t dst_stride, 1118cabdff1aSopenharmony_ci const int8_t *filter, 1119cabdff1aSopenharmony_ci int32_t height, 1120cabdff1aSopenharmony_ci int32_t weight0, 1121cabdff1aSopenharmony_ci int32_t weight1, 1122cabdff1aSopenharmony_ci int32_t offset0, 1123cabdff1aSopenharmony_ci int32_t offset1, 1124cabdff1aSopenharmony_ci int32_t rnd_val) 1125cabdff1aSopenharmony_ci{ 1126cabdff1aSopenharmony_ci uint32_t loop_cnt; 1127cabdff1aSopenharmony_ci int32_t offset, weight, constant; 1128cabdff1aSopenharmony_ci v16i8 src0, src1, src2; 1129cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 1130cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1131cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1132cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1133cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 1134cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 1135cabdff1aSopenharmony_ci v8i16 filter_vec, out0, out1, out2, out3; 1136cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 1137cabdff1aSopenharmony_ci 1138cabdff1aSopenharmony_ci src0_ptr -= 3; 1139cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 1140cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 1141cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 1142cabdff1aSopenharmony_ci constant = 128 * weight1; 1143cabdff1aSopenharmony_ci constant <<= 6; 1144cabdff1aSopenharmony_ci offset += constant; 1145cabdff1aSopenharmony_ci 1146cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 1147cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1148cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 1149cabdff1aSopenharmony_ci 1150cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1151cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1152cabdff1aSopenharmony_ci 1153cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1154cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1155cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1156cabdff1aSopenharmony_ci mask4 = mask0 + 8; 1157cabdff1aSopenharmony_ci mask5 = mask0 + 10; 1158cabdff1aSopenharmony_ci mask6 = mask0 + 12; 1159cabdff1aSopenharmony_ci mask7 = mask0 + 14; 1160cabdff1aSopenharmony_ci 1161cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 1162cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 16, src0, src1); 1163cabdff1aSopenharmony_ci src2 = LD_SB(src0_ptr + 24); 1164cabdff1aSopenharmony_ci src0_ptr += src_stride; 1165cabdff1aSopenharmony_ci LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 1166cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1167cabdff1aSopenharmony_ci 1168cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 1169cabdff1aSopenharmony_ci 1170cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1171cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1172cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1173cabdff1aSopenharmony_ci filt3); 1174cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 1175cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1176cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1177cabdff1aSopenharmony_ci filt3); 1178cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1179cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1180cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1181cabdff1aSopenharmony_ci filt3); 1182cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1183cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1184cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1185cabdff1aSopenharmony_ci filt3); 1186cabdff1aSopenharmony_ci 1187cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 1188cabdff1aSopenharmony_ci in0, in1, in2, in3, 1189cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 1190cabdff1aSopenharmony_ci out0, out1, out2, out3); 1191cabdff1aSopenharmony_ci 1192cabdff1aSopenharmony_ci PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 1193cabdff1aSopenharmony_ci ST_SH2(out0, out1, dst, 16); 1194cabdff1aSopenharmony_ci dst += dst_stride; 1195cabdff1aSopenharmony_ci } 1196cabdff1aSopenharmony_ci} 1197cabdff1aSopenharmony_ci 1198cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr, 1199cabdff1aSopenharmony_ci int32_t src_stride, 1200cabdff1aSopenharmony_ci int16_t *src1_ptr, 1201cabdff1aSopenharmony_ci int32_t src2_stride, 1202cabdff1aSopenharmony_ci uint8_t *dst, 1203cabdff1aSopenharmony_ci int32_t dst_stride, 1204cabdff1aSopenharmony_ci const int8_t *filter, 1205cabdff1aSopenharmony_ci int32_t height, 1206cabdff1aSopenharmony_ci int32_t weight0, 1207cabdff1aSopenharmony_ci int32_t weight1, 1208cabdff1aSopenharmony_ci int32_t offset0, 1209cabdff1aSopenharmony_ci int32_t offset1, 1210cabdff1aSopenharmony_ci int32_t rnd_val) 1211cabdff1aSopenharmony_ci{ 1212cabdff1aSopenharmony_ci uint32_t loop_cnt; 1213cabdff1aSopenharmony_ci int32_t offset, weight, constant; 1214cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 1215cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 1216cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1217cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1218cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1219cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 1220cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 1221cabdff1aSopenharmony_ci v8i16 filter_vec, out0, out1, out2, out3; 1222cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 1223cabdff1aSopenharmony_ci 1224cabdff1aSopenharmony_ci src0_ptr -= 3; 1225cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 1226cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 1227cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 1228cabdff1aSopenharmony_ci constant = 128 * weight1; 1229cabdff1aSopenharmony_ci constant <<= 6; 1230cabdff1aSopenharmony_ci offset += constant; 1231cabdff1aSopenharmony_ci 1232cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 1233cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1234cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 1235cabdff1aSopenharmony_ci 1236cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1237cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1238cabdff1aSopenharmony_ci 1239cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1240cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1241cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1242cabdff1aSopenharmony_ci mask4 = mask0 + 8; 1243cabdff1aSopenharmony_ci mask5 = mask0 + 10; 1244cabdff1aSopenharmony_ci mask6 = mask0 + 12; 1245cabdff1aSopenharmony_ci mask7 = mask0 + 14; 1246cabdff1aSopenharmony_ci 1247cabdff1aSopenharmony_ci for (loop_cnt = 64; loop_cnt--;) { 1248cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 16, src0, src1); 1249cabdff1aSopenharmony_ci src2 = LD_SB(src0_ptr + 24); 1250cabdff1aSopenharmony_ci LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 1251cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 1252cabdff1aSopenharmony_ci LD_SB2(src0_ptr + 32, 8, src3, src4); 1253cabdff1aSopenharmony_ci src0_ptr += src_stride; 1254cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 1255cabdff1aSopenharmony_ci 1256cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1257cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1258cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1259cabdff1aSopenharmony_ci filt3); 1260cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 1261cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1262cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1263cabdff1aSopenharmony_ci filt3); 1264cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1265cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1266cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1267cabdff1aSopenharmony_ci filt3); 1268cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1269cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1270cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1271cabdff1aSopenharmony_ci filt3); 1272cabdff1aSopenharmony_ci 1273cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3, 1274cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 1275cabdff1aSopenharmony_ci out0, out1, out2, out3); 1276cabdff1aSopenharmony_ci 1277cabdff1aSopenharmony_ci PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 1278cabdff1aSopenharmony_ci ST_SH2(out0, out1, dst, 16); 1279cabdff1aSopenharmony_ci 1280cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 32, 8, in2, in3); 1281cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1282cabdff1aSopenharmony_ci 1283cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1284cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1285cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1286cabdff1aSopenharmony_ci filt3); 1287cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 1288cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1289cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1290cabdff1aSopenharmony_ci filt3); 1291cabdff1aSopenharmony_ci 1292cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3, 1293cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 1294cabdff1aSopenharmony_ci out0, out1); 1295cabdff1aSopenharmony_ci 1296cabdff1aSopenharmony_ci out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0); 1297cabdff1aSopenharmony_ci ST_SH(out0, dst + 32); 1298cabdff1aSopenharmony_ci dst += dst_stride; 1299cabdff1aSopenharmony_ci } 1300cabdff1aSopenharmony_ci} 1301cabdff1aSopenharmony_ci 1302cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr, 1303cabdff1aSopenharmony_ci int32_t src_stride, 1304cabdff1aSopenharmony_ci int16_t *src1_ptr, 1305cabdff1aSopenharmony_ci int32_t src2_stride, 1306cabdff1aSopenharmony_ci uint8_t *dst, 1307cabdff1aSopenharmony_ci int32_t dst_stride, 1308cabdff1aSopenharmony_ci const int8_t *filter, 1309cabdff1aSopenharmony_ci int32_t height, 1310cabdff1aSopenharmony_ci int32_t weight0, 1311cabdff1aSopenharmony_ci int32_t weight1, 1312cabdff1aSopenharmony_ci int32_t offset0, 1313cabdff1aSopenharmony_ci int32_t offset1, 1314cabdff1aSopenharmony_ci int32_t rnd_val) 1315cabdff1aSopenharmony_ci{ 1316cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp; 1317cabdff1aSopenharmony_ci uint8_t *dst_tmp; 1318cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 1319cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 1320cabdff1aSopenharmony_ci int32_t offset, weight, constant; 1321cabdff1aSopenharmony_ci v16i8 src0, src1, src2; 1322cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 1323cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1324cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1325cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1326cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 1327cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 1328cabdff1aSopenharmony_ci v8i16 filter_vec, out0, out1, out2, out3; 1329cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 1330cabdff1aSopenharmony_ci 1331cabdff1aSopenharmony_ci src0_ptr -= 3; 1332cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 1333cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 1334cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 1335cabdff1aSopenharmony_ci constant = 128 * weight1; 1336cabdff1aSopenharmony_ci constant <<= 6; 1337cabdff1aSopenharmony_ci offset += constant; 1338cabdff1aSopenharmony_ci 1339cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 1340cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1341cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 1342cabdff1aSopenharmony_ci 1343cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1344cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1345cabdff1aSopenharmony_ci 1346cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1347cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1348cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1349cabdff1aSopenharmony_ci mask4 = mask0 + 8; 1350cabdff1aSopenharmony_ci mask5 = mask0 + 10; 1351cabdff1aSopenharmony_ci mask6 = mask0 + 12; 1352cabdff1aSopenharmony_ci mask7 = mask0 + 14; 1353cabdff1aSopenharmony_ci 1354cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 1355cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 1356cabdff1aSopenharmony_ci dst_tmp = dst; 1357cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 1358cabdff1aSopenharmony_ci 1359cabdff1aSopenharmony_ci for (cnt = 2; cnt--;) { 1360cabdff1aSopenharmony_ci LD_SB2(src0_ptr_tmp, 16, src0, src1); 1361cabdff1aSopenharmony_ci src2 = LD_SB(src0_ptr_tmp + 24); 1362cabdff1aSopenharmony_ci src0_ptr_tmp += 32; 1363cabdff1aSopenharmony_ci LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3); 1364cabdff1aSopenharmony_ci src1_ptr_tmp += 32; 1365cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 1366cabdff1aSopenharmony_ci 1367cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1368cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1369cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1370cabdff1aSopenharmony_ci filt2, filt3); 1371cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 1372cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1373cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1374cabdff1aSopenharmony_ci filt2, filt3); 1375cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1376cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1377cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1378cabdff1aSopenharmony_ci filt2, filt3); 1379cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1380cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1381cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1382cabdff1aSopenharmony_ci filt2, filt3); 1383cabdff1aSopenharmony_ci 1384cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 1385cabdff1aSopenharmony_ci in0, in1, in2, in3, 1386cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 1387cabdff1aSopenharmony_ci out0, out1, out2, out3); 1388cabdff1aSopenharmony_ci 1389cabdff1aSopenharmony_ci PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 1390cabdff1aSopenharmony_ci ST_SH2(out0, out1, dst_tmp, 16); 1391cabdff1aSopenharmony_ci dst_tmp += 32; 1392cabdff1aSopenharmony_ci } 1393cabdff1aSopenharmony_ci 1394cabdff1aSopenharmony_ci src0_ptr += src_stride; 1395cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1396cabdff1aSopenharmony_ci dst += dst_stride; 1397cabdff1aSopenharmony_ci 1398cabdff1aSopenharmony_ci } 1399cabdff1aSopenharmony_ci} 1400cabdff1aSopenharmony_ci 1401cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr, 1402cabdff1aSopenharmony_ci int32_t src_stride, 1403cabdff1aSopenharmony_ci int16_t *src1_ptr, 1404cabdff1aSopenharmony_ci int32_t src2_stride, 1405cabdff1aSopenharmony_ci uint8_t *dst, 1406cabdff1aSopenharmony_ci int32_t dst_stride, 1407cabdff1aSopenharmony_ci const int8_t *filter, 1408cabdff1aSopenharmony_ci int32_t height, 1409cabdff1aSopenharmony_ci int32_t weight0, 1410cabdff1aSopenharmony_ci int32_t weight1, 1411cabdff1aSopenharmony_ci int32_t offset0, 1412cabdff1aSopenharmony_ci int32_t offset1, 1413cabdff1aSopenharmony_ci int32_t rnd_val) 1414cabdff1aSopenharmony_ci{ 1415cabdff1aSopenharmony_ci uint32_t loop_cnt; 1416cabdff1aSopenharmony_ci int32_t offset, weight; 1417cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1418cabdff1aSopenharmony_ci v16i8 src11, src12, src13, src14; 1419cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 1420cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1421cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1422cabdff1aSopenharmony_ci v16i8 src1110_r, src1211_r, src1312_r, src1413_r; 1423cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554, src8776, src10998; 1424cabdff1aSopenharmony_ci v16i8 src12111110, src14131312; 1425cabdff1aSopenharmony_ci v8i16 dst10, dst32, dst54, dst76; 1426cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1427cabdff1aSopenharmony_ci v8i16 filter_vec, out0, out1, out2, out3; 1428cabdff1aSopenharmony_ci v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec; 1429cabdff1aSopenharmony_ci 1430cabdff1aSopenharmony_ci src0_ptr -= (3 * src_stride); 1431cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 1432cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 1433cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 1434cabdff1aSopenharmony_ci 1435cabdff1aSopenharmony_ci const_vec = __msa_ldi_w(128); 1436cabdff1aSopenharmony_ci const_vec <<= 6; 1437cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 1438cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1439cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 1440cabdff1aSopenharmony_ci weight1_vec = __msa_fill_w(weight1); 1441cabdff1aSopenharmony_ci offset_vec += const_vec * weight1_vec; 1442cabdff1aSopenharmony_ci 1443cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1444cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1445cabdff1aSopenharmony_ci 1446cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1447cabdff1aSopenharmony_ci src0_ptr += (7 * src_stride); 1448cabdff1aSopenharmony_ci 1449cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1450cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1451cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1452cabdff1aSopenharmony_ci ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 1453cabdff1aSopenharmony_ci src2110, src4332, src6554); 1454cabdff1aSopenharmony_ci XORI_B3_128_SB(src2110, src4332, src6554); 1455cabdff1aSopenharmony_ci 1456cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 1457cabdff1aSopenharmony_ci LD_SB8(src0_ptr, src_stride, 1458cabdff1aSopenharmony_ci src7, src8, src9, src10, src11, src12, src13, src14); 1459cabdff1aSopenharmony_ci src0_ptr += (8 * src_stride); 1460cabdff1aSopenharmony_ci LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); 1461cabdff1aSopenharmony_ci src1_ptr += (8 * src2_stride); 1462cabdff1aSopenharmony_ci 1463cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 1464cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 1465cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1466cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1467cabdff1aSopenharmony_ci ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, 1468cabdff1aSopenharmony_ci src1110_r, src1211_r, src1312_r, src1413_r); 1469cabdff1aSopenharmony_ci ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r, 1470cabdff1aSopenharmony_ci src1413_r, src1312_r, 1471cabdff1aSopenharmony_ci src8776, src10998, src12111110, src14131312); 1472cabdff1aSopenharmony_ci XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); 1473cabdff1aSopenharmony_ci 1474cabdff1aSopenharmony_ci DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, 1475cabdff1aSopenharmony_ci filt0, dst10, dst32, dst54, dst76); 1476cabdff1aSopenharmony_ci DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1, 1477cabdff1aSopenharmony_ci filt1, dst10, dst32, dst54, dst76); 1478cabdff1aSopenharmony_ci DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2, 1479cabdff1aSopenharmony_ci filt2, filt2, dst10, dst32, dst54, dst76); 1480cabdff1aSopenharmony_ci DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3, 1481cabdff1aSopenharmony_ci filt3, filt3, dst10, dst32, dst54, dst76); 1482cabdff1aSopenharmony_ci 1483cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76, 1484cabdff1aSopenharmony_ci in0, in1, in2, in3, 1485cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 1486cabdff1aSopenharmony_ci out0, out1, out2, out3); 1487cabdff1aSopenharmony_ci 1488cabdff1aSopenharmony_ci PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 1489cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 1490cabdff1aSopenharmony_ci dst += (8 * dst_stride); 1491cabdff1aSopenharmony_ci 1492cabdff1aSopenharmony_ci src2110 = src10998; 1493cabdff1aSopenharmony_ci src4332 = src12111110; 1494cabdff1aSopenharmony_ci src6554 = src14131312; 1495cabdff1aSopenharmony_ci src6 = src14; 1496cabdff1aSopenharmony_ci } 1497cabdff1aSopenharmony_ci} 1498cabdff1aSopenharmony_ci 1499cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr, 1500cabdff1aSopenharmony_ci int32_t src_stride, 1501cabdff1aSopenharmony_ci int16_t *src1_ptr, 1502cabdff1aSopenharmony_ci int32_t src2_stride, 1503cabdff1aSopenharmony_ci uint8_t *dst, 1504cabdff1aSopenharmony_ci int32_t dst_stride, 1505cabdff1aSopenharmony_ci const int8_t *filter, 1506cabdff1aSopenharmony_ci int32_t height, 1507cabdff1aSopenharmony_ci int32_t weight0, 1508cabdff1aSopenharmony_ci int32_t weight1, 1509cabdff1aSopenharmony_ci int32_t offset0, 1510cabdff1aSopenharmony_ci int32_t offset1, 1511cabdff1aSopenharmony_ci int32_t rnd_val) 1512cabdff1aSopenharmony_ci{ 1513cabdff1aSopenharmony_ci uint32_t loop_cnt; 1514cabdff1aSopenharmony_ci int32_t offset, weight; 1515cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 1516cabdff1aSopenharmony_ci v16i8 src6, src7, src8, src9, src10; 1517cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 1518cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1519cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1520cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 1521cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1522cabdff1aSopenharmony_ci v8i16 filter_vec, out0, out1, out2, out3; 1523cabdff1aSopenharmony_ci v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec; 1524cabdff1aSopenharmony_ci 1525cabdff1aSopenharmony_ci src0_ptr -= (3 * src_stride); 1526cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 1527cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 1528cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 1529cabdff1aSopenharmony_ci 1530cabdff1aSopenharmony_ci const_vec = __msa_ldi_w(128); 1531cabdff1aSopenharmony_ci const_vec <<= 6; 1532cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 1533cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1534cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 1535cabdff1aSopenharmony_ci weight1_vec = __msa_fill_w(weight1); 1536cabdff1aSopenharmony_ci offset_vec += const_vec * weight1_vec; 1537cabdff1aSopenharmony_ci 1538cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1539cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1540cabdff1aSopenharmony_ci 1541cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1542cabdff1aSopenharmony_ci src0_ptr += (7 * src_stride); 1543cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1544cabdff1aSopenharmony_ci 1545cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1546cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1547cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1548cabdff1aSopenharmony_ci 1549cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1550cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 1551cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 1552cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 1553cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 1554cabdff1aSopenharmony_ci 1555cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1556cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1557cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1558cabdff1aSopenharmony_ci 1559cabdff1aSopenharmony_ci DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0, 1560cabdff1aSopenharmony_ci filt0, tmp0, tmp1, tmp2, tmp3); 1561cabdff1aSopenharmony_ci DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1, 1562cabdff1aSopenharmony_ci filt1, tmp0, tmp1, tmp2, tmp3); 1563cabdff1aSopenharmony_ci DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2, 1564cabdff1aSopenharmony_ci filt2, tmp0, tmp1, tmp2, tmp3); 1565cabdff1aSopenharmony_ci DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3, 1566cabdff1aSopenharmony_ci filt3, tmp0, tmp1, tmp2, tmp3); 1567cabdff1aSopenharmony_ci 1568cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 1569cabdff1aSopenharmony_ci in0, in1, in2, in3, 1570cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 1571cabdff1aSopenharmony_ci out0, out1, out2, out3); 1572cabdff1aSopenharmony_ci 1573cabdff1aSopenharmony_ci PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); 1574cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1575cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1576cabdff1aSopenharmony_ci 1577cabdff1aSopenharmony_ci src10_r = src54_r; 1578cabdff1aSopenharmony_ci src32_r = src76_r; 1579cabdff1aSopenharmony_ci src54_r = src98_r; 1580cabdff1aSopenharmony_ci src21_r = src65_r; 1581cabdff1aSopenharmony_ci src43_r = src87_r; 1582cabdff1aSopenharmony_ci src65_r = src109_r; 1583cabdff1aSopenharmony_ci src6 = src10; 1584cabdff1aSopenharmony_ci } 1585cabdff1aSopenharmony_ci} 1586cabdff1aSopenharmony_ci 1587cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr, 1588cabdff1aSopenharmony_ci int32_t src_stride, 1589cabdff1aSopenharmony_ci int16_t *src1_ptr, 1590cabdff1aSopenharmony_ci int32_t src2_stride, 1591cabdff1aSopenharmony_ci uint8_t *dst, 1592cabdff1aSopenharmony_ci int32_t dst_stride, 1593cabdff1aSopenharmony_ci const int8_t *filter, 1594cabdff1aSopenharmony_ci int32_t height, 1595cabdff1aSopenharmony_ci int32_t weight0, 1596cabdff1aSopenharmony_ci int32_t weight1, 1597cabdff1aSopenharmony_ci int32_t offset0, 1598cabdff1aSopenharmony_ci int32_t offset1, 1599cabdff1aSopenharmony_ci int32_t rnd_val) 1600cabdff1aSopenharmony_ci{ 1601cabdff1aSopenharmony_ci uint32_t loop_cnt; 1602cabdff1aSopenharmony_ci int32_t offset, weight; 1603cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1604cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 1605cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r; 1606cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r; 1607cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2; 1608cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src76_l; 1609cabdff1aSopenharmony_ci v16i8 src21_l, src43_l, src65_l, src87_l; 1610cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554, src8776; 1611cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1612cabdff1aSopenharmony_ci v8i16 out0, out1, out2, filter_vec; 1613cabdff1aSopenharmony_ci v4i32 dst2_r, dst2_l; 1614cabdff1aSopenharmony_ci v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec; 1615cabdff1aSopenharmony_ci 1616cabdff1aSopenharmony_ci src0_ptr -= (3 * src_stride); 1617cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 1618cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 1619cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 1620cabdff1aSopenharmony_ci 1621cabdff1aSopenharmony_ci const_vec = __msa_ldi_w(128); 1622cabdff1aSopenharmony_ci const_vec <<= 6; 1623cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 1624cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1625cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 1626cabdff1aSopenharmony_ci weight1_vec = __msa_fill_w(weight1); 1627cabdff1aSopenharmony_ci offset_vec += const_vec * weight1_vec; 1628cabdff1aSopenharmony_ci 1629cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1630cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1631cabdff1aSopenharmony_ci 1632cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1633cabdff1aSopenharmony_ci src0_ptr += (7 * src_stride); 1634cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1635cabdff1aSopenharmony_ci 1636cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1637cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1638cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1639cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1640cabdff1aSopenharmony_ci src10_l, src32_l, src54_l, src21_l); 1641cabdff1aSopenharmony_ci ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1642cabdff1aSopenharmony_ci ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, 1643cabdff1aSopenharmony_ci src2110, src4332, src6554); 1644cabdff1aSopenharmony_ci 1645cabdff1aSopenharmony_ci for (loop_cnt = 8; loop_cnt--;) { 1646cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src7, src8); 1647cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 1648cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 1649cabdff1aSopenharmony_ci LD_SH2((src1_ptr + 8), src2_stride, in2, in3); 1650cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 1651cabdff1aSopenharmony_ci in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2); 1652cabdff1aSopenharmony_ci XORI_B2_128_SB(src7, src8); 1653cabdff1aSopenharmony_ci 1654cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 1655cabdff1aSopenharmony_ci ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 1656cabdff1aSopenharmony_ci src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l); 1657cabdff1aSopenharmony_ci 1658cabdff1aSopenharmony_ci DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0, 1659cabdff1aSopenharmony_ci tmp0, tmp1, tmp2); 1660cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1); 1661cabdff1aSopenharmony_ci tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1); 1662cabdff1aSopenharmony_ci DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1); 1663cabdff1aSopenharmony_ci tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2); 1664cabdff1aSopenharmony_ci DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1); 1665cabdff1aSopenharmony_ci tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3); 1666cabdff1aSopenharmony_ci 1667cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1, 1668cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 1669cabdff1aSopenharmony_ci out0, out1); 1670cabdff1aSopenharmony_ci 1671cabdff1aSopenharmony_ci ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l); 1672cabdff1aSopenharmony_ci dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, 1673cabdff1aSopenharmony_ci (v8i16) weight_vec); 1674cabdff1aSopenharmony_ci dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, 1675cabdff1aSopenharmony_ci (v8i16) weight_vec); 1676cabdff1aSopenharmony_ci SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); 1677cabdff1aSopenharmony_ci out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); 1678cabdff1aSopenharmony_ci CLIP_SH_0_255(out2); 1679cabdff1aSopenharmony_ci PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); 1680cabdff1aSopenharmony_ci ST_D2(out0, 0, 1, dst, dst_stride); 1681cabdff1aSopenharmony_ci ST_W2(out2, 0, 1, dst + 8, dst_stride); 1682cabdff1aSopenharmony_ci dst += (2 * dst_stride); 1683cabdff1aSopenharmony_ci 1684cabdff1aSopenharmony_ci src10_r = src32_r; 1685cabdff1aSopenharmony_ci src32_r = src54_r; 1686cabdff1aSopenharmony_ci src54_r = src76_r; 1687cabdff1aSopenharmony_ci src21_r = src43_r; 1688cabdff1aSopenharmony_ci src43_r = src65_r; 1689cabdff1aSopenharmony_ci src65_r = src87_r; 1690cabdff1aSopenharmony_ci src2110 = src4332; 1691cabdff1aSopenharmony_ci src4332 = src6554; 1692cabdff1aSopenharmony_ci src6554 = src8776; 1693cabdff1aSopenharmony_ci src6 = src8; 1694cabdff1aSopenharmony_ci } 1695cabdff1aSopenharmony_ci} 1696cabdff1aSopenharmony_ci 1697cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr, 1698cabdff1aSopenharmony_ci int32_t src_stride, 1699cabdff1aSopenharmony_ci int16_t *src1_ptr, 1700cabdff1aSopenharmony_ci int32_t src2_stride, 1701cabdff1aSopenharmony_ci uint8_t *dst, 1702cabdff1aSopenharmony_ci int32_t dst_stride, 1703cabdff1aSopenharmony_ci const int8_t *filter, 1704cabdff1aSopenharmony_ci int32_t height, 1705cabdff1aSopenharmony_ci int32_t weight0, 1706cabdff1aSopenharmony_ci int32_t weight1, 1707cabdff1aSopenharmony_ci int32_t offset0, 1708cabdff1aSopenharmony_ci int32_t offset1, 1709cabdff1aSopenharmony_ci int32_t rnd_val, 1710cabdff1aSopenharmony_ci int32_t width) 1711cabdff1aSopenharmony_ci{ 1712cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp; 1713cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 1714cabdff1aSopenharmony_ci uint8_t *dst_tmp; 1715cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 1716cabdff1aSopenharmony_ci int32_t offset, weight; 1717cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1718cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 1719cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r; 1720cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r; 1721cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src76_l; 1722cabdff1aSopenharmony_ci v16i8 src21_l, src43_l, src65_l, src87_l; 1723cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 1724cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1725cabdff1aSopenharmony_ci v8i16 filter_vec; 1726cabdff1aSopenharmony_ci v8i16 out0, out1, out2, out3; 1727cabdff1aSopenharmony_ci v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec; 1728cabdff1aSopenharmony_ci 1729cabdff1aSopenharmony_ci src0_ptr -= (3 * src_stride); 1730cabdff1aSopenharmony_ci 1731cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 1732cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 1733cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 1734cabdff1aSopenharmony_ci 1735cabdff1aSopenharmony_ci const_vec = __msa_ldi_w(128); 1736cabdff1aSopenharmony_ci const_vec <<= 6; 1737cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 1738cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1739cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 1740cabdff1aSopenharmony_ci weight1_vec = __msa_fill_w(weight1); 1741cabdff1aSopenharmony_ci offset_vec += const_vec * weight1_vec; 1742cabdff1aSopenharmony_ci 1743cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1744cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1745cabdff1aSopenharmony_ci 1746cabdff1aSopenharmony_ci for (cnt = (width >> 4); cnt--;) { 1747cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 1748cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 1749cabdff1aSopenharmony_ci dst_tmp = dst; 1750cabdff1aSopenharmony_ci 1751cabdff1aSopenharmony_ci LD_SB7(src0_ptr_tmp, src_stride, 1752cabdff1aSopenharmony_ci src0, src1, src2, src3, src4, src5, src6); 1753cabdff1aSopenharmony_ci src0_ptr_tmp += (7 * src_stride); 1754cabdff1aSopenharmony_ci 1755cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1756cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1757cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1758cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1759cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1760cabdff1aSopenharmony_ci src10_l, src32_l, src54_l, src21_l); 1761cabdff1aSopenharmony_ci ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1762cabdff1aSopenharmony_ci 1763cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 1764cabdff1aSopenharmony_ci LD_SB2(src0_ptr_tmp, src_stride, src7, src8); 1765cabdff1aSopenharmony_ci src0_ptr_tmp += (2 * src_stride); 1766cabdff1aSopenharmony_ci LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); 1767cabdff1aSopenharmony_ci LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3); 1768cabdff1aSopenharmony_ci src1_ptr_tmp += (2 * src2_stride); 1769cabdff1aSopenharmony_ci 1770cabdff1aSopenharmony_ci XORI_B2_128_SB(src7, src8); 1771cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 1772cabdff1aSopenharmony_ci ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 1773cabdff1aSopenharmony_ci 1774cabdff1aSopenharmony_ci DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0, 1775cabdff1aSopenharmony_ci filt0, filt0, tmp0, tmp1, tmp2, tmp3); 1776cabdff1aSopenharmony_ci DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1, 1777cabdff1aSopenharmony_ci filt1, filt1, tmp0, tmp1, tmp2, tmp3); 1778cabdff1aSopenharmony_ci DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2, 1779cabdff1aSopenharmony_ci filt2, filt2, tmp0, tmp1, tmp2, tmp3); 1780cabdff1aSopenharmony_ci DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3, 1781cabdff1aSopenharmony_ci filt3, filt3, tmp0, tmp1, tmp2, tmp3); 1782cabdff1aSopenharmony_ci 1783cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 1784cabdff1aSopenharmony_ci in0, in1, in2, in3, 1785cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 1786cabdff1aSopenharmony_ci out0, out1, out2, out3); 1787cabdff1aSopenharmony_ci 1788cabdff1aSopenharmony_ci PCKEV_B2_SH(out2, out0, out3, out1, out0, out1); 1789cabdff1aSopenharmony_ci ST_SH2(out0, out1, dst_tmp, dst_stride); 1790cabdff1aSopenharmony_ci dst_tmp += (2 * dst_stride); 1791cabdff1aSopenharmony_ci 1792cabdff1aSopenharmony_ci src10_r = src32_r; 1793cabdff1aSopenharmony_ci src32_r = src54_r; 1794cabdff1aSopenharmony_ci src54_r = src76_r; 1795cabdff1aSopenharmony_ci src21_r = src43_r; 1796cabdff1aSopenharmony_ci src43_r = src65_r; 1797cabdff1aSopenharmony_ci src65_r = src87_r; 1798cabdff1aSopenharmony_ci src10_l = src32_l; 1799cabdff1aSopenharmony_ci src32_l = src54_l; 1800cabdff1aSopenharmony_ci src54_l = src76_l; 1801cabdff1aSopenharmony_ci src21_l = src43_l; 1802cabdff1aSopenharmony_ci src43_l = src65_l; 1803cabdff1aSopenharmony_ci src65_l = src87_l; 1804cabdff1aSopenharmony_ci src6 = src8; 1805cabdff1aSopenharmony_ci } 1806cabdff1aSopenharmony_ci 1807cabdff1aSopenharmony_ci src0_ptr += 16; 1808cabdff1aSopenharmony_ci src1_ptr += 16; 1809cabdff1aSopenharmony_ci dst += 16; 1810cabdff1aSopenharmony_ci } 1811cabdff1aSopenharmony_ci} 1812cabdff1aSopenharmony_ci 1813cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr, 1814cabdff1aSopenharmony_ci int32_t src_stride, 1815cabdff1aSopenharmony_ci int16_t *src1_ptr, 1816cabdff1aSopenharmony_ci int32_t src2_stride, 1817cabdff1aSopenharmony_ci uint8_t *dst, 1818cabdff1aSopenharmony_ci int32_t dst_stride, 1819cabdff1aSopenharmony_ci const int8_t *filter, 1820cabdff1aSopenharmony_ci int32_t height, 1821cabdff1aSopenharmony_ci int32_t weight0, 1822cabdff1aSopenharmony_ci int32_t weight1, 1823cabdff1aSopenharmony_ci int32_t offset0, 1824cabdff1aSopenharmony_ci int32_t offset1, 1825cabdff1aSopenharmony_ci int32_t rnd_val) 1826cabdff1aSopenharmony_ci{ 1827cabdff1aSopenharmony_ci hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, 1828cabdff1aSopenharmony_ci src1_ptr, src2_stride, 1829cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 1830cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, 1831cabdff1aSopenharmony_ci rnd_val, 16); 1832cabdff1aSopenharmony_ci} 1833cabdff1aSopenharmony_ci 1834cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr, 1835cabdff1aSopenharmony_ci int32_t src_stride, 1836cabdff1aSopenharmony_ci int16_t *src1_ptr, 1837cabdff1aSopenharmony_ci int32_t src2_stride, 1838cabdff1aSopenharmony_ci uint8_t *dst, 1839cabdff1aSopenharmony_ci int32_t dst_stride, 1840cabdff1aSopenharmony_ci const int8_t *filter, 1841cabdff1aSopenharmony_ci int32_t height, 1842cabdff1aSopenharmony_ci int32_t weight0, 1843cabdff1aSopenharmony_ci int32_t weight1, 1844cabdff1aSopenharmony_ci int32_t offset0, 1845cabdff1aSopenharmony_ci int32_t offset1, 1846cabdff1aSopenharmony_ci int32_t rnd_val) 1847cabdff1aSopenharmony_ci{ 1848cabdff1aSopenharmony_ci hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, 1849cabdff1aSopenharmony_ci src1_ptr, src2_stride, 1850cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 1851cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, 1852cabdff1aSopenharmony_ci rnd_val, 16); 1853cabdff1aSopenharmony_ci hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride, 1854cabdff1aSopenharmony_ci src1_ptr + 16, src2_stride, 1855cabdff1aSopenharmony_ci dst + 16, dst_stride, filter, height, 1856cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, rnd_val); 1857cabdff1aSopenharmony_ci} 1858cabdff1aSopenharmony_ci 1859cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr, 1860cabdff1aSopenharmony_ci int32_t src_stride, 1861cabdff1aSopenharmony_ci int16_t *src1_ptr, 1862cabdff1aSopenharmony_ci int32_t src2_stride, 1863cabdff1aSopenharmony_ci uint8_t *dst, 1864cabdff1aSopenharmony_ci int32_t dst_stride, 1865cabdff1aSopenharmony_ci const int8_t *filter, 1866cabdff1aSopenharmony_ci int32_t height, 1867cabdff1aSopenharmony_ci int32_t weight0, 1868cabdff1aSopenharmony_ci int32_t weight1, 1869cabdff1aSopenharmony_ci int32_t offset0, 1870cabdff1aSopenharmony_ci int32_t offset1, 1871cabdff1aSopenharmony_ci int32_t rnd_val) 1872cabdff1aSopenharmony_ci{ 1873cabdff1aSopenharmony_ci hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, 1874cabdff1aSopenharmony_ci src1_ptr, src2_stride, 1875cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 1876cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, 1877cabdff1aSopenharmony_ci rnd_val, 32); 1878cabdff1aSopenharmony_ci} 1879cabdff1aSopenharmony_ci 1880cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr, 1881cabdff1aSopenharmony_ci int32_t src_stride, 1882cabdff1aSopenharmony_ci int16_t *src1_ptr, 1883cabdff1aSopenharmony_ci int32_t src2_stride, 1884cabdff1aSopenharmony_ci uint8_t *dst, 1885cabdff1aSopenharmony_ci int32_t dst_stride, 1886cabdff1aSopenharmony_ci const int8_t *filter, 1887cabdff1aSopenharmony_ci int32_t height, 1888cabdff1aSopenharmony_ci int32_t weight0, 1889cabdff1aSopenharmony_ci int32_t weight1, 1890cabdff1aSopenharmony_ci int32_t offset0, 1891cabdff1aSopenharmony_ci int32_t offset1, 1892cabdff1aSopenharmony_ci int32_t rnd_val) 1893cabdff1aSopenharmony_ci{ 1894cabdff1aSopenharmony_ci hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, 1895cabdff1aSopenharmony_ci src1_ptr, src2_stride, 1896cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 1897cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, 1898cabdff1aSopenharmony_ci rnd_val, 48); 1899cabdff1aSopenharmony_ci} 1900cabdff1aSopenharmony_ci 1901cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr, 1902cabdff1aSopenharmony_ci int32_t src_stride, 1903cabdff1aSopenharmony_ci int16_t *src1_ptr, 1904cabdff1aSopenharmony_ci int32_t src2_stride, 1905cabdff1aSopenharmony_ci uint8_t *dst, 1906cabdff1aSopenharmony_ci int32_t dst_stride, 1907cabdff1aSopenharmony_ci const int8_t *filter, 1908cabdff1aSopenharmony_ci int32_t height, 1909cabdff1aSopenharmony_ci int32_t weight0, 1910cabdff1aSopenharmony_ci int32_t weight1, 1911cabdff1aSopenharmony_ci int32_t offset0, 1912cabdff1aSopenharmony_ci int32_t offset1, 1913cabdff1aSopenharmony_ci int32_t rnd_val) 1914cabdff1aSopenharmony_ci{ 1915cabdff1aSopenharmony_ci hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride, 1916cabdff1aSopenharmony_ci src1_ptr, src2_stride, 1917cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 1918cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, 1919cabdff1aSopenharmony_ci rnd_val, 64); 1920cabdff1aSopenharmony_ci} 1921cabdff1aSopenharmony_ci 1922cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr, 1923cabdff1aSopenharmony_ci int32_t src_stride, 1924cabdff1aSopenharmony_ci int16_t *src1_ptr, 1925cabdff1aSopenharmony_ci int32_t src2_stride, 1926cabdff1aSopenharmony_ci uint8_t *dst, 1927cabdff1aSopenharmony_ci int32_t dst_stride, 1928cabdff1aSopenharmony_ci const int8_t *filter_x, 1929cabdff1aSopenharmony_ci const int8_t *filter_y, 1930cabdff1aSopenharmony_ci int32_t height, 1931cabdff1aSopenharmony_ci int32_t weight0, 1932cabdff1aSopenharmony_ci int32_t weight1, 1933cabdff1aSopenharmony_ci int32_t offset0, 1934cabdff1aSopenharmony_ci int32_t offset1, 1935cabdff1aSopenharmony_ci int32_t rnd_val) 1936cabdff1aSopenharmony_ci{ 1937cabdff1aSopenharmony_ci uint32_t loop_cnt; 1938cabdff1aSopenharmony_ci uint64_t tp0, tp1; 1939cabdff1aSopenharmony_ci int32_t offset, weight; 1940cabdff1aSopenharmony_ci v16u8 out; 1941cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1942cabdff1aSopenharmony_ci v8i16 in0 = { 0 }, in1 = { 0 }; 1943cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1944cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1945cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 1946cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec; 1947cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1948cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1949cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63, dst66, dst87; 1950cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 1951cabdff1aSopenharmony_ci v8i16 dst10, dst32, dst54, dst76; 1952cabdff1aSopenharmony_ci v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98; 1953cabdff1aSopenharmony_ci v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3; 1954cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1955cabdff1aSopenharmony_ci 1956cabdff1aSopenharmony_ci src0_ptr -= ((3 * src_stride) + 3); 1957cabdff1aSopenharmony_ci 1958cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 1959cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1960cabdff1aSopenharmony_ci 1961cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 1962cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 1963cabdff1aSopenharmony_ci 1964cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1965cabdff1aSopenharmony_ci 1966cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1967cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1968cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1969cabdff1aSopenharmony_ci 1970cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 1971cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 1972cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 1973cabdff1aSopenharmony_ci 1974cabdff1aSopenharmony_ci const_vec = __msa_fill_w((128 * weight1)); 1975cabdff1aSopenharmony_ci const_vec <<= 6; 1976cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 1977cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 1978cabdff1aSopenharmony_ci offset_vec += const_vec; 1979cabdff1aSopenharmony_ci weight_vec = (v8i16) __msa_fill_w(weight); 1980cabdff1aSopenharmony_ci 1981cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1982cabdff1aSopenharmony_ci src0_ptr += (7 * src_stride); 1983cabdff1aSopenharmony_ci 1984cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1985cabdff1aSopenharmony_ci 1986cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1987cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1988cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, 1989cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1990cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, 1991cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1992cabdff1aSopenharmony_ci 1993cabdff1aSopenharmony_ci dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1994cabdff1aSopenharmony_ci filt3); 1995cabdff1aSopenharmony_ci dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1996cabdff1aSopenharmony_ci filt3); 1997cabdff1aSopenharmony_ci dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1998cabdff1aSopenharmony_ci filt3); 1999cabdff1aSopenharmony_ci dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 2000cabdff1aSopenharmony_ci filt3); 2001cabdff1aSopenharmony_ci 2002cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10, dst43); 2003cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21, dst54); 2004cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32, dst65); 2005cabdff1aSopenharmony_ci 2006cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 2007cabdff1aSopenharmony_ci 2008cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 2009cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 2010cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 2011cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 2012cabdff1aSopenharmony_ci 2013cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 2014cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 2015cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 2016cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 2017cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in1); 2018cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 2019cabdff1aSopenharmony_ci 2020cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3, 2021cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 2022cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3, 2023cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 2024cabdff1aSopenharmony_ci dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2025cabdff1aSopenharmony_ci filt3); 2026cabdff1aSopenharmony_ci dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2027cabdff1aSopenharmony_ci filt3); 2028cabdff1aSopenharmony_ci 2029cabdff1aSopenharmony_ci dst76 = __msa_ilvr_h(dst97, dst66); 2030cabdff1aSopenharmony_ci ILVRL_H2_SH(dst108, dst97, dst87, dst109); 2031cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 2032cabdff1aSopenharmony_ci dst98 = __msa_ilvr_h(dst66, dst108); 2033cabdff1aSopenharmony_ci 2034cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1, 2035cabdff1aSopenharmony_ci filt_h2, filt_h3); 2036cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1, 2037cabdff1aSopenharmony_ci filt_h2, filt_h3); 2038cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1, 2039cabdff1aSopenharmony_ci filt_h2, filt_h3); 2040cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1, 2041cabdff1aSopenharmony_ci filt_h2, filt_h3); 2042cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 2043cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3); 2044cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, in0, tmp0, tmp1); 2045cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp3, in1, tmp2, tmp3); 2046cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 2047cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 2048cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 2049cabdff1aSopenharmony_ci dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 2050cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 2051cabdff1aSopenharmony_ci CLIP_SW4_0_255(dst0, dst1, dst2, dst3); 2052cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 2053cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 2054cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2055cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2056cabdff1aSopenharmony_ci 2057cabdff1aSopenharmony_ci dst10 = dst54; 2058cabdff1aSopenharmony_ci dst32 = dst76; 2059cabdff1aSopenharmony_ci dst54 = dst98; 2060cabdff1aSopenharmony_ci dst21 = dst65; 2061cabdff1aSopenharmony_ci dst43 = dst87; 2062cabdff1aSopenharmony_ci dst65 = dst109; 2063cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 2064cabdff1aSopenharmony_ci } 2065cabdff1aSopenharmony_ci} 2066cabdff1aSopenharmony_ci 2067cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr, 2068cabdff1aSopenharmony_ci int32_t src_stride, 2069cabdff1aSopenharmony_ci int16_t *src1_ptr, 2070cabdff1aSopenharmony_ci int32_t src2_stride, 2071cabdff1aSopenharmony_ci uint8_t *dst, 2072cabdff1aSopenharmony_ci int32_t dst_stride, 2073cabdff1aSopenharmony_ci const int8_t *filter_x, 2074cabdff1aSopenharmony_ci const int8_t *filter_y, 2075cabdff1aSopenharmony_ci int32_t height, 2076cabdff1aSopenharmony_ci int32_t weight0, 2077cabdff1aSopenharmony_ci int32_t weight1, 2078cabdff1aSopenharmony_ci int32_t offset0, 2079cabdff1aSopenharmony_ci int32_t offset1, 2080cabdff1aSopenharmony_ci int32_t rnd_val, 2081cabdff1aSopenharmony_ci int32_t width8mult) 2082cabdff1aSopenharmony_ci{ 2083cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 2084cabdff1aSopenharmony_ci int32_t offset, weight; 2085cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp; 2086cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 2087cabdff1aSopenharmony_ci uint8_t *dst_tmp; 2088cabdff1aSopenharmony_ci v16u8 out; 2089cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2090cabdff1aSopenharmony_ci v8i16 in0, in1; 2091cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 2092cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 2093cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2094cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 2095cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec; 2096cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2097cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 2098cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 2099cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 2100cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 2101cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 2102cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst76_l; 2103cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 2104cabdff1aSopenharmony_ci v8i16 dst21_l, dst43_l, dst65_l, dst87_l; 2105cabdff1aSopenharmony_ci v4i32 offset_vec, rnd_vec, const_vec; 2106cabdff1aSopenharmony_ci 2107cabdff1aSopenharmony_ci src0_ptr -= ((3 * src_stride) + 3); 2108cabdff1aSopenharmony_ci 2109cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 2110cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 2111cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 2112cabdff1aSopenharmony_ci 2113cabdff1aSopenharmony_ci const_vec = __msa_fill_w((128 * weight1)); 2114cabdff1aSopenharmony_ci const_vec <<= 6; 2115cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 2116cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 2117cabdff1aSopenharmony_ci offset_vec += const_vec; 2118cabdff1aSopenharmony_ci weight_vec = (v8i16) __msa_fill_w(weight); 2119cabdff1aSopenharmony_ci 2120cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 2121cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 2122cabdff1aSopenharmony_ci 2123cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 2124cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 2125cabdff1aSopenharmony_ci 2126cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 2127cabdff1aSopenharmony_ci 2128cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2129cabdff1aSopenharmony_ci mask2 = mask0 + 4; 2130cabdff1aSopenharmony_ci mask3 = mask0 + 6; 2131cabdff1aSopenharmony_ci 2132cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 2133cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 2134cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 2135cabdff1aSopenharmony_ci dst_tmp = dst; 2136cabdff1aSopenharmony_ci 2137cabdff1aSopenharmony_ci LD_SB7(src0_ptr_tmp, src_stride, 2138cabdff1aSopenharmony_ci src0, src1, src2, src3, src4, src5, src6); 2139cabdff1aSopenharmony_ci src0_ptr_tmp += (7 * src_stride); 2140cabdff1aSopenharmony_ci 2141cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 2142cabdff1aSopenharmony_ci 2143cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 2144cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 2145cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 2146cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 2147cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 2148cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 2149cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 2150cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 2151cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 2152cabdff1aSopenharmony_ci 2153cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2154cabdff1aSopenharmony_ci filt3); 2155cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2156cabdff1aSopenharmony_ci filt3); 2157cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2158cabdff1aSopenharmony_ci filt3); 2159cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 2160cabdff1aSopenharmony_ci filt2, filt3); 2161cabdff1aSopenharmony_ci 2162cabdff1aSopenharmony_ci /* row 4 row 5 row 6 */ 2163cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 2164cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 2165cabdff1aSopenharmony_ci VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, 2166cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 2167cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, 2168cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 2169cabdff1aSopenharmony_ci 2170cabdff1aSopenharmony_ci dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2171cabdff1aSopenharmony_ci filt3); 2172cabdff1aSopenharmony_ci dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2173cabdff1aSopenharmony_ci filt3); 2174cabdff1aSopenharmony_ci dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2175cabdff1aSopenharmony_ci filt3); 2176cabdff1aSopenharmony_ci 2177cabdff1aSopenharmony_ci for (loop_cnt = height >> 1; loop_cnt--;) { 2178cabdff1aSopenharmony_ci LD_SB2(src0_ptr_tmp, src_stride, src7, src8); 2179cabdff1aSopenharmony_ci XORI_B2_128_SB(src7, src8); 2180cabdff1aSopenharmony_ci src0_ptr_tmp += 2 * src_stride; 2181cabdff1aSopenharmony_ci 2182cabdff1aSopenharmony_ci LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); 2183cabdff1aSopenharmony_ci src1_ptr_tmp += (2 * src2_stride); 2184cabdff1aSopenharmony_ci 2185cabdff1aSopenharmony_ci ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r, 2186cabdff1aSopenharmony_ci dst32_r, dst54_r, dst21_r); 2187cabdff1aSopenharmony_ci ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l, 2188cabdff1aSopenharmony_ci dst32_l, dst54_l, dst21_l); 2189cabdff1aSopenharmony_ci ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); 2190cabdff1aSopenharmony_ci ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); 2191cabdff1aSopenharmony_ci 2192cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, 2193cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 2194cabdff1aSopenharmony_ci dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 2195cabdff1aSopenharmony_ci filt2, filt3); 2196cabdff1aSopenharmony_ci 2197cabdff1aSopenharmony_ci ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 2198cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 2199cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 2200cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 2201cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 2202cabdff1aSopenharmony_ci 2203cabdff1aSopenharmony_ci dst0_r >>= 6; 2204cabdff1aSopenharmony_ci dst0_l >>= 6; 2205cabdff1aSopenharmony_ci 2206cabdff1aSopenharmony_ci /* row 8 */ 2207cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, 2208cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 2209cabdff1aSopenharmony_ci dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 2210cabdff1aSopenharmony_ci filt2, filt3); 2211cabdff1aSopenharmony_ci 2212cabdff1aSopenharmony_ci ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 2213cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, 2214cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 2215cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, 2216cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 2217cabdff1aSopenharmony_ci 2218cabdff1aSopenharmony_ci dst1_r >>= 6; 2219cabdff1aSopenharmony_ci dst1_l >>= 6; 2220cabdff1aSopenharmony_ci 2221cabdff1aSopenharmony_ci PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3); 2222cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, in0, tmp0, tmp1); 2223cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp3, in1, tmp2, tmp3); 2224cabdff1aSopenharmony_ci dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 2225cabdff1aSopenharmony_ci dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 2226cabdff1aSopenharmony_ci dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 2227cabdff1aSopenharmony_ci dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 2228cabdff1aSopenharmony_ci SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec); 2229cabdff1aSopenharmony_ci CLIP_SW4_0_255(dst0_l, dst0_r, dst1_l, dst1_r); 2230cabdff1aSopenharmony_ci PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); 2231cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 2232cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst_tmp, dst_stride); 2233cabdff1aSopenharmony_ci dst_tmp += (2 * dst_stride); 2234cabdff1aSopenharmony_ci 2235cabdff1aSopenharmony_ci dst0 = dst2; 2236cabdff1aSopenharmony_ci dst1 = dst3; 2237cabdff1aSopenharmony_ci dst2 = dst4; 2238cabdff1aSopenharmony_ci dst3 = dst5; 2239cabdff1aSopenharmony_ci dst4 = dst6; 2240cabdff1aSopenharmony_ci dst5 = dst7; 2241cabdff1aSopenharmony_ci dst6 = dst8; 2242cabdff1aSopenharmony_ci } 2243cabdff1aSopenharmony_ci 2244cabdff1aSopenharmony_ci src0_ptr += 8; 2245cabdff1aSopenharmony_ci src1_ptr += 8; 2246cabdff1aSopenharmony_ci dst += 8; 2247cabdff1aSopenharmony_ci } 2248cabdff1aSopenharmony_ci} 2249cabdff1aSopenharmony_ci 2250cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr, 2251cabdff1aSopenharmony_ci int32_t src_stride, 2252cabdff1aSopenharmony_ci int16_t *src1_ptr, 2253cabdff1aSopenharmony_ci int32_t src2_stride, 2254cabdff1aSopenharmony_ci uint8_t *dst, 2255cabdff1aSopenharmony_ci int32_t dst_stride, 2256cabdff1aSopenharmony_ci const int8_t *filter_x, 2257cabdff1aSopenharmony_ci const int8_t *filter_y, 2258cabdff1aSopenharmony_ci int32_t height, 2259cabdff1aSopenharmony_ci int32_t weight0, 2260cabdff1aSopenharmony_ci int32_t weight1, 2261cabdff1aSopenharmony_ci int32_t offset0, 2262cabdff1aSopenharmony_ci int32_t offset1, 2263cabdff1aSopenharmony_ci int32_t rnd_val) 2264cabdff1aSopenharmony_ci{ 2265cabdff1aSopenharmony_ci hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, 2266cabdff1aSopenharmony_ci src1_ptr, src2_stride, 2267cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 2268cabdff1aSopenharmony_ci height, weight0, weight1, offset0, 2269cabdff1aSopenharmony_ci offset1, rnd_val, 1); 2270cabdff1aSopenharmony_ci} 2271cabdff1aSopenharmony_ci 2272cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr, 2273cabdff1aSopenharmony_ci int32_t src_stride, 2274cabdff1aSopenharmony_ci int16_t *src1_ptr, 2275cabdff1aSopenharmony_ci int32_t src2_stride, 2276cabdff1aSopenharmony_ci uint8_t *dst, 2277cabdff1aSopenharmony_ci int32_t dst_stride, 2278cabdff1aSopenharmony_ci const int8_t *filter_x, 2279cabdff1aSopenharmony_ci const int8_t *filter_y, 2280cabdff1aSopenharmony_ci int32_t height, 2281cabdff1aSopenharmony_ci int32_t weight0, 2282cabdff1aSopenharmony_ci int32_t weight1, 2283cabdff1aSopenharmony_ci int32_t offset0, 2284cabdff1aSopenharmony_ci int32_t offset1, 2285cabdff1aSopenharmony_ci int32_t rnd_val) 2286cabdff1aSopenharmony_ci{ 2287cabdff1aSopenharmony_ci uint32_t loop_cnt; 2288cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp, *dst_tmp; 2289cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 2290cabdff1aSopenharmony_ci int32_t offset, weight; 2291cabdff1aSopenharmony_ci uint64_t tp0, tp1; 2292cabdff1aSopenharmony_ci v16u8 out; 2293cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2294cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2295cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 2296cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 2297cabdff1aSopenharmony_ci v8i16 in0 = { 0 }, in1 = { 0 }; 2298cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3; 2299cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3; 2300cabdff1aSopenharmony_ci v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8; 2301cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r; 2302cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l; 2303cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76; 2304cabdff1aSopenharmony_ci v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l; 2305cabdff1aSopenharmony_ci v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3; 2306cabdff1aSopenharmony_ci 2307cabdff1aSopenharmony_ci src0_ptr -= ((3 * src_stride) + 3); 2308cabdff1aSopenharmony_ci 2309cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 2310cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 2311cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 2312cabdff1aSopenharmony_ci 2313cabdff1aSopenharmony_ci const_vec = __msa_fill_w((128 * weight1)); 2314cabdff1aSopenharmony_ci const_vec <<= 6; 2315cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 2316cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 2317cabdff1aSopenharmony_ci offset_vec += const_vec; 2318cabdff1aSopenharmony_ci weight_vec = (v8i16) __msa_fill_w(weight); 2319cabdff1aSopenharmony_ci 2320cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 2321cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 2322cabdff1aSopenharmony_ci 2323cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 2324cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 2325cabdff1aSopenharmony_ci 2326cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 2327cabdff1aSopenharmony_ci 2328cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 2329cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2330cabdff1aSopenharmony_ci mask2 = mask0 + 4; 2331cabdff1aSopenharmony_ci mask3 = mask0 + 6; 2332cabdff1aSopenharmony_ci 2333cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 2334cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 2335cabdff1aSopenharmony_ci dst_tmp = dst; 2336cabdff1aSopenharmony_ci 2337cabdff1aSopenharmony_ci LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 2338cabdff1aSopenharmony_ci src0_ptr_tmp += (7 * src_stride); 2339cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 2340cabdff1aSopenharmony_ci 2341cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 2342cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 2343cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 2344cabdff1aSopenharmony_ci vec11); 2345cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14, 2346cabdff1aSopenharmony_ci vec15); 2347cabdff1aSopenharmony_ci dsth0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2348cabdff1aSopenharmony_ci filt3); 2349cabdff1aSopenharmony_ci dsth1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2350cabdff1aSopenharmony_ci filt3); 2351cabdff1aSopenharmony_ci dsth2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2352cabdff1aSopenharmony_ci filt3); 2353cabdff1aSopenharmony_ci dsth3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 2354cabdff1aSopenharmony_ci filt2, filt3); 2355cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 2356cabdff1aSopenharmony_ci VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 2357cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 2358cabdff1aSopenharmony_ci vec11); 2359cabdff1aSopenharmony_ci dsth4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2360cabdff1aSopenharmony_ci filt3); 2361cabdff1aSopenharmony_ci dsth5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2362cabdff1aSopenharmony_ci filt3); 2363cabdff1aSopenharmony_ci dsth6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2364cabdff1aSopenharmony_ci filt3); 2365cabdff1aSopenharmony_ci 2366cabdff1aSopenharmony_ci for (loop_cnt = 8; loop_cnt--;) { 2367cabdff1aSopenharmony_ci LD_SB2(src0_ptr_tmp, src_stride, src7, src8); 2368cabdff1aSopenharmony_ci src0_ptr_tmp += (2 * src_stride); 2369cabdff1aSopenharmony_ci XORI_B2_128_SB(src7, src8); 2370cabdff1aSopenharmony_ci 2371cabdff1aSopenharmony_ci LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); 2372cabdff1aSopenharmony_ci src1_ptr_tmp += (2 * src2_stride); 2373cabdff1aSopenharmony_ci 2374cabdff1aSopenharmony_ci ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1, 2375cabdff1aSopenharmony_ci dst10_r, dst32_r, dst54_r, dst21_r); 2376cabdff1aSopenharmony_ci ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1, 2377cabdff1aSopenharmony_ci dst10_l, dst32_l, dst54_l, dst21_l); 2378cabdff1aSopenharmony_ci ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r); 2379cabdff1aSopenharmony_ci ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l); 2380cabdff1aSopenharmony_ci 2381cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 2382cabdff1aSopenharmony_ci vec3); 2383cabdff1aSopenharmony_ci dsth7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2384cabdff1aSopenharmony_ci filt3); 2385cabdff1aSopenharmony_ci 2386cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 2387cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 2388cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 2389cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0, 2390cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 2391cabdff1aSopenharmony_ci dst0 >>= 6; 2392cabdff1aSopenharmony_ci dst1 >>= 6; 2393cabdff1aSopenharmony_ci 2394cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 2395cabdff1aSopenharmony_ci vec3); 2396cabdff1aSopenharmony_ci dsth8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2397cabdff1aSopenharmony_ci filt3); 2398cabdff1aSopenharmony_ci 2399cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 2400cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0, 2401cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 2402cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0, 2403cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 2404cabdff1aSopenharmony_ci dst2 >>= 6; 2405cabdff1aSopenharmony_ci dst3 >>= 6; 2406cabdff1aSopenharmony_ci 2407cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3); 2408cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, in0, tmp0, tmp1); 2409cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp3, in1, tmp2, tmp3); 2410cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 2411cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 2412cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 2413cabdff1aSopenharmony_ci dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 2414cabdff1aSopenharmony_ci SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec); 2415cabdff1aSopenharmony_ci CLIP_SW4_0_255(dst1, dst0, dst3, dst2); 2416cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 2417cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 2418cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst_tmp, dst_stride); 2419cabdff1aSopenharmony_ci dst_tmp += (2 * dst_stride); 2420cabdff1aSopenharmony_ci 2421cabdff1aSopenharmony_ci dsth0 = dsth2; 2422cabdff1aSopenharmony_ci dsth1 = dsth3; 2423cabdff1aSopenharmony_ci dsth2 = dsth4; 2424cabdff1aSopenharmony_ci dsth3 = dsth5; 2425cabdff1aSopenharmony_ci dsth4 = dsth6; 2426cabdff1aSopenharmony_ci dsth5 = dsth7; 2427cabdff1aSopenharmony_ci dsth6 = dsth8; 2428cabdff1aSopenharmony_ci } 2429cabdff1aSopenharmony_ci 2430cabdff1aSopenharmony_ci src0_ptr += 8; 2431cabdff1aSopenharmony_ci src1_ptr += 8; 2432cabdff1aSopenharmony_ci dst += 8; 2433cabdff1aSopenharmony_ci 2434cabdff1aSopenharmony_ci mask4 = LD_SB(ff_hevc_mask_arr + 16); 2435cabdff1aSopenharmony_ci mask5 = mask4 + 2; 2436cabdff1aSopenharmony_ci mask6 = mask4 + 4; 2437cabdff1aSopenharmony_ci mask7 = mask4 + 6; 2438cabdff1aSopenharmony_ci 2439cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 2440cabdff1aSopenharmony_ci src0_ptr += (7 * src_stride); 2441cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 2442cabdff1aSopenharmony_ci 2443cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); 2444cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7); 2445cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10, 2446cabdff1aSopenharmony_ci vec11); 2447cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14, 2448cabdff1aSopenharmony_ci vec15); 2449cabdff1aSopenharmony_ci dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2450cabdff1aSopenharmony_ci filt3); 2451cabdff1aSopenharmony_ci dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2452cabdff1aSopenharmony_ci filt3); 2453cabdff1aSopenharmony_ci dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2454cabdff1aSopenharmony_ci filt3); 2455cabdff1aSopenharmony_ci dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 2456cabdff1aSopenharmony_ci filt3); 2457cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10, dst43); 2458cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21, dst54); 2459cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32, dst65); 2460cabdff1aSopenharmony_ci 2461cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 2462cabdff1aSopenharmony_ci 2463cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 2464cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 2465cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 2466cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 2467cabdff1aSopenharmony_ci 2468cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 2469cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 2470cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 2471cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 2472cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in1); 2473cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 2474cabdff1aSopenharmony_ci 2475cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 2476cabdff1aSopenharmony_ci vec3); 2477cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6, 2478cabdff1aSopenharmony_ci vec7); 2479cabdff1aSopenharmony_ci dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2480cabdff1aSopenharmony_ci filt3); 2481cabdff1aSopenharmony_ci dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2482cabdff1aSopenharmony_ci filt3); 2483cabdff1aSopenharmony_ci 2484cabdff1aSopenharmony_ci dst76 = __msa_ilvr_h(dst97, dst66); 2485cabdff1aSopenharmony_ci ILVRL_H2_SH(dst108, dst97, dst87, dst109); 2486cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 2487cabdff1aSopenharmony_ci dst98 = __msa_ilvr_h(dst66, dst108); 2488cabdff1aSopenharmony_ci 2489cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1, 2490cabdff1aSopenharmony_ci filt_h2, filt_h3); 2491cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1, 2492cabdff1aSopenharmony_ci filt_h2, filt_h3); 2493cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1, 2494cabdff1aSopenharmony_ci filt_h2, filt_h3); 2495cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1, 2496cabdff1aSopenharmony_ci filt_h2, filt_h3); 2497cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 2498cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3); 2499cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, in0, tmp0, tmp1); 2500cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp3, in1, tmp2, tmp3); 2501cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 2502cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 2503cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 2504cabdff1aSopenharmony_ci dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 2505cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 2506cabdff1aSopenharmony_ci CLIP_SW4_0_255(dst0, dst1, dst2, dst3); 2507cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 2508cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 2509cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2510cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2511cabdff1aSopenharmony_ci 2512cabdff1aSopenharmony_ci dst10 = dst54; 2513cabdff1aSopenharmony_ci dst32 = dst76; 2514cabdff1aSopenharmony_ci dst54 = dst98; 2515cabdff1aSopenharmony_ci dst21 = dst65; 2516cabdff1aSopenharmony_ci dst43 = dst87; 2517cabdff1aSopenharmony_ci dst65 = dst109; 2518cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 2519cabdff1aSopenharmony_ci } 2520cabdff1aSopenharmony_ci} 2521cabdff1aSopenharmony_ci 2522cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr, 2523cabdff1aSopenharmony_ci int32_t src_stride, 2524cabdff1aSopenharmony_ci int16_t *src1_ptr, 2525cabdff1aSopenharmony_ci int32_t src2_stride, 2526cabdff1aSopenharmony_ci uint8_t *dst, 2527cabdff1aSopenharmony_ci int32_t dst_stride, 2528cabdff1aSopenharmony_ci const int8_t *filter_x, 2529cabdff1aSopenharmony_ci const int8_t *filter_y, 2530cabdff1aSopenharmony_ci int32_t height, 2531cabdff1aSopenharmony_ci int32_t weight0, 2532cabdff1aSopenharmony_ci int32_t weight1, 2533cabdff1aSopenharmony_ci int32_t offset0, 2534cabdff1aSopenharmony_ci int32_t offset1, 2535cabdff1aSopenharmony_ci int32_t rnd_val) 2536cabdff1aSopenharmony_ci{ 2537cabdff1aSopenharmony_ci hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, 2538cabdff1aSopenharmony_ci src1_ptr, src2_stride, 2539cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 2540cabdff1aSopenharmony_ci height, weight0, weight1, offset0, 2541cabdff1aSopenharmony_ci offset1, rnd_val, 2); 2542cabdff1aSopenharmony_ci} 2543cabdff1aSopenharmony_ci 2544cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr, 2545cabdff1aSopenharmony_ci int32_t src_stride, 2546cabdff1aSopenharmony_ci int16_t *src1_ptr, 2547cabdff1aSopenharmony_ci int32_t src2_stride, 2548cabdff1aSopenharmony_ci uint8_t *dst, 2549cabdff1aSopenharmony_ci int32_t dst_stride, 2550cabdff1aSopenharmony_ci const int8_t *filter_x, 2551cabdff1aSopenharmony_ci const int8_t *filter_y, 2552cabdff1aSopenharmony_ci int32_t height, 2553cabdff1aSopenharmony_ci int32_t weight0, 2554cabdff1aSopenharmony_ci int32_t weight1, 2555cabdff1aSopenharmony_ci int32_t offset0, 2556cabdff1aSopenharmony_ci int32_t offset1, 2557cabdff1aSopenharmony_ci int32_t rnd_val) 2558cabdff1aSopenharmony_ci{ 2559cabdff1aSopenharmony_ci hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, 2560cabdff1aSopenharmony_ci src1_ptr, src2_stride, 2561cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 2562cabdff1aSopenharmony_ci height, weight0, weight1, offset0, 2563cabdff1aSopenharmony_ci offset1, rnd_val, 3); 2564cabdff1aSopenharmony_ci} 2565cabdff1aSopenharmony_ci 2566cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr, 2567cabdff1aSopenharmony_ci int32_t src_stride, 2568cabdff1aSopenharmony_ci int16_t *src1_ptr, 2569cabdff1aSopenharmony_ci int32_t src2_stride, 2570cabdff1aSopenharmony_ci uint8_t *dst, 2571cabdff1aSopenharmony_ci int32_t dst_stride, 2572cabdff1aSopenharmony_ci const int8_t *filter_x, 2573cabdff1aSopenharmony_ci const int8_t *filter_y, 2574cabdff1aSopenharmony_ci int32_t height, 2575cabdff1aSopenharmony_ci int32_t weight0, 2576cabdff1aSopenharmony_ci int32_t weight1, 2577cabdff1aSopenharmony_ci int32_t offset0, 2578cabdff1aSopenharmony_ci int32_t offset1, 2579cabdff1aSopenharmony_ci int32_t rnd_val) 2580cabdff1aSopenharmony_ci{ 2581cabdff1aSopenharmony_ci hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, 2582cabdff1aSopenharmony_ci src1_ptr, src2_stride, 2583cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 2584cabdff1aSopenharmony_ci height, weight0, weight1, offset0, 2585cabdff1aSopenharmony_ci offset1, rnd_val, 4); 2586cabdff1aSopenharmony_ci} 2587cabdff1aSopenharmony_ci 2588cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr, 2589cabdff1aSopenharmony_ci int32_t src_stride, 2590cabdff1aSopenharmony_ci int16_t *src1_ptr, 2591cabdff1aSopenharmony_ci int32_t src2_stride, 2592cabdff1aSopenharmony_ci uint8_t *dst, 2593cabdff1aSopenharmony_ci int32_t dst_stride, 2594cabdff1aSopenharmony_ci const int8_t *filter_x, 2595cabdff1aSopenharmony_ci const int8_t *filter_y, 2596cabdff1aSopenharmony_ci int32_t height, 2597cabdff1aSopenharmony_ci int32_t weight0, 2598cabdff1aSopenharmony_ci int32_t weight1, 2599cabdff1aSopenharmony_ci int32_t offset0, 2600cabdff1aSopenharmony_ci int32_t offset1, 2601cabdff1aSopenharmony_ci int32_t rnd_val) 2602cabdff1aSopenharmony_ci{ 2603cabdff1aSopenharmony_ci hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, 2604cabdff1aSopenharmony_ci src1_ptr, src2_stride, 2605cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 2606cabdff1aSopenharmony_ci height, weight0, weight1, offset0, 2607cabdff1aSopenharmony_ci offset1, rnd_val, 6); 2608cabdff1aSopenharmony_ci} 2609cabdff1aSopenharmony_ci 2610cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr, 2611cabdff1aSopenharmony_ci int32_t src_stride, 2612cabdff1aSopenharmony_ci int16_t *src1_ptr, 2613cabdff1aSopenharmony_ci int32_t src2_stride, 2614cabdff1aSopenharmony_ci uint8_t *dst, 2615cabdff1aSopenharmony_ci int32_t dst_stride, 2616cabdff1aSopenharmony_ci const int8_t *filter_x, 2617cabdff1aSopenharmony_ci const int8_t *filter_y, 2618cabdff1aSopenharmony_ci int32_t height, 2619cabdff1aSopenharmony_ci int32_t weight0, 2620cabdff1aSopenharmony_ci int32_t weight1, 2621cabdff1aSopenharmony_ci int32_t offset0, 2622cabdff1aSopenharmony_ci int32_t offset1, 2623cabdff1aSopenharmony_ci int32_t rnd_val) 2624cabdff1aSopenharmony_ci{ 2625cabdff1aSopenharmony_ci hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride, 2626cabdff1aSopenharmony_ci src1_ptr, src2_stride, 2627cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 2628cabdff1aSopenharmony_ci height, weight0, weight1, offset0, 2629cabdff1aSopenharmony_ci offset1, rnd_val, 8); 2630cabdff1aSopenharmony_ci} 2631cabdff1aSopenharmony_ci 2632cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, 2633cabdff1aSopenharmony_ci int32_t src_stride, 2634cabdff1aSopenharmony_ci int16_t *src1_ptr, 2635cabdff1aSopenharmony_ci int32_t src2_stride, 2636cabdff1aSopenharmony_ci uint8_t *dst, 2637cabdff1aSopenharmony_ci int32_t dst_stride, 2638cabdff1aSopenharmony_ci const int8_t *filter, 2639cabdff1aSopenharmony_ci int32_t weight0, 2640cabdff1aSopenharmony_ci int32_t weight1, 2641cabdff1aSopenharmony_ci int32_t offset0, 2642cabdff1aSopenharmony_ci int32_t offset1, 2643cabdff1aSopenharmony_ci int32_t rnd_val) 2644cabdff1aSopenharmony_ci{ 2645cabdff1aSopenharmony_ci int32_t offset, weight, constant; 2646cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2647cabdff1aSopenharmony_ci v16i8 src0, src1; 2648cabdff1aSopenharmony_ci v8i16 in0, in1; 2649cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2650cabdff1aSopenharmony_ci v16i8 mask1, vec0, vec1; 2651cabdff1aSopenharmony_ci v8i16 dst0; 2652cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l; 2653cabdff1aSopenharmony_ci v8i16 out0, filter_vec; 2654cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 2655cabdff1aSopenharmony_ci 2656cabdff1aSopenharmony_ci src0_ptr -= 1; 2657cabdff1aSopenharmony_ci 2658cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2659cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2660cabdff1aSopenharmony_ci 2661cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2662cabdff1aSopenharmony_ci 2663cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 2664cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 2665cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 2666cabdff1aSopenharmony_ci constant = 128 * weight1; 2667cabdff1aSopenharmony_ci constant <<= 6; 2668cabdff1aSopenharmony_ci offset += constant; 2669cabdff1aSopenharmony_ci 2670cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 2671cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2672cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 2673cabdff1aSopenharmony_ci 2674cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src0, src1); 2675cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 2676cabdff1aSopenharmony_ci in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); 2677cabdff1aSopenharmony_ci XORI_B2_128_SB(src0, src1); 2678cabdff1aSopenharmony_ci 2679cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2680cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2681cabdff1aSopenharmony_ci 2682cabdff1aSopenharmony_ci ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l); 2683cabdff1aSopenharmony_ci dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec); 2684cabdff1aSopenharmony_ci dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec); 2685cabdff1aSopenharmony_ci SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); 2686cabdff1aSopenharmony_ci out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 2687cabdff1aSopenharmony_ci CLIP_SH_0_255(out0); 2688cabdff1aSopenharmony_ci out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0); 2689cabdff1aSopenharmony_ci ST_W2(out0, 0, 1, dst, dst_stride); 2690cabdff1aSopenharmony_ci} 2691cabdff1aSopenharmony_ci 2692cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, 2693cabdff1aSopenharmony_ci int32_t src_stride, 2694cabdff1aSopenharmony_ci int16_t *src1_ptr, 2695cabdff1aSopenharmony_ci int32_t src2_stride, 2696cabdff1aSopenharmony_ci uint8_t *dst, 2697cabdff1aSopenharmony_ci int32_t dst_stride, 2698cabdff1aSopenharmony_ci const int8_t *filter, 2699cabdff1aSopenharmony_ci int32_t weight0, 2700cabdff1aSopenharmony_ci int32_t weight1, 2701cabdff1aSopenharmony_ci int32_t offset0, 2702cabdff1aSopenharmony_ci int32_t offset1, 2703cabdff1aSopenharmony_ci int32_t rnd_val) 2704cabdff1aSopenharmony_ci{ 2705cabdff1aSopenharmony_ci int32_t offset, weight, constant; 2706cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2707cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 2708cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2709cabdff1aSopenharmony_ci v16i8 mask1; 2710cabdff1aSopenharmony_ci v8i16 dst0, dst1; 2711cabdff1aSopenharmony_ci v16i8 vec0, vec1; 2712cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 2713cabdff1aSopenharmony_ci v8i16 filter_vec; 2714cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 2715cabdff1aSopenharmony_ci 2716cabdff1aSopenharmony_ci src0_ptr -= 1; 2717cabdff1aSopenharmony_ci 2718cabdff1aSopenharmony_ci /* rearranging filter */ 2719cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2720cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2721cabdff1aSopenharmony_ci 2722cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2723cabdff1aSopenharmony_ci 2724cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 2725cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 2726cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 2727cabdff1aSopenharmony_ci constant = 128 * weight1; 2728cabdff1aSopenharmony_ci constant <<= 6; 2729cabdff1aSopenharmony_ci offset += constant; 2730cabdff1aSopenharmony_ci 2731cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 2732cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2733cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 2734cabdff1aSopenharmony_ci 2735cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 2736cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2737cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2738cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 2739cabdff1aSopenharmony_ci 2740cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2741cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2742cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); 2743cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2744cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, 2745cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 2746cabdff1aSopenharmony_ci dst0, dst1); 2747cabdff1aSopenharmony_ci 2748cabdff1aSopenharmony_ci dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 2749cabdff1aSopenharmony_ci ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride); 2750cabdff1aSopenharmony_ci} 2751cabdff1aSopenharmony_ci 2752cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, 2753cabdff1aSopenharmony_ci int32_t src_stride, 2754cabdff1aSopenharmony_ci int16_t *src1_ptr, 2755cabdff1aSopenharmony_ci int32_t src2_stride, 2756cabdff1aSopenharmony_ci uint8_t *dst, 2757cabdff1aSopenharmony_ci int32_t dst_stride, 2758cabdff1aSopenharmony_ci const int8_t *filter, 2759cabdff1aSopenharmony_ci int32_t height, 2760cabdff1aSopenharmony_ci int32_t weight0, 2761cabdff1aSopenharmony_ci int32_t weight1, 2762cabdff1aSopenharmony_ci int32_t offset0, 2763cabdff1aSopenharmony_ci int32_t offset1, 2764cabdff1aSopenharmony_ci int32_t rnd_val) 2765cabdff1aSopenharmony_ci{ 2766cabdff1aSopenharmony_ci uint32_t loop_cnt; 2767cabdff1aSopenharmony_ci int32_t weight, offset, constant; 2768cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2769cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2770cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2771cabdff1aSopenharmony_ci v16i8 mask1; 2772cabdff1aSopenharmony_ci v16i8 vec0, vec1; 2773cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 2774cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 2775cabdff1aSopenharmony_ci v8i16 filter_vec; 2776cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 2777cabdff1aSopenharmony_ci 2778cabdff1aSopenharmony_ci src0_ptr -= 1; 2779cabdff1aSopenharmony_ci 2780cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2781cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2782cabdff1aSopenharmony_ci 2783cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 2784cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 2785cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 2786cabdff1aSopenharmony_ci constant = 128 * weight1; 2787cabdff1aSopenharmony_ci constant <<= 6; 2788cabdff1aSopenharmony_ci offset += constant; 2789cabdff1aSopenharmony_ci 2790cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 2791cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2792cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 2793cabdff1aSopenharmony_ci 2794cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2795cabdff1aSopenharmony_ci 2796cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 2797cabdff1aSopenharmony_ci LD_SB8(src0_ptr, src_stride, 2798cabdff1aSopenharmony_ci src0, src1, src2, src3, src4, src5, src6, src7); 2799cabdff1aSopenharmony_ci src0_ptr += (8 * src_stride); 2800cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2801cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 2802cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7); 2803cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 2804cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 2805cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 2806cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2807cabdff1aSopenharmony_ci 2808cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2809cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2810cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); 2811cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2812cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1); 2813cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2814cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1); 2815cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2816cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 2817cabdff1aSopenharmony_ci in0, in1, in2, in3, 2818cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 2819cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 2820cabdff1aSopenharmony_ci 2821cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2822cabdff1aSopenharmony_ci ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 2823cabdff1aSopenharmony_ci dst += (8 * dst_stride); 2824cabdff1aSopenharmony_ci } 2825cabdff1aSopenharmony_ci} 2826cabdff1aSopenharmony_ci 2827cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr, 2828cabdff1aSopenharmony_ci int32_t src_stride, 2829cabdff1aSopenharmony_ci int16_t *src1_ptr, 2830cabdff1aSopenharmony_ci int32_t src2_stride, 2831cabdff1aSopenharmony_ci uint8_t *dst, 2832cabdff1aSopenharmony_ci int32_t dst_stride, 2833cabdff1aSopenharmony_ci const int8_t *filter, 2834cabdff1aSopenharmony_ci int32_t height, 2835cabdff1aSopenharmony_ci int32_t weight0, 2836cabdff1aSopenharmony_ci int32_t weight1, 2837cabdff1aSopenharmony_ci int32_t offset0, 2838cabdff1aSopenharmony_ci int32_t offset1, 2839cabdff1aSopenharmony_ci int32_t rnd_val) 2840cabdff1aSopenharmony_ci{ 2841cabdff1aSopenharmony_ci if (2 == height) { 2842cabdff1aSopenharmony_ci hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2843cabdff1aSopenharmony_ci dst, dst_stride, filter, 2844cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, rnd_val); 2845cabdff1aSopenharmony_ci } else if (4 == height) { 2846cabdff1aSopenharmony_ci hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2847cabdff1aSopenharmony_ci dst, dst_stride, filter, 2848cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, rnd_val); 2849cabdff1aSopenharmony_ci } else if (0 == (height % 8)) { 2850cabdff1aSopenharmony_ci hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride, 2851cabdff1aSopenharmony_ci src1_ptr, src2_stride, 2852cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 2853cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, 2854cabdff1aSopenharmony_ci rnd_val); 2855cabdff1aSopenharmony_ci } 2856cabdff1aSopenharmony_ci} 2857cabdff1aSopenharmony_ci 2858cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr, 2859cabdff1aSopenharmony_ci int32_t src_stride, 2860cabdff1aSopenharmony_ci int16_t *src1_ptr, 2861cabdff1aSopenharmony_ci int32_t src2_stride, 2862cabdff1aSopenharmony_ci uint8_t *dst, 2863cabdff1aSopenharmony_ci int32_t dst_stride, 2864cabdff1aSopenharmony_ci const int8_t *filter, 2865cabdff1aSopenharmony_ci int32_t height, 2866cabdff1aSopenharmony_ci int32_t weight0, 2867cabdff1aSopenharmony_ci int32_t weight1, 2868cabdff1aSopenharmony_ci int32_t offset0, 2869cabdff1aSopenharmony_ci int32_t offset1, 2870cabdff1aSopenharmony_ci int32_t rnd_val) 2871cabdff1aSopenharmony_ci{ 2872cabdff1aSopenharmony_ci uint32_t loop_cnt; 2873cabdff1aSopenharmony_ci int32_t offset, weight, constant; 2874cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2875cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 2876cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2877cabdff1aSopenharmony_ci v16i8 mask1; 2878cabdff1aSopenharmony_ci v16i8 vec0, vec1; 2879cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 2880cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 2881cabdff1aSopenharmony_ci v8i16 filter_vec; 2882cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 2883cabdff1aSopenharmony_ci 2884cabdff1aSopenharmony_ci src0_ptr -= 1; 2885cabdff1aSopenharmony_ci 2886cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2887cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2888cabdff1aSopenharmony_ci 2889cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 2890cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 2891cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 2892cabdff1aSopenharmony_ci constant = 128 * weight1; 2893cabdff1aSopenharmony_ci constant <<= 6; 2894cabdff1aSopenharmony_ci offset += constant; 2895cabdff1aSopenharmony_ci 2896cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 2897cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2898cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 2899cabdff1aSopenharmony_ci 2900cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2901cabdff1aSopenharmony_ci 2902cabdff1aSopenharmony_ci for (loop_cnt = 2; loop_cnt--;) { 2903cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 2904cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 2905cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2906cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 2907cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2908cabdff1aSopenharmony_ci 2909cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2910cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2911cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2912cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2913cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 2914cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2915cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 2916cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2917cabdff1aSopenharmony_ci 2918cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 2919cabdff1aSopenharmony_ci in0, in1, in2, in3, 2920cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 2921cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 2922cabdff1aSopenharmony_ci 2923cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2924cabdff1aSopenharmony_ci ST_W2(dst0, 0, 2, dst, dst_stride); 2925cabdff1aSopenharmony_ci ST_H2(dst0, 2, 6, dst + 4, dst_stride); 2926cabdff1aSopenharmony_ci ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride); 2927cabdff1aSopenharmony_ci ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2928cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2929cabdff1aSopenharmony_ci } 2930cabdff1aSopenharmony_ci} 2931cabdff1aSopenharmony_ci 2932cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr, 2933cabdff1aSopenharmony_ci int32_t src_stride, 2934cabdff1aSopenharmony_ci int16_t *src1_ptr, 2935cabdff1aSopenharmony_ci int32_t src2_stride, 2936cabdff1aSopenharmony_ci uint8_t *dst, 2937cabdff1aSopenharmony_ci int32_t dst_stride, 2938cabdff1aSopenharmony_ci const int8_t *filter, 2939cabdff1aSopenharmony_ci int32_t weight0, 2940cabdff1aSopenharmony_ci int32_t weight1, 2941cabdff1aSopenharmony_ci int32_t offset0, 2942cabdff1aSopenharmony_ci int32_t offset1, 2943cabdff1aSopenharmony_ci int32_t rnd_val) 2944cabdff1aSopenharmony_ci{ 2945cabdff1aSopenharmony_ci int32_t offset, weight, constant; 2946cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2947cabdff1aSopenharmony_ci v16i8 src0, src1; 2948cabdff1aSopenharmony_ci v8i16 in0, in1; 2949cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2950cabdff1aSopenharmony_ci v16i8 mask1, vec0, vec1; 2951cabdff1aSopenharmony_ci v8i16 dst0, dst1; 2952cabdff1aSopenharmony_ci v8i16 filter_vec; 2953cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 2954cabdff1aSopenharmony_ci 2955cabdff1aSopenharmony_ci src0_ptr -= 1; 2956cabdff1aSopenharmony_ci 2957cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2958cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2959cabdff1aSopenharmony_ci 2960cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 2961cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 2962cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 2963cabdff1aSopenharmony_ci constant = 128 * weight1; 2964cabdff1aSopenharmony_ci constant <<= 6; 2965cabdff1aSopenharmony_ci offset += constant; 2966cabdff1aSopenharmony_ci 2967cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 2968cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2969cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 2970cabdff1aSopenharmony_ci 2971cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2972cabdff1aSopenharmony_ci 2973cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src0, src1); 2974cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 2975cabdff1aSopenharmony_ci XORI_B2_128_SB(src0, src1); 2976cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2977cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2978cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2979cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2980cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, 2981cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 2982cabdff1aSopenharmony_ci dst0, dst1); 2983cabdff1aSopenharmony_ci 2984cabdff1aSopenharmony_ci dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 2985cabdff1aSopenharmony_ci ST_D2(dst0, 0, 1, dst, dst_stride); 2986cabdff1aSopenharmony_ci} 2987cabdff1aSopenharmony_ci 2988cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr, 2989cabdff1aSopenharmony_ci int32_t src_stride, 2990cabdff1aSopenharmony_ci int16_t *src1_ptr, 2991cabdff1aSopenharmony_ci int32_t src2_stride, 2992cabdff1aSopenharmony_ci uint8_t *dst, 2993cabdff1aSopenharmony_ci int32_t dst_stride, 2994cabdff1aSopenharmony_ci const int8_t *filter, 2995cabdff1aSopenharmony_ci int32_t weight0, 2996cabdff1aSopenharmony_ci int32_t weight1, 2997cabdff1aSopenharmony_ci int32_t offset0, 2998cabdff1aSopenharmony_ci int32_t offset1, 2999cabdff1aSopenharmony_ci int32_t rnd_val) 3000cabdff1aSopenharmony_ci{ 3001cabdff1aSopenharmony_ci int32_t weight, offset, constant; 3002cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3003cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 3004cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5; 3005cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3006cabdff1aSopenharmony_ci v16i8 mask1; 3007cabdff1aSopenharmony_ci v16i8 vec0, vec1; 3008cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 3009cabdff1aSopenharmony_ci v8i16 filter_vec; 3010cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 3011cabdff1aSopenharmony_ci 3012cabdff1aSopenharmony_ci src0_ptr -= 1; 3013cabdff1aSopenharmony_ci 3014cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3015cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3016cabdff1aSopenharmony_ci 3017cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 3018cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 3019cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 3020cabdff1aSopenharmony_ci constant = 128 * weight1; 3021cabdff1aSopenharmony_ci constant <<= 6; 3022cabdff1aSopenharmony_ci offset += constant; 3023cabdff1aSopenharmony_ci 3024cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 3025cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3026cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 3027cabdff1aSopenharmony_ci 3028cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3029cabdff1aSopenharmony_ci 3030cabdff1aSopenharmony_ci LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5); 3031cabdff1aSopenharmony_ci 3032cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3033cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 3034cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in4, in5); 3035cabdff1aSopenharmony_ci XORI_B6_128_SB(src0, src1, src2, src3, src4, src5); 3036cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3037cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3038cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3039cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3040cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 3041cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3042cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3043cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3044cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 3045cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3046cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); 3047cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3048cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 3049cabdff1aSopenharmony_ci in0, in1, in2, in3, 3050cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3051cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 3052cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5, 3053cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3054cabdff1aSopenharmony_ci dst4, dst5); 3055cabdff1aSopenharmony_ci 3056cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3057cabdff1aSopenharmony_ci dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); 3058cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 3059cabdff1aSopenharmony_ci ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride); 3060cabdff1aSopenharmony_ci} 3061cabdff1aSopenharmony_ci 3062cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, 3063cabdff1aSopenharmony_ci int32_t src_stride, 3064cabdff1aSopenharmony_ci int16_t *src1_ptr, 3065cabdff1aSopenharmony_ci int32_t src2_stride, 3066cabdff1aSopenharmony_ci uint8_t *dst, 3067cabdff1aSopenharmony_ci int32_t dst_stride, 3068cabdff1aSopenharmony_ci const int8_t *filter, 3069cabdff1aSopenharmony_ci int32_t height, 3070cabdff1aSopenharmony_ci int32_t weight0, 3071cabdff1aSopenharmony_ci int32_t weight1, 3072cabdff1aSopenharmony_ci int32_t offset0, 3073cabdff1aSopenharmony_ci int32_t offset1, 3074cabdff1aSopenharmony_ci int32_t rnd_val) 3075cabdff1aSopenharmony_ci{ 3076cabdff1aSopenharmony_ci uint32_t loop_cnt; 3077cabdff1aSopenharmony_ci int32_t offset, weight, constant; 3078cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3079cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 3080cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3081cabdff1aSopenharmony_ci v16i8 mask1; 3082cabdff1aSopenharmony_ci v16i8 vec0, vec1; 3083cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 3084cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 3085cabdff1aSopenharmony_ci v8i16 filter_vec; 3086cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 3087cabdff1aSopenharmony_ci 3088cabdff1aSopenharmony_ci src0_ptr -= 1; 3089cabdff1aSopenharmony_ci 3090cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3091cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3092cabdff1aSopenharmony_ci 3093cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 3094cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 3095cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 3096cabdff1aSopenharmony_ci constant = 128 * weight1; 3097cabdff1aSopenharmony_ci constant <<= 6; 3098cabdff1aSopenharmony_ci offset += constant; 3099cabdff1aSopenharmony_ci 3100cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 3101cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3102cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 3103cabdff1aSopenharmony_ci 3104cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3105cabdff1aSopenharmony_ci 3106cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 3107cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 3108cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 3109cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3110cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 3111cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 3112cabdff1aSopenharmony_ci 3113cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3114cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3115cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3116cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3117cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 3118cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3119cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3120cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3121cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 3122cabdff1aSopenharmony_ci in0, in1, in2, in3, 3123cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3124cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 3125cabdff1aSopenharmony_ci 3126cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3127cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 3128cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3129cabdff1aSopenharmony_ci } 3130cabdff1aSopenharmony_ci} 3131cabdff1aSopenharmony_ci 3132cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr, 3133cabdff1aSopenharmony_ci int32_t src_stride, 3134cabdff1aSopenharmony_ci int16_t *src1_ptr, 3135cabdff1aSopenharmony_ci int32_t src2_stride, 3136cabdff1aSopenharmony_ci uint8_t *dst, 3137cabdff1aSopenharmony_ci int32_t dst_stride, 3138cabdff1aSopenharmony_ci const int8_t *filter, 3139cabdff1aSopenharmony_ci int32_t height, 3140cabdff1aSopenharmony_ci int32_t weight0, 3141cabdff1aSopenharmony_ci int32_t weight1, 3142cabdff1aSopenharmony_ci int32_t offset0, 3143cabdff1aSopenharmony_ci int32_t offset1, 3144cabdff1aSopenharmony_ci int32_t rnd_val) 3145cabdff1aSopenharmony_ci{ 3146cabdff1aSopenharmony_ci if (2 == height) { 3147cabdff1aSopenharmony_ci hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3148cabdff1aSopenharmony_ci dst, dst_stride, filter, 3149cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, rnd_val); 3150cabdff1aSopenharmony_ci } else if (6 == height) { 3151cabdff1aSopenharmony_ci hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3152cabdff1aSopenharmony_ci dst, dst_stride, filter, 3153cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, rnd_val); 3154cabdff1aSopenharmony_ci } else if (0 == (height % 4)) { 3155cabdff1aSopenharmony_ci hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride, 3156cabdff1aSopenharmony_ci src1_ptr, src2_stride, 3157cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 3158cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, 3159cabdff1aSopenharmony_ci rnd_val); 3160cabdff1aSopenharmony_ci } 3161cabdff1aSopenharmony_ci} 3162cabdff1aSopenharmony_ci 3163cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr, 3164cabdff1aSopenharmony_ci int32_t src_stride, 3165cabdff1aSopenharmony_ci int16_t *src1_ptr, 3166cabdff1aSopenharmony_ci int32_t src2_stride, 3167cabdff1aSopenharmony_ci uint8_t *dst, 3168cabdff1aSopenharmony_ci int32_t dst_stride, 3169cabdff1aSopenharmony_ci const int8_t *filter, 3170cabdff1aSopenharmony_ci int32_t height, 3171cabdff1aSopenharmony_ci int32_t weight0, 3172cabdff1aSopenharmony_ci int32_t weight1, 3173cabdff1aSopenharmony_ci int32_t offset0, 3174cabdff1aSopenharmony_ci int32_t offset1, 3175cabdff1aSopenharmony_ci int32_t rnd_val) 3176cabdff1aSopenharmony_ci{ 3177cabdff1aSopenharmony_ci uint32_t loop_cnt; 3178cabdff1aSopenharmony_ci int32_t offset, weight, constant; 3179cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3180cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 3181cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 3182cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3183cabdff1aSopenharmony_ci v16i8 mask2 = { 3184cabdff1aSopenharmony_ci 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 3185cabdff1aSopenharmony_ci }; 3186cabdff1aSopenharmony_ci v16i8 mask1, mask3; 3187cabdff1aSopenharmony_ci v16i8 vec0, vec1; 3188cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 3189cabdff1aSopenharmony_ci v8i16 filter_vec; 3190cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 3191cabdff1aSopenharmony_ci 3192cabdff1aSopenharmony_ci src0_ptr -= 1; 3193cabdff1aSopenharmony_ci 3194cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3195cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3196cabdff1aSopenharmony_ci 3197cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 3198cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 3199cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 3200cabdff1aSopenharmony_ci constant = 128 * weight1; 3201cabdff1aSopenharmony_ci constant <<= 6; 3202cabdff1aSopenharmony_ci offset += constant; 3203cabdff1aSopenharmony_ci 3204cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 3205cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3206cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 3207cabdff1aSopenharmony_ci 3208cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3209cabdff1aSopenharmony_ci mask3 = mask2 + 2; 3210cabdff1aSopenharmony_ci 3211cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 3212cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 3213cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 3214cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3215cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 3216cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 3217cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 3218cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 3219cabdff1aSopenharmony_ci 3220cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3221cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3222cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3223cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3224cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 3225cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3226cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3227cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3228cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 3229cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3230cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); 3231cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3232cabdff1aSopenharmony_ci 3233cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 3234cabdff1aSopenharmony_ci in0, in1, in2, in3, 3235cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3236cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 3237cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5, 3238cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3239cabdff1aSopenharmony_ci dst4, dst5); 3240cabdff1aSopenharmony_ci 3241cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3242cabdff1aSopenharmony_ci dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); 3243cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 3244cabdff1aSopenharmony_ci ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride); 3245cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3246cabdff1aSopenharmony_ci } 3247cabdff1aSopenharmony_ci} 3248cabdff1aSopenharmony_ci 3249cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr, 3250cabdff1aSopenharmony_ci int32_t src_stride, 3251cabdff1aSopenharmony_ci int16_t *src1_ptr, 3252cabdff1aSopenharmony_ci int32_t src2_stride, 3253cabdff1aSopenharmony_ci uint8_t *dst, 3254cabdff1aSopenharmony_ci int32_t dst_stride, 3255cabdff1aSopenharmony_ci const int8_t *filter, 3256cabdff1aSopenharmony_ci int32_t height, 3257cabdff1aSopenharmony_ci int32_t weight0, 3258cabdff1aSopenharmony_ci int32_t weight1, 3259cabdff1aSopenharmony_ci int32_t offset0, 3260cabdff1aSopenharmony_ci int32_t offset1, 3261cabdff1aSopenharmony_ci int32_t rnd_val) 3262cabdff1aSopenharmony_ci{ 3263cabdff1aSopenharmony_ci uint32_t loop_cnt; 3264cabdff1aSopenharmony_ci int32_t offset, weight, constant; 3265cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 3266cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 3267cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3268cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3269cabdff1aSopenharmony_ci v16i8 mask1; 3270cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3271cabdff1aSopenharmony_ci v16i8 vec0, vec1; 3272cabdff1aSopenharmony_ci v8i16 filter_vec; 3273cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 3274cabdff1aSopenharmony_ci 3275cabdff1aSopenharmony_ci src0_ptr -= 1; 3276cabdff1aSopenharmony_ci 3277cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3278cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3279cabdff1aSopenharmony_ci 3280cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 3281cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 3282cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 3283cabdff1aSopenharmony_ci constant = 128 * weight1; 3284cabdff1aSopenharmony_ci constant <<= 6; 3285cabdff1aSopenharmony_ci offset += constant; 3286cabdff1aSopenharmony_ci 3287cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 3288cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3289cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 3290cabdff1aSopenharmony_ci 3291cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3292cabdff1aSopenharmony_ci 3293cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 3294cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6); 3295cabdff1aSopenharmony_ci LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7); 3296cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 3297cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6); 3298cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7); 3299cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 3300cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3301cabdff1aSopenharmony_ci 3302cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3303cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3304cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3305cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3306cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 3307cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3308cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3309cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3310cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 3311cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3312cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); 3313cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3314cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); 3315cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3316cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 3317cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3318cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 3319cabdff1aSopenharmony_ci in0, in1, in2, in3, 3320cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3321cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 3322cabdff1aSopenharmony_ci 3323cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3324cabdff1aSopenharmony_ci ST_SH2(dst0, dst1, dst, dst_stride); 3325cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3326cabdff1aSopenharmony_ci 3327cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7, 3328cabdff1aSopenharmony_ci in4, in5, in6, in7, 3329cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3330cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 3331cabdff1aSopenharmony_ci 3332cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3333cabdff1aSopenharmony_ci ST_SH2(dst0, dst1, dst, dst_stride); 3334cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3335cabdff1aSopenharmony_ci } 3336cabdff1aSopenharmony_ci} 3337cabdff1aSopenharmony_ci 3338cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr, 3339cabdff1aSopenharmony_ci int32_t src_stride, 3340cabdff1aSopenharmony_ci int16_t *src1_ptr, 3341cabdff1aSopenharmony_ci int32_t src2_stride, 3342cabdff1aSopenharmony_ci uint8_t *dst, 3343cabdff1aSopenharmony_ci int32_t dst_stride, 3344cabdff1aSopenharmony_ci const int8_t *filter, 3345cabdff1aSopenharmony_ci int32_t height, 3346cabdff1aSopenharmony_ci int32_t weight0, 3347cabdff1aSopenharmony_ci int32_t weight1, 3348cabdff1aSopenharmony_ci int32_t offset0, 3349cabdff1aSopenharmony_ci int32_t offset1, 3350cabdff1aSopenharmony_ci int32_t rnd_val) 3351cabdff1aSopenharmony_ci{ 3352cabdff1aSopenharmony_ci uint32_t loop_cnt; 3353cabdff1aSopenharmony_ci int32_t offset, weight, constant; 3354cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 3355cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3356cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3357cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 3358cabdff1aSopenharmony_ci v16i8 vec0, vec1; 3359cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 3360cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5; 3361cabdff1aSopenharmony_ci v8i16 filter_vec; 3362cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 3363cabdff1aSopenharmony_ci 3364cabdff1aSopenharmony_ci src0_ptr -= 1; 3365cabdff1aSopenharmony_ci 3366cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3367cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3368cabdff1aSopenharmony_ci 3369cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 3370cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 3371cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 3372cabdff1aSopenharmony_ci constant = 128 * weight1; 3373cabdff1aSopenharmony_ci constant <<= 6; 3374cabdff1aSopenharmony_ci offset += constant; 3375cabdff1aSopenharmony_ci 3376cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 3377cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3378cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 3379cabdff1aSopenharmony_ci 3380cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3381cabdff1aSopenharmony_ci mask2 = mask0 + 8; 3382cabdff1aSopenharmony_ci mask3 = mask0 + 10; 3383cabdff1aSopenharmony_ci 3384cabdff1aSopenharmony_ci for (loop_cnt = 16; loop_cnt--;) { 3385cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src0, src2); 3386cabdff1aSopenharmony_ci LD_SB2(src0_ptr + 16, src_stride, src1, src3); 3387cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3388cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in2); 3389cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 8, src2_stride, in1, in3); 3390cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 16, src2_stride, in4, in5); 3391cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 3392cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 3393cabdff1aSopenharmony_ci 3394cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3395cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3396cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 3397cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3398cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 3399cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3400cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); 3401cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3402cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 3403cabdff1aSopenharmony_ci in0, in1, in2, in3, 3404cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3405cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 3406cabdff1aSopenharmony_ci 3407cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3408cabdff1aSopenharmony_ci ST_SH2(dst0, dst1, dst, dst_stride); 3409cabdff1aSopenharmony_ci 3410cabdff1aSopenharmony_ci /* 8 width */ 3411cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3412cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3413cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3414cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3415cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5, 3416cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3417cabdff1aSopenharmony_ci dst0, dst1); 3418cabdff1aSopenharmony_ci 3419cabdff1aSopenharmony_ci dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 3420cabdff1aSopenharmony_ci ST_D2(dst0, 0, 1, (dst + 16), dst_stride); 3421cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3422cabdff1aSopenharmony_ci } 3423cabdff1aSopenharmony_ci} 3424cabdff1aSopenharmony_ci 3425cabdff1aSopenharmony_cistatic void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr, 3426cabdff1aSopenharmony_ci int32_t src_stride, 3427cabdff1aSopenharmony_ci int16_t *src1_ptr, 3428cabdff1aSopenharmony_ci int32_t src2_stride, 3429cabdff1aSopenharmony_ci uint8_t *dst, 3430cabdff1aSopenharmony_ci int32_t dst_stride, 3431cabdff1aSopenharmony_ci const int8_t *filter, 3432cabdff1aSopenharmony_ci int32_t height, 3433cabdff1aSopenharmony_ci int32_t weight0, 3434cabdff1aSopenharmony_ci int32_t weight1, 3435cabdff1aSopenharmony_ci int32_t offset0, 3436cabdff1aSopenharmony_ci int32_t offset1, 3437cabdff1aSopenharmony_ci int32_t rnd_val) 3438cabdff1aSopenharmony_ci{ 3439cabdff1aSopenharmony_ci uint32_t loop_cnt; 3440cabdff1aSopenharmony_ci int32_t offset, weight, constant; 3441cabdff1aSopenharmony_ci v16i8 src0, src1, src2; 3442cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3443cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3444cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 3445cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 3446cabdff1aSopenharmony_ci v16i8 vec0, vec1; 3447cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 3448cabdff1aSopenharmony_ci v8i16 filter_vec; 3449cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 3450cabdff1aSopenharmony_ci 3451cabdff1aSopenharmony_ci src0_ptr -= 1; 3452cabdff1aSopenharmony_ci 3453cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3454cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3455cabdff1aSopenharmony_ci 3456cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 3457cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 3458cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 3459cabdff1aSopenharmony_ci constant = 128 * weight1; 3460cabdff1aSopenharmony_ci constant <<= 6; 3461cabdff1aSopenharmony_ci offset += constant; 3462cabdff1aSopenharmony_ci 3463cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 3464cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3465cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 3466cabdff1aSopenharmony_ci 3467cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3468cabdff1aSopenharmony_ci mask2 = mask0 + 8; 3469cabdff1aSopenharmony_ci mask3 = mask0 + 10; 3470cabdff1aSopenharmony_ci 3471cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 3472cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 16, src0, src1); 3473cabdff1aSopenharmony_ci src2 = LD_SB(src0_ptr + 24); 3474cabdff1aSopenharmony_ci src0_ptr += src_stride; 3475cabdff1aSopenharmony_ci LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 3476cabdff1aSopenharmony_ci src1_ptr += src2_stride; 3477cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3478cabdff1aSopenharmony_ci 3479cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3480cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3481cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 3482cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3483cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3484cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3485cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 3486cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3487cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, 3488cabdff1aSopenharmony_ci in0, in1, in2, in3, 3489cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3490cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 3491cabdff1aSopenharmony_ci 3492cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 3493cabdff1aSopenharmony_ci ST_SH2(dst0, dst1, dst, 16); 3494cabdff1aSopenharmony_ci dst += dst_stride; 3495cabdff1aSopenharmony_ci } 3496cabdff1aSopenharmony_ci} 3497cabdff1aSopenharmony_ci 3498cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr, 3499cabdff1aSopenharmony_ci int32_t src_stride, 3500cabdff1aSopenharmony_ci int16_t *src1_ptr, 3501cabdff1aSopenharmony_ci int32_t src2_stride, 3502cabdff1aSopenharmony_ci uint8_t *dst, 3503cabdff1aSopenharmony_ci int32_t dst_stride, 3504cabdff1aSopenharmony_ci const int8_t *filter, 3505cabdff1aSopenharmony_ci int32_t weight0, 3506cabdff1aSopenharmony_ci int32_t weight1, 3507cabdff1aSopenharmony_ci int32_t offset0, 3508cabdff1aSopenharmony_ci int32_t offset1, 3509cabdff1aSopenharmony_ci int32_t rnd_val) 3510cabdff1aSopenharmony_ci{ 3511cabdff1aSopenharmony_ci int32_t weight, offset, constant; 3512cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 3513cabdff1aSopenharmony_ci v8i16 in0, in1, dst10; 3514cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; 3515cabdff1aSopenharmony_ci v4i32 dst10_r, dst10_l; 3516cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3517cabdff1aSopenharmony_ci v8i16 filter_vec, out; 3518cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 3519cabdff1aSopenharmony_ci 3520cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3521cabdff1aSopenharmony_ci 3522cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 3523cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 3524cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 3525cabdff1aSopenharmony_ci constant = 128 * weight1; 3526cabdff1aSopenharmony_ci constant <<= 6; 3527cabdff1aSopenharmony_ci offset += constant; 3528cabdff1aSopenharmony_ci 3529cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 3530cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3531cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 3532cabdff1aSopenharmony_ci 3533cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3534cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3535cabdff1aSopenharmony_ci 3536cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3537cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3538cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3539cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 3540cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3541cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 3542cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3543cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 3544cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 3545cabdff1aSopenharmony_ci 3546cabdff1aSopenharmony_ci in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); 3547cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3548cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); 3549cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); 3550cabdff1aSopenharmony_ci 3551cabdff1aSopenharmony_ci dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3552cabdff1aSopenharmony_ci 3553cabdff1aSopenharmony_ci ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l); 3554cabdff1aSopenharmony_ci dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec); 3555cabdff1aSopenharmony_ci dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec); 3556cabdff1aSopenharmony_ci SRAR_W2_SW(dst10_r, dst10_l, rnd_vec); 3557cabdff1aSopenharmony_ci out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r); 3558cabdff1aSopenharmony_ci CLIP_SH_0_255(out); 3559cabdff1aSopenharmony_ci out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out); 3560cabdff1aSopenharmony_ci ST_W2(out, 0, 1, dst, dst_stride); 3561cabdff1aSopenharmony_ci} 3562cabdff1aSopenharmony_ci 3563cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr, 3564cabdff1aSopenharmony_ci int32_t src_stride, 3565cabdff1aSopenharmony_ci int16_t *src1_ptr, 3566cabdff1aSopenharmony_ci int32_t src2_stride, 3567cabdff1aSopenharmony_ci uint8_t *dst, 3568cabdff1aSopenharmony_ci int32_t dst_stride, 3569cabdff1aSopenharmony_ci const int8_t *filter, 3570cabdff1aSopenharmony_ci int32_t weight0, 3571cabdff1aSopenharmony_ci int32_t weight1, 3572cabdff1aSopenharmony_ci int32_t offset0, 3573cabdff1aSopenharmony_ci int32_t offset1, 3574cabdff1aSopenharmony_ci int32_t rnd_val) 3575cabdff1aSopenharmony_ci{ 3576cabdff1aSopenharmony_ci int32_t weight, offset, constant; 3577cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 3578cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 3579cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 3580cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554; 3581cabdff1aSopenharmony_ci v8i16 dst10, dst32; 3582cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3583cabdff1aSopenharmony_ci v8i16 filter_vec; 3584cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 3585cabdff1aSopenharmony_ci 3586cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3587cabdff1aSopenharmony_ci 3588cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 3589cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 3590cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 3591cabdff1aSopenharmony_ci constant = 128 * weight1; 3592cabdff1aSopenharmony_ci constant <<= 6; 3593cabdff1aSopenharmony_ci offset += constant; 3594cabdff1aSopenharmony_ci 3595cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 3596cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3597cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 3598cabdff1aSopenharmony_ci 3599cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3600cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3601cabdff1aSopenharmony_ci 3602cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3603cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3604cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3605cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 3606cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3607cabdff1aSopenharmony_ci 3608cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6); 3609cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 3610cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3611cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 3612cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 3613cabdff1aSopenharmony_ci ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3614cabdff1aSopenharmony_ci src32_r, src43_r, src54_r, src65_r); 3615cabdff1aSopenharmony_ci ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554); 3616cabdff1aSopenharmony_ci XORI_B2_128_SB(src4332, src6554); 3617cabdff1aSopenharmony_ci 3618cabdff1aSopenharmony_ci dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3619cabdff1aSopenharmony_ci dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1); 3620cabdff1aSopenharmony_ci 3621cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1, 3622cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3623cabdff1aSopenharmony_ci dst10, dst32); 3624cabdff1aSopenharmony_ci 3625cabdff1aSopenharmony_ci dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10); 3626cabdff1aSopenharmony_ci ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride); 3627cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3628cabdff1aSopenharmony_ci} 3629cabdff1aSopenharmony_ci 3630cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, 3631cabdff1aSopenharmony_ci int32_t src_stride, 3632cabdff1aSopenharmony_ci int16_t *src1_ptr, 3633cabdff1aSopenharmony_ci int32_t src2_stride, 3634cabdff1aSopenharmony_ci uint8_t *dst, 3635cabdff1aSopenharmony_ci int32_t dst_stride, 3636cabdff1aSopenharmony_ci const int8_t *filter, 3637cabdff1aSopenharmony_ci int32_t height, 3638cabdff1aSopenharmony_ci int32_t weight0, 3639cabdff1aSopenharmony_ci int32_t weight1, 3640cabdff1aSopenharmony_ci int32_t offset0, 3641cabdff1aSopenharmony_ci int32_t offset1, 3642cabdff1aSopenharmony_ci int32_t rnd_val) 3643cabdff1aSopenharmony_ci{ 3644cabdff1aSopenharmony_ci uint32_t loop_cnt; 3645cabdff1aSopenharmony_ci int32_t weight, offset, constant; 3646cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 3647cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 3648cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 3649cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 3650cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554, src8776; 3651cabdff1aSopenharmony_ci v8i16 dst10, dst32, dst54, dst76; 3652cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3653cabdff1aSopenharmony_ci v8i16 filter_vec; 3654cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 3655cabdff1aSopenharmony_ci 3656cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3657cabdff1aSopenharmony_ci 3658cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 3659cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 3660cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 3661cabdff1aSopenharmony_ci constant = 128 * weight1; 3662cabdff1aSopenharmony_ci constant <<= 6; 3663cabdff1aSopenharmony_ci offset += constant; 3664cabdff1aSopenharmony_ci 3665cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 3666cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3667cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 3668cabdff1aSopenharmony_ci 3669cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3670cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3671cabdff1aSopenharmony_ci 3672cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3673cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3674cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3675cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 3676cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3677cabdff1aSopenharmony_ci 3678cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 3679cabdff1aSopenharmony_ci LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); 3680cabdff1aSopenharmony_ci src0_ptr += (6 * src_stride); 3681cabdff1aSopenharmony_ci LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); 3682cabdff1aSopenharmony_ci src1_ptr += (8 * src2_stride); 3683cabdff1aSopenharmony_ci 3684cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 3685cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 3686cabdff1aSopenharmony_ci 3687cabdff1aSopenharmony_ci ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3688cabdff1aSopenharmony_ci src32_r, src43_r, src54_r, src65_r); 3689cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3690cabdff1aSopenharmony_ci ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, 3691cabdff1aSopenharmony_ci src4332, src6554, src8776); 3692cabdff1aSopenharmony_ci XORI_B3_128_SB(src4332, src6554, src8776); 3693cabdff1aSopenharmony_ci 3694cabdff1aSopenharmony_ci dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3695cabdff1aSopenharmony_ci dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1); 3696cabdff1aSopenharmony_ci dst54 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1); 3697cabdff1aSopenharmony_ci 3698cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src9, src2); 3699cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3700cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r); 3701cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r); 3702cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3703cabdff1aSopenharmony_ci 3704cabdff1aSopenharmony_ci dst76 = HEVC_FILT_4TAP_SH(src8776, src2110, filt0, filt1); 3705cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76, 3706cabdff1aSopenharmony_ci in0, in1, in2, in3, 3707cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3708cabdff1aSopenharmony_ci dst10, dst32, dst54, dst76); 3709cabdff1aSopenharmony_ci 3710cabdff1aSopenharmony_ci PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32); 3711cabdff1aSopenharmony_ci ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3712cabdff1aSopenharmony_ci dst += (8 * dst_stride); 3713cabdff1aSopenharmony_ci } 3714cabdff1aSopenharmony_ci} 3715cabdff1aSopenharmony_ci 3716cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr, 3717cabdff1aSopenharmony_ci int32_t src_stride, 3718cabdff1aSopenharmony_ci int16_t *src1_ptr, 3719cabdff1aSopenharmony_ci int32_t src2_stride, 3720cabdff1aSopenharmony_ci uint8_t *dst, 3721cabdff1aSopenharmony_ci int32_t dst_stride, 3722cabdff1aSopenharmony_ci const int8_t *filter, 3723cabdff1aSopenharmony_ci int32_t height, 3724cabdff1aSopenharmony_ci int32_t weight0, 3725cabdff1aSopenharmony_ci int32_t weight1, 3726cabdff1aSopenharmony_ci int32_t offset0, 3727cabdff1aSopenharmony_ci int32_t offset1, 3728cabdff1aSopenharmony_ci int32_t rnd_val) 3729cabdff1aSopenharmony_ci{ 3730cabdff1aSopenharmony_ci if (2 == height) { 3731cabdff1aSopenharmony_ci hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3732cabdff1aSopenharmony_ci dst, dst_stride, filter, 3733cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, rnd_val); 3734cabdff1aSopenharmony_ci } else if (4 == height) { 3735cabdff1aSopenharmony_ci hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3736cabdff1aSopenharmony_ci dst, dst_stride, filter, 3737cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, rnd_val); 3738cabdff1aSopenharmony_ci } else if (0 == (height % 8)) { 3739cabdff1aSopenharmony_ci hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride, 3740cabdff1aSopenharmony_ci src1_ptr, src2_stride, 3741cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 3742cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, 3743cabdff1aSopenharmony_ci rnd_val); 3744cabdff1aSopenharmony_ci } 3745cabdff1aSopenharmony_ci} 3746cabdff1aSopenharmony_ci 3747cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr, 3748cabdff1aSopenharmony_ci int32_t src_stride, 3749cabdff1aSopenharmony_ci int16_t *src1_ptr, 3750cabdff1aSopenharmony_ci int32_t src2_stride, 3751cabdff1aSopenharmony_ci uint8_t *dst, 3752cabdff1aSopenharmony_ci int32_t dst_stride, 3753cabdff1aSopenharmony_ci const int8_t *filter, 3754cabdff1aSopenharmony_ci int32_t height, 3755cabdff1aSopenharmony_ci int32_t weight0, 3756cabdff1aSopenharmony_ci int32_t weight1, 3757cabdff1aSopenharmony_ci int32_t offset0, 3758cabdff1aSopenharmony_ci int32_t offset1, 3759cabdff1aSopenharmony_ci int32_t rnd_val) 3760cabdff1aSopenharmony_ci{ 3761cabdff1aSopenharmony_ci uint32_t loop_cnt; 3762cabdff1aSopenharmony_ci int32_t offset, weight, constant; 3763cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 3764cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 3765cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 3766cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 3767cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3768cabdff1aSopenharmony_ci v8i16 filter_vec; 3769cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 3770cabdff1aSopenharmony_ci 3771cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3772cabdff1aSopenharmony_ci 3773cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 3774cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 3775cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 3776cabdff1aSopenharmony_ci constant = 128 * weight1; 3777cabdff1aSopenharmony_ci constant <<= 6; 3778cabdff1aSopenharmony_ci offset += constant; 3779cabdff1aSopenharmony_ci 3780cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 3781cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3782cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 3783cabdff1aSopenharmony_ci 3784cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3785cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3786cabdff1aSopenharmony_ci 3787cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3788cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3789cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3790cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3791cabdff1aSopenharmony_ci 3792cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 3793cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 3794cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3795cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3796cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 3797cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 3798cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3799cabdff1aSopenharmony_ci 3800cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3801cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3802cabdff1aSopenharmony_ci 3803cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src1, src2); 3804cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3805cabdff1aSopenharmony_ci XORI_B2_128_SB(src1, src2); 3806cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); 3807cabdff1aSopenharmony_ci 3808cabdff1aSopenharmony_ci tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1); 3809cabdff1aSopenharmony_ci tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1); 3810cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 3811cabdff1aSopenharmony_ci in0, in1, in2, in3, 3812cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3813cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 3814cabdff1aSopenharmony_ci 3815cabdff1aSopenharmony_ci PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 3816cabdff1aSopenharmony_ci ST_W2(tmp0, 0, 2, dst, dst_stride); 3817cabdff1aSopenharmony_ci ST_H2(tmp0, 2, 6, dst + 4, dst_stride); 3818cabdff1aSopenharmony_ci ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride); 3819cabdff1aSopenharmony_ci ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 3820cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3821cabdff1aSopenharmony_ci } 3822cabdff1aSopenharmony_ci} 3823cabdff1aSopenharmony_ci 3824cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr, 3825cabdff1aSopenharmony_ci int32_t src_stride, 3826cabdff1aSopenharmony_ci int16_t *src1_ptr, 3827cabdff1aSopenharmony_ci int32_t src2_stride, 3828cabdff1aSopenharmony_ci uint8_t *dst, 3829cabdff1aSopenharmony_ci int32_t dst_stride, 3830cabdff1aSopenharmony_ci const int8_t *filter, 3831cabdff1aSopenharmony_ci int32_t weight0, 3832cabdff1aSopenharmony_ci int32_t weight1, 3833cabdff1aSopenharmony_ci int32_t offset0, 3834cabdff1aSopenharmony_ci int32_t offset1, 3835cabdff1aSopenharmony_ci int32_t rnd_val) 3836cabdff1aSopenharmony_ci{ 3837cabdff1aSopenharmony_ci int32_t offset, weight, constant; 3838cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 3839cabdff1aSopenharmony_ci v8i16 in0, in1, tmp0, tmp1; 3840cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 3841cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3842cabdff1aSopenharmony_ci v8i16 filter_vec; 3843cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 3844cabdff1aSopenharmony_ci 3845cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3846cabdff1aSopenharmony_ci 3847cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 3848cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 3849cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 3850cabdff1aSopenharmony_ci constant = 128 * weight1; 3851cabdff1aSopenharmony_ci constant <<= 6; 3852cabdff1aSopenharmony_ci offset += constant; 3853cabdff1aSopenharmony_ci 3854cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 3855cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3856cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 3857cabdff1aSopenharmony_ci 3858cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3859cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3860cabdff1aSopenharmony_ci 3861cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3862cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3863cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3864cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3865cabdff1aSopenharmony_ci 3866cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 3867cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 3868cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 3869cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3870cabdff1aSopenharmony_ci 3871cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3872cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3873cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1, 3874cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3875cabdff1aSopenharmony_ci tmp0, tmp1); 3876cabdff1aSopenharmony_ci 3877cabdff1aSopenharmony_ci tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 3878cabdff1aSopenharmony_ci ST_D2(tmp0, 0, 1, dst, dst_stride); 3879cabdff1aSopenharmony_ci} 3880cabdff1aSopenharmony_ci 3881cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr, 3882cabdff1aSopenharmony_ci int32_t src_stride, 3883cabdff1aSopenharmony_ci int16_t *src1_ptr, 3884cabdff1aSopenharmony_ci int32_t src2_stride, 3885cabdff1aSopenharmony_ci uint8_t *dst, 3886cabdff1aSopenharmony_ci int32_t dst_stride, 3887cabdff1aSopenharmony_ci const int8_t *filter, 3888cabdff1aSopenharmony_ci int32_t weight0, 3889cabdff1aSopenharmony_ci int32_t weight1, 3890cabdff1aSopenharmony_ci int32_t offset0, 3891cabdff1aSopenharmony_ci int32_t offset1, 3892cabdff1aSopenharmony_ci int32_t rnd_val) 3893cabdff1aSopenharmony_ci{ 3894cabdff1aSopenharmony_ci int32_t offset, weight, constant; 3895cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3896cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5; 3897cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r; 3898cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r; 3899cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 3900cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3901cabdff1aSopenharmony_ci v8i16 filter_vec; 3902cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 3903cabdff1aSopenharmony_ci 3904cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3905cabdff1aSopenharmony_ci 3906cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 3907cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 3908cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 3909cabdff1aSopenharmony_ci constant = 128 * weight1; 3910cabdff1aSopenharmony_ci constant <<= 6; 3911cabdff1aSopenharmony_ci offset += constant; 3912cabdff1aSopenharmony_ci 3913cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 3914cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3915cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 3916cabdff1aSopenharmony_ci 3917cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3918cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3919cabdff1aSopenharmony_ci 3920cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3921cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3922cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3923cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3924cabdff1aSopenharmony_ci 3925cabdff1aSopenharmony_ci LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); 3926cabdff1aSopenharmony_ci LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); 3927cabdff1aSopenharmony_ci XORI_B6_128_SB(src3, src4, src5, src6, src7, src8); 3928cabdff1aSopenharmony_ci ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3929cabdff1aSopenharmony_ci src32_r, src43_r, src54_r, src65_r); 3930cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3931cabdff1aSopenharmony_ci 3932cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3933cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3934cabdff1aSopenharmony_ci tmp2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3935cabdff1aSopenharmony_ci tmp3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3936cabdff1aSopenharmony_ci tmp4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1); 3937cabdff1aSopenharmony_ci tmp5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1); 3938cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 3939cabdff1aSopenharmony_ci in0, in1, in2, in3, 3940cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3941cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 3942cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5, 3943cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 3944cabdff1aSopenharmony_ci tmp4, tmp5); 3945cabdff1aSopenharmony_ci 3946cabdff1aSopenharmony_ci PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 3947cabdff1aSopenharmony_ci tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); 3948cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 3949cabdff1aSopenharmony_ci ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride); 3950cabdff1aSopenharmony_ci} 3951cabdff1aSopenharmony_ci 3952cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, 3953cabdff1aSopenharmony_ci int32_t src_stride, 3954cabdff1aSopenharmony_ci int16_t *src1_ptr, 3955cabdff1aSopenharmony_ci int32_t src2_stride, 3956cabdff1aSopenharmony_ci uint8_t *dst, 3957cabdff1aSopenharmony_ci int32_t dst_stride, 3958cabdff1aSopenharmony_ci const int8_t *filter, 3959cabdff1aSopenharmony_ci int32_t height, 3960cabdff1aSopenharmony_ci int32_t weight0, 3961cabdff1aSopenharmony_ci int32_t weight1, 3962cabdff1aSopenharmony_ci int32_t offset0, 3963cabdff1aSopenharmony_ci int32_t offset1, 3964cabdff1aSopenharmony_ci int32_t rnd_val) 3965cabdff1aSopenharmony_ci{ 3966cabdff1aSopenharmony_ci uint32_t loop_cnt; 3967cabdff1aSopenharmony_ci int32_t offset, weight, constant; 3968cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 3969cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 3970cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 3971cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 3972cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3973cabdff1aSopenharmony_ci v8i16 filter_vec; 3974cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 3975cabdff1aSopenharmony_ci 3976cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3977cabdff1aSopenharmony_ci 3978cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 3979cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 3980cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 3981cabdff1aSopenharmony_ci constant = 128 * weight1; 3982cabdff1aSopenharmony_ci constant <<= 6; 3983cabdff1aSopenharmony_ci offset += constant; 3984cabdff1aSopenharmony_ci 3985cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 3986cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3987cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 3988cabdff1aSopenharmony_ci 3989cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3990cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3991cabdff1aSopenharmony_ci 3992cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3993cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3994cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3995cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3996cabdff1aSopenharmony_ci 3997cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 3998cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 3999cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 4000cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 4001cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 4002cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 4003cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4004cabdff1aSopenharmony_ci 4005cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4006cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4007cabdff1aSopenharmony_ci 4008cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src1, src2); 4009cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 4010cabdff1aSopenharmony_ci XORI_B2_128_SB(src1, src2); 4011cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); 4012cabdff1aSopenharmony_ci 4013cabdff1aSopenharmony_ci tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1); 4014cabdff1aSopenharmony_ci tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1); 4015cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 4016cabdff1aSopenharmony_ci in0, in1, in2, in3, 4017cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 4018cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 4019cabdff1aSopenharmony_ci 4020cabdff1aSopenharmony_ci PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 4021cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 4022cabdff1aSopenharmony_ci dst += (4 * dst_stride); 4023cabdff1aSopenharmony_ci } 4024cabdff1aSopenharmony_ci} 4025cabdff1aSopenharmony_ci 4026cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr, 4027cabdff1aSopenharmony_ci int32_t src_stride, 4028cabdff1aSopenharmony_ci int16_t *src1_ptr, 4029cabdff1aSopenharmony_ci int32_t src2_stride, 4030cabdff1aSopenharmony_ci uint8_t *dst, 4031cabdff1aSopenharmony_ci int32_t dst_stride, 4032cabdff1aSopenharmony_ci const int8_t *filter, 4033cabdff1aSopenharmony_ci int32_t height, 4034cabdff1aSopenharmony_ci int32_t weight0, 4035cabdff1aSopenharmony_ci int32_t weight1, 4036cabdff1aSopenharmony_ci int32_t offset0, 4037cabdff1aSopenharmony_ci int32_t offset1, 4038cabdff1aSopenharmony_ci int32_t rnd_val) 4039cabdff1aSopenharmony_ci{ 4040cabdff1aSopenharmony_ci if (2 == height) { 4041cabdff1aSopenharmony_ci hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4042cabdff1aSopenharmony_ci dst, dst_stride, filter, 4043cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, rnd_val); 4044cabdff1aSopenharmony_ci } else if (6 == height) { 4045cabdff1aSopenharmony_ci hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4046cabdff1aSopenharmony_ci dst, dst_stride, filter, 4047cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, rnd_val); 4048cabdff1aSopenharmony_ci } else { 4049cabdff1aSopenharmony_ci hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride, 4050cabdff1aSopenharmony_ci src1_ptr, src2_stride, 4051cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 4052cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, 4053cabdff1aSopenharmony_ci rnd_val); 4054cabdff1aSopenharmony_ci } 4055cabdff1aSopenharmony_ci} 4056cabdff1aSopenharmony_ci 4057cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr, 4058cabdff1aSopenharmony_ci int32_t src_stride, 4059cabdff1aSopenharmony_ci int16_t *src1_ptr, 4060cabdff1aSopenharmony_ci int32_t src2_stride, 4061cabdff1aSopenharmony_ci uint8_t *dst, 4062cabdff1aSopenharmony_ci int32_t dst_stride, 4063cabdff1aSopenharmony_ci const int8_t *filter, 4064cabdff1aSopenharmony_ci int32_t height, 4065cabdff1aSopenharmony_ci int32_t weight0, 4066cabdff1aSopenharmony_ci int32_t weight1, 4067cabdff1aSopenharmony_ci int32_t offset0, 4068cabdff1aSopenharmony_ci int32_t offset1, 4069cabdff1aSopenharmony_ci int32_t rnd_val) 4070cabdff1aSopenharmony_ci{ 4071cabdff1aSopenharmony_ci uint32_t loop_cnt; 4072cabdff1aSopenharmony_ci int32_t offset, weight, constant; 4073cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 4074cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 4075cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 4076cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 4077cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 4078cabdff1aSopenharmony_ci v16i8 src2110, src4332; 4079cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4080cabdff1aSopenharmony_ci v8i16 filter_vec; 4081cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 4082cabdff1aSopenharmony_ci 4083cabdff1aSopenharmony_ci src0_ptr -= (1 * src_stride); 4084cabdff1aSopenharmony_ci 4085cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 4086cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 4087cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 4088cabdff1aSopenharmony_ci constant = 128 * weight1; 4089cabdff1aSopenharmony_ci constant <<= 6; 4090cabdff1aSopenharmony_ci offset += constant; 4091cabdff1aSopenharmony_ci 4092cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 4093cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4094cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 4095cabdff1aSopenharmony_ci 4096cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 4097cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4098cabdff1aSopenharmony_ci 4099cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4100cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 4101cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4102cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 4103cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 4104cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); 4105cabdff1aSopenharmony_ci 4106cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 4107cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 4108cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 4109cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 4110cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 4111cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 4112cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 4113cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 4114cabdff1aSopenharmony_ci 4115cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4116cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 4117cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); 4118cabdff1aSopenharmony_ci 4119cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4120cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4121cabdff1aSopenharmony_ci tmp4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 4122cabdff1aSopenharmony_ci 4123cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src5, src2); 4124cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 4125cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src2); 4126cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 4127cabdff1aSopenharmony_ci ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l); 4128cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); 4129cabdff1aSopenharmony_ci 4130cabdff1aSopenharmony_ci tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1); 4131cabdff1aSopenharmony_ci tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1); 4132cabdff1aSopenharmony_ci tmp5 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1); 4133cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 4134cabdff1aSopenharmony_ci in0, in1, in2, in3, 4135cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 4136cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 4137cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5, 4138cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 4139cabdff1aSopenharmony_ci tmp4, tmp5); 4140cabdff1aSopenharmony_ci 4141cabdff1aSopenharmony_ci PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 4142cabdff1aSopenharmony_ci tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); 4143cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 4144cabdff1aSopenharmony_ci ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride); 4145cabdff1aSopenharmony_ci dst += (4 * dst_stride); 4146cabdff1aSopenharmony_ci } 4147cabdff1aSopenharmony_ci} 4148cabdff1aSopenharmony_ci 4149cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr, 4150cabdff1aSopenharmony_ci int32_t src_stride, 4151cabdff1aSopenharmony_ci int16_t *src1_ptr, 4152cabdff1aSopenharmony_ci int32_t src2_stride, 4153cabdff1aSopenharmony_ci uint8_t *dst, 4154cabdff1aSopenharmony_ci int32_t dst_stride, 4155cabdff1aSopenharmony_ci const int8_t *filter, 4156cabdff1aSopenharmony_ci int32_t height, 4157cabdff1aSopenharmony_ci int32_t weight0, 4158cabdff1aSopenharmony_ci int32_t weight1, 4159cabdff1aSopenharmony_ci int32_t offset0, 4160cabdff1aSopenharmony_ci int32_t offset1, 4161cabdff1aSopenharmony_ci int32_t rnd_val) 4162cabdff1aSopenharmony_ci{ 4163cabdff1aSopenharmony_ci uint32_t loop_cnt; 4164cabdff1aSopenharmony_ci int32_t offset, weight, constant; 4165cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 4166cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 4167cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 4168cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src21_l, src43_l; 4169cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 4170cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4171cabdff1aSopenharmony_ci v8i16 filter_vec; 4172cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 4173cabdff1aSopenharmony_ci 4174cabdff1aSopenharmony_ci src0_ptr -= src_stride; 4175cabdff1aSopenharmony_ci 4176cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 4177cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 4178cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 4179cabdff1aSopenharmony_ci constant = 128 * weight1; 4180cabdff1aSopenharmony_ci constant <<= 6; 4181cabdff1aSopenharmony_ci offset += constant; 4182cabdff1aSopenharmony_ci 4183cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 4184cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4185cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 4186cabdff1aSopenharmony_ci 4187cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 4188cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4189cabdff1aSopenharmony_ci 4190cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4191cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 4192cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4193cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 4194cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 4195cabdff1aSopenharmony_ci 4196cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 4197cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 4198cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 4199cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 4200cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 8, src2_stride, in2, in3); 4201cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 4202cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 4203cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4204cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 4205cabdff1aSopenharmony_ci 4206cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4207cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4208cabdff1aSopenharmony_ci tmp2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 4209cabdff1aSopenharmony_ci tmp3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 4210cabdff1aSopenharmony_ci 4211cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 4212cabdff1aSopenharmony_ci in0, in1, in2, in3, 4213cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 4214cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 4215cabdff1aSopenharmony_ci PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 4216cabdff1aSopenharmony_ci ST_SH2(tmp0, tmp1, dst, dst_stride); 4217cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4218cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src5, src2); 4219cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 4220cabdff1aSopenharmony_ci 4221cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 4222cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 8, src2_stride, in2, in3); 4223cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 4224cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src2); 4225cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 4226cabdff1aSopenharmony_ci ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 4227cabdff1aSopenharmony_ci 4228cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1); 4229cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1); 4230cabdff1aSopenharmony_ci tmp2 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1); 4231cabdff1aSopenharmony_ci tmp3 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1); 4232cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3, 4233cabdff1aSopenharmony_ci in0, in1, in2, in3, 4234cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 4235cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 4236cabdff1aSopenharmony_ci 4237cabdff1aSopenharmony_ci PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 4238cabdff1aSopenharmony_ci ST_SH2(tmp0, tmp1, dst, dst_stride); 4239cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4240cabdff1aSopenharmony_ci } 4241cabdff1aSopenharmony_ci} 4242cabdff1aSopenharmony_ci 4243cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr, 4244cabdff1aSopenharmony_ci int32_t src_stride, 4245cabdff1aSopenharmony_ci int16_t *src1_ptr, 4246cabdff1aSopenharmony_ci int32_t src2_stride, 4247cabdff1aSopenharmony_ci uint8_t *dst, 4248cabdff1aSopenharmony_ci int32_t dst_stride, 4249cabdff1aSopenharmony_ci const int8_t *filter, 4250cabdff1aSopenharmony_ci int32_t height, 4251cabdff1aSopenharmony_ci int32_t weight0, 4252cabdff1aSopenharmony_ci int32_t weight1, 4253cabdff1aSopenharmony_ci int32_t offset0, 4254cabdff1aSopenharmony_ci int32_t offset1, 4255cabdff1aSopenharmony_ci int32_t rnd_val) 4256cabdff1aSopenharmony_ci{ 4257cabdff1aSopenharmony_ci uint32_t loop_cnt; 4258cabdff1aSopenharmony_ci int32_t offset, weight, constant; 4259cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 4260cabdff1aSopenharmony_ci v16i8 src6, src7, src8, src9, src10, src11; 4261cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5; 4262cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src76_r, src98_r; 4263cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src21_l, src43_l; 4264cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src87_r, src109_r; 4265cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 4266cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4267cabdff1aSopenharmony_ci v8i16 filter_vec; 4268cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 4269cabdff1aSopenharmony_ci 4270cabdff1aSopenharmony_ci src0_ptr -= src_stride; 4271cabdff1aSopenharmony_ci 4272cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 4273cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 4274cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 4275cabdff1aSopenharmony_ci constant = 128 * weight1; 4276cabdff1aSopenharmony_ci constant <<= 6; 4277cabdff1aSopenharmony_ci offset += constant; 4278cabdff1aSopenharmony_ci 4279cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 4280cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4281cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 4282cabdff1aSopenharmony_ci 4283cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 4284cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4285cabdff1aSopenharmony_ci 4286cabdff1aSopenharmony_ci /* 16width */ 4287cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4288cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4289cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 4290cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 4291cabdff1aSopenharmony_ci /* 8width */ 4292cabdff1aSopenharmony_ci LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); 4293cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 4294cabdff1aSopenharmony_ci XORI_B3_128_SB(src6, src7, src8); 4295cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 4296cabdff1aSopenharmony_ci 4297cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 4298cabdff1aSopenharmony_ci /* 16width */ 4299cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 4300cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 4301cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 8, src2_stride, in2, in3); 4302cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 4303cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4304cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 4305cabdff1aSopenharmony_ci 4306cabdff1aSopenharmony_ci /* 8width */ 4307cabdff1aSopenharmony_ci LD_SB2(src0_ptr + 16, src_stride, src9, src10); 4308cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 4309cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 16, src2_stride, in4, in5); 4310cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 4311cabdff1aSopenharmony_ci XORI_B2_128_SB(src9, src10); 4312cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 4313cabdff1aSopenharmony_ci /* 16width */ 4314cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4315cabdff1aSopenharmony_ci tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 4316cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4317cabdff1aSopenharmony_ci tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 4318cabdff1aSopenharmony_ci /* 8width */ 4319cabdff1aSopenharmony_ci tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 4320cabdff1aSopenharmony_ci tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 4321cabdff1aSopenharmony_ci /* 16width */ 4322cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, 4323cabdff1aSopenharmony_ci in0, in1, in2, in3, 4324cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 4325cabdff1aSopenharmony_ci tmp0, tmp1, tmp4, tmp5); 4326cabdff1aSopenharmony_ci /* 8width */ 4327cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5, 4328cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 4329cabdff1aSopenharmony_ci tmp2, tmp3); 4330cabdff1aSopenharmony_ci /* 16width */ 4331cabdff1aSopenharmony_ci PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1); 4332cabdff1aSopenharmony_ci /* 8width */ 4333cabdff1aSopenharmony_ci tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2); 4334cabdff1aSopenharmony_ci ST_SH2(tmp0, tmp1, dst, dst_stride); 4335cabdff1aSopenharmony_ci ST_D2(tmp2, 0, 1, dst + 16, dst_stride); 4336cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4337cabdff1aSopenharmony_ci 4338cabdff1aSopenharmony_ci /* 16width */ 4339cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src5, src2); 4340cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 4341cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 8, src2_stride, in2, in3); 4342cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src2); 4343cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 4344cabdff1aSopenharmony_ci ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 4345cabdff1aSopenharmony_ci /* 8width */ 4346cabdff1aSopenharmony_ci LD_SB2(src0_ptr + 16, src_stride, src11, src8); 4347cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 4348cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 16, src2_stride, in4, in5); 4349cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 4350cabdff1aSopenharmony_ci XORI_B2_128_SB(src11, src8); 4351cabdff1aSopenharmony_ci ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); 4352cabdff1aSopenharmony_ci /* 16width */ 4353cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1); 4354cabdff1aSopenharmony_ci tmp4 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1); 4355cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1); 4356cabdff1aSopenharmony_ci tmp5 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1); 4357cabdff1aSopenharmony_ci /* 8width */ 4358cabdff1aSopenharmony_ci tmp2 = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1); 4359cabdff1aSopenharmony_ci tmp3 = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1); 4360cabdff1aSopenharmony_ci /* 16width */ 4361cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, 4362cabdff1aSopenharmony_ci in0, in1, in2, in3, 4363cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 4364cabdff1aSopenharmony_ci tmp0, tmp1, tmp4, tmp5); 4365cabdff1aSopenharmony_ci /* 8width */ 4366cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5, 4367cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 4368cabdff1aSopenharmony_ci tmp2, tmp3); 4369cabdff1aSopenharmony_ci /* 16width */ 4370cabdff1aSopenharmony_ci PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1); 4371cabdff1aSopenharmony_ci 4372cabdff1aSopenharmony_ci /* 8width */ 4373cabdff1aSopenharmony_ci tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2); 4374cabdff1aSopenharmony_ci ST_SH2(tmp0, tmp1, dst, dst_stride); 4375cabdff1aSopenharmony_ci ST_D2(tmp2, 0, 1, dst + 16, dst_stride); 4376cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4377cabdff1aSopenharmony_ci } 4378cabdff1aSopenharmony_ci} 4379cabdff1aSopenharmony_ci 4380cabdff1aSopenharmony_cistatic void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr, 4381cabdff1aSopenharmony_ci int32_t src_stride, 4382cabdff1aSopenharmony_ci int16_t *src1_ptr, 4383cabdff1aSopenharmony_ci int32_t src2_stride, 4384cabdff1aSopenharmony_ci uint8_t *dst, 4385cabdff1aSopenharmony_ci int32_t dst_stride, 4386cabdff1aSopenharmony_ci const int8_t *filter, 4387cabdff1aSopenharmony_ci int32_t height, 4388cabdff1aSopenharmony_ci int32_t weight0, 4389cabdff1aSopenharmony_ci int32_t weight1, 4390cabdff1aSopenharmony_ci int32_t offset0, 4391cabdff1aSopenharmony_ci int32_t offset1, 4392cabdff1aSopenharmony_ci int32_t rnd_val) 4393cabdff1aSopenharmony_ci{ 4394cabdff1aSopenharmony_ci uint32_t loop_cnt; 4395cabdff1aSopenharmony_ci uint8_t *dst_tmp = dst + 16; 4396cabdff1aSopenharmony_ci int32_t offset, weight, constant; 4397cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10; 4398cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 4399cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src76_r, src98_r; 4400cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src87_r, src109_r; 4401cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 4402cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src76_l, src98_l; 4403cabdff1aSopenharmony_ci v16i8 src21_l, src43_l, src87_l, src109_l; 4404cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4405cabdff1aSopenharmony_ci v8i16 filter_vec; 4406cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec; 4407cabdff1aSopenharmony_ci 4408cabdff1aSopenharmony_ci src0_ptr -= src_stride; 4409cabdff1aSopenharmony_ci 4410cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 4411cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 4412cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 4413cabdff1aSopenharmony_ci constant = 128 * weight1; 4414cabdff1aSopenharmony_ci constant <<= 6; 4415cabdff1aSopenharmony_ci offset += constant; 4416cabdff1aSopenharmony_ci 4417cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 4418cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4419cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 4420cabdff1aSopenharmony_ci 4421cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 4422cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4423cabdff1aSopenharmony_ci 4424cabdff1aSopenharmony_ci /* 16width */ 4425cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4426cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4427cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 4428cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 4429cabdff1aSopenharmony_ci /* next 16width */ 4430cabdff1aSopenharmony_ci LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); 4431cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 4432cabdff1aSopenharmony_ci XORI_B3_128_SB(src6, src7, src8); 4433cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 4434cabdff1aSopenharmony_ci ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 4435cabdff1aSopenharmony_ci 4436cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 4437cabdff1aSopenharmony_ci /* 16width */ 4438cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 4439cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 4440cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 8, src2_stride, in2, in3); 4441cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 4442cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4443cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 4444cabdff1aSopenharmony_ci 4445cabdff1aSopenharmony_ci /* 16width */ 4446cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4447cabdff1aSopenharmony_ci tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 4448cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4449cabdff1aSopenharmony_ci tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 4450cabdff1aSopenharmony_ci /* 16width */ 4451cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5, 4452cabdff1aSopenharmony_ci in0, in1, in2, in3, 4453cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 4454cabdff1aSopenharmony_ci tmp0, tmp1, tmp4, tmp5); 4455cabdff1aSopenharmony_ci /* 16width */ 4456cabdff1aSopenharmony_ci PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1); 4457cabdff1aSopenharmony_ci ST_SH2(tmp0, tmp1, dst, dst_stride); 4458cabdff1aSopenharmony_ci dst += (2 * dst_stride); 4459cabdff1aSopenharmony_ci 4460cabdff1aSopenharmony_ci src10_r = src32_r; 4461cabdff1aSopenharmony_ci src21_r = src43_r; 4462cabdff1aSopenharmony_ci src10_l = src32_l; 4463cabdff1aSopenharmony_ci src21_l = src43_l; 4464cabdff1aSopenharmony_ci src2 = src4; 4465cabdff1aSopenharmony_ci 4466cabdff1aSopenharmony_ci /* next 16width */ 4467cabdff1aSopenharmony_ci LD_SB2(src0_ptr + 16, src_stride, src9, src10); 4468cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 4469cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 16, src2_stride, in4, in5); 4470cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 24, src2_stride, in6, in7); 4471cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 4472cabdff1aSopenharmony_ci XORI_B2_128_SB(src9, src10); 4473cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 4474cabdff1aSopenharmony_ci ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); 4475cabdff1aSopenharmony_ci /* next 16width */ 4476cabdff1aSopenharmony_ci tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 4477cabdff1aSopenharmony_ci tmp6 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1); 4478cabdff1aSopenharmony_ci tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 4479cabdff1aSopenharmony_ci tmp7 = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1); 4480cabdff1aSopenharmony_ci /* next 16width */ 4481cabdff1aSopenharmony_ci HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7, 4482cabdff1aSopenharmony_ci in4, in5, in6, in7, 4483cabdff1aSopenharmony_ci weight_vec, rnd_vec, offset_vec, 4484cabdff1aSopenharmony_ci tmp2, tmp3, tmp6, tmp7); 4485cabdff1aSopenharmony_ci 4486cabdff1aSopenharmony_ci /* next 16width */ 4487cabdff1aSopenharmony_ci PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3); 4488cabdff1aSopenharmony_ci ST_SH2(tmp2, tmp3, dst_tmp, dst_stride); 4489cabdff1aSopenharmony_ci dst_tmp += (2 * dst_stride); 4490cabdff1aSopenharmony_ci 4491cabdff1aSopenharmony_ci src76_r = src98_r; 4492cabdff1aSopenharmony_ci src87_r = src109_r; 4493cabdff1aSopenharmony_ci src76_l = src98_l; 4494cabdff1aSopenharmony_ci src87_l = src109_l; 4495cabdff1aSopenharmony_ci src8 = src10; 4496cabdff1aSopenharmony_ci } 4497cabdff1aSopenharmony_ci} 4498cabdff1aSopenharmony_ci 4499cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr, 4500cabdff1aSopenharmony_ci int32_t src_stride, 4501cabdff1aSopenharmony_ci int16_t *src1_ptr, 4502cabdff1aSopenharmony_ci int32_t src2_stride, 4503cabdff1aSopenharmony_ci uint8_t *dst, 4504cabdff1aSopenharmony_ci int32_t dst_stride, 4505cabdff1aSopenharmony_ci const int8_t *filter_x, 4506cabdff1aSopenharmony_ci const int8_t *filter_y, 4507cabdff1aSopenharmony_ci int32_t weight0, 4508cabdff1aSopenharmony_ci int32_t weight1, 4509cabdff1aSopenharmony_ci int32_t offset0, 4510cabdff1aSopenharmony_ci int32_t offset1, 4511cabdff1aSopenharmony_ci int32_t rnd_val) 4512cabdff1aSopenharmony_ci{ 4513cabdff1aSopenharmony_ci uint64_t tp0, tp1; 4514cabdff1aSopenharmony_ci int32_t offset, weight; 4515cabdff1aSopenharmony_ci v8i16 in0 = { 0 }; 4516cabdff1aSopenharmony_ci v16u8 out; 4517cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 4518cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4519cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 4520cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 4521cabdff1aSopenharmony_ci v16i8 mask1; 4522cabdff1aSopenharmony_ci v8i16 filter_vec, tmp, weight_vec; 4523cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 4524cabdff1aSopenharmony_ci v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1; 4525cabdff1aSopenharmony_ci v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec; 4526cabdff1aSopenharmony_ci 4527cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 4528cabdff1aSopenharmony_ci 4529cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4530cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4531cabdff1aSopenharmony_ci 4532cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4533cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4534cabdff1aSopenharmony_ci 4535cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4536cabdff1aSopenharmony_ci 4537cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4538cabdff1aSopenharmony_ci 4539cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 4540cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 4541cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 4542cabdff1aSopenharmony_ci 4543cabdff1aSopenharmony_ci const_vec = __msa_fill_w((128 * weight1)); 4544cabdff1aSopenharmony_ci const_vec <<= 6; 4545cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 4546cabdff1aSopenharmony_ci weight_vec = (v8i16) __msa_fill_w(weight); 4547cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 4548cabdff1aSopenharmony_ci offset_vec += const_vec; 4549cabdff1aSopenharmony_ci 4550cabdff1aSopenharmony_ci LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4); 4551cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4552cabdff1aSopenharmony_ci 4553cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1); 4554cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3); 4555cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5); 4556cabdff1aSopenharmony_ci 4557cabdff1aSopenharmony_ci dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4558cabdff1aSopenharmony_ci dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4559cabdff1aSopenharmony_ci dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4560cabdff1aSopenharmony_ci 4561cabdff1aSopenharmony_ci ILVRL_H2_SH(dst31, dst20, dst10, dst32); 4562cabdff1aSopenharmony_ci ILVRL_H2_SH(dst42, dst31, dst21, dst43); 4563cabdff1aSopenharmony_ci 4564cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 4565cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 4566cabdff1aSopenharmony_ci dst0 >>= 6; 4567cabdff1aSopenharmony_ci dst1 >>= 6; 4568cabdff1aSopenharmony_ci dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0); 4569cabdff1aSopenharmony_ci 4570cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4571cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 4572cabdff1aSopenharmony_ci 4573cabdff1aSopenharmony_ci ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 4574cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 4575cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 4576cabdff1aSopenharmony_ci SRAR_W2_SW(dst0, dst1, rnd_vec); 4577cabdff1aSopenharmony_ci tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); 4578cabdff1aSopenharmony_ci CLIP_SH_0_255(tmp); 4579cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); 4580cabdff1aSopenharmony_ci ST_W2(out, 0, 1, dst, dst_stride); 4581cabdff1aSopenharmony_ci} 4582cabdff1aSopenharmony_ci 4583cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr, 4584cabdff1aSopenharmony_ci int32_t src_stride, 4585cabdff1aSopenharmony_ci int16_t *src1_ptr, 4586cabdff1aSopenharmony_ci int32_t src2_stride, 4587cabdff1aSopenharmony_ci uint8_t *dst, 4588cabdff1aSopenharmony_ci int32_t dst_stride, 4589cabdff1aSopenharmony_ci const int8_t *filter_x, 4590cabdff1aSopenharmony_ci const int8_t *filter_y, 4591cabdff1aSopenharmony_ci int32_t weight0, 4592cabdff1aSopenharmony_ci int32_t weight1, 4593cabdff1aSopenharmony_ci int32_t offset0, 4594cabdff1aSopenharmony_ci int32_t offset1, 4595cabdff1aSopenharmony_ci int32_t rnd_val) 4596cabdff1aSopenharmony_ci{ 4597cabdff1aSopenharmony_ci uint64_t tp0, tp1; 4598cabdff1aSopenharmony_ci int32_t offset, weight; 4599cabdff1aSopenharmony_ci v16u8 out; 4600cabdff1aSopenharmony_ci v8i16 in0 = { 0 }, in1 = { 0 }; 4601cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 4602cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4603cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 4604cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 4605cabdff1aSopenharmony_ci v16i8 mask1; 4606cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec; 4607cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4608cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 4609cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63; 4610cabdff1aSopenharmony_ci v8i16 dst10, dst32, dst54, dst21, dst43, dst65; 4611cabdff1aSopenharmony_ci v4i32 offset_vec, rnd_vec, const_vec; 4612cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3; 4613cabdff1aSopenharmony_ci 4614cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 4615cabdff1aSopenharmony_ci 4616cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4617cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4618cabdff1aSopenharmony_ci 4619cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4620cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4621cabdff1aSopenharmony_ci 4622cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4623cabdff1aSopenharmony_ci 4624cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4625cabdff1aSopenharmony_ci 4626cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 4627cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 4628cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 4629cabdff1aSopenharmony_ci 4630cabdff1aSopenharmony_ci const_vec = __msa_fill_w((128 * weight1)); 4631cabdff1aSopenharmony_ci const_vec <<= 6; 4632cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 4633cabdff1aSopenharmony_ci weight_vec = (v8i16) __msa_fill_w(weight); 4634cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 4635cabdff1aSopenharmony_ci offset_vec += const_vec; 4636cabdff1aSopenharmony_ci 4637cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 4638cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 4639cabdff1aSopenharmony_ci 4640cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1); 4641cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3); 4642cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5); 4643cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7); 4644cabdff1aSopenharmony_ci 4645cabdff1aSopenharmony_ci dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4646cabdff1aSopenharmony_ci dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4647cabdff1aSopenharmony_ci dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4648cabdff1aSopenharmony_ci dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4649cabdff1aSopenharmony_ci 4650cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10, dst43); 4651cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21, dst54); 4652cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32, dst65); 4653cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 4654cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 4655cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1); 4656cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1); 4657cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 4658cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3); 4659cabdff1aSopenharmony_ci 4660cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4661cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 4662cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 4663cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4664cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in1); 4665cabdff1aSopenharmony_ci 4666cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, in0, tmp0, tmp1); 4667cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp3, in1, tmp2, tmp3); 4668cabdff1aSopenharmony_ci 4669cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 4670cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 4671cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 4672cabdff1aSopenharmony_ci dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 4673cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 4674cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 4675cabdff1aSopenharmony_ci CLIP_SH2_0_255(tmp0, tmp1); 4676cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 4677cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 4678cabdff1aSopenharmony_ci} 4679cabdff1aSopenharmony_ci 4680cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr, 4681cabdff1aSopenharmony_ci int32_t src_stride, 4682cabdff1aSopenharmony_ci int16_t *src1_ptr, 4683cabdff1aSopenharmony_ci int32_t src2_stride, 4684cabdff1aSopenharmony_ci uint8_t *dst, 4685cabdff1aSopenharmony_ci int32_t dst_stride, 4686cabdff1aSopenharmony_ci const int8_t *filter_x, 4687cabdff1aSopenharmony_ci const int8_t *filter_y, 4688cabdff1aSopenharmony_ci int32_t height, 4689cabdff1aSopenharmony_ci int32_t weight0, 4690cabdff1aSopenharmony_ci int32_t weight1, 4691cabdff1aSopenharmony_ci int32_t offset0, 4692cabdff1aSopenharmony_ci int32_t offset1, 4693cabdff1aSopenharmony_ci int32_t rnd_val) 4694cabdff1aSopenharmony_ci{ 4695cabdff1aSopenharmony_ci uint32_t loop_cnt; 4696cabdff1aSopenharmony_ci uint64_t tp0, tp1; 4697cabdff1aSopenharmony_ci int32_t offset, weight; 4698cabdff1aSopenharmony_ci v16u8 out0, out1; 4699cabdff1aSopenharmony_ci v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 4700cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4701cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4702cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 4703cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4704cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 4705cabdff1aSopenharmony_ci v16i8 mask1; 4706cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec; 4707cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 4708cabdff1aSopenharmony_ci v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 4709cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 4710cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 4711cabdff1aSopenharmony_ci v8i16 dst98_r, dst109_r; 4712cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 4713cabdff1aSopenharmony_ci v4i32 offset_vec, rnd_vec, const_vec; 4714cabdff1aSopenharmony_ci 4715cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 4716cabdff1aSopenharmony_ci 4717cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4718cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4719cabdff1aSopenharmony_ci 4720cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4721cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4722cabdff1aSopenharmony_ci 4723cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4724cabdff1aSopenharmony_ci 4725cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4726cabdff1aSopenharmony_ci 4727cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 4728cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 4729cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 4730cabdff1aSopenharmony_ci 4731cabdff1aSopenharmony_ci const_vec = __msa_fill_w((128 * weight1)); 4732cabdff1aSopenharmony_ci const_vec <<= 6; 4733cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 4734cabdff1aSopenharmony_ci weight_vec = (v8i16) __msa_fill_w(weight); 4735cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 4736cabdff1aSopenharmony_ci offset_vec += const_vec; 4737cabdff1aSopenharmony_ci 4738cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4739cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 4740cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4741cabdff1aSopenharmony_ci 4742cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 4743cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3); 4744cabdff1aSopenharmony_ci dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4745cabdff1aSopenharmony_ci dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4746cabdff1aSopenharmony_ci ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 4747cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 4748cabdff1aSopenharmony_ci 4749cabdff1aSopenharmony_ci for (loop_cnt = height >> 3; loop_cnt--;) { 4750cabdff1aSopenharmony_ci LD_SB8(src0_ptr, src_stride, 4751cabdff1aSopenharmony_ci src3, src4, src5, src6, src7, src8, src9, src10); 4752cabdff1aSopenharmony_ci src0_ptr += (8 * src_stride); 4753cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4754cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1); 4755cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3); 4756cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5); 4757cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7); 4758cabdff1aSopenharmony_ci 4759cabdff1aSopenharmony_ci dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4760cabdff1aSopenharmony_ci dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4761cabdff1aSopenharmony_ci dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4762cabdff1aSopenharmony_ci dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4763cabdff1aSopenharmony_ci 4764cabdff1aSopenharmony_ci dst32_r = __msa_ilvr_h(dst73, dst22); 4765cabdff1aSopenharmony_ci ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 4766cabdff1aSopenharmony_ci ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 4767cabdff1aSopenharmony_ci ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 4768cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 4769cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst22, dst106); 4770cabdff1aSopenharmony_ci 4771cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4772cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 4773cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 4774cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4775cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 4776cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in1); 4777cabdff1aSopenharmony_ci 4778cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4779cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 4780cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in2); 4781cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4782cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 4783cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in3); 4784cabdff1aSopenharmony_ci 4785cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4786cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4787cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4788cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4789cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4790cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4791cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4792cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4793cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 4794cabdff1aSopenharmony_ci SRA_4V(dst4, dst5, dst6, dst7, 6); 4795cabdff1aSopenharmony_ci PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, 4796cabdff1aSopenharmony_ci dst2, dst3); 4797cabdff1aSopenharmony_ci ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 4798cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 4799cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 4800cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 4801cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 4802cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 4803cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 4804cabdff1aSopenharmony_ci dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 4805cabdff1aSopenharmony_ci dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 4806cabdff1aSopenharmony_ci dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 4807cabdff1aSopenharmony_ci dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 4808cabdff1aSopenharmony_ci dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 4809cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 4810cabdff1aSopenharmony_ci SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 4811cabdff1aSopenharmony_ci PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1, 4812cabdff1aSopenharmony_ci tmp2, tmp3); 4813cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4814cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4815cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 4816cabdff1aSopenharmony_ci dst += (8 * dst_stride); 4817cabdff1aSopenharmony_ci 4818cabdff1aSopenharmony_ci dst10_r = dst98_r; 4819cabdff1aSopenharmony_ci dst21_r = dst109_r; 4820cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 4821cabdff1aSopenharmony_ci } 4822cabdff1aSopenharmony_ci} 4823cabdff1aSopenharmony_ci 4824cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr, 4825cabdff1aSopenharmony_ci int32_t src_stride, 4826cabdff1aSopenharmony_ci int16_t *src1_ptr, 4827cabdff1aSopenharmony_ci int32_t src2_stride, 4828cabdff1aSopenharmony_ci uint8_t *dst, 4829cabdff1aSopenharmony_ci int32_t dst_stride, 4830cabdff1aSopenharmony_ci const int8_t *filter_x, 4831cabdff1aSopenharmony_ci const int8_t *filter_y, 4832cabdff1aSopenharmony_ci int32_t height, 4833cabdff1aSopenharmony_ci int32_t weight0, 4834cabdff1aSopenharmony_ci int32_t weight1, 4835cabdff1aSopenharmony_ci int32_t offset0, 4836cabdff1aSopenharmony_ci int32_t offset1, 4837cabdff1aSopenharmony_ci int32_t rnd_val) 4838cabdff1aSopenharmony_ci{ 4839cabdff1aSopenharmony_ci if (2 == height) { 4840cabdff1aSopenharmony_ci hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4841cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 4842cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, rnd_val); 4843cabdff1aSopenharmony_ci } else if (4 == height) { 4844cabdff1aSopenharmony_ci hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4845cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 4846cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, rnd_val); 4847cabdff1aSopenharmony_ci } else if (0 == (height % 8)) { 4848cabdff1aSopenharmony_ci hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride, 4849cabdff1aSopenharmony_ci src1_ptr, src2_stride, 4850cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 4851cabdff1aSopenharmony_ci height, weight0, weight1, 4852cabdff1aSopenharmony_ci offset0, offset1, rnd_val); 4853cabdff1aSopenharmony_ci } 4854cabdff1aSopenharmony_ci} 4855cabdff1aSopenharmony_ci 4856cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr, 4857cabdff1aSopenharmony_ci int32_t src_stride, 4858cabdff1aSopenharmony_ci int16_t *src1_ptr, 4859cabdff1aSopenharmony_ci int32_t src2_stride, 4860cabdff1aSopenharmony_ci uint8_t *dst, 4861cabdff1aSopenharmony_ci int32_t dst_stride, 4862cabdff1aSopenharmony_ci const int8_t *filter_x, 4863cabdff1aSopenharmony_ci const int8_t *filter_y, 4864cabdff1aSopenharmony_ci int32_t height, 4865cabdff1aSopenharmony_ci int32_t weight0, 4866cabdff1aSopenharmony_ci int32_t weight1, 4867cabdff1aSopenharmony_ci int32_t offset0, 4868cabdff1aSopenharmony_ci int32_t offset1, 4869cabdff1aSopenharmony_ci int32_t rnd_val) 4870cabdff1aSopenharmony_ci{ 4871cabdff1aSopenharmony_ci uint32_t tpw0, tpw1, tpw2, tpw3; 4872cabdff1aSopenharmony_ci uint64_t tp0, tp1; 4873cabdff1aSopenharmony_ci int32_t offset, weight; 4874cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 4875cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4876cabdff1aSopenharmony_ci v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 4877cabdff1aSopenharmony_ci v8i16 in4 = { 0 }, in5 = { 0 }; 4878cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4879cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filter_vec; 4880cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4881cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4882cabdff1aSopenharmony_ci v16i8 mask1; 4883cabdff1aSopenharmony_ci v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9; 4884cabdff1aSopenharmony_ci v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec; 4885cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r; 4886cabdff1aSopenharmony_ci v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l; 4887cabdff1aSopenharmony_ci v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l; 4888cabdff1aSopenharmony_ci v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l; 4889cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4890cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 4891cabdff1aSopenharmony_ci v4i32 dst4_r, dst5_r, dst6_r, dst7_r; 4892cabdff1aSopenharmony_ci v4i32 offset_vec, rnd_vec, const_vec; 4893cabdff1aSopenharmony_ci 4894cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 4895cabdff1aSopenharmony_ci 4896cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4897cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4898cabdff1aSopenharmony_ci 4899cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4900cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4901cabdff1aSopenharmony_ci 4902cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4903cabdff1aSopenharmony_ci 4904cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4905cabdff1aSopenharmony_ci 4906cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 4907cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 4908cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 4909cabdff1aSopenharmony_ci 4910cabdff1aSopenharmony_ci const_vec = __msa_fill_w((128 * weight1)); 4911cabdff1aSopenharmony_ci const_vec <<= 6; 4912cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 4913cabdff1aSopenharmony_ci weight_vec = (v8i16) __msa_fill_w(weight); 4914cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 4915cabdff1aSopenharmony_ci offset_vec += const_vec; 4916cabdff1aSopenharmony_ci 4917cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4918cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 4919cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4920cabdff1aSopenharmony_ci 4921cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4922cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4923cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4924cabdff1aSopenharmony_ci dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4925cabdff1aSopenharmony_ci dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4926cabdff1aSopenharmony_ci dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4927cabdff1aSopenharmony_ci 4928cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 4929cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 4930cabdff1aSopenharmony_ci 4931cabdff1aSopenharmony_ci LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9, 4932cabdff1aSopenharmony_ci src10); 4933cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4934cabdff1aSopenharmony_ci 4935cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4936cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4937cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4938cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4939cabdff1aSopenharmony_ci 4940cabdff1aSopenharmony_ci dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4941cabdff1aSopenharmony_ci dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4942cabdff1aSopenharmony_ci dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4943cabdff1aSopenharmony_ci dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4944cabdff1aSopenharmony_ci 4945cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 4946cabdff1aSopenharmony_ci VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3); 4947cabdff1aSopenharmony_ci VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5); 4948cabdff1aSopenharmony_ci VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7); 4949cabdff1aSopenharmony_ci 4950cabdff1aSopenharmony_ci dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4951cabdff1aSopenharmony_ci dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4952cabdff1aSopenharmony_ci dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4953cabdff1aSopenharmony_ci dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4954cabdff1aSopenharmony_ci 4955cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 4956cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 4957cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 4958cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 4959cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 4960cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 4961cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l); 4962cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l); 4963cabdff1aSopenharmony_ci PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l); 4964cabdff1aSopenharmony_ci PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l); 4965cabdff1aSopenharmony_ci dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l); 4966cabdff1aSopenharmony_ci 4967cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4968cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4969cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4970cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4971cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4972cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4973cabdff1aSopenharmony_ci dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4974cabdff1aSopenharmony_ci dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4975cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1); 4976cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1); 4977cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1); 4978cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1); 4979cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 4980cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 4981cabdff1aSopenharmony_ci SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); 4982cabdff1aSopenharmony_ci PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1); 4983cabdff1aSopenharmony_ci PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3); 4984cabdff1aSopenharmony_ci 4985cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4986cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 4987cabdff1aSopenharmony_ci LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1); 4988cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in1); 4989cabdff1aSopenharmony_ci 4990cabdff1aSopenharmony_ci LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1); 4991cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in2); 4992cabdff1aSopenharmony_ci LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1); 4993cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in3); 4994cabdff1aSopenharmony_ci 4995cabdff1aSopenharmony_ci ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 4996cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 4997cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 4998cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 4999cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5000cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5001cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5002cabdff1aSopenharmony_ci dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5003cabdff1aSopenharmony_ci dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 5004cabdff1aSopenharmony_ci dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 5005cabdff1aSopenharmony_ci dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 5006cabdff1aSopenharmony_ci dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 5007cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5008cabdff1aSopenharmony_ci SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5009cabdff1aSopenharmony_ci PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1, 5010cabdff1aSopenharmony_ci tmp2, tmp3); 5011cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5012cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5013cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 5014cabdff1aSopenharmony_ci 5015cabdff1aSopenharmony_ci PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5); 5016cabdff1aSopenharmony_ci 5017cabdff1aSopenharmony_ci LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3); 5018cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 5019cabdff1aSopenharmony_ci INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4); 5020cabdff1aSopenharmony_ci LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3); 5021cabdff1aSopenharmony_ci INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5); 5022cabdff1aSopenharmony_ci 5023cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, in4, tmp0, tmp1); 5024cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, in5, tmp2, tmp3); 5025cabdff1aSopenharmony_ci 5026cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5027cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5028cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5029cabdff1aSopenharmony_ci dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5030cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5031cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5); 5032cabdff1aSopenharmony_ci 5033cabdff1aSopenharmony_ci CLIP_SH2_0_255(tmp4, tmp5); 5034cabdff1aSopenharmony_ci out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); 5035cabdff1aSopenharmony_ci ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); 5036cabdff1aSopenharmony_ci} 5037cabdff1aSopenharmony_ci 5038cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr, 5039cabdff1aSopenharmony_ci int32_t src_stride, 5040cabdff1aSopenharmony_ci int16_t *src1_ptr, 5041cabdff1aSopenharmony_ci int32_t src2_stride, 5042cabdff1aSopenharmony_ci uint8_t *dst, 5043cabdff1aSopenharmony_ci int32_t dst_stride, 5044cabdff1aSopenharmony_ci const int8_t *filter_x, 5045cabdff1aSopenharmony_ci const int8_t *filter_y, 5046cabdff1aSopenharmony_ci int32_t weight0, 5047cabdff1aSopenharmony_ci int32_t weight1, 5048cabdff1aSopenharmony_ci int32_t offset0, 5049cabdff1aSopenharmony_ci int32_t offset1, 5050cabdff1aSopenharmony_ci int32_t rnd_val) 5051cabdff1aSopenharmony_ci{ 5052cabdff1aSopenharmony_ci int32_t weight, offset; 5053cabdff1aSopenharmony_ci v16u8 out; 5054cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 5055cabdff1aSopenharmony_ci v8i16 filt0, filt1; 5056cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 5057cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 5058cabdff1aSopenharmony_ci v16i8 mask1; 5059cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec; 5060cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 5061cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4; 5062cabdff1aSopenharmony_ci v8i16 in0, in1; 5063cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 5064cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 5065cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 5066cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 5067cabdff1aSopenharmony_ci v4i32 offset_vec, rnd_vec, const_vec; 5068cabdff1aSopenharmony_ci 5069cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 5070cabdff1aSopenharmony_ci 5071cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 5072cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 5073cabdff1aSopenharmony_ci 5074cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 5075cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 5076cabdff1aSopenharmony_ci 5077cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 5078cabdff1aSopenharmony_ci 5079cabdff1aSopenharmony_ci mask1 = mask0 + 2; 5080cabdff1aSopenharmony_ci 5081cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 5082cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 5083cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 5084cabdff1aSopenharmony_ci 5085cabdff1aSopenharmony_ci const_vec = __msa_fill_w((128 * weight1)); 5086cabdff1aSopenharmony_ci const_vec <<= 6; 5087cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 5088cabdff1aSopenharmony_ci weight_vec = (v8i16) __msa_fill_w(weight); 5089cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 5090cabdff1aSopenharmony_ci offset_vec += const_vec; 5091cabdff1aSopenharmony_ci 5092cabdff1aSopenharmony_ci LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4); 5093cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 5094cabdff1aSopenharmony_ci 5095cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 5096cabdff1aSopenharmony_ci 5097cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 5098cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 5099cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 5100cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 5101cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 5102cabdff1aSopenharmony_ci 5103cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5104cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5105cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5106cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5107cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 5108cabdff1aSopenharmony_ci 5109cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 5110cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 5111cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 5112cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 5113cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5114cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 5115cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5116cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 5117cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 5118cabdff1aSopenharmony_ci PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3); 5119cabdff1aSopenharmony_ci 5120cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp1, in0, tmp0, tmp1); 5121cabdff1aSopenharmony_ci ILVRL_H2_SH(tmp3, in1, tmp2, tmp3); 5122cabdff1aSopenharmony_ci 5123cabdff1aSopenharmony_ci dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5124cabdff1aSopenharmony_ci dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5125cabdff1aSopenharmony_ci dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5126cabdff1aSopenharmony_ci dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5127cabdff1aSopenharmony_ci SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); 5128cabdff1aSopenharmony_ci PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); 5129cabdff1aSopenharmony_ci CLIP_SH2_0_255(tmp0, tmp1); 5130cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 5131cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst, dst_stride); 5132cabdff1aSopenharmony_ci} 5133cabdff1aSopenharmony_ci 5134cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr, 5135cabdff1aSopenharmony_ci int32_t src_stride, 5136cabdff1aSopenharmony_ci int16_t *src1_ptr, 5137cabdff1aSopenharmony_ci int32_t src2_stride, 5138cabdff1aSopenharmony_ci uint8_t *dst, 5139cabdff1aSopenharmony_ci int32_t dst_stride, 5140cabdff1aSopenharmony_ci const int8_t *filter_x, 5141cabdff1aSopenharmony_ci const int8_t *filter_y, 5142cabdff1aSopenharmony_ci int32_t weight0, 5143cabdff1aSopenharmony_ci int32_t weight1, 5144cabdff1aSopenharmony_ci int32_t offset0, 5145cabdff1aSopenharmony_ci int32_t offset1, 5146cabdff1aSopenharmony_ci int32_t rnd_val, 5147cabdff1aSopenharmony_ci int32_t width8mult) 5148cabdff1aSopenharmony_ci{ 5149cabdff1aSopenharmony_ci int32_t weight, offset; 5150cabdff1aSopenharmony_ci uint32_t cnt; 5151cabdff1aSopenharmony_ci v16u8 out0, out1; 5152cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 5153cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 5154cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec; 5155cabdff1aSopenharmony_ci v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6; 5156cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3; 5157cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 5158cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 5159cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 5160cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 5161cabdff1aSopenharmony_ci v4i32 offset_vec, rnd_vec, const_vec; 5162cabdff1aSopenharmony_ci 5163cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 5164cabdff1aSopenharmony_ci 5165cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 5166cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 5167cabdff1aSopenharmony_ci 5168cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 5169cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 5170cabdff1aSopenharmony_ci 5171cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 5172cabdff1aSopenharmony_ci 5173cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 5174cabdff1aSopenharmony_ci mask1 = mask0 + 2; 5175cabdff1aSopenharmony_ci 5176cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 5177cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 5178cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 5179cabdff1aSopenharmony_ci 5180cabdff1aSopenharmony_ci const_vec = __msa_fill_w((128 * weight1)); 5181cabdff1aSopenharmony_ci const_vec <<= 6; 5182cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 5183cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 5184cabdff1aSopenharmony_ci offset_vec += const_vec; 5185cabdff1aSopenharmony_ci weight_vec = (v8i16) __msa_fill_w(weight); 5186cabdff1aSopenharmony_ci 5187cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 5188cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 5189cabdff1aSopenharmony_ci src0_ptr += 8; 5190cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 5191cabdff1aSopenharmony_ci 5192cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 5193cabdff1aSopenharmony_ci src1_ptr += 8; 5194cabdff1aSopenharmony_ci 5195cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 5196cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 5197cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 5198cabdff1aSopenharmony_ci 5199cabdff1aSopenharmony_ci dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5200cabdff1aSopenharmony_ci dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5201cabdff1aSopenharmony_ci dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5202cabdff1aSopenharmony_ci 5203cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 5204cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 5205cabdff1aSopenharmony_ci 5206cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 5207cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 5208cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 5209cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 5210cabdff1aSopenharmony_ci 5211cabdff1aSopenharmony_ci dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5212cabdff1aSopenharmony_ci dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5213cabdff1aSopenharmony_ci dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5214cabdff1aSopenharmony_ci dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5215cabdff1aSopenharmony_ci 5216cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 5217cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 5218cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 5219cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 5220cabdff1aSopenharmony_ci 5221cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5222cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 5223cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5224cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 5225cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5226cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 5227cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5228cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 5229cabdff1aSopenharmony_ci 5230cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 5231cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 5232cabdff1aSopenharmony_ci PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 5233cabdff1aSopenharmony_ci dst3_r, dst0, dst1, dst2, dst3); 5234cabdff1aSopenharmony_ci 5235cabdff1aSopenharmony_ci ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 5236cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 5237cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 5238cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 5239cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5240cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5241cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5242cabdff1aSopenharmony_ci dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5243cabdff1aSopenharmony_ci dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 5244cabdff1aSopenharmony_ci dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 5245cabdff1aSopenharmony_ci dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 5246cabdff1aSopenharmony_ci dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 5247cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5248cabdff1aSopenharmony_ci SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5249cabdff1aSopenharmony_ci PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 5250cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 5251cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5252cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5253cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 5254cabdff1aSopenharmony_ci dst += 8; 5255cabdff1aSopenharmony_ci } 5256cabdff1aSopenharmony_ci} 5257cabdff1aSopenharmony_ci 5258cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr, 5259cabdff1aSopenharmony_ci int32_t src_stride, 5260cabdff1aSopenharmony_ci int16_t *src1_ptr, 5261cabdff1aSopenharmony_ci int32_t src2_stride, 5262cabdff1aSopenharmony_ci uint8_t *dst, 5263cabdff1aSopenharmony_ci int32_t dst_stride, 5264cabdff1aSopenharmony_ci const int8_t *filter_x, 5265cabdff1aSopenharmony_ci const int8_t *filter_y, 5266cabdff1aSopenharmony_ci int32_t weight0, 5267cabdff1aSopenharmony_ci int32_t weight1, 5268cabdff1aSopenharmony_ci int32_t offset0, 5269cabdff1aSopenharmony_ci int32_t offset1, 5270cabdff1aSopenharmony_ci int32_t rnd_val) 5271cabdff1aSopenharmony_ci{ 5272cabdff1aSopenharmony_ci uint32_t offset, weight; 5273cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 5274cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 5275cabdff1aSopenharmony_ci v8i16 filt0, filt1; 5276cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 5277cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 5278cabdff1aSopenharmony_ci v16i8 mask1; 5279cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec; 5280cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 5281cabdff1aSopenharmony_ci v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 5282cabdff1aSopenharmony_ci v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8; 5283cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 5284cabdff1aSopenharmony_ci v4i32 dst4_r, dst4_l, dst5_r, dst5_l; 5285cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst10_l, dst32_l; 5286cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst21_l, dst43_l; 5287cabdff1aSopenharmony_ci v8i16 dst54_r, dst54_l, dst65_r, dst65_l; 5288cabdff1aSopenharmony_ci v8i16 dst76_r, dst76_l, dst87_r, dst87_l; 5289cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5; 5290cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 5291cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 5292cabdff1aSopenharmony_ci v4i32 offset_vec, rnd_vec, const_vec; 5293cabdff1aSopenharmony_ci 5294cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 5295cabdff1aSopenharmony_ci 5296cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 5297cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 5298cabdff1aSopenharmony_ci 5299cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 5300cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 5301cabdff1aSopenharmony_ci 5302cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 5303cabdff1aSopenharmony_ci 5304cabdff1aSopenharmony_ci mask1 = mask0 + 2; 5305cabdff1aSopenharmony_ci 5306cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 5307cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 5308cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 5309cabdff1aSopenharmony_ci 5310cabdff1aSopenharmony_ci const_vec = __msa_fill_w((128 * weight1)); 5311cabdff1aSopenharmony_ci const_vec <<= 6; 5312cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 5313cabdff1aSopenharmony_ci weight_vec = (v8i16) __msa_fill_w(weight); 5314cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 5315cabdff1aSopenharmony_ci offset_vec += const_vec; 5316cabdff1aSopenharmony_ci 5317cabdff1aSopenharmony_ci LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4); 5318cabdff1aSopenharmony_ci src0_ptr += (5 * src_stride); 5319cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8); 5320cabdff1aSopenharmony_ci 5321cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 5322cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 5323cabdff1aSopenharmony_ci 5324cabdff1aSopenharmony_ci LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); 5325cabdff1aSopenharmony_ci 5326cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 5327cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 5328cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 5329cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 5330cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 5331cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11); 5332cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13); 5333cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15); 5334cabdff1aSopenharmony_ci VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17); 5335cabdff1aSopenharmony_ci 5336cabdff1aSopenharmony_ci dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5337cabdff1aSopenharmony_ci dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5338cabdff1aSopenharmony_ci dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5339cabdff1aSopenharmony_ci dsth3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5340cabdff1aSopenharmony_ci dsth4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 5341cabdff1aSopenharmony_ci dsth5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1); 5342cabdff1aSopenharmony_ci dsth6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1); 5343cabdff1aSopenharmony_ci dsth7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1); 5344cabdff1aSopenharmony_ci dsth8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1); 5345cabdff1aSopenharmony_ci 5346cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 5347cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 5348cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 5349cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 5350cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 5351cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 5352cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 5353cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 5354cabdff1aSopenharmony_ci 5355cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5356cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 5357cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5358cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 5359cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5360cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 5361cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5362cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 5363cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 5364cabdff1aSopenharmony_ci dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); 5365cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 5366cabdff1aSopenharmony_ci dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); 5367cabdff1aSopenharmony_ci 5368cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 5369cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 5370cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6); 5371cabdff1aSopenharmony_ci PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r, 5372cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 5373cabdff1aSopenharmony_ci 5374cabdff1aSopenharmony_ci ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 5375cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 5376cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 5377cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 5378cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5379cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5380cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5381cabdff1aSopenharmony_ci dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5382cabdff1aSopenharmony_ci dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 5383cabdff1aSopenharmony_ci dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 5384cabdff1aSopenharmony_ci dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 5385cabdff1aSopenharmony_ci dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 5386cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5387cabdff1aSopenharmony_ci SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5388cabdff1aSopenharmony_ci PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 5389cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 5390cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5391cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5392cabdff1aSopenharmony_ci 5393cabdff1aSopenharmony_ci PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1); 5394cabdff1aSopenharmony_ci ILVRL_H2_SH(dst0, in4, tmp0, tmp1); 5395cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, in5, tmp2, tmp3); 5396cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5397cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5398cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5399cabdff1aSopenharmony_ci dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5400cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5401cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5); 5402cabdff1aSopenharmony_ci CLIP_SH2_0_255(tmp4, tmp5); 5403cabdff1aSopenharmony_ci out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); 5404cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 5405cabdff1aSopenharmony_ci ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 5406cabdff1aSopenharmony_ci} 5407cabdff1aSopenharmony_ci 5408cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr, 5409cabdff1aSopenharmony_ci int32_t src_stride, 5410cabdff1aSopenharmony_ci int16_t *src1_ptr, 5411cabdff1aSopenharmony_ci int32_t src2_stride, 5412cabdff1aSopenharmony_ci uint8_t *dst, 5413cabdff1aSopenharmony_ci int32_t dst_stride, 5414cabdff1aSopenharmony_ci const int8_t *filter_x, 5415cabdff1aSopenharmony_ci const int8_t *filter_y, 5416cabdff1aSopenharmony_ci int32_t height, 5417cabdff1aSopenharmony_ci int32_t weight0, 5418cabdff1aSopenharmony_ci int32_t weight1, 5419cabdff1aSopenharmony_ci int32_t offset0, 5420cabdff1aSopenharmony_ci int32_t offset1, 5421cabdff1aSopenharmony_ci int32_t rnd_val, 5422cabdff1aSopenharmony_ci int32_t width) 5423cabdff1aSopenharmony_ci{ 5424cabdff1aSopenharmony_ci uint32_t loop_cnt; 5425cabdff1aSopenharmony_ci uint32_t cnt; 5426cabdff1aSopenharmony_ci int32_t offset, weight; 5427cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp; 5428cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 5429cabdff1aSopenharmony_ci uint8_t *dst_tmp; 5430cabdff1aSopenharmony_ci v16u8 out0, out1; 5431cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 5432cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 5433cabdff1aSopenharmony_ci v8i16 filt0, filt1; 5434cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 5435cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 5436cabdff1aSopenharmony_ci v16i8 mask1; 5437cabdff1aSopenharmony_ci v8i16 filter_vec; 5438cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 5439cabdff1aSopenharmony_ci v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6; 5440cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 5441cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 5442cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 5443cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 5444cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec; 5445cabdff1aSopenharmony_ci v4i32 offset_vec, rnd_vec, const_vec; 5446cabdff1aSopenharmony_ci 5447cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 5448cabdff1aSopenharmony_ci 5449cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 5450cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 5451cabdff1aSopenharmony_ci 5452cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 5453cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 5454cabdff1aSopenharmony_ci 5455cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 5456cabdff1aSopenharmony_ci 5457cabdff1aSopenharmony_ci mask1 = mask0 + 2; 5458cabdff1aSopenharmony_ci 5459cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 5460cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 5461cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 5462cabdff1aSopenharmony_ci 5463cabdff1aSopenharmony_ci const_vec = __msa_fill_w((128 * weight1)); 5464cabdff1aSopenharmony_ci const_vec <<= 6; 5465cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 5466cabdff1aSopenharmony_ci weight_vec = (v8i16) __msa_fill_w(weight); 5467cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 5468cabdff1aSopenharmony_ci offset_vec += const_vec; 5469cabdff1aSopenharmony_ci 5470cabdff1aSopenharmony_ci for (cnt = width >> 3; cnt--;) { 5471cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 5472cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 5473cabdff1aSopenharmony_ci dst_tmp = dst; 5474cabdff1aSopenharmony_ci 5475cabdff1aSopenharmony_ci LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2); 5476cabdff1aSopenharmony_ci src0_ptr_tmp += (3 * src_stride); 5477cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 5478cabdff1aSopenharmony_ci 5479cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 5480cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 5481cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 5482cabdff1aSopenharmony_ci dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5483cabdff1aSopenharmony_ci dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5484cabdff1aSopenharmony_ci dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5485cabdff1aSopenharmony_ci 5486cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 5487cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 5488cabdff1aSopenharmony_ci 5489cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 5490cabdff1aSopenharmony_ci LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6); 5491cabdff1aSopenharmony_ci src0_ptr_tmp += (4 * src_stride); 5492cabdff1aSopenharmony_ci LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); 5493cabdff1aSopenharmony_ci src1_ptr_tmp += (4 * src2_stride); 5494cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 5495cabdff1aSopenharmony_ci 5496cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 5497cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 5498cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 5499cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 5500cabdff1aSopenharmony_ci 5501cabdff1aSopenharmony_ci dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5502cabdff1aSopenharmony_ci dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5503cabdff1aSopenharmony_ci dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5504cabdff1aSopenharmony_ci dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5505cabdff1aSopenharmony_ci 5506cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 5507cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 5508cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 5509cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 5510cabdff1aSopenharmony_ci 5511cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5512cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 5513cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5514cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 5515cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5516cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 5517cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5518cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 5519cabdff1aSopenharmony_ci 5520cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 5521cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 5522cabdff1aSopenharmony_ci PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 5523cabdff1aSopenharmony_ci dst3_r, dst0, dst1, dst2, dst3); 5524cabdff1aSopenharmony_ci ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 5525cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 5526cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 5527cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 5528cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5529cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5530cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5531cabdff1aSopenharmony_ci dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5532cabdff1aSopenharmony_ci dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 5533cabdff1aSopenharmony_ci dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 5534cabdff1aSopenharmony_ci dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 5535cabdff1aSopenharmony_ci dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 5536cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5537cabdff1aSopenharmony_ci SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5538cabdff1aSopenharmony_ci PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 5539cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 5540cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5541cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5542cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 5543cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 5544cabdff1aSopenharmony_ci 5545cabdff1aSopenharmony_ci dst10_r = dst54_r; 5546cabdff1aSopenharmony_ci dst10_l = dst54_l; 5547cabdff1aSopenharmony_ci dst21_r = dst65_r; 5548cabdff1aSopenharmony_ci dst21_l = dst65_l; 5549cabdff1aSopenharmony_ci dsth2 = dsth6; 5550cabdff1aSopenharmony_ci } 5551cabdff1aSopenharmony_ci 5552cabdff1aSopenharmony_ci src0_ptr += 8; 5553cabdff1aSopenharmony_ci dst += 8; 5554cabdff1aSopenharmony_ci src1_ptr += 8; 5555cabdff1aSopenharmony_ci } 5556cabdff1aSopenharmony_ci} 5557cabdff1aSopenharmony_ci 5558cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr, 5559cabdff1aSopenharmony_ci int32_t src_stride, 5560cabdff1aSopenharmony_ci int16_t *src1_ptr, 5561cabdff1aSopenharmony_ci int32_t src2_stride, 5562cabdff1aSopenharmony_ci uint8_t *dst, 5563cabdff1aSopenharmony_ci int32_t dst_stride, 5564cabdff1aSopenharmony_ci const int8_t *filter_x, 5565cabdff1aSopenharmony_ci const int8_t *filter_y, 5566cabdff1aSopenharmony_ci int32_t height, 5567cabdff1aSopenharmony_ci int32_t weight0, 5568cabdff1aSopenharmony_ci int32_t weight1, 5569cabdff1aSopenharmony_ci int32_t offset0, 5570cabdff1aSopenharmony_ci int32_t offset1, 5571cabdff1aSopenharmony_ci int32_t rnd_val) 5572cabdff1aSopenharmony_ci{ 5573cabdff1aSopenharmony_ci if (2 == height) { 5574cabdff1aSopenharmony_ci hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 5575cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 5576cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, rnd_val); 5577cabdff1aSopenharmony_ci } else if (4 == height) { 5578cabdff1aSopenharmony_ci hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, 5579cabdff1aSopenharmony_ci src2_stride, dst, dst_stride, filter_x, 5580cabdff1aSopenharmony_ci filter_y, weight0, weight1, offset0, 5581cabdff1aSopenharmony_ci offset1, rnd_val, 1); 5582cabdff1aSopenharmony_ci } else if (6 == height) { 5583cabdff1aSopenharmony_ci hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 5584cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 5585cabdff1aSopenharmony_ci weight0, weight1, offset0, offset1, rnd_val); 5586cabdff1aSopenharmony_ci } else if (0 == (height % 4)) { 5587cabdff1aSopenharmony_ci hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, 5588cabdff1aSopenharmony_ci src1_ptr, src2_stride, 5589cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 5590cabdff1aSopenharmony_ci height, weight0, 5591cabdff1aSopenharmony_ci weight1, offset0, offset1, rnd_val, 8); 5592cabdff1aSopenharmony_ci } 5593cabdff1aSopenharmony_ci} 5594cabdff1aSopenharmony_ci 5595cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr, 5596cabdff1aSopenharmony_ci int32_t src_stride, 5597cabdff1aSopenharmony_ci int16_t *src1_ptr, 5598cabdff1aSopenharmony_ci int32_t src2_stride, 5599cabdff1aSopenharmony_ci uint8_t *dst, 5600cabdff1aSopenharmony_ci int32_t dst_stride, 5601cabdff1aSopenharmony_ci const int8_t *filter_x, 5602cabdff1aSopenharmony_ci const int8_t *filter_y, 5603cabdff1aSopenharmony_ci int32_t height, 5604cabdff1aSopenharmony_ci int32_t weight0, 5605cabdff1aSopenharmony_ci int32_t weight1, 5606cabdff1aSopenharmony_ci int32_t offset0, 5607cabdff1aSopenharmony_ci int32_t offset1, 5608cabdff1aSopenharmony_ci int32_t rnd_val) 5609cabdff1aSopenharmony_ci{ 5610cabdff1aSopenharmony_ci uint32_t loop_cnt; 5611cabdff1aSopenharmony_ci uint64_t tp0, tp1; 5612cabdff1aSopenharmony_ci int32_t offset, weight; 5613cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp, *dst_tmp; 5614cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 5615cabdff1aSopenharmony_ci v16u8 out0, out1; 5616cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 5617cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 5618cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3; 5619cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec; 5620cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 5621cabdff1aSopenharmony_ci v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec; 5622cabdff1aSopenharmony_ci v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 5623cabdff1aSopenharmony_ci v8i16 dst76_r, dst98_r, dst87_r, dst109_r; 5624cabdff1aSopenharmony_ci v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 5625cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 5626cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 5627cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 5628cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 5629cabdff1aSopenharmony_ci v4i32 offset_vec, rnd_vec, const_vec; 5630cabdff1aSopenharmony_ci 5631cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 5632cabdff1aSopenharmony_ci 5633cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 5634cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 5635cabdff1aSopenharmony_ci 5636cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 5637cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 5638cabdff1aSopenharmony_ci 5639cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 5640cabdff1aSopenharmony_ci 5641cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 5642cabdff1aSopenharmony_ci mask1 = mask0 + 2; 5643cabdff1aSopenharmony_ci 5644cabdff1aSopenharmony_ci offset = (offset0 + offset1) << rnd_val; 5645cabdff1aSopenharmony_ci weight0 = weight0 & 0x0000FFFF; 5646cabdff1aSopenharmony_ci weight = weight0 | (weight1 << 16); 5647cabdff1aSopenharmony_ci 5648cabdff1aSopenharmony_ci const_vec = __msa_fill_w((128 * weight1)); 5649cabdff1aSopenharmony_ci const_vec <<= 6; 5650cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 5651cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val + 1); 5652cabdff1aSopenharmony_ci offset_vec += const_vec; 5653cabdff1aSopenharmony_ci weight_vec = (v8i16) __msa_fill_w(weight); 5654cabdff1aSopenharmony_ci 5655cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 5656cabdff1aSopenharmony_ci dst_tmp = dst; 5657cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 5658cabdff1aSopenharmony_ci 5659cabdff1aSopenharmony_ci LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2); 5660cabdff1aSopenharmony_ci src0_ptr_tmp += (3 * src_stride); 5661cabdff1aSopenharmony_ci 5662cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 5663cabdff1aSopenharmony_ci 5664cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 5665cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 5666cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 5667cabdff1aSopenharmony_ci 5668cabdff1aSopenharmony_ci dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5669cabdff1aSopenharmony_ci dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5670cabdff1aSopenharmony_ci dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5671cabdff1aSopenharmony_ci 5672cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 5673cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 5674cabdff1aSopenharmony_ci 5675cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 5676cabdff1aSopenharmony_ci LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6); 5677cabdff1aSopenharmony_ci src0_ptr_tmp += (4 * src_stride); 5678cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 5679cabdff1aSopenharmony_ci 5680cabdff1aSopenharmony_ci LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); 5681cabdff1aSopenharmony_ci src1_ptr_tmp += (4 * src2_stride); 5682cabdff1aSopenharmony_ci 5683cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 5684cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 5685cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 5686cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 5687cabdff1aSopenharmony_ci 5688cabdff1aSopenharmony_ci dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5689cabdff1aSopenharmony_ci dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5690cabdff1aSopenharmony_ci dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5691cabdff1aSopenharmony_ci dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5692cabdff1aSopenharmony_ci 5693cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 5694cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 5695cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 5696cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 5697cabdff1aSopenharmony_ci 5698cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5699cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 5700cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5701cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 5702cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5703cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 5704cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5705cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 5706cabdff1aSopenharmony_ci 5707cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 5708cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 5709cabdff1aSopenharmony_ci PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 5710cabdff1aSopenharmony_ci dst3_r, dst0, dst1, dst2, dst3); 5711cabdff1aSopenharmony_ci ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 5712cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 5713cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 5714cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 5715cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5716cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5717cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5718cabdff1aSopenharmony_ci dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5719cabdff1aSopenharmony_ci dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 5720cabdff1aSopenharmony_ci dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 5721cabdff1aSopenharmony_ci dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 5722cabdff1aSopenharmony_ci dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 5723cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5724cabdff1aSopenharmony_ci SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5725cabdff1aSopenharmony_ci PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 5726cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 5727cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5728cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5729cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 5730cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 5731cabdff1aSopenharmony_ci 5732cabdff1aSopenharmony_ci dst10_r = dst54_r; 5733cabdff1aSopenharmony_ci dst10_l = dst54_l; 5734cabdff1aSopenharmony_ci dst21_r = dst65_r; 5735cabdff1aSopenharmony_ci dst21_l = dst65_l; 5736cabdff1aSopenharmony_ci dsth2 = dsth6; 5737cabdff1aSopenharmony_ci } 5738cabdff1aSopenharmony_ci 5739cabdff1aSopenharmony_ci src0_ptr += 8; 5740cabdff1aSopenharmony_ci dst += 8; 5741cabdff1aSopenharmony_ci src1_ptr += 8; 5742cabdff1aSopenharmony_ci 5743cabdff1aSopenharmony_ci mask2 = LD_SB(ff_hevc_mask_arr + 16); 5744cabdff1aSopenharmony_ci mask3 = mask2 + 2; 5745cabdff1aSopenharmony_ci 5746cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 5747cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 5748cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 5749cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 5750cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3); 5751cabdff1aSopenharmony_ci 5752cabdff1aSopenharmony_ci dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5753cabdff1aSopenharmony_ci dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5754cabdff1aSopenharmony_ci 5755cabdff1aSopenharmony_ci ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 5756cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 5757cabdff1aSopenharmony_ci 5758cabdff1aSopenharmony_ci for (loop_cnt = 2; loop_cnt--;) { 5759cabdff1aSopenharmony_ci LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9, 5760cabdff1aSopenharmony_ci src10); 5761cabdff1aSopenharmony_ci src0_ptr += (8 * src_stride); 5762cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 5763cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1); 5764cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3); 5765cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5); 5766cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7); 5767cabdff1aSopenharmony_ci 5768cabdff1aSopenharmony_ci dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5769cabdff1aSopenharmony_ci dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5770cabdff1aSopenharmony_ci dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5771cabdff1aSopenharmony_ci dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5772cabdff1aSopenharmony_ci 5773cabdff1aSopenharmony_ci dst32_r = __msa_ilvr_h(dst73, dst22); 5774cabdff1aSopenharmony_ci ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 5775cabdff1aSopenharmony_ci ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 5776cabdff1aSopenharmony_ci ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 5777cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 5778cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst22, dst106); 5779cabdff1aSopenharmony_ci 5780cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 5781cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 5782cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 5783cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 5784cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 5785cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in1); 5786cabdff1aSopenharmony_ci 5787cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 5788cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 5789cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in2); 5790cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 5791cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 5792cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in3); 5793cabdff1aSopenharmony_ci 5794cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5795cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5796cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5797cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5798cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 5799cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 5800cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 5801cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 5802cabdff1aSopenharmony_ci 5803cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 5804cabdff1aSopenharmony_ci SRA_4V(dst4, dst5, dst6, dst7, 6); 5805cabdff1aSopenharmony_ci PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 5806cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 5807cabdff1aSopenharmony_ci ILVRL_H2_SH(dst0, in0, tmp0, tmp1); 5808cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, in1, tmp2, tmp3); 5809cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, in2, tmp4, tmp5); 5810cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, in3, tmp6, tmp7); 5811cabdff1aSopenharmony_ci dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec); 5812cabdff1aSopenharmony_ci dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); 5813cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); 5814cabdff1aSopenharmony_ci dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); 5815cabdff1aSopenharmony_ci dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec); 5816cabdff1aSopenharmony_ci dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec); 5817cabdff1aSopenharmony_ci dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec); 5818cabdff1aSopenharmony_ci dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec); 5819cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5820cabdff1aSopenharmony_ci SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5821cabdff1aSopenharmony_ci PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 5822cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 5823cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5824cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5825cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 5826cabdff1aSopenharmony_ci dst += (8 * dst_stride); 5827cabdff1aSopenharmony_ci 5828cabdff1aSopenharmony_ci dst10_r = dst98_r; 5829cabdff1aSopenharmony_ci dst21_r = dst109_r; 5830cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 5831cabdff1aSopenharmony_ci } 5832cabdff1aSopenharmony_ci} 5833cabdff1aSopenharmony_ci 5834cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr, 5835cabdff1aSopenharmony_ci int32_t src_stride, 5836cabdff1aSopenharmony_ci int16_t *src1_ptr, 5837cabdff1aSopenharmony_ci int32_t src2_stride, 5838cabdff1aSopenharmony_ci uint8_t *dst, 5839cabdff1aSopenharmony_ci int32_t dst_stride, 5840cabdff1aSopenharmony_ci const int8_t *filter_x, 5841cabdff1aSopenharmony_ci const int8_t *filter_y, 5842cabdff1aSopenharmony_ci int32_t height, 5843cabdff1aSopenharmony_ci int32_t weight0, 5844cabdff1aSopenharmony_ci int32_t weight1, 5845cabdff1aSopenharmony_ci int32_t offset0, 5846cabdff1aSopenharmony_ci int32_t offset1, 5847cabdff1aSopenharmony_ci int32_t rnd_val) 5848cabdff1aSopenharmony_ci{ 5849cabdff1aSopenharmony_ci if (4 == height) { 5850cabdff1aSopenharmony_ci hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, 5851cabdff1aSopenharmony_ci src2_stride, dst, dst_stride, filter_x, 5852cabdff1aSopenharmony_ci filter_y, weight0, weight1, offset0, 5853cabdff1aSopenharmony_ci offset1, rnd_val, 2); 5854cabdff1aSopenharmony_ci } else { 5855cabdff1aSopenharmony_ci hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, 5856cabdff1aSopenharmony_ci src2_stride, dst, dst_stride, 5857cabdff1aSopenharmony_ci filter_x, filter_y, height, weight0, 5858cabdff1aSopenharmony_ci weight1, offset0, offset1, rnd_val, 16); 5859cabdff1aSopenharmony_ci } 5860cabdff1aSopenharmony_ci} 5861cabdff1aSopenharmony_ci 5862cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr, 5863cabdff1aSopenharmony_ci int32_t src_stride, 5864cabdff1aSopenharmony_ci int16_t *src1_ptr, 5865cabdff1aSopenharmony_ci int32_t src2_stride, 5866cabdff1aSopenharmony_ci uint8_t *dst, 5867cabdff1aSopenharmony_ci int32_t dst_stride, 5868cabdff1aSopenharmony_ci const int8_t *filter_x, 5869cabdff1aSopenharmony_ci const int8_t *filter_y, 5870cabdff1aSopenharmony_ci int32_t height, 5871cabdff1aSopenharmony_ci int32_t weight0, 5872cabdff1aSopenharmony_ci int32_t weight1, 5873cabdff1aSopenharmony_ci int32_t offset0, 5874cabdff1aSopenharmony_ci int32_t offset1, 5875cabdff1aSopenharmony_ci int32_t rnd_val) 5876cabdff1aSopenharmony_ci{ 5877cabdff1aSopenharmony_ci hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, 5878cabdff1aSopenharmony_ci src1_ptr, src2_stride, 5879cabdff1aSopenharmony_ci dst, dst_stride, 5880cabdff1aSopenharmony_ci filter_x, filter_y, height, weight0, 5881cabdff1aSopenharmony_ci weight1, offset0, offset1, rnd_val, 24); 5882cabdff1aSopenharmony_ci} 5883cabdff1aSopenharmony_ci 5884cabdff1aSopenharmony_cistatic void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr, 5885cabdff1aSopenharmony_ci int32_t src_stride, 5886cabdff1aSopenharmony_ci int16_t *src1_ptr, 5887cabdff1aSopenharmony_ci int32_t src2_stride, 5888cabdff1aSopenharmony_ci uint8_t *dst, 5889cabdff1aSopenharmony_ci int32_t dst_stride, 5890cabdff1aSopenharmony_ci const int8_t *filter_x, 5891cabdff1aSopenharmony_ci const int8_t *filter_y, 5892cabdff1aSopenharmony_ci int32_t height, 5893cabdff1aSopenharmony_ci int32_t weight0, 5894cabdff1aSopenharmony_ci int32_t weight1, 5895cabdff1aSopenharmony_ci int32_t offset0, 5896cabdff1aSopenharmony_ci int32_t offset1, 5897cabdff1aSopenharmony_ci int32_t rnd_val) 5898cabdff1aSopenharmony_ci{ 5899cabdff1aSopenharmony_ci hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, 5900cabdff1aSopenharmony_ci src1_ptr, src2_stride, 5901cabdff1aSopenharmony_ci dst, dst_stride, 5902cabdff1aSopenharmony_ci filter_x, filter_y, height, weight0, 5903cabdff1aSopenharmony_ci weight1, offset0, offset1, rnd_val, 32); 5904cabdff1aSopenharmony_ci} 5905cabdff1aSopenharmony_ci 5906cabdff1aSopenharmony_ci#define BI_W_MC_COPY(WIDTH) \ 5907cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ 5908cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 5909cabdff1aSopenharmony_ci uint8_t *src, \ 5910cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 5911cabdff1aSopenharmony_ci int16_t *src_16bit, \ 5912cabdff1aSopenharmony_ci int height, \ 5913cabdff1aSopenharmony_ci int denom, \ 5914cabdff1aSopenharmony_ci int weight0, \ 5915cabdff1aSopenharmony_ci int weight1, \ 5916cabdff1aSopenharmony_ci int offset0, \ 5917cabdff1aSopenharmony_ci int offset1, \ 5918cabdff1aSopenharmony_ci intptr_t mx, \ 5919cabdff1aSopenharmony_ci intptr_t my, \ 5920cabdff1aSopenharmony_ci int width) \ 5921cabdff1aSopenharmony_ci{ \ 5922cabdff1aSopenharmony_ci int shift = 14 + 1 - 8; \ 5923cabdff1aSopenharmony_ci int log2Wd = denom + shift - 1; \ 5924cabdff1aSopenharmony_ci \ 5925cabdff1aSopenharmony_ci hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \ 5926cabdff1aSopenharmony_ci dst, dst_stride, height, \ 5927cabdff1aSopenharmony_ci weight0, weight1, offset0, \ 5928cabdff1aSopenharmony_ci offset1, log2Wd); \ 5929cabdff1aSopenharmony_ci} 5930cabdff1aSopenharmony_ci 5931cabdff1aSopenharmony_ciBI_W_MC_COPY(4); 5932cabdff1aSopenharmony_ciBI_W_MC_COPY(6); 5933cabdff1aSopenharmony_ciBI_W_MC_COPY(8); 5934cabdff1aSopenharmony_ciBI_W_MC_COPY(12); 5935cabdff1aSopenharmony_ciBI_W_MC_COPY(16); 5936cabdff1aSopenharmony_ciBI_W_MC_COPY(24); 5937cabdff1aSopenharmony_ciBI_W_MC_COPY(32); 5938cabdff1aSopenharmony_ciBI_W_MC_COPY(48); 5939cabdff1aSopenharmony_ciBI_W_MC_COPY(64); 5940cabdff1aSopenharmony_ci 5941cabdff1aSopenharmony_ci#undef BI_W_MC_COPY 5942cabdff1aSopenharmony_ci 5943cabdff1aSopenharmony_ci#define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 5944cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ 5945cabdff1aSopenharmony_ci ptrdiff_t \ 5946cabdff1aSopenharmony_ci dst_stride, \ 5947cabdff1aSopenharmony_ci uint8_t *src, \ 5948cabdff1aSopenharmony_ci ptrdiff_t \ 5949cabdff1aSopenharmony_ci src_stride, \ 5950cabdff1aSopenharmony_ci int16_t *src_16bit, \ 5951cabdff1aSopenharmony_ci int height, \ 5952cabdff1aSopenharmony_ci int denom, \ 5953cabdff1aSopenharmony_ci int weight0, \ 5954cabdff1aSopenharmony_ci int weight1, \ 5955cabdff1aSopenharmony_ci int offset0, \ 5956cabdff1aSopenharmony_ci int offset1, \ 5957cabdff1aSopenharmony_ci intptr_t mx, \ 5958cabdff1aSopenharmony_ci intptr_t my, \ 5959cabdff1aSopenharmony_ci int width) \ 5960cabdff1aSopenharmony_ci{ \ 5961cabdff1aSopenharmony_ci const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 5962cabdff1aSopenharmony_ci int log2Wd = denom + 14 - 8; \ 5963cabdff1aSopenharmony_ci \ 5964cabdff1aSopenharmony_ci hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ 5965cabdff1aSopenharmony_ci MAX_PB_SIZE, dst, dst_stride, \ 5966cabdff1aSopenharmony_ci filter, height, weight0, \ 5967cabdff1aSopenharmony_ci weight1, offset0, offset1, \ 5968cabdff1aSopenharmony_ci log2Wd); \ 5969cabdff1aSopenharmony_ci} 5970cabdff1aSopenharmony_ci 5971cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 4, 8, hz, mx); 5972cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 8, 8, hz, mx); 5973cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 12, 8, hz, mx); 5974cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 16, 8, hz, mx); 5975cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 24, 8, hz, mx); 5976cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 32, 8, hz, mx); 5977cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 48, 8, hz, mx); 5978cabdff1aSopenharmony_ciBI_W_MC(qpel, h, 64, 8, hz, mx); 5979cabdff1aSopenharmony_ci 5980cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 4, 8, vt, my); 5981cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 8, 8, vt, my); 5982cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 12, 8, vt, my); 5983cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 16, 8, vt, my); 5984cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 24, 8, vt, my); 5985cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 32, 8, vt, my); 5986cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 48, 8, vt, my); 5987cabdff1aSopenharmony_ciBI_W_MC(qpel, v, 64, 8, vt, my); 5988cabdff1aSopenharmony_ci 5989cabdff1aSopenharmony_ciBI_W_MC(epel, h, 4, 4, hz, mx); 5990cabdff1aSopenharmony_ciBI_W_MC(epel, h, 8, 4, hz, mx); 5991cabdff1aSopenharmony_ciBI_W_MC(epel, h, 6, 4, hz, mx); 5992cabdff1aSopenharmony_ciBI_W_MC(epel, h, 12, 4, hz, mx); 5993cabdff1aSopenharmony_ciBI_W_MC(epel, h, 16, 4, hz, mx); 5994cabdff1aSopenharmony_ciBI_W_MC(epel, h, 24, 4, hz, mx); 5995cabdff1aSopenharmony_ciBI_W_MC(epel, h, 32, 4, hz, mx); 5996cabdff1aSopenharmony_ci 5997cabdff1aSopenharmony_ciBI_W_MC(epel, v, 4, 4, vt, my); 5998cabdff1aSopenharmony_ciBI_W_MC(epel, v, 8, 4, vt, my); 5999cabdff1aSopenharmony_ciBI_W_MC(epel, v, 6, 4, vt, my); 6000cabdff1aSopenharmony_ciBI_W_MC(epel, v, 12, 4, vt, my); 6001cabdff1aSopenharmony_ciBI_W_MC(epel, v, 16, 4, vt, my); 6002cabdff1aSopenharmony_ciBI_W_MC(epel, v, 24, 4, vt, my); 6003cabdff1aSopenharmony_ciBI_W_MC(epel, v, 32, 4, vt, my); 6004cabdff1aSopenharmony_ci 6005cabdff1aSopenharmony_ci#undef BI_W_MC 6006cabdff1aSopenharmony_ci 6007cabdff1aSopenharmony_ci#define BI_W_MC_HV(PEL, WIDTH, TAP) \ 6008cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \ 6009cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 6010cabdff1aSopenharmony_ci uint8_t *src, \ 6011cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 6012cabdff1aSopenharmony_ci int16_t *src_16bit, \ 6013cabdff1aSopenharmony_ci int height, \ 6014cabdff1aSopenharmony_ci int denom, \ 6015cabdff1aSopenharmony_ci int weight0, \ 6016cabdff1aSopenharmony_ci int weight1, \ 6017cabdff1aSopenharmony_ci int offset0, \ 6018cabdff1aSopenharmony_ci int offset1, \ 6019cabdff1aSopenharmony_ci intptr_t mx, \ 6020cabdff1aSopenharmony_ci intptr_t my, \ 6021cabdff1aSopenharmony_ci int width) \ 6022cabdff1aSopenharmony_ci{ \ 6023cabdff1aSopenharmony_ci const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 6024cabdff1aSopenharmony_ci const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 6025cabdff1aSopenharmony_ci int log2Wd = denom + 14 - 8; \ 6026cabdff1aSopenharmony_ci \ 6027cabdff1aSopenharmony_ci hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ 6028cabdff1aSopenharmony_ci MAX_PB_SIZE, dst, dst_stride, \ 6029cabdff1aSopenharmony_ci filter_x, filter_y, height, \ 6030cabdff1aSopenharmony_ci weight0, weight1, offset0, \ 6031cabdff1aSopenharmony_ci offset1, log2Wd); \ 6032cabdff1aSopenharmony_ci} 6033cabdff1aSopenharmony_ci 6034cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 4, 8); 6035cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 8, 8); 6036cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 12, 8); 6037cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 16, 8); 6038cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 24, 8); 6039cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 32, 8); 6040cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 48, 8); 6041cabdff1aSopenharmony_ciBI_W_MC_HV(qpel, 64, 8); 6042cabdff1aSopenharmony_ci 6043cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 4, 4); 6044cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 8, 4); 6045cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 6, 4); 6046cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 12, 4); 6047cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 16, 4); 6048cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 24, 4); 6049cabdff1aSopenharmony_ciBI_W_MC_HV(epel, 32, 4); 6050cabdff1aSopenharmony_ci 6051cabdff1aSopenharmony_ci#undef BI_W_MC_HV 6052