1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 22cabdff1aSopenharmony_ci#include "libavcodec/mips/hevcdsp_mips.h" 23cabdff1aSopenharmony_ci#include "libavcodec/mips/hevc_macros_msa.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { 26cabdff1aSopenharmony_ci /* 8 width cases */ 27cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28cabdff1aSopenharmony_ci /* 4 width cases */ 29cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 30cabdff1aSopenharmony_ci}; 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ci#define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \ 33cabdff1aSopenharmony_ci out0_h, out1_h) \ 34cabdff1aSopenharmony_ci{ \ 35cabdff1aSopenharmony_ci v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m; \ 36cabdff1aSopenharmony_ci \ 37cabdff1aSopenharmony_ci ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m); \ 38cabdff1aSopenharmony_ci ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m); \ 39cabdff1aSopenharmony_ci DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w, \ 40cabdff1aSopenharmony_ci wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m); \ 41cabdff1aSopenharmony_ci SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w); \ 42cabdff1aSopenharmony_ci PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h); \ 43cabdff1aSopenharmony_ci ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h); \ 44cabdff1aSopenharmony_ci CLIP_SH2_0_255(out0_h, out1_h); \ 45cabdff1aSopenharmony_ci} 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_ci#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, \ 48cabdff1aSopenharmony_ci offset_h, rnd_w, out0_h, out1_h, \ 49cabdff1aSopenharmony_ci out2_h, out3_h) \ 50cabdff1aSopenharmony_ci{ \ 51cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \ 52cabdff1aSopenharmony_ci out0_h, out1_h); \ 53cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w, \ 54cabdff1aSopenharmony_ci out2_h, out3_h); \ 55cabdff1aSopenharmony_ci} 56cabdff1aSopenharmony_ci 57cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_4w_msa(uint8_t *src, 58cabdff1aSopenharmony_ci int32_t src_stride, 59cabdff1aSopenharmony_ci uint8_t *dst, 60cabdff1aSopenharmony_ci int32_t dst_stride, 61cabdff1aSopenharmony_ci int32_t height, 62cabdff1aSopenharmony_ci int32_t weight, 63cabdff1aSopenharmony_ci int32_t offset, 64cabdff1aSopenharmony_ci int32_t rnd_val) 65cabdff1aSopenharmony_ci{ 66cabdff1aSopenharmony_ci uint32_t loop_cnt, tp0, tp1, tp2, tp3; 67cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 68cabdff1aSopenharmony_ci v16u8 out0, out1; 69cabdff1aSopenharmony_ci v16i8 src0 = { 0 }, src1 = { 0 }; 70cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, offset_vec; 71cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 72cabdff1aSopenharmony_ci 73cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 74cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 75cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 76cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 77cabdff1aSopenharmony_ci 78cabdff1aSopenharmony_ci if (2 == height) { 79cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l; 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_ci LW2(src, src_stride, tp0, tp1); 82cabdff1aSopenharmony_ci INSERT_W2_SB(tp0, tp1, src0); 83cabdff1aSopenharmony_ci dst0 = (v8i16) __msa_ilvr_b(zero, src0); 84cabdff1aSopenharmony_ci dst0 <<= 6; 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_ci ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l); 87cabdff1aSopenharmony_ci DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l); 88cabdff1aSopenharmony_ci SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); 89cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 90cabdff1aSopenharmony_ci dst0 += offset_vec; 91cabdff1aSopenharmony_ci CLIP_SH_0_255(dst0); 92cabdff1aSopenharmony_ci out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); 93cabdff1aSopenharmony_ci ST_W2(out0, 0, 1, dst, dst_stride); 94cabdff1aSopenharmony_ci } else if (4 == height) { 95cabdff1aSopenharmony_ci LW4(src, src_stride, tp0, tp1, tp2, tp3); 96cabdff1aSopenharmony_ci INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); 97cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 98cabdff1aSopenharmony_ci SLLI_2V(dst0, dst1, 6); 99cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, 100cabdff1aSopenharmony_ci rnd_vec, dst0, dst1); 101cabdff1aSopenharmony_ci out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 102cabdff1aSopenharmony_ci ST_W4(out0, 0, 1, 2, 3, dst, dst_stride); 103cabdff1aSopenharmony_ci } else if (0 == (height % 8)) { 104cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 105cabdff1aSopenharmony_ci LW4(src, src_stride, tp0, tp1, tp2, tp3); 106cabdff1aSopenharmony_ci src += 4 * src_stride; 107cabdff1aSopenharmony_ci INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); 108cabdff1aSopenharmony_ci LW4(src, src_stride, tp0, tp1, tp2, tp3); 109cabdff1aSopenharmony_ci src += 4 * src_stride; 110cabdff1aSopenharmony_ci INSERT_W4_SB(tp0, tp1, tp2, tp3, src1); 111cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 112cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 113cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 114cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 115cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, 116cabdff1aSopenharmony_ci dst2, dst3); 117cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 118cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 119cabdff1aSopenharmony_ci dst += 8 * dst_stride; 120cabdff1aSopenharmony_ci } 121cabdff1aSopenharmony_ci } 122cabdff1aSopenharmony_ci} 123cabdff1aSopenharmony_ci 124cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_6w_msa(uint8_t *src, 125cabdff1aSopenharmony_ci int32_t src_stride, 126cabdff1aSopenharmony_ci uint8_t *dst, 127cabdff1aSopenharmony_ci int32_t dst_stride, 128cabdff1aSopenharmony_ci int32_t height, 129cabdff1aSopenharmony_ci int32_t weight, 130cabdff1aSopenharmony_ci int32_t offset, 131cabdff1aSopenharmony_ci int32_t rnd_val) 132cabdff1aSopenharmony_ci{ 133cabdff1aSopenharmony_ci uint32_t loop_cnt; 134cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 135cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 136cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 137cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 138cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; 139cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 140cabdff1aSopenharmony_ci 141cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 142cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 143cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 144cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 145cabdff1aSopenharmony_ci 146cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 147cabdff1aSopenharmony_ci LD4(src, src_stride, tp0, tp1, tp2, tp3); 148cabdff1aSopenharmony_ci src += (4 * src_stride); 149cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 150cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src1); 151cabdff1aSopenharmony_ci LD4(src, src_stride, tp0, tp1, tp2, tp3); 152cabdff1aSopenharmony_ci src += (4 * src_stride); 153cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src2); 154cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src3); 155cabdff1aSopenharmony_ci 156cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 157cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 158cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 159cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src3, dst6, dst7); 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 162cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 163cabdff1aSopenharmony_ci 164cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 165cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 166cabdff1aSopenharmony_ci dst3); 167cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 168cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst4, dst5, dst6, 169cabdff1aSopenharmony_ci dst7); 170cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 171cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 172cabdff1aSopenharmony_ci 173cabdff1aSopenharmony_ci ST_W2(out0, 0, 2, dst, dst_stride); 174cabdff1aSopenharmony_ci ST_H2(out0, 2, 6, dst + 4, dst_stride); 175cabdff1aSopenharmony_ci ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 176cabdff1aSopenharmony_ci ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 177cabdff1aSopenharmony_ci dst += (4 * dst_stride); 178cabdff1aSopenharmony_ci ST_W2(out2, 0, 2, dst, dst_stride); 179cabdff1aSopenharmony_ci ST_H2(out2, 2, 6, dst + 4, dst_stride); 180cabdff1aSopenharmony_ci ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride); 181cabdff1aSopenharmony_ci ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 182cabdff1aSopenharmony_ci dst += (4 * dst_stride); 183cabdff1aSopenharmony_ci } 184cabdff1aSopenharmony_ci} 185cabdff1aSopenharmony_ci 186cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_8w_msa(uint8_t *src, 187cabdff1aSopenharmony_ci int32_t src_stride, 188cabdff1aSopenharmony_ci uint8_t *dst, 189cabdff1aSopenharmony_ci int32_t dst_stride, 190cabdff1aSopenharmony_ci int32_t height, 191cabdff1aSopenharmony_ci int32_t weight, 192cabdff1aSopenharmony_ci int32_t offset, 193cabdff1aSopenharmony_ci int32_t rnd_val) 194cabdff1aSopenharmony_ci{ 195cabdff1aSopenharmony_ci uint32_t loop_cnt; 196cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 197cabdff1aSopenharmony_ci v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 198cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 199cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 200cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; 201cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 204cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 205cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 206cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 207cabdff1aSopenharmony_ci 208cabdff1aSopenharmony_ci if (2 == height) { 209cabdff1aSopenharmony_ci LD2(src, src_stride, tp0, tp1); 210cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 211cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 212cabdff1aSopenharmony_ci SLLI_2V(dst0, dst1, 6); 213cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, 214cabdff1aSopenharmony_ci rnd_vec, dst0, dst1); 215cabdff1aSopenharmony_ci out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 216cabdff1aSopenharmony_ci ST_D2(out0, 0, 1, dst, dst_stride); 217cabdff1aSopenharmony_ci } else if (4 == height) { 218cabdff1aSopenharmony_ci LD4(src, src_stride, tp0, tp1, tp2, tp3); 219cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 220cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src1); 221cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 222cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 223cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 224cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 225cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 226cabdff1aSopenharmony_ci dst3); 227cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 228cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 229cabdff1aSopenharmony_ci } else if (6 == height) { 230cabdff1aSopenharmony_ci LD4(src, src_stride, tp0, tp1, tp2, tp3); 231cabdff1aSopenharmony_ci src += 4 * src_stride; 232cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 233cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src1); 234cabdff1aSopenharmony_ci LD2(src, src_stride, tp0, tp1); 235cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src2); 236cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 237cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 238cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 239cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 240cabdff1aSopenharmony_ci SLLI_2V(dst4, dst5, 6); 241cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 242cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 243cabdff1aSopenharmony_ci dst3); 244cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 245cabdff1aSopenharmony_ci rnd_vec, dst4, dst5); 246cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 247cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 248cabdff1aSopenharmony_ci ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 249cabdff1aSopenharmony_ci } else if (0 == height % 8) { 250cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 251cabdff1aSopenharmony_ci LD4(src, src_stride, tp0, tp1, tp2, tp3); 252cabdff1aSopenharmony_ci src += 4 * src_stride; 253cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 254cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src1); 255cabdff1aSopenharmony_ci LD4(src, src_stride, tp0, tp1, tp2, tp3); 256cabdff1aSopenharmony_ci src += 4 * src_stride; 257cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src2); 258cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src3); 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 261cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 262cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 263cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src3, dst6, dst7); 264cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 265cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 266cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 267cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, 268cabdff1aSopenharmony_ci dst2, dst3); 269cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 270cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst4, dst5, 271cabdff1aSopenharmony_ci dst6, dst7); 272cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 273cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 274cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, 275cabdff1aSopenharmony_ci dst, dst_stride); 276cabdff1aSopenharmony_ci dst += (8 * dst_stride); 277cabdff1aSopenharmony_ci } 278cabdff1aSopenharmony_ci } 279cabdff1aSopenharmony_ci} 280cabdff1aSopenharmony_ci 281cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_12w_msa(uint8_t *src, 282cabdff1aSopenharmony_ci int32_t src_stride, 283cabdff1aSopenharmony_ci uint8_t *dst, 284cabdff1aSopenharmony_ci int32_t dst_stride, 285cabdff1aSopenharmony_ci int32_t height, 286cabdff1aSopenharmony_ci int32_t weight, 287cabdff1aSopenharmony_ci int32_t offset, 288cabdff1aSopenharmony_ci int32_t rnd_val) 289cabdff1aSopenharmony_ci{ 290cabdff1aSopenharmony_ci uint32_t loop_cnt; 291cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 292cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 293cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 294cabdff1aSopenharmony_ci v8i16 offset_vec; 295cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 296cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 297cabdff1aSopenharmony_ci 298cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 299cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 300cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 301cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 302cabdff1aSopenharmony_ci 303cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 304cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 305cabdff1aSopenharmony_ci src += (4 * src_stride); 306cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 307cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 308cabdff1aSopenharmony_ci 309cabdff1aSopenharmony_ci ILVL_W2_SB(src1, src0, src3, src2, src0, src1); 310cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5); 311cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 312cabdff1aSopenharmony_ci SLLI_2V(dst4, dst5, 6); 313cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 314cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 315cabdff1aSopenharmony_ci dst3); 316cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 317cabdff1aSopenharmony_ci rnd_vec, dst4, dst5); 318cabdff1aSopenharmony_ci 319cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 320cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 321cabdff1aSopenharmony_ci ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 322cabdff1aSopenharmony_ci dst += (4 * dst_stride); 323cabdff1aSopenharmony_ci } 324cabdff1aSopenharmony_ci} 325cabdff1aSopenharmony_ci 326cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_16w_msa(uint8_t *src, 327cabdff1aSopenharmony_ci int32_t src_stride, 328cabdff1aSopenharmony_ci uint8_t *dst, 329cabdff1aSopenharmony_ci int32_t dst_stride, 330cabdff1aSopenharmony_ci int32_t height, 331cabdff1aSopenharmony_ci int32_t weight, 332cabdff1aSopenharmony_ci int32_t offset, 333cabdff1aSopenharmony_ci int32_t rnd_val) 334cabdff1aSopenharmony_ci{ 335cabdff1aSopenharmony_ci uint32_t loop_cnt; 336cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 337cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 338cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 339cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; 340cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 341cabdff1aSopenharmony_ci 342cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 343cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 344cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 345cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 346cabdff1aSopenharmony_ci 347cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 348cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 349cabdff1aSopenharmony_ci src += (4 * src_stride); 350cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 351cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 352cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 353cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src3, dst6, dst7); 354cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 355cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 356cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 357cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 358cabdff1aSopenharmony_ci dst3); 359cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 360cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst4, dst5, dst6, 361cabdff1aSopenharmony_ci dst7); 362cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 363cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 364cabdff1aSopenharmony_ci ST_UB4(out0, out1, out2, out3, dst, dst_stride); 365cabdff1aSopenharmony_ci dst += (4 * dst_stride); 366cabdff1aSopenharmony_ci } 367cabdff1aSopenharmony_ci} 368cabdff1aSopenharmony_ci 369cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_24w_msa(uint8_t *src, 370cabdff1aSopenharmony_ci int32_t src_stride, 371cabdff1aSopenharmony_ci uint8_t *dst, 372cabdff1aSopenharmony_ci int32_t dst_stride, 373cabdff1aSopenharmony_ci int32_t height, 374cabdff1aSopenharmony_ci int32_t weight, 375cabdff1aSopenharmony_ci int32_t offset, 376cabdff1aSopenharmony_ci int32_t rnd_val) 377cabdff1aSopenharmony_ci{ 378cabdff1aSopenharmony_ci uint32_t loop_cnt; 379cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3, out4, out5; 380cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 381cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 382cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; 383cabdff1aSopenharmony_ci v8i16 dst8, dst9, dst10, dst11; 384cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 385cabdff1aSopenharmony_ci 386cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 387cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 388cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 389cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 390cabdff1aSopenharmony_ci 391cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 392cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src4, src5); 393cabdff1aSopenharmony_ci LD_SB4(src + 16, src_stride, src2, src3, src6, src7); 394cabdff1aSopenharmony_ci src += (4 * src_stride); 395cabdff1aSopenharmony_ci 396cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 397cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 398cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5); 399cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src4, dst6, dst7); 400cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src5, dst8, dst9); 401cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11); 402cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 403cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 404cabdff1aSopenharmony_ci SLLI_4V(dst8, dst9, dst10, dst11, 6); 405cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 406cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 407cabdff1aSopenharmony_ci dst3); 408cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 409cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst4, dst5, dst6, 410cabdff1aSopenharmony_ci dst7); 411cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec, 412cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst8, dst9, dst10, 413cabdff1aSopenharmony_ci dst11); 414cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 415cabdff1aSopenharmony_ci PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); 416cabdff1aSopenharmony_ci ST_UB4(out0, out1, out3, out4, dst, dst_stride); 417cabdff1aSopenharmony_ci ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride); 418cabdff1aSopenharmony_ci dst += (4 * dst_stride); 419cabdff1aSopenharmony_ci } 420cabdff1aSopenharmony_ci} 421cabdff1aSopenharmony_ci 422cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_32w_msa(uint8_t *src, 423cabdff1aSopenharmony_ci int32_t src_stride, 424cabdff1aSopenharmony_ci uint8_t *dst, 425cabdff1aSopenharmony_ci int32_t dst_stride, 426cabdff1aSopenharmony_ci int32_t height, 427cabdff1aSopenharmony_ci int32_t weight, 428cabdff1aSopenharmony_ci int32_t offset, 429cabdff1aSopenharmony_ci int32_t rnd_val) 430cabdff1aSopenharmony_ci{ 431cabdff1aSopenharmony_ci uint32_t loop_cnt; 432cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 433cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 434cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 435cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; 436cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 437cabdff1aSopenharmony_ci 438cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 439cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 440cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 441cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 444cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src1); 445cabdff1aSopenharmony_ci LD_SB2(src + 16, src_stride, src2, src3); 446cabdff1aSopenharmony_ci src += (2 * src_stride); 447cabdff1aSopenharmony_ci 448cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 449cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 450cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 451cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src3, dst6, dst7); 452cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 453cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 454cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 455cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 456cabdff1aSopenharmony_ci dst3); 457cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 458cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst4, dst5, dst6, 459cabdff1aSopenharmony_ci dst7); 460cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 461cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 462cabdff1aSopenharmony_ci ST_UB2(out0, out1, dst, dst_stride); 463cabdff1aSopenharmony_ci ST_UB2(out2, out3, dst + 16, dst_stride); 464cabdff1aSopenharmony_ci dst += (2 * dst_stride); 465cabdff1aSopenharmony_ci } 466cabdff1aSopenharmony_ci} 467cabdff1aSopenharmony_ci 468cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_48w_msa(uint8_t *src, 469cabdff1aSopenharmony_ci int32_t src_stride, 470cabdff1aSopenharmony_ci uint8_t *dst, 471cabdff1aSopenharmony_ci int32_t dst_stride, 472cabdff1aSopenharmony_ci int32_t height, 473cabdff1aSopenharmony_ci int32_t weight, 474cabdff1aSopenharmony_ci int32_t offset, 475cabdff1aSopenharmony_ci int32_t rnd_val) 476cabdff1aSopenharmony_ci{ 477cabdff1aSopenharmony_ci uint32_t loop_cnt; 478cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3, out4, out5; 479cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 480cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 481cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec; 482cabdff1aSopenharmony_ci v8i16 dst6, dst7, dst8, dst9, dst10, dst11; 483cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 484cabdff1aSopenharmony_ci 485cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 486cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 487cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 488cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 489cabdff1aSopenharmony_ci 490cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 491cabdff1aSopenharmony_ci LD_SB3(src, 16, src0, src1, src2); 492cabdff1aSopenharmony_ci src += src_stride; 493cabdff1aSopenharmony_ci LD_SB3(src, 16, src3, src4, src5); 494cabdff1aSopenharmony_ci src += src_stride; 495cabdff1aSopenharmony_ci 496cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 497cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 498cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 499cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src3, dst6, dst7); 500cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src4, dst8, dst9); 501cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src5, dst10, dst11); 502cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 503cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 504cabdff1aSopenharmony_ci SLLI_4V(dst8, dst9, dst10, dst11, 6); 505cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 506cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 507cabdff1aSopenharmony_ci dst3); 508cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 509cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst4, dst5, dst6, 510cabdff1aSopenharmony_ci dst7); 511cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec, 512cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst8, dst9, dst10, 513cabdff1aSopenharmony_ci dst11); 514cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 515cabdff1aSopenharmony_ci PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); 516cabdff1aSopenharmony_ci ST_UB2(out0, out1, dst, 16); 517cabdff1aSopenharmony_ci ST_UB(out2, dst + 32); 518cabdff1aSopenharmony_ci dst += dst_stride; 519cabdff1aSopenharmony_ci ST_UB2(out3, out4, dst, 16); 520cabdff1aSopenharmony_ci ST_UB(out5, dst + 32); 521cabdff1aSopenharmony_ci dst += dst_stride; 522cabdff1aSopenharmony_ci } 523cabdff1aSopenharmony_ci} 524cabdff1aSopenharmony_ci 525cabdff1aSopenharmony_cistatic void hevc_uniwgt_copy_64w_msa(uint8_t *src, 526cabdff1aSopenharmony_ci int32_t src_stride, 527cabdff1aSopenharmony_ci uint8_t *dst, 528cabdff1aSopenharmony_ci int32_t dst_stride, 529cabdff1aSopenharmony_ci int32_t height, 530cabdff1aSopenharmony_ci int32_t weight, 531cabdff1aSopenharmony_ci int32_t offset, 532cabdff1aSopenharmony_ci int32_t rnd_val) 533cabdff1aSopenharmony_ci{ 534cabdff1aSopenharmony_ci uint32_t loop_cnt; 535cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3, out4, out5, out6, out7; 536cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 537cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 538cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec; 539cabdff1aSopenharmony_ci v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; 540cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 541cabdff1aSopenharmony_ci 542cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 543cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 544cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 545cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 546cabdff1aSopenharmony_ci 547cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 548cabdff1aSopenharmony_ci LD_SB4(src, 16, src0, src1, src2, src3); 549cabdff1aSopenharmony_ci src += src_stride; 550cabdff1aSopenharmony_ci LD_SB4(src, 16, src4, src5, src6, src7); 551cabdff1aSopenharmony_ci src += src_stride; 552cabdff1aSopenharmony_ci 553cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 554cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 555cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 556cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src3, dst6, dst7); 557cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src4, dst8, dst9); 558cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src5, dst10, dst11); 559cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src6, dst12, dst13); 560cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src7, dst14, dst15); 561cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 562cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 563cabdff1aSopenharmony_ci SLLI_4V(dst8, dst9, dst10, dst11, 6); 564cabdff1aSopenharmony_ci SLLI_4V(dst12, dst13, dst14, dst15, 6); 565cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 566cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 567cabdff1aSopenharmony_ci dst3); 568cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 569cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst4, dst5, dst6, 570cabdff1aSopenharmony_ci dst7); 571cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec, 572cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst8, dst9, dst10, 573cabdff1aSopenharmony_ci dst11); 574cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst12, dst13, dst14, dst15, weight_vec, 575cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst12, dst13, dst14, 576cabdff1aSopenharmony_ci dst15); 577cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 578cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 579cabdff1aSopenharmony_ci PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5); 580cabdff1aSopenharmony_ci PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7); 581cabdff1aSopenharmony_ci ST_UB4(out0, out1, out2, out3, dst, 16); 582cabdff1aSopenharmony_ci dst += dst_stride; 583cabdff1aSopenharmony_ci ST_UB4(out4, out5, out6, out7, dst, 16); 584cabdff1aSopenharmony_ci dst += dst_stride; 585cabdff1aSopenharmony_ci } 586cabdff1aSopenharmony_ci} 587cabdff1aSopenharmony_ci 588cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src, 589cabdff1aSopenharmony_ci int32_t src_stride, 590cabdff1aSopenharmony_ci uint8_t *dst, 591cabdff1aSopenharmony_ci int32_t dst_stride, 592cabdff1aSopenharmony_ci const int8_t *filter, 593cabdff1aSopenharmony_ci int32_t height, 594cabdff1aSopenharmony_ci int32_t weight, 595cabdff1aSopenharmony_ci int32_t offset, 596cabdff1aSopenharmony_ci int32_t rnd_val) 597cabdff1aSopenharmony_ci{ 598cabdff1aSopenharmony_ci uint32_t loop_cnt; 599cabdff1aSopenharmony_ci v16u8 out0, out1; 600cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 601cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 602cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 603cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15; 604cabdff1aSopenharmony_ci v8i16 filter_vec, dst01, dst23, dst45, dst67; 605cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec; 606cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 607cabdff1aSopenharmony_ci 608cabdff1aSopenharmony_ci src -= 3; 609cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 610cabdff1aSopenharmony_ci 611cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 612cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 613cabdff1aSopenharmony_ci 614cabdff1aSopenharmony_ci weight *= 128; 615cabdff1aSopenharmony_ci rnd_val -= 6; 616cabdff1aSopenharmony_ci 617cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 618cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 619cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 620cabdff1aSopenharmony_ci 621cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 622cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 623cabdff1aSopenharmony_ci 624cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 625cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 626cabdff1aSopenharmony_ci 627cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[16]); 628cabdff1aSopenharmony_ci mask1 = mask0 + 2; 629cabdff1aSopenharmony_ci mask2 = mask0 + 4; 630cabdff1aSopenharmony_ci mask3 = mask0 + 6; 631cabdff1aSopenharmony_ci 632cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 633cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 634cabdff1aSopenharmony_ci src += (8 * src_stride); 635cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 636cabdff1aSopenharmony_ci 637cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, 638cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 639cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3, 640cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 641cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3, 642cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 643cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3, 644cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 645cabdff1aSopenharmony_ci dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 646cabdff1aSopenharmony_ci filt3); 647cabdff1aSopenharmony_ci dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 648cabdff1aSopenharmony_ci filt3); 649cabdff1aSopenharmony_ci dst45 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 650cabdff1aSopenharmony_ci filt3); 651cabdff1aSopenharmony_ci dst67 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 652cabdff1aSopenharmony_ci filt2, filt3); 653cabdff1aSopenharmony_ci 654cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec, 655cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 656cabdff1aSopenharmony_ci dst3); 657cabdff1aSopenharmony_ci 658cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 659cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 660cabdff1aSopenharmony_ci dst += (8 * dst_stride); 661cabdff1aSopenharmony_ci } 662cabdff1aSopenharmony_ci} 663cabdff1aSopenharmony_ci 664cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src, 665cabdff1aSopenharmony_ci int32_t src_stride, 666cabdff1aSopenharmony_ci uint8_t *dst, 667cabdff1aSopenharmony_ci int32_t dst_stride, 668cabdff1aSopenharmony_ci const int8_t *filter, 669cabdff1aSopenharmony_ci int32_t height, 670cabdff1aSopenharmony_ci int32_t weight, 671cabdff1aSopenharmony_ci int32_t offset, 672cabdff1aSopenharmony_ci int32_t rnd_val) 673cabdff1aSopenharmony_ci{ 674cabdff1aSopenharmony_ci uint32_t loop_cnt; 675cabdff1aSopenharmony_ci v16u8 out0, out1; 676cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 677cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 678cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3; 679cabdff1aSopenharmony_ci v8i16 filter_vec; 680cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 681cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 682cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 683cabdff1aSopenharmony_ci v8i16 weight_vec_h, offset_vec, denom_vec; 684cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 685cabdff1aSopenharmony_ci 686cabdff1aSopenharmony_ci src -= 3; 687cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 688cabdff1aSopenharmony_ci 689cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 690cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 691cabdff1aSopenharmony_ci 692cabdff1aSopenharmony_ci weight *= 128; 693cabdff1aSopenharmony_ci rnd_val -= 6; 694cabdff1aSopenharmony_ci 695cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 696cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 697cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 698cabdff1aSopenharmony_ci 699cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 700cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 701cabdff1aSopenharmony_ci 702cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 703cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 704cabdff1aSopenharmony_ci 705cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 706cabdff1aSopenharmony_ci mask1 = mask0 + 2; 707cabdff1aSopenharmony_ci mask2 = mask0 + 4; 708cabdff1aSopenharmony_ci mask3 = mask0 + 6; 709cabdff1aSopenharmony_ci 710cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 711cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 712cabdff1aSopenharmony_ci src += (4 * src_stride); 713cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 714cabdff1aSopenharmony_ci 715cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 716cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 717cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 718cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 719cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 720cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 721cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 722cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 723cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 724cabdff1aSopenharmony_ci filt3); 725cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 726cabdff1aSopenharmony_ci filt3); 727cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 728cabdff1aSopenharmony_ci filt3); 729cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 730cabdff1aSopenharmony_ci filt2, filt3); 731cabdff1aSopenharmony_ci 732cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 733cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 734cabdff1aSopenharmony_ci dst3); 735cabdff1aSopenharmony_ci 736cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 737cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 738cabdff1aSopenharmony_ci dst += (4 * dst_stride); 739cabdff1aSopenharmony_ci } 740cabdff1aSopenharmony_ci} 741cabdff1aSopenharmony_ci 742cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src, 743cabdff1aSopenharmony_ci int32_t src_stride, 744cabdff1aSopenharmony_ci uint8_t *dst, 745cabdff1aSopenharmony_ci int32_t dst_stride, 746cabdff1aSopenharmony_ci const int8_t *filter, 747cabdff1aSopenharmony_ci int32_t height, 748cabdff1aSopenharmony_ci int32_t weight, 749cabdff1aSopenharmony_ci int32_t offset, 750cabdff1aSopenharmony_ci int32_t rnd_val) 751cabdff1aSopenharmony_ci{ 752cabdff1aSopenharmony_ci uint32_t loop_cnt; 753cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 754cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 755cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 756cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 757cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 758cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 759cabdff1aSopenharmony_ci v8i16 filter_vec; 760cabdff1aSopenharmony_ci v8i16 dst01, dst23, dst0, dst1, dst2, dst3, dst4, dst5; 761cabdff1aSopenharmony_ci v8i16 weight_vec_h, offset_vec, denom_vec; 762cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 763cabdff1aSopenharmony_ci 764cabdff1aSopenharmony_ci src -= 3; 765cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 766cabdff1aSopenharmony_ci 767cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 768cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 769cabdff1aSopenharmony_ci 770cabdff1aSopenharmony_ci weight *= 128; 771cabdff1aSopenharmony_ci rnd_val -= 6; 772cabdff1aSopenharmony_ci 773cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 774cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 775cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 776cabdff1aSopenharmony_ci 777cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 778cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 779cabdff1aSopenharmony_ci 780cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 781cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 782cabdff1aSopenharmony_ci 783cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 784cabdff1aSopenharmony_ci mask1 = mask0 + 2; 785cabdff1aSopenharmony_ci mask2 = mask0 + 4; 786cabdff1aSopenharmony_ci mask3 = mask0 + 6; 787cabdff1aSopenharmony_ci mask4 = LD_SB(&ff_hevc_mask_arr[16]); 788cabdff1aSopenharmony_ci mask5 = mask4 + 2; 789cabdff1aSopenharmony_ci mask6 = mask4 + 4; 790cabdff1aSopenharmony_ci mask7 = mask4 + 6; 791cabdff1aSopenharmony_ci 792cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 793cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 794cabdff1aSopenharmony_ci LD_SB4(src + 8, src_stride, src4, src5, src6, src7); 795cabdff1aSopenharmony_ci src += (4 * src_stride); 796cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 797cabdff1aSopenharmony_ci 798cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 799cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 800cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 801cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 802cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 803cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 804cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 805cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 806cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 807cabdff1aSopenharmony_ci filt3); 808cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 809cabdff1aSopenharmony_ci filt3); 810cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 811cabdff1aSopenharmony_ci filt3); 812cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 813cabdff1aSopenharmony_ci filt2, filt3); 814cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src5, mask4, mask5, mask6, mask7, 815cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 816cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src7, mask4, mask5, mask6, mask7, 817cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 818cabdff1aSopenharmony_ci dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 819cabdff1aSopenharmony_ci filt3); 820cabdff1aSopenharmony_ci dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 821cabdff1aSopenharmony_ci filt3); 822cabdff1aSopenharmony_ci 823cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 824cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 825cabdff1aSopenharmony_ci dst3); 826cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst01, dst23, weight_vec, offset_vec, 827cabdff1aSopenharmony_ci rnd_vec, dst4, dst5); 828cabdff1aSopenharmony_ci 829cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 830cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 831cabdff1aSopenharmony_ci ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 832cabdff1aSopenharmony_ci dst += (4 * dst_stride); 833cabdff1aSopenharmony_ci } 834cabdff1aSopenharmony_ci} 835cabdff1aSopenharmony_ci 836cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src, 837cabdff1aSopenharmony_ci int32_t src_stride, 838cabdff1aSopenharmony_ci uint8_t *dst, 839cabdff1aSopenharmony_ci int32_t dst_stride, 840cabdff1aSopenharmony_ci const int8_t *filter, 841cabdff1aSopenharmony_ci int32_t height, 842cabdff1aSopenharmony_ci int32_t weight, 843cabdff1aSopenharmony_ci int32_t offset, 844cabdff1aSopenharmony_ci int32_t rnd_val) 845cabdff1aSopenharmony_ci{ 846cabdff1aSopenharmony_ci uint32_t loop_cnt; 847cabdff1aSopenharmony_ci v16u8 out0, out1; 848cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 849cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 850cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3; 851cabdff1aSopenharmony_ci v8i16 filter_vec; 852cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 853cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 854cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 855cabdff1aSopenharmony_ci v8i16 weight_vec_h, offset_vec, denom_vec; 856cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 857cabdff1aSopenharmony_ci 858cabdff1aSopenharmony_ci src -= 3; 859cabdff1aSopenharmony_ci 860cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 861cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 862cabdff1aSopenharmony_ci 863cabdff1aSopenharmony_ci weight *= 128; 864cabdff1aSopenharmony_ci rnd_val -= 6; 865cabdff1aSopenharmony_ci 866cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 867cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 868cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 869cabdff1aSopenharmony_ci 870cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 871cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 872cabdff1aSopenharmony_ci 873cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 874cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 875cabdff1aSopenharmony_ci 876cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 877cabdff1aSopenharmony_ci mask1 = mask0 + 2; 878cabdff1aSopenharmony_ci mask2 = mask0 + 4; 879cabdff1aSopenharmony_ci mask3 = mask0 + 6; 880cabdff1aSopenharmony_ci 881cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 882cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src2); 883cabdff1aSopenharmony_ci LD_SB2(src + 8, src_stride, src1, src3); 884cabdff1aSopenharmony_ci src += (2 * src_stride); 885cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 886cabdff1aSopenharmony_ci 887cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 888cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 889cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 890cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 891cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 892cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 893cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 894cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 895cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 896cabdff1aSopenharmony_ci filt3); 897cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 898cabdff1aSopenharmony_ci filt3); 899cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 900cabdff1aSopenharmony_ci filt3); 901cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 902cabdff1aSopenharmony_ci filt2, filt3); 903cabdff1aSopenharmony_ci 904cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 905cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 906cabdff1aSopenharmony_ci dst3); 907cabdff1aSopenharmony_ci 908cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 909cabdff1aSopenharmony_ci ST_UB2(out0, out1, dst, dst_stride); 910cabdff1aSopenharmony_ci dst += (2 * dst_stride); 911cabdff1aSopenharmony_ci } 912cabdff1aSopenharmony_ci} 913cabdff1aSopenharmony_ci 914cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src, 915cabdff1aSopenharmony_ci int32_t src_stride, 916cabdff1aSopenharmony_ci uint8_t *dst, 917cabdff1aSopenharmony_ci int32_t dst_stride, 918cabdff1aSopenharmony_ci const int8_t *filter, 919cabdff1aSopenharmony_ci int32_t height, 920cabdff1aSopenharmony_ci int32_t weight, 921cabdff1aSopenharmony_ci int32_t offset, 922cabdff1aSopenharmony_ci int32_t rnd_val) 923cabdff1aSopenharmony_ci{ 924cabdff1aSopenharmony_ci uint32_t loop_cnt; 925cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 926cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 927cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 928cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 929cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 930cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 931cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 932cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 933cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 934cabdff1aSopenharmony_ci 935cabdff1aSopenharmony_ci src -= 3; 936cabdff1aSopenharmony_ci 937cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 938cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 939cabdff1aSopenharmony_ci 940cabdff1aSopenharmony_ci weight *= 128; 941cabdff1aSopenharmony_ci rnd_val -= 6; 942cabdff1aSopenharmony_ci 943cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 944cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 945cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 946cabdff1aSopenharmony_ci 947cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 948cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 949cabdff1aSopenharmony_ci 950cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 951cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 952cabdff1aSopenharmony_ci 953cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 954cabdff1aSopenharmony_ci mask1 = mask0 + 2; 955cabdff1aSopenharmony_ci mask2 = mask0 + 4; 956cabdff1aSopenharmony_ci mask3 = mask0 + 6; 957cabdff1aSopenharmony_ci mask4 = mask0 + 8; 958cabdff1aSopenharmony_ci mask5 = mask0 + 10; 959cabdff1aSopenharmony_ci mask6 = mask0 + 12; 960cabdff1aSopenharmony_ci mask7 = mask0 + 14; 961cabdff1aSopenharmony_ci 962cabdff1aSopenharmony_ci for (loop_cnt = 16; loop_cnt--;) { 963cabdff1aSopenharmony_ci LD_SB2(src, 16, src0, src1); 964cabdff1aSopenharmony_ci src += src_stride; 965cabdff1aSopenharmony_ci LD_SB2(src, 16, src2, src3); 966cabdff1aSopenharmony_ci src += src_stride; 967cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 968cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 969cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 970cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 971cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 972cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 973cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 974cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 975cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 976cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 977cabdff1aSopenharmony_ci filt3); 978cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 979cabdff1aSopenharmony_ci filt3); 980cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 981cabdff1aSopenharmony_ci filt3); 982cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 983cabdff1aSopenharmony_ci filt2, filt3); 984cabdff1aSopenharmony_ci 985cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, 986cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 987cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 988cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 989cabdff1aSopenharmony_ci dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 990cabdff1aSopenharmony_ci filt3); 991cabdff1aSopenharmony_ci dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 992cabdff1aSopenharmony_ci filt3); 993cabdff1aSopenharmony_ci 994cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 995cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 996cabdff1aSopenharmony_ci dst3); 997cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 998cabdff1aSopenharmony_ci rnd_vec, dst4, dst5); 999cabdff1aSopenharmony_ci 1000cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2); 1001cabdff1aSopenharmony_ci ST_UB2(out0, out1, dst, dst_stride); 1002cabdff1aSopenharmony_ci ST_D2(out2, 0, 1, dst + 16, dst_stride); 1003cabdff1aSopenharmony_ci dst += (2 * dst_stride); 1004cabdff1aSopenharmony_ci } 1005cabdff1aSopenharmony_ci} 1006cabdff1aSopenharmony_ci 1007cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src, 1008cabdff1aSopenharmony_ci int32_t src_stride, 1009cabdff1aSopenharmony_ci uint8_t *dst, 1010cabdff1aSopenharmony_ci int32_t dst_stride, 1011cabdff1aSopenharmony_ci const int8_t *filter, 1012cabdff1aSopenharmony_ci int32_t height, 1013cabdff1aSopenharmony_ci int32_t weight, 1014cabdff1aSopenharmony_ci int32_t offset, 1015cabdff1aSopenharmony_ci int32_t rnd_val) 1016cabdff1aSopenharmony_ci{ 1017cabdff1aSopenharmony_ci uint32_t loop_cnt; 1018cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 1019cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 1020cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1021cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3; 1022cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1023cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1024cabdff1aSopenharmony_ci v8i16 filter_vec; 1025cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1026cabdff1aSopenharmony_ci v8i16 weight_vec_h, offset_vec, denom_vec; 1027cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 1028cabdff1aSopenharmony_ci 1029cabdff1aSopenharmony_ci src -= 3; 1030cabdff1aSopenharmony_ci 1031cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1032cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 1033cabdff1aSopenharmony_ci 1034cabdff1aSopenharmony_ci weight *= 128; 1035cabdff1aSopenharmony_ci rnd_val -= 6; 1036cabdff1aSopenharmony_ci 1037cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 1038cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 1039cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 1040cabdff1aSopenharmony_ci 1041cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1042cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1043cabdff1aSopenharmony_ci 1044cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1045cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1046cabdff1aSopenharmony_ci 1047cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1048cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1049cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1050cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1051cabdff1aSopenharmony_ci 1052cabdff1aSopenharmony_ci for (loop_cnt = height >> 1; loop_cnt--;) { 1053cabdff1aSopenharmony_ci LD_SB4(src, 8, src0, src1, src2, src3); 1054cabdff1aSopenharmony_ci src += src_stride; 1055cabdff1aSopenharmony_ci LD_SB4(src, 8, src4, src5, src6, src7); 1056cabdff1aSopenharmony_ci src += src_stride; 1057cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 1058cabdff1aSopenharmony_ci 1059cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1060cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1061cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1062cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1063cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1064cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1065cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1066cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1067cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1068cabdff1aSopenharmony_ci filt3); 1069cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1070cabdff1aSopenharmony_ci filt3); 1071cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1072cabdff1aSopenharmony_ci filt3); 1073cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1074cabdff1aSopenharmony_ci filt2, filt3); 1075cabdff1aSopenharmony_ci 1076cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 1077cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1078cabdff1aSopenharmony_ci VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, 1079cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1080cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, 1081cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1082cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, 1083cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1084cabdff1aSopenharmony_ci dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1085cabdff1aSopenharmony_ci filt3); 1086cabdff1aSopenharmony_ci dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1087cabdff1aSopenharmony_ci filt3); 1088cabdff1aSopenharmony_ci dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1089cabdff1aSopenharmony_ci filt3); 1090cabdff1aSopenharmony_ci dst7 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1091cabdff1aSopenharmony_ci filt2, filt3); 1092cabdff1aSopenharmony_ci 1093cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 1094cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 1095cabdff1aSopenharmony_ci dst3); 1096cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 1097cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst4, dst5, dst6, 1098cabdff1aSopenharmony_ci dst7); 1099cabdff1aSopenharmony_ci 1100cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 1101cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 1102cabdff1aSopenharmony_ci ST_UB2(out0, out1, dst, 16); 1103cabdff1aSopenharmony_ci dst += dst_stride; 1104cabdff1aSopenharmony_ci ST_UB2(out2, out3, dst, 16); 1105cabdff1aSopenharmony_ci dst += dst_stride; 1106cabdff1aSopenharmony_ci } 1107cabdff1aSopenharmony_ci} 1108cabdff1aSopenharmony_ci 1109cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src, 1110cabdff1aSopenharmony_ci int32_t src_stride, 1111cabdff1aSopenharmony_ci uint8_t *dst, 1112cabdff1aSopenharmony_ci int32_t dst_stride, 1113cabdff1aSopenharmony_ci const int8_t *filter, 1114cabdff1aSopenharmony_ci int32_t height, 1115cabdff1aSopenharmony_ci int32_t weight, 1116cabdff1aSopenharmony_ci int32_t offset, 1117cabdff1aSopenharmony_ci int32_t rnd_val) 1118cabdff1aSopenharmony_ci{ 1119cabdff1aSopenharmony_ci uint32_t loop_cnt; 1120cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 1121cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 1122cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1123cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1124cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1125cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1126cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 1127cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 1128cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 1129cabdff1aSopenharmony_ci 1130cabdff1aSopenharmony_ci src -= 3; 1131cabdff1aSopenharmony_ci 1132cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 1133cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1134cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 1135cabdff1aSopenharmony_ci 1136cabdff1aSopenharmony_ci weight *= 128; 1137cabdff1aSopenharmony_ci rnd_val -= 6; 1138cabdff1aSopenharmony_ci 1139cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 1140cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 1141cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 1142cabdff1aSopenharmony_ci 1143cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1144cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1145cabdff1aSopenharmony_ci 1146cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1147cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1148cabdff1aSopenharmony_ci 1149cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1150cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1151cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1152cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1153cabdff1aSopenharmony_ci mask4 = mask0 + 8; 1154cabdff1aSopenharmony_ci mask5 = mask0 + 10; 1155cabdff1aSopenharmony_ci mask6 = mask0 + 12; 1156cabdff1aSopenharmony_ci mask7 = mask0 + 14; 1157cabdff1aSopenharmony_ci 1158cabdff1aSopenharmony_ci for (loop_cnt = 64; loop_cnt--;) { 1159cabdff1aSopenharmony_ci LD_SB3(src, 16, src0, src1, src2); 1160cabdff1aSopenharmony_ci src3 = LD_SB(src + 40); 1161cabdff1aSopenharmony_ci src += src_stride; 1162cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 1163cabdff1aSopenharmony_ci 1164cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1165cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1166cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 1167cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1168cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1169cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1170cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7, 1171cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1172cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1173cabdff1aSopenharmony_ci filt3); 1174cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1175cabdff1aSopenharmony_ci filt3); 1176cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1177cabdff1aSopenharmony_ci filt3); 1178cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1179cabdff1aSopenharmony_ci filt2, filt3); 1180cabdff1aSopenharmony_ci 1181cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1182cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1183cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1184cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1185cabdff1aSopenharmony_ci dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1186cabdff1aSopenharmony_ci filt3); 1187cabdff1aSopenharmony_ci dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1188cabdff1aSopenharmony_ci filt3); 1189cabdff1aSopenharmony_ci 1190cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 1191cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 1192cabdff1aSopenharmony_ci dst3); 1193cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 1194cabdff1aSopenharmony_ci rnd_vec, dst4, dst5); 1195cabdff1aSopenharmony_ci 1196cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 1197cabdff1aSopenharmony_ci ST_UB2(out0, out1, dst, 16); 1198cabdff1aSopenharmony_ci ST_UB(out2, dst + 32); 1199cabdff1aSopenharmony_ci dst += dst_stride; 1200cabdff1aSopenharmony_ci } 1201cabdff1aSopenharmony_ci} 1202cabdff1aSopenharmony_ci 1203cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src, 1204cabdff1aSopenharmony_ci int32_t src_stride, 1205cabdff1aSopenharmony_ci uint8_t *dst, 1206cabdff1aSopenharmony_ci int32_t dst_stride, 1207cabdff1aSopenharmony_ci const int8_t *filter, 1208cabdff1aSopenharmony_ci int32_t height, 1209cabdff1aSopenharmony_ci int32_t weight, 1210cabdff1aSopenharmony_ci int32_t offset, 1211cabdff1aSopenharmony_ci int32_t rnd_val) 1212cabdff1aSopenharmony_ci{ 1213cabdff1aSopenharmony_ci uint8_t *src_tmp; 1214cabdff1aSopenharmony_ci uint8_t *dst_tmp; 1215cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 1216cabdff1aSopenharmony_ci v16u8 out0, out1; 1217cabdff1aSopenharmony_ci v16i8 src0, src1, src2; 1218cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1219cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1220cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1221cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1222cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 1223cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 1224cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 1225cabdff1aSopenharmony_ci 1226cabdff1aSopenharmony_ci src -= 3; 1227cabdff1aSopenharmony_ci 1228cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1229cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 1230cabdff1aSopenharmony_ci 1231cabdff1aSopenharmony_ci weight *= 128; 1232cabdff1aSopenharmony_ci rnd_val -= 6; 1233cabdff1aSopenharmony_ci 1234cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 1235cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 1236cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 1237cabdff1aSopenharmony_ci 1238cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1239cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1240cabdff1aSopenharmony_ci 1241cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1242cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1243cabdff1aSopenharmony_ci 1244cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1245cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1246cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1247cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1248cabdff1aSopenharmony_ci mask4 = mask0 + 8; 1249cabdff1aSopenharmony_ci mask5 = mask0 + 10; 1250cabdff1aSopenharmony_ci mask6 = mask0 + 12; 1251cabdff1aSopenharmony_ci mask7 = mask0 + 14; 1252cabdff1aSopenharmony_ci 1253cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 1254cabdff1aSopenharmony_ci src_tmp = src; 1255cabdff1aSopenharmony_ci dst_tmp = dst; 1256cabdff1aSopenharmony_ci 1257cabdff1aSopenharmony_ci for (cnt = 2; cnt--;) { 1258cabdff1aSopenharmony_ci LD_SB2(src_tmp, 16, src0, src1); 1259cabdff1aSopenharmony_ci src2 = LD_SB(src_tmp + 24); 1260cabdff1aSopenharmony_ci src_tmp += 32; 1261cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 1262cabdff1aSopenharmony_ci 1263cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1264cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1265cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 1266cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1267cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1268cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1269cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1270cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1271cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1272cabdff1aSopenharmony_ci filt2, filt3); 1273cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, 1274cabdff1aSopenharmony_ci filt2, filt3); 1275cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, 1276cabdff1aSopenharmony_ci filt2, filt3); 1277cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1278cabdff1aSopenharmony_ci filt2, filt3); 1279cabdff1aSopenharmony_ci 1280cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 1281cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, 1282cabdff1aSopenharmony_ci dst2, dst3); 1283cabdff1aSopenharmony_ci 1284cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 1285cabdff1aSopenharmony_ci ST_UB2(out0, out1, dst_tmp, 16); 1286cabdff1aSopenharmony_ci dst_tmp += 32; 1287cabdff1aSopenharmony_ci } 1288cabdff1aSopenharmony_ci 1289cabdff1aSopenharmony_ci src += src_stride; 1290cabdff1aSopenharmony_ci dst += dst_stride; 1291cabdff1aSopenharmony_ci } 1292cabdff1aSopenharmony_ci} 1293cabdff1aSopenharmony_ci 1294cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src, 1295cabdff1aSopenharmony_ci int32_t src_stride, 1296cabdff1aSopenharmony_ci uint8_t *dst, 1297cabdff1aSopenharmony_ci int32_t dst_stride, 1298cabdff1aSopenharmony_ci const int8_t *filter, 1299cabdff1aSopenharmony_ci int32_t height, 1300cabdff1aSopenharmony_ci int32_t weight, 1301cabdff1aSopenharmony_ci int32_t offset, 1302cabdff1aSopenharmony_ci int32_t rnd_val) 1303cabdff1aSopenharmony_ci{ 1304cabdff1aSopenharmony_ci int32_t loop_cnt; 1305cabdff1aSopenharmony_ci v16u8 out0, out1; 1306cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1307cabdff1aSopenharmony_ci v16i8 src9, src10, src11, src12, src13, src14; 1308cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1309cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1310cabdff1aSopenharmony_ci v16i8 src1110_r, src1211_r, src1312_r, src1413_r; 1311cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554, src8776, src10998; 1312cabdff1aSopenharmony_ci v16i8 src12111110, src14131312; 1313cabdff1aSopenharmony_ci v8i16 filter_vec, dst01, dst23, dst45, dst67; 1314cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1315cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec; 1316cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 1317cabdff1aSopenharmony_ci 1318cabdff1aSopenharmony_ci src -= (3 * src_stride); 1319cabdff1aSopenharmony_ci 1320cabdff1aSopenharmony_ci 1321cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1322cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 1323cabdff1aSopenharmony_ci 1324cabdff1aSopenharmony_ci weight *= 128; 1325cabdff1aSopenharmony_ci rnd_val -= 6; 1326cabdff1aSopenharmony_ci 1327cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 1328cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 1329cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 1330cabdff1aSopenharmony_ci 1331cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1332cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1333cabdff1aSopenharmony_ci 1334cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1335cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1336cabdff1aSopenharmony_ci 1337cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1338cabdff1aSopenharmony_ci src += (7 * src_stride); 1339cabdff1aSopenharmony_ci 1340cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1341cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1342cabdff1aSopenharmony_ci 1343cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1344cabdff1aSopenharmony_ci 1345cabdff1aSopenharmony_ci ILVR_D3_SB(src21_r, src10_r, src43_r, 1346cabdff1aSopenharmony_ci src32_r, src65_r, src54_r, src2110, src4332, src6554); 1347cabdff1aSopenharmony_ci 1348cabdff1aSopenharmony_ci XORI_B3_128_SB(src2110, src4332, src6554); 1349cabdff1aSopenharmony_ci 1350cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 1351cabdff1aSopenharmony_ci LD_SB8(src, src_stride, 1352cabdff1aSopenharmony_ci src7, src8, src9, src10, src11, src12, src13, src14); 1353cabdff1aSopenharmony_ci src += (8 * src_stride); 1354cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1355cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1356cabdff1aSopenharmony_ci ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, 1357cabdff1aSopenharmony_ci src1110_r, src1211_r, src1312_r, src1413_r); 1358cabdff1aSopenharmony_ci ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r, 1359cabdff1aSopenharmony_ci src1413_r, src1312_r, 1360cabdff1aSopenharmony_ci src8776, src10998, src12111110, src14131312); 1361cabdff1aSopenharmony_ci XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); 1362cabdff1aSopenharmony_ci dst01 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0, 1363cabdff1aSopenharmony_ci filt1, filt2, filt3); 1364cabdff1aSopenharmony_ci dst23 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0, 1365cabdff1aSopenharmony_ci filt1, filt2, filt3); 1366cabdff1aSopenharmony_ci dst45 = HEVC_FILT_8TAP_SH(src6554, src8776, src10998, src12111110, 1367cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1368cabdff1aSopenharmony_ci dst67 = HEVC_FILT_8TAP_SH(src8776, src10998, src12111110, src14131312, 1369cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3); 1370cabdff1aSopenharmony_ci 1371cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec, 1372cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 1373cabdff1aSopenharmony_ci dst3); 1374cabdff1aSopenharmony_ci 1375cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 1376cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 1377cabdff1aSopenharmony_ci dst += (8 * dst_stride); 1378cabdff1aSopenharmony_ci 1379cabdff1aSopenharmony_ci src2110 = src10998; 1380cabdff1aSopenharmony_ci src4332 = src12111110; 1381cabdff1aSopenharmony_ci src6554 = src14131312; 1382cabdff1aSopenharmony_ci src6 = src14; 1383cabdff1aSopenharmony_ci } 1384cabdff1aSopenharmony_ci} 1385cabdff1aSopenharmony_ci 1386cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src, 1387cabdff1aSopenharmony_ci int32_t src_stride, 1388cabdff1aSopenharmony_ci uint8_t *dst, 1389cabdff1aSopenharmony_ci int32_t dst_stride, 1390cabdff1aSopenharmony_ci const int8_t *filter, 1391cabdff1aSopenharmony_ci int32_t height, 1392cabdff1aSopenharmony_ci int32_t weight, 1393cabdff1aSopenharmony_ci int32_t offset, 1394cabdff1aSopenharmony_ci int32_t rnd_val) 1395cabdff1aSopenharmony_ci{ 1396cabdff1aSopenharmony_ci int32_t loop_cnt; 1397cabdff1aSopenharmony_ci v16u8 out0, out1; 1398cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1399cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1400cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1401cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1402cabdff1aSopenharmony_ci v8i16 filter_vec; 1403cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec; 1404cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 1405cabdff1aSopenharmony_ci 1406cabdff1aSopenharmony_ci src -= (3 * src_stride); 1407cabdff1aSopenharmony_ci 1408cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1409cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 1410cabdff1aSopenharmony_ci 1411cabdff1aSopenharmony_ci weight *= 128; 1412cabdff1aSopenharmony_ci rnd_val -= 6; 1413cabdff1aSopenharmony_ci 1414cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 1415cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 1416cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 1417cabdff1aSopenharmony_ci 1418cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1419cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1420cabdff1aSopenharmony_ci 1421cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1422cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1423cabdff1aSopenharmony_ci 1424cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1425cabdff1aSopenharmony_ci src += (7 * src_stride); 1426cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1427cabdff1aSopenharmony_ci 1428cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1429cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1430cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1431cabdff1aSopenharmony_ci 1432cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1433cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 1434cabdff1aSopenharmony_ci src += (4 * src_stride); 1435cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1436cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1437cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1438cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0, 1439cabdff1aSopenharmony_ci filt1, filt2, filt3); 1440cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0, 1441cabdff1aSopenharmony_ci filt1, filt2, filt3); 1442cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0, 1443cabdff1aSopenharmony_ci filt1, filt2, filt3); 1444cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0, 1445cabdff1aSopenharmony_ci filt1, filt2, filt3); 1446cabdff1aSopenharmony_ci 1447cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 1448cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 1449cabdff1aSopenharmony_ci dst3); 1450cabdff1aSopenharmony_ci 1451cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 1452cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1453cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1454cabdff1aSopenharmony_ci 1455cabdff1aSopenharmony_ci src10_r = src54_r; 1456cabdff1aSopenharmony_ci src32_r = src76_r; 1457cabdff1aSopenharmony_ci src54_r = src98_r; 1458cabdff1aSopenharmony_ci src21_r = src65_r; 1459cabdff1aSopenharmony_ci src43_r = src87_r; 1460cabdff1aSopenharmony_ci src65_r = src109_r; 1461cabdff1aSopenharmony_ci src6 = src10; 1462cabdff1aSopenharmony_ci } 1463cabdff1aSopenharmony_ci} 1464cabdff1aSopenharmony_ci 1465cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src, 1466cabdff1aSopenharmony_ci int32_t src_stride, 1467cabdff1aSopenharmony_ci uint8_t *dst, 1468cabdff1aSopenharmony_ci int32_t dst_stride, 1469cabdff1aSopenharmony_ci const int8_t *filter, 1470cabdff1aSopenharmony_ci int32_t height, 1471cabdff1aSopenharmony_ci int32_t weight, 1472cabdff1aSopenharmony_ci int32_t offset, 1473cabdff1aSopenharmony_ci int32_t rnd_val) 1474cabdff1aSopenharmony_ci{ 1475cabdff1aSopenharmony_ci int32_t loop_cnt; 1476cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 1477cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1478cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1479cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1480cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; 1481cabdff1aSopenharmony_ci v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; 1482cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554, src8776, src10998; 1483cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1484cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 1485cabdff1aSopenharmony_ci v8i16 weight_vec_h, offset_vec, denom_vec, filter_vec; 1486cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 1487cabdff1aSopenharmony_ci 1488cabdff1aSopenharmony_ci src -= (3 * src_stride); 1489cabdff1aSopenharmony_ci 1490cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 1491cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1492cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 1493cabdff1aSopenharmony_ci 1494cabdff1aSopenharmony_ci weight *= 128; 1495cabdff1aSopenharmony_ci rnd_val -= 6; 1496cabdff1aSopenharmony_ci 1497cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 1498cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 1499cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 1500cabdff1aSopenharmony_ci 1501cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1502cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1503cabdff1aSopenharmony_ci 1504cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1505cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1506cabdff1aSopenharmony_ci 1507cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1508cabdff1aSopenharmony_ci src += (7 * src_stride); 1509cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1510cabdff1aSopenharmony_ci 1511cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1512cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1513cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1514cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1515cabdff1aSopenharmony_ci src10_l, src32_l, src54_l, src21_l); 1516cabdff1aSopenharmony_ci ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1517cabdff1aSopenharmony_ci ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, 1518cabdff1aSopenharmony_ci src2110, src4332, src6554); 1519cabdff1aSopenharmony_ci 1520cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 1521cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 1522cabdff1aSopenharmony_ci src += (4 * src_stride); 1523cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1524cabdff1aSopenharmony_ci 1525cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1526cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1527cabdff1aSopenharmony_ci ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1528cabdff1aSopenharmony_ci src76_l, src87_l, src98_l, src109_l); 1529cabdff1aSopenharmony_ci ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998); 1530cabdff1aSopenharmony_ci 1531cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0, 1532cabdff1aSopenharmony_ci filt1, filt2, filt3); 1533cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0, 1534cabdff1aSopenharmony_ci filt1, filt2, filt3); 1535cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0, 1536cabdff1aSopenharmony_ci filt1, filt2, filt3); 1537cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0, 1538cabdff1aSopenharmony_ci filt1, filt2, filt3); 1539cabdff1aSopenharmony_ci dst4 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0, 1540cabdff1aSopenharmony_ci filt1, filt2, filt3); 1541cabdff1aSopenharmony_ci dst5 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0, 1542cabdff1aSopenharmony_ci filt1, filt2, filt3); 1543cabdff1aSopenharmony_ci 1544cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 1545cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 1546cabdff1aSopenharmony_ci dst3); 1547cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 1548cabdff1aSopenharmony_ci rnd_vec, dst4, dst5); 1549cabdff1aSopenharmony_ci 1550cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 1551cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1552cabdff1aSopenharmony_ci ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 1553cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1554cabdff1aSopenharmony_ci 1555cabdff1aSopenharmony_ci src10_r = src54_r; 1556cabdff1aSopenharmony_ci src32_r = src76_r; 1557cabdff1aSopenharmony_ci src54_r = src98_r; 1558cabdff1aSopenharmony_ci src21_r = src65_r; 1559cabdff1aSopenharmony_ci src43_r = src87_r; 1560cabdff1aSopenharmony_ci src65_r = src109_r; 1561cabdff1aSopenharmony_ci src2110 = src6554; 1562cabdff1aSopenharmony_ci src4332 = src8776; 1563cabdff1aSopenharmony_ci src6554 = src10998; 1564cabdff1aSopenharmony_ci src6 = src10; 1565cabdff1aSopenharmony_ci } 1566cabdff1aSopenharmony_ci} 1567cabdff1aSopenharmony_ci 1568cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_16multx4mult_msa(uint8_t *src, 1569cabdff1aSopenharmony_ci int32_t src_stride, 1570cabdff1aSopenharmony_ci uint8_t *dst, 1571cabdff1aSopenharmony_ci int32_t dst_stride, 1572cabdff1aSopenharmony_ci const int8_t *filter, 1573cabdff1aSopenharmony_ci int32_t height, 1574cabdff1aSopenharmony_ci int32_t weight, 1575cabdff1aSopenharmony_ci int32_t offset, 1576cabdff1aSopenharmony_ci int32_t rnd_val, 1577cabdff1aSopenharmony_ci int32_t weightmul16) 1578cabdff1aSopenharmony_ci{ 1579cabdff1aSopenharmony_ci uint8_t *src_tmp; 1580cabdff1aSopenharmony_ci uint8_t *dst_tmp; 1581cabdff1aSopenharmony_ci int32_t loop_cnt, cnt; 1582cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 1583cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1584cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r; 1585cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r; 1586cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src76_l; 1587cabdff1aSopenharmony_ci v16i8 src21_l, src43_l, src65_l, src87_l; 1588cabdff1aSopenharmony_ci v16i8 src98_r, src109_r, src98_l, src109_l; 1589cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1590cabdff1aSopenharmony_ci v8i16 filter_vec; 1591cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1592cabdff1aSopenharmony_ci v8i16 weight_vec_h, offset_vec, denom_vec; 1593cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 1594cabdff1aSopenharmony_ci 1595cabdff1aSopenharmony_ci src -= (3 * src_stride); 1596cabdff1aSopenharmony_ci 1597cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1598cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 1599cabdff1aSopenharmony_ci 1600cabdff1aSopenharmony_ci weight *= 128; 1601cabdff1aSopenharmony_ci rnd_val -= 6; 1602cabdff1aSopenharmony_ci 1603cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 1604cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 1605cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 1606cabdff1aSopenharmony_ci 1607cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 1608cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 1609cabdff1aSopenharmony_ci 1610cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1611cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1612cabdff1aSopenharmony_ci 1613cabdff1aSopenharmony_ci for (cnt = weightmul16; cnt--;) { 1614cabdff1aSopenharmony_ci src_tmp = src; 1615cabdff1aSopenharmony_ci dst_tmp = dst; 1616cabdff1aSopenharmony_ci 1617cabdff1aSopenharmony_ci LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1618cabdff1aSopenharmony_ci src_tmp += (7 * src_stride); 1619cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1620cabdff1aSopenharmony_ci 1621cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1622cabdff1aSopenharmony_ci LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); 1623cabdff1aSopenharmony_ci src_tmp += (4 * src_stride); 1624cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1625cabdff1aSopenharmony_ci 1626cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1627cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1628cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1629cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1630cabdff1aSopenharmony_ci src10_l, src32_l, src54_l, src21_l); 1631cabdff1aSopenharmony_ci ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1632cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1633cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1634cabdff1aSopenharmony_ci ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1635cabdff1aSopenharmony_ci src76_l, src87_l, src98_l, src109_l); 1636cabdff1aSopenharmony_ci 1637cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0, 1638cabdff1aSopenharmony_ci filt1, filt2, filt3); 1639cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0, 1640cabdff1aSopenharmony_ci filt1, filt2, filt3); 1641cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0, 1642cabdff1aSopenharmony_ci filt1, filt2, filt3); 1643cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0, 1644cabdff1aSopenharmony_ci filt1, filt2, filt3); 1645cabdff1aSopenharmony_ci dst4 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0, 1646cabdff1aSopenharmony_ci filt1, filt2, filt3); 1647cabdff1aSopenharmony_ci dst5 = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0, 1648cabdff1aSopenharmony_ci filt1, filt2, filt3); 1649cabdff1aSopenharmony_ci dst6 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0, 1650cabdff1aSopenharmony_ci filt1, filt2, filt3); 1651cabdff1aSopenharmony_ci dst7 = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0, 1652cabdff1aSopenharmony_ci filt1, filt2, filt3); 1653cabdff1aSopenharmony_ci 1654cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 1655cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, 1656cabdff1aSopenharmony_ci dst2, dst3); 1657cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 1658cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst4, dst5, 1659cabdff1aSopenharmony_ci dst6, dst7); 1660cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 1661cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 1662cabdff1aSopenharmony_ci ST_UB4(out0, out1, out2, out3, dst_tmp, dst_stride); 1663cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 1664cabdff1aSopenharmony_ci 1665cabdff1aSopenharmony_ci src0 = src4; 1666cabdff1aSopenharmony_ci src1 = src5; 1667cabdff1aSopenharmony_ci src2 = src6; 1668cabdff1aSopenharmony_ci src3 = src7; 1669cabdff1aSopenharmony_ci src4 = src8; 1670cabdff1aSopenharmony_ci src5 = src9; 1671cabdff1aSopenharmony_ci src6 = src10; 1672cabdff1aSopenharmony_ci } 1673cabdff1aSopenharmony_ci 1674cabdff1aSopenharmony_ci src += 16; 1675cabdff1aSopenharmony_ci dst += 16; 1676cabdff1aSopenharmony_ci } 1677cabdff1aSopenharmony_ci} 1678cabdff1aSopenharmony_ci 1679cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src, 1680cabdff1aSopenharmony_ci int32_t src_stride, 1681cabdff1aSopenharmony_ci uint8_t *dst, 1682cabdff1aSopenharmony_ci int32_t dst_stride, 1683cabdff1aSopenharmony_ci const int8_t *filter, 1684cabdff1aSopenharmony_ci int32_t height, 1685cabdff1aSopenharmony_ci int32_t weight, 1686cabdff1aSopenharmony_ci int32_t offset, 1687cabdff1aSopenharmony_ci int32_t rnd_val) 1688cabdff1aSopenharmony_ci{ 1689cabdff1aSopenharmony_ci hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1690cabdff1aSopenharmony_ci filter, height, weight, 1691cabdff1aSopenharmony_ci offset, rnd_val, 1); 1692cabdff1aSopenharmony_ci} 1693cabdff1aSopenharmony_ci 1694cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src, 1695cabdff1aSopenharmony_ci int32_t src_stride, 1696cabdff1aSopenharmony_ci uint8_t *dst, 1697cabdff1aSopenharmony_ci int32_t dst_stride, 1698cabdff1aSopenharmony_ci const int8_t *filter, 1699cabdff1aSopenharmony_ci int32_t height, 1700cabdff1aSopenharmony_ci int32_t weight, 1701cabdff1aSopenharmony_ci int32_t offset, 1702cabdff1aSopenharmony_ci int32_t rnd_val) 1703cabdff1aSopenharmony_ci{ 1704cabdff1aSopenharmony_ci hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1705cabdff1aSopenharmony_ci filter, 32, weight, 1706cabdff1aSopenharmony_ci offset, rnd_val, 1); 1707cabdff1aSopenharmony_ci 1708cabdff1aSopenharmony_ci hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, 1709cabdff1aSopenharmony_ci filter, 32, weight, offset, rnd_val); 1710cabdff1aSopenharmony_ci} 1711cabdff1aSopenharmony_ci 1712cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src, 1713cabdff1aSopenharmony_ci int32_t src_stride, 1714cabdff1aSopenharmony_ci uint8_t *dst, 1715cabdff1aSopenharmony_ci int32_t dst_stride, 1716cabdff1aSopenharmony_ci const int8_t *filter, 1717cabdff1aSopenharmony_ci int32_t height, 1718cabdff1aSopenharmony_ci int32_t weight, 1719cabdff1aSopenharmony_ci int32_t offset, 1720cabdff1aSopenharmony_ci int32_t rnd_val) 1721cabdff1aSopenharmony_ci{ 1722cabdff1aSopenharmony_ci hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1723cabdff1aSopenharmony_ci filter, height, weight, 1724cabdff1aSopenharmony_ci offset, rnd_val, 2); 1725cabdff1aSopenharmony_ci} 1726cabdff1aSopenharmony_ci 1727cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src, 1728cabdff1aSopenharmony_ci int32_t src_stride, 1729cabdff1aSopenharmony_ci uint8_t *dst, 1730cabdff1aSopenharmony_ci int32_t dst_stride, 1731cabdff1aSopenharmony_ci const int8_t *filter, 1732cabdff1aSopenharmony_ci int32_t height, 1733cabdff1aSopenharmony_ci int32_t weight, 1734cabdff1aSopenharmony_ci int32_t offset, 1735cabdff1aSopenharmony_ci int32_t rnd_val) 1736cabdff1aSopenharmony_ci{ 1737cabdff1aSopenharmony_ci hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1738cabdff1aSopenharmony_ci filter, 64, weight, 1739cabdff1aSopenharmony_ci offset, rnd_val, 3); 1740cabdff1aSopenharmony_ci} 1741cabdff1aSopenharmony_ci 1742cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src, 1743cabdff1aSopenharmony_ci int32_t src_stride, 1744cabdff1aSopenharmony_ci uint8_t *dst, 1745cabdff1aSopenharmony_ci int32_t dst_stride, 1746cabdff1aSopenharmony_ci const int8_t *filter, 1747cabdff1aSopenharmony_ci int32_t height, 1748cabdff1aSopenharmony_ci int32_t weight, 1749cabdff1aSopenharmony_ci int32_t offset, 1750cabdff1aSopenharmony_ci int32_t rnd_val) 1751cabdff1aSopenharmony_ci{ 1752cabdff1aSopenharmony_ci hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1753cabdff1aSopenharmony_ci filter, height, weight, 1754cabdff1aSopenharmony_ci offset, rnd_val, 4); 1755cabdff1aSopenharmony_ci} 1756cabdff1aSopenharmony_ci 1757cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src, 1758cabdff1aSopenharmony_ci int32_t src_stride, 1759cabdff1aSopenharmony_ci uint8_t *dst, 1760cabdff1aSopenharmony_ci int32_t dst_stride, 1761cabdff1aSopenharmony_ci const int8_t *filter_x, 1762cabdff1aSopenharmony_ci const int8_t *filter_y, 1763cabdff1aSopenharmony_ci int32_t height, 1764cabdff1aSopenharmony_ci int32_t weight, 1765cabdff1aSopenharmony_ci int32_t offset, 1766cabdff1aSopenharmony_ci int32_t rnd_val) 1767cabdff1aSopenharmony_ci{ 1768cabdff1aSopenharmony_ci uint32_t loop_cnt; 1769cabdff1aSopenharmony_ci v16u8 out; 1770cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1771cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1772cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1773cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 1774cabdff1aSopenharmony_ci v8i16 filter_vec; 1775cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1776cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1777cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108; 1778cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r; 1779cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r; 1780cabdff1aSopenharmony_ci v4i32 dst0_r, dst1_r, dst2_r, dst3_r; 1781cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec; 1782cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1783cabdff1aSopenharmony_ci 1784cabdff1aSopenharmony_ci src -= ((3 * src_stride) + 3); 1785cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 1786cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1787cabdff1aSopenharmony_ci 1788cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 1789cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 1790cabdff1aSopenharmony_ci 1791cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1792cabdff1aSopenharmony_ci 1793cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1794cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1795cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1796cabdff1aSopenharmony_ci 1797cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1798cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 1799cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 1800cabdff1aSopenharmony_ci denom_vec = rnd_vec - 6; 1801cabdff1aSopenharmony_ci 1802cabdff1aSopenharmony_ci const_128 = __msa_ldi_w(128); 1803cabdff1aSopenharmony_ci const_128 *= weight_vec; 1804cabdff1aSopenharmony_ci offset_vec += __msa_srar_w(const_128, denom_vec); 1805cabdff1aSopenharmony_ci 1806cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1807cabdff1aSopenharmony_ci src += (7 * src_stride); 1808cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1809cabdff1aSopenharmony_ci 1810cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 1811cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1812cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1813cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, 1814cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1815cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, 1816cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1817cabdff1aSopenharmony_ci dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1818cabdff1aSopenharmony_ci filt3); 1819cabdff1aSopenharmony_ci dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1820cabdff1aSopenharmony_ci filt3); 1821cabdff1aSopenharmony_ci dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1822cabdff1aSopenharmony_ci filt3); 1823cabdff1aSopenharmony_ci dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 1824cabdff1aSopenharmony_ci filt3); 1825cabdff1aSopenharmony_ci 1826cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r); 1827cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r); 1828cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r); 1829cabdff1aSopenharmony_ci 1830cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 1831cabdff1aSopenharmony_ci 1832cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 1833cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 1834cabdff1aSopenharmony_ci src += (4 * src_stride); 1835cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1836cabdff1aSopenharmony_ci 1837cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3, 1838cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1839cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3, 1840cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1841cabdff1aSopenharmony_ci dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1842cabdff1aSopenharmony_ci filt3); 1843cabdff1aSopenharmony_ci dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1844cabdff1aSopenharmony_ci filt3); 1845cabdff1aSopenharmony_ci 1846cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst97, dst66); 1847cabdff1aSopenharmony_ci ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r); 1848cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 1849cabdff1aSopenharmony_ci dst98_r = __msa_ilvr_h(dst66, dst108); 1850cabdff1aSopenharmony_ci 1851cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 1852cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1853cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0, 1854cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1855cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0, 1856cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1857cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0, 1858cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1859cabdff1aSopenharmony_ci 1860cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1861cabdff1aSopenharmony_ci MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 1862cabdff1aSopenharmony_ci MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 1863cabdff1aSopenharmony_ci SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec); 1864cabdff1aSopenharmony_ci ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r); 1865cabdff1aSopenharmony_ci ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r); 1866cabdff1aSopenharmony_ci CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r); 1867cabdff1aSopenharmony_ci PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 1868cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); 1869cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 1870cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1871cabdff1aSopenharmony_ci 1872cabdff1aSopenharmony_ci dst10_r = dst54_r; 1873cabdff1aSopenharmony_ci dst32_r = dst76_r; 1874cabdff1aSopenharmony_ci dst54_r = dst98_r; 1875cabdff1aSopenharmony_ci dst21_r = dst65_r; 1876cabdff1aSopenharmony_ci dst43_r = dst87_r; 1877cabdff1aSopenharmony_ci dst65_r = dst109_r; 1878cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 1879cabdff1aSopenharmony_ci } 1880cabdff1aSopenharmony_ci} 1881cabdff1aSopenharmony_ci 1882cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src, 1883cabdff1aSopenharmony_ci int32_t src_stride, 1884cabdff1aSopenharmony_ci uint8_t *dst, 1885cabdff1aSopenharmony_ci int32_t dst_stride, 1886cabdff1aSopenharmony_ci const int8_t *filter_x, 1887cabdff1aSopenharmony_ci const int8_t *filter_y, 1888cabdff1aSopenharmony_ci int32_t height, 1889cabdff1aSopenharmony_ci int32_t weight, 1890cabdff1aSopenharmony_ci int32_t offset, 1891cabdff1aSopenharmony_ci int32_t rnd_val, 1892cabdff1aSopenharmony_ci int32_t width) 1893cabdff1aSopenharmony_ci{ 1894cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 1895cabdff1aSopenharmony_ci uint8_t *src_tmp; 1896cabdff1aSopenharmony_ci uint8_t *dst_tmp; 1897cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1898cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1899cabdff1aSopenharmony_ci v4i32 filt_h0, filt_h1, filt_h2, filt_h3; 1900cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 1901cabdff1aSopenharmony_ci v8i16 filter_vec; 1902cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1903cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1904cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 1905cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 1906cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 1907cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst76_l; 1908cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 1909cabdff1aSopenharmony_ci v8i16 dst21_l, dst43_l, dst65_l, dst87_l; 1910cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec; 1911cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 1912cabdff1aSopenharmony_ci 1913cabdff1aSopenharmony_ci src -= ((3 * src_stride) + 3); 1914cabdff1aSopenharmony_ci 1915cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 1916cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 1917cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 1918cabdff1aSopenharmony_ci denom_vec = rnd_vec - 6; 1919cabdff1aSopenharmony_ci 1920cabdff1aSopenharmony_ci const_128 = __msa_ldi_w(128); 1921cabdff1aSopenharmony_ci const_128 *= weight_vec; 1922cabdff1aSopenharmony_ci offset_vec += __msa_srar_w(const_128, denom_vec); 1923cabdff1aSopenharmony_ci 1924cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 1925cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1926cabdff1aSopenharmony_ci 1927cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 1928cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 1929cabdff1aSopenharmony_ci SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1930cabdff1aSopenharmony_ci 1931cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1932cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1933cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1934cabdff1aSopenharmony_ci 1935cabdff1aSopenharmony_ci for (cnt = width >> 3; cnt--;) { 1936cabdff1aSopenharmony_ci src_tmp = src; 1937cabdff1aSopenharmony_ci dst_tmp = dst; 1938cabdff1aSopenharmony_ci 1939cabdff1aSopenharmony_ci LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1940cabdff1aSopenharmony_ci src_tmp += (7 * src_stride); 1941cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1942cabdff1aSopenharmony_ci 1943cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1944cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1945cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1946cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1947cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1948cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1949cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1950cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1951cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1952cabdff1aSopenharmony_ci filt3); 1953cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1954cabdff1aSopenharmony_ci filt3); 1955cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1956cabdff1aSopenharmony_ci filt3); 1957cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1958cabdff1aSopenharmony_ci filt2, filt3); 1959cabdff1aSopenharmony_ci 1960cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 1961cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1962cabdff1aSopenharmony_ci VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, 1963cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1964cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, 1965cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1966cabdff1aSopenharmony_ci dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1967cabdff1aSopenharmony_ci filt3); 1968cabdff1aSopenharmony_ci dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1969cabdff1aSopenharmony_ci filt3); 1970cabdff1aSopenharmony_ci dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1971cabdff1aSopenharmony_ci filt3); 1972cabdff1aSopenharmony_ci 1973cabdff1aSopenharmony_ci ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, 1974cabdff1aSopenharmony_ci dst10_r, dst32_r, dst54_r, dst21_r); 1975cabdff1aSopenharmony_ci ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); 1976cabdff1aSopenharmony_ci ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, 1977cabdff1aSopenharmony_ci dst10_l, dst32_l, dst54_l, dst21_l); 1978cabdff1aSopenharmony_ci ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); 1979cabdff1aSopenharmony_ci 1980cabdff1aSopenharmony_ci for (loop_cnt = height >> 1; loop_cnt--;) { 1981cabdff1aSopenharmony_ci LD_SB2(src_tmp, src_stride, src7, src8); 1982cabdff1aSopenharmony_ci src_tmp += 2 * src_stride; 1983cabdff1aSopenharmony_ci XORI_B2_128_SB(src7, src8); 1984cabdff1aSopenharmony_ci 1985cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, 1986cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1987cabdff1aSopenharmony_ci dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1988cabdff1aSopenharmony_ci filt2, filt3); 1989cabdff1aSopenharmony_ci 1990cabdff1aSopenharmony_ci ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1991cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 1992cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1993cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 1994cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1995cabdff1aSopenharmony_ci dst0_r >>= 6; 1996cabdff1aSopenharmony_ci dst0_l >>= 6; 1997cabdff1aSopenharmony_ci 1998cabdff1aSopenharmony_ci /* row 8 */ 1999cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, 2000cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 2001cabdff1aSopenharmony_ci dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 2002cabdff1aSopenharmony_ci filt2, filt3); 2003cabdff1aSopenharmony_ci 2004cabdff1aSopenharmony_ci ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 2005cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, 2006cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 2007cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, 2008cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 2009cabdff1aSopenharmony_ci dst1_r >>= 6; 2010cabdff1aSopenharmony_ci dst1_l >>= 6; 2011cabdff1aSopenharmony_ci 2012cabdff1aSopenharmony_ci MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l); 2013cabdff1aSopenharmony_ci MUL2(dst1_r, weight_vec, dst1_l, weight_vec, dst1_r, dst1_l); 2014cabdff1aSopenharmony_ci SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec); 2015cabdff1aSopenharmony_ci ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l); 2016cabdff1aSopenharmony_ci ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l); 2017cabdff1aSopenharmony_ci CLIP_SW4_0_255(dst0_r, dst1_r, dst0_l, dst1_l); 2018cabdff1aSopenharmony_ci 2019cabdff1aSopenharmony_ci PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 2020cabdff1aSopenharmony_ci dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); 2021cabdff1aSopenharmony_ci ST_D2(dst0_r, 0, 1, dst_tmp, dst_stride); 2022cabdff1aSopenharmony_ci dst_tmp += (2 * dst_stride); 2023cabdff1aSopenharmony_ci 2024cabdff1aSopenharmony_ci dst10_r = dst32_r; 2025cabdff1aSopenharmony_ci dst32_r = dst54_r; 2026cabdff1aSopenharmony_ci dst54_r = dst76_r; 2027cabdff1aSopenharmony_ci dst10_l = dst32_l; 2028cabdff1aSopenharmony_ci dst32_l = dst54_l; 2029cabdff1aSopenharmony_ci dst54_l = dst76_l; 2030cabdff1aSopenharmony_ci dst21_r = dst43_r; 2031cabdff1aSopenharmony_ci dst43_r = dst65_r; 2032cabdff1aSopenharmony_ci dst65_r = dst87_r; 2033cabdff1aSopenharmony_ci dst21_l = dst43_l; 2034cabdff1aSopenharmony_ci dst43_l = dst65_l; 2035cabdff1aSopenharmony_ci dst65_l = dst87_l; 2036cabdff1aSopenharmony_ci dst6 = dst8; 2037cabdff1aSopenharmony_ci } 2038cabdff1aSopenharmony_ci 2039cabdff1aSopenharmony_ci src += 8; 2040cabdff1aSopenharmony_ci dst += 8; 2041cabdff1aSopenharmony_ci } 2042cabdff1aSopenharmony_ci} 2043cabdff1aSopenharmony_ci 2044cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src, 2045cabdff1aSopenharmony_ci int32_t src_stride, 2046cabdff1aSopenharmony_ci uint8_t *dst, 2047cabdff1aSopenharmony_ci int32_t dst_stride, 2048cabdff1aSopenharmony_ci const int8_t *filter_x, 2049cabdff1aSopenharmony_ci const int8_t *filter_y, 2050cabdff1aSopenharmony_ci int32_t height, 2051cabdff1aSopenharmony_ci int32_t weight, 2052cabdff1aSopenharmony_ci int32_t offset, 2053cabdff1aSopenharmony_ci int32_t rnd_val) 2054cabdff1aSopenharmony_ci{ 2055cabdff1aSopenharmony_ci hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 2056cabdff1aSopenharmony_ci filter_x, filter_y, height, weight, 2057cabdff1aSopenharmony_ci offset, rnd_val, 8); 2058cabdff1aSopenharmony_ci} 2059cabdff1aSopenharmony_ci 2060cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src, 2061cabdff1aSopenharmony_ci int32_t src_stride, 2062cabdff1aSopenharmony_ci uint8_t *dst, 2063cabdff1aSopenharmony_ci int32_t dst_stride, 2064cabdff1aSopenharmony_ci const int8_t *filter_x, 2065cabdff1aSopenharmony_ci const int8_t *filter_y, 2066cabdff1aSopenharmony_ci int32_t height, 2067cabdff1aSopenharmony_ci int32_t weight, 2068cabdff1aSopenharmony_ci int32_t offset, 2069cabdff1aSopenharmony_ci int32_t rnd_val) 2070cabdff1aSopenharmony_ci{ 2071cabdff1aSopenharmony_ci uint32_t loop_cnt; 2072cabdff1aSopenharmony_ci uint8_t *src_tmp, *dst_tmp; 2073cabdff1aSopenharmony_ci v16u8 out; 2074cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2075cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 2076cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2077cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 2078cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2079cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108; 2080cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3; 2081cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst10_l, dst32_l, dst54_l; 2082cabdff1aSopenharmony_ci v8i16 dst98_r, dst21_r, dst43_r, dst65_r, dst87_r, dst109_r; 2083cabdff1aSopenharmony_ci v8i16 dst76_l, filter_vec; 2084cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r; 2085cabdff1aSopenharmony_ci v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec; 2086cabdff1aSopenharmony_ci 2087cabdff1aSopenharmony_ci src -= ((3 * src_stride) + 3); 2088cabdff1aSopenharmony_ci 2089cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 2090cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 2091cabdff1aSopenharmony_ci 2092cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 2093cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 2094cabdff1aSopenharmony_ci 2095cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 2096cabdff1aSopenharmony_ci 2097cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2098cabdff1aSopenharmony_ci offset_vec = __msa_fill_w(offset); 2099cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 2100cabdff1aSopenharmony_ci denom_vec = rnd_vec - 6; 2101cabdff1aSopenharmony_ci 2102cabdff1aSopenharmony_ci const_128 = __msa_ldi_w(128); 2103cabdff1aSopenharmony_ci const_128 *= weight_vec; 2104cabdff1aSopenharmony_ci offset_vec += __msa_srar_w(const_128, denom_vec); 2105cabdff1aSopenharmony_ci 2106cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 2107cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2108cabdff1aSopenharmony_ci mask2 = mask0 + 4; 2109cabdff1aSopenharmony_ci mask3 = mask0 + 6; 2110cabdff1aSopenharmony_ci 2111cabdff1aSopenharmony_ci src_tmp = src; 2112cabdff1aSopenharmony_ci dst_tmp = dst; 2113cabdff1aSopenharmony_ci 2114cabdff1aSopenharmony_ci LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 2115cabdff1aSopenharmony_ci src_tmp += (7 * src_stride); 2116cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 2117cabdff1aSopenharmony_ci 2118cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 2119cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 2120cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 2121cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 2122cabdff1aSopenharmony_ci vec11); 2123cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14, 2124cabdff1aSopenharmony_ci vec15); 2125cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2126cabdff1aSopenharmony_ci filt3); 2127cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2128cabdff1aSopenharmony_ci filt3); 2129cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2130cabdff1aSopenharmony_ci filt3); 2131cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 2132cabdff1aSopenharmony_ci filt2, filt3); 2133cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 2134cabdff1aSopenharmony_ci VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 2135cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 2136cabdff1aSopenharmony_ci vec11); 2137cabdff1aSopenharmony_ci dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2138cabdff1aSopenharmony_ci filt3); 2139cabdff1aSopenharmony_ci dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2140cabdff1aSopenharmony_ci filt3); 2141cabdff1aSopenharmony_ci dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2142cabdff1aSopenharmony_ci filt3); 2143cabdff1aSopenharmony_ci 2144cabdff1aSopenharmony_ci for (loop_cnt = 16; loop_cnt--;) { 2145cabdff1aSopenharmony_ci src7 = LD_SB(src_tmp); 2146cabdff1aSopenharmony_ci src7 = (v16i8) __msa_xori_b((v16u8) src7, 128); 2147cabdff1aSopenharmony_ci src_tmp += src_stride; 2148cabdff1aSopenharmony_ci 2149cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 2150cabdff1aSopenharmony_ci vec3); 2151cabdff1aSopenharmony_ci dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2152cabdff1aSopenharmony_ci filt3); 2153cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 2154cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 2155cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 2156cabdff1aSopenharmony_ci ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 2157cabdff1aSopenharmony_ci 2158cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 2159cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 2160cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 2161cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 2162cabdff1aSopenharmony_ci dst0_r >>= 6; 2163cabdff1aSopenharmony_ci dst0_l >>= 6; 2164cabdff1aSopenharmony_ci 2165cabdff1aSopenharmony_ci MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l); 2166cabdff1aSopenharmony_ci SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); 2167cabdff1aSopenharmony_ci ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l); 2168cabdff1aSopenharmony_ci CLIP_SW2_0_255(dst0_r, dst0_l); 2169cabdff1aSopenharmony_ci dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 2170cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r); 2171cabdff1aSopenharmony_ci ST_D1(out, 0, dst_tmp); 2172cabdff1aSopenharmony_ci dst_tmp += dst_stride; 2173cabdff1aSopenharmony_ci 2174cabdff1aSopenharmony_ci dst0 = dst1; 2175cabdff1aSopenharmony_ci dst1 = dst2; 2176cabdff1aSopenharmony_ci dst2 = dst3; 2177cabdff1aSopenharmony_ci dst3 = dst4; 2178cabdff1aSopenharmony_ci dst4 = dst5; 2179cabdff1aSopenharmony_ci dst5 = dst6; 2180cabdff1aSopenharmony_ci dst6 = dst7; 2181cabdff1aSopenharmony_ci } 2182cabdff1aSopenharmony_ci 2183cabdff1aSopenharmony_ci src += 8; 2184cabdff1aSopenharmony_ci dst += 8; 2185cabdff1aSopenharmony_ci 2186cabdff1aSopenharmony_ci mask4 = LD_SB(ff_hevc_mask_arr + 16); 2187cabdff1aSopenharmony_ci mask5 = mask4 + 2; 2188cabdff1aSopenharmony_ci mask6 = mask4 + 4; 2189cabdff1aSopenharmony_ci mask7 = mask4 + 6; 2190cabdff1aSopenharmony_ci 2191cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 2192cabdff1aSopenharmony_ci src += (7 * src_stride); 2193cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 2194cabdff1aSopenharmony_ci 2195cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); 2196cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7); 2197cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10, 2198cabdff1aSopenharmony_ci vec11); 2199cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14, 2200cabdff1aSopenharmony_ci vec15); 2201cabdff1aSopenharmony_ci dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2202cabdff1aSopenharmony_ci filt3); 2203cabdff1aSopenharmony_ci dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2204cabdff1aSopenharmony_ci filt3); 2205cabdff1aSopenharmony_ci dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2206cabdff1aSopenharmony_ci filt3); 2207cabdff1aSopenharmony_ci dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 2208cabdff1aSopenharmony_ci filt3); 2209cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r); 2210cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r); 2211cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r); 2212cabdff1aSopenharmony_ci 2213cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 2214cabdff1aSopenharmony_ci 2215cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 2216cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 2217cabdff1aSopenharmony_ci src += (4 * src_stride); 2218cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 2219cabdff1aSopenharmony_ci 2220cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 2221cabdff1aSopenharmony_ci vec3); 2222cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6, 2223cabdff1aSopenharmony_ci vec7); 2224cabdff1aSopenharmony_ci dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2225cabdff1aSopenharmony_ci filt3); 2226cabdff1aSopenharmony_ci dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2227cabdff1aSopenharmony_ci filt3); 2228cabdff1aSopenharmony_ci 2229cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst97, dst66); 2230cabdff1aSopenharmony_ci ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r); 2231cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 2232cabdff1aSopenharmony_ci dst98_r = __msa_ilvr_h(dst66, dst108); 2233cabdff1aSopenharmony_ci 2234cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 2235cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 2236cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0, 2237cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 2238cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0, 2239cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 2240cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0, 2241cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 2242cabdff1aSopenharmony_ci 2243cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 2244cabdff1aSopenharmony_ci MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 2245cabdff1aSopenharmony_ci MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 2246cabdff1aSopenharmony_ci SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec); 2247cabdff1aSopenharmony_ci ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r); 2248cabdff1aSopenharmony_ci ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r); 2249cabdff1aSopenharmony_ci CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r); 2250cabdff1aSopenharmony_ci PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 2251cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); 2252cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2253cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2254cabdff1aSopenharmony_ci 2255cabdff1aSopenharmony_ci dst10_r = dst54_r; 2256cabdff1aSopenharmony_ci dst32_r = dst76_r; 2257cabdff1aSopenharmony_ci dst54_r = dst98_r; 2258cabdff1aSopenharmony_ci dst21_r = dst65_r; 2259cabdff1aSopenharmony_ci dst43_r = dst87_r; 2260cabdff1aSopenharmony_ci dst65_r = dst109_r; 2261cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 2262cabdff1aSopenharmony_ci } 2263cabdff1aSopenharmony_ci} 2264cabdff1aSopenharmony_ci 2265cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src, 2266cabdff1aSopenharmony_ci int32_t src_stride, 2267cabdff1aSopenharmony_ci uint8_t *dst, 2268cabdff1aSopenharmony_ci int32_t dst_stride, 2269cabdff1aSopenharmony_ci const int8_t *filter_x, 2270cabdff1aSopenharmony_ci const int8_t *filter_y, 2271cabdff1aSopenharmony_ci int32_t height, 2272cabdff1aSopenharmony_ci int32_t weight, 2273cabdff1aSopenharmony_ci int32_t offset, 2274cabdff1aSopenharmony_ci int32_t rnd_val) 2275cabdff1aSopenharmony_ci{ 2276cabdff1aSopenharmony_ci hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 2277cabdff1aSopenharmony_ci filter_x, filter_y, height, weight, 2278cabdff1aSopenharmony_ci offset, rnd_val, 16); 2279cabdff1aSopenharmony_ci} 2280cabdff1aSopenharmony_ci 2281cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src, 2282cabdff1aSopenharmony_ci int32_t src_stride, 2283cabdff1aSopenharmony_ci uint8_t *dst, 2284cabdff1aSopenharmony_ci int32_t dst_stride, 2285cabdff1aSopenharmony_ci const int8_t *filter_x, 2286cabdff1aSopenharmony_ci const int8_t *filter_y, 2287cabdff1aSopenharmony_ci int32_t height, 2288cabdff1aSopenharmony_ci int32_t weight, 2289cabdff1aSopenharmony_ci int32_t offset, 2290cabdff1aSopenharmony_ci int32_t rnd_val) 2291cabdff1aSopenharmony_ci{ 2292cabdff1aSopenharmony_ci hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 2293cabdff1aSopenharmony_ci filter_x, filter_y, height, weight, 2294cabdff1aSopenharmony_ci offset, rnd_val, 24); 2295cabdff1aSopenharmony_ci} 2296cabdff1aSopenharmony_ci 2297cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src, 2298cabdff1aSopenharmony_ci int32_t src_stride, 2299cabdff1aSopenharmony_ci uint8_t *dst, 2300cabdff1aSopenharmony_ci int32_t dst_stride, 2301cabdff1aSopenharmony_ci const int8_t *filter_x, 2302cabdff1aSopenharmony_ci const int8_t *filter_y, 2303cabdff1aSopenharmony_ci int32_t height, 2304cabdff1aSopenharmony_ci int32_t weight, 2305cabdff1aSopenharmony_ci int32_t offset, 2306cabdff1aSopenharmony_ci int32_t rnd_val) 2307cabdff1aSopenharmony_ci{ 2308cabdff1aSopenharmony_ci hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 2309cabdff1aSopenharmony_ci filter_x, filter_y, height, weight, 2310cabdff1aSopenharmony_ci offset, rnd_val, 32); 2311cabdff1aSopenharmony_ci} 2312cabdff1aSopenharmony_ci 2313cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src, 2314cabdff1aSopenharmony_ci int32_t src_stride, 2315cabdff1aSopenharmony_ci uint8_t *dst, 2316cabdff1aSopenharmony_ci int32_t dst_stride, 2317cabdff1aSopenharmony_ci const int8_t *filter_x, 2318cabdff1aSopenharmony_ci const int8_t *filter_y, 2319cabdff1aSopenharmony_ci int32_t height, 2320cabdff1aSopenharmony_ci int32_t weight, 2321cabdff1aSopenharmony_ci int32_t offset, 2322cabdff1aSopenharmony_ci int32_t rnd_val) 2323cabdff1aSopenharmony_ci{ 2324cabdff1aSopenharmony_ci hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 2325cabdff1aSopenharmony_ci filter_x, filter_y, height, weight, 2326cabdff1aSopenharmony_ci offset, rnd_val, 48); 2327cabdff1aSopenharmony_ci} 2328cabdff1aSopenharmony_ci 2329cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src, 2330cabdff1aSopenharmony_ci int32_t src_stride, 2331cabdff1aSopenharmony_ci uint8_t *dst, 2332cabdff1aSopenharmony_ci int32_t dst_stride, 2333cabdff1aSopenharmony_ci const int8_t *filter_x, 2334cabdff1aSopenharmony_ci const int8_t *filter_y, 2335cabdff1aSopenharmony_ci int32_t height, 2336cabdff1aSopenharmony_ci int32_t weight, 2337cabdff1aSopenharmony_ci int32_t offset, 2338cabdff1aSopenharmony_ci int32_t rnd_val) 2339cabdff1aSopenharmony_ci{ 2340cabdff1aSopenharmony_ci hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 2341cabdff1aSopenharmony_ci filter_x, filter_y, height, weight, 2342cabdff1aSopenharmony_ci offset, rnd_val, 64); 2343cabdff1aSopenharmony_ci} 2344cabdff1aSopenharmony_ci 2345cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src, 2346cabdff1aSopenharmony_ci int32_t src_stride, 2347cabdff1aSopenharmony_ci uint8_t *dst, 2348cabdff1aSopenharmony_ci int32_t dst_stride, 2349cabdff1aSopenharmony_ci const int8_t *filter, 2350cabdff1aSopenharmony_ci int32_t weight, 2351cabdff1aSopenharmony_ci int32_t offset, 2352cabdff1aSopenharmony_ci int32_t rnd_val) 2353cabdff1aSopenharmony_ci{ 2354cabdff1aSopenharmony_ci v16u8 out; 2355cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2356cabdff1aSopenharmony_ci v16i8 src0, src1, vec0, vec1; 2357cabdff1aSopenharmony_ci v16i8 mask1; 2358cabdff1aSopenharmony_ci v8i16 dst0; 2359cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l; 2360cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2361cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 2362cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2363cabdff1aSopenharmony_ci 2364cabdff1aSopenharmony_ci src -= 1; 2365cabdff1aSopenharmony_ci 2366cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2367cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2368cabdff1aSopenharmony_ci 2369cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2370cabdff1aSopenharmony_ci 2371cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 2372cabdff1aSopenharmony_ci 2373cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2374cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 2375cabdff1aSopenharmony_ci 2376cabdff1aSopenharmony_ci weight *= 128; 2377cabdff1aSopenharmony_ci rnd_val -= 6; 2378cabdff1aSopenharmony_ci 2379cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 2380cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 2381cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 2382cabdff1aSopenharmony_ci 2383cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2384cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2385cabdff1aSopenharmony_ci 2386cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src1); 2387cabdff1aSopenharmony_ci XORI_B2_128_SB(src0, src1); 2388cabdff1aSopenharmony_ci 2389cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2390cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2391cabdff1aSopenharmony_ci 2392cabdff1aSopenharmony_ci ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l); 2393cabdff1aSopenharmony_ci DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l); 2394cabdff1aSopenharmony_ci SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); 2395cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 2396cabdff1aSopenharmony_ci dst0 = __msa_adds_s_h(dst0, offset_vec); 2397cabdff1aSopenharmony_ci CLIP_SH_0_255(dst0); 2398cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); 2399cabdff1aSopenharmony_ci ST_W2(out, 0, 1, dst, dst_stride); 2400cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2401cabdff1aSopenharmony_ci} 2402cabdff1aSopenharmony_ci 2403cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src, 2404cabdff1aSopenharmony_ci int32_t src_stride, 2405cabdff1aSopenharmony_ci uint8_t *dst, 2406cabdff1aSopenharmony_ci int32_t dst_stride, 2407cabdff1aSopenharmony_ci const int8_t *filter, 2408cabdff1aSopenharmony_ci int32_t weight, 2409cabdff1aSopenharmony_ci int32_t offset, 2410cabdff1aSopenharmony_ci int32_t rnd_val) 2411cabdff1aSopenharmony_ci{ 2412cabdff1aSopenharmony_ci v16u8 out; 2413cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2414cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 2415cabdff1aSopenharmony_ci v16i8 mask1, vec0, vec1, vec2, vec3; 2416cabdff1aSopenharmony_ci v8i16 dst0, dst1; 2417cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2418cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 2419cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2420cabdff1aSopenharmony_ci 2421cabdff1aSopenharmony_ci src -= 1; 2422cabdff1aSopenharmony_ci 2423cabdff1aSopenharmony_ci /* rearranging filter */ 2424cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2425cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2426cabdff1aSopenharmony_ci 2427cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2428cabdff1aSopenharmony_ci 2429cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 2430cabdff1aSopenharmony_ci 2431cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2432cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 2433cabdff1aSopenharmony_ci 2434cabdff1aSopenharmony_ci weight *= 128; 2435cabdff1aSopenharmony_ci rnd_val -= 6; 2436cabdff1aSopenharmony_ci 2437cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 2438cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 2439cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 2440cabdff1aSopenharmony_ci 2441cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2442cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2443cabdff1aSopenharmony_ci 2444cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 2445cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2446cabdff1aSopenharmony_ci 2447cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2448cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3); 2449cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2450cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2451cabdff1aSopenharmony_ci 2452cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec, 2453cabdff1aSopenharmony_ci dst0, dst1); 2454cabdff1aSopenharmony_ci 2455cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 2456cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2457cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2458cabdff1aSopenharmony_ci} 2459cabdff1aSopenharmony_ci 2460cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src, 2461cabdff1aSopenharmony_ci int32_t src_stride, 2462cabdff1aSopenharmony_ci uint8_t *dst, 2463cabdff1aSopenharmony_ci int32_t dst_stride, 2464cabdff1aSopenharmony_ci const int8_t *filter, 2465cabdff1aSopenharmony_ci int32_t height, 2466cabdff1aSopenharmony_ci int32_t weight, 2467cabdff1aSopenharmony_ci int32_t offset, 2468cabdff1aSopenharmony_ci int32_t rnd_val) 2469cabdff1aSopenharmony_ci{ 2470cabdff1aSopenharmony_ci uint32_t loop_cnt; 2471cabdff1aSopenharmony_ci v16u8 out0, out1; 2472cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2473cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2474cabdff1aSopenharmony_ci v16i8 mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2475cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 2476cabdff1aSopenharmony_ci v8i16 filter_vec; 2477cabdff1aSopenharmony_ci v8i16 weight_vec_h, offset_vec, denom_vec; 2478cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 2479cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2480cabdff1aSopenharmony_ci 2481cabdff1aSopenharmony_ci src -= 1; 2482cabdff1aSopenharmony_ci 2483cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2484cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2485cabdff1aSopenharmony_ci 2486cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 2487cabdff1aSopenharmony_ci 2488cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2489cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 2490cabdff1aSopenharmony_ci 2491cabdff1aSopenharmony_ci weight *= 128; 2492cabdff1aSopenharmony_ci rnd_val -= 6; 2493cabdff1aSopenharmony_ci 2494cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 2495cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 2496cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 2497cabdff1aSopenharmony_ci 2498cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2499cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2500cabdff1aSopenharmony_ci 2501cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2502cabdff1aSopenharmony_ci 2503cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 2504cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 2505cabdff1aSopenharmony_ci src += (8 * src_stride); 2506cabdff1aSopenharmony_ci 2507cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2508cabdff1aSopenharmony_ci 2509cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2510cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3); 2511cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec4, vec5); 2512cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec6, vec7); 2513cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2514cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2515cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2516cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2517cabdff1aSopenharmony_ci 2518cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 2519cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 2520cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 2521cabdff1aSopenharmony_ci 2522cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 2523cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 2524cabdff1aSopenharmony_ci dst += (8 * dst_stride); 2525cabdff1aSopenharmony_ci } 2526cabdff1aSopenharmony_ci} 2527cabdff1aSopenharmony_ci 2528cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src, 2529cabdff1aSopenharmony_ci int32_t src_stride, 2530cabdff1aSopenharmony_ci uint8_t *dst, 2531cabdff1aSopenharmony_ci int32_t dst_stride, 2532cabdff1aSopenharmony_ci const int8_t *filter, 2533cabdff1aSopenharmony_ci int32_t height, 2534cabdff1aSopenharmony_ci int32_t weight, 2535cabdff1aSopenharmony_ci int32_t offset, 2536cabdff1aSopenharmony_ci int32_t rnd_val) 2537cabdff1aSopenharmony_ci{ 2538cabdff1aSopenharmony_ci if (2 == height) { 2539cabdff1aSopenharmony_ci hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride, 2540cabdff1aSopenharmony_ci filter, weight, offset, rnd_val); 2541cabdff1aSopenharmony_ci } else if (4 == height) { 2542cabdff1aSopenharmony_ci hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride, 2543cabdff1aSopenharmony_ci filter, weight, offset, rnd_val); 2544cabdff1aSopenharmony_ci } else if (8 == height || 16 == height) { 2545cabdff1aSopenharmony_ci hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride, 2546cabdff1aSopenharmony_ci filter, height, weight, 2547cabdff1aSopenharmony_ci offset, rnd_val); 2548cabdff1aSopenharmony_ci } 2549cabdff1aSopenharmony_ci} 2550cabdff1aSopenharmony_ci 2551cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src, 2552cabdff1aSopenharmony_ci int32_t src_stride, 2553cabdff1aSopenharmony_ci uint8_t *dst, 2554cabdff1aSopenharmony_ci int32_t dst_stride, 2555cabdff1aSopenharmony_ci const int8_t *filter, 2556cabdff1aSopenharmony_ci int32_t height, 2557cabdff1aSopenharmony_ci int32_t weight, 2558cabdff1aSopenharmony_ci int32_t offset, 2559cabdff1aSopenharmony_ci int32_t rnd_val) 2560cabdff1aSopenharmony_ci{ 2561cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 2562cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2563cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2564cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2565cabdff1aSopenharmony_ci v16i8 mask1; 2566cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2567cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2568cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2569cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 2570cabdff1aSopenharmony_ci 2571cabdff1aSopenharmony_ci src -= 1; 2572cabdff1aSopenharmony_ci 2573cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2574cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2575cabdff1aSopenharmony_ci 2576cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 2577cabdff1aSopenharmony_ci 2578cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2579cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 2580cabdff1aSopenharmony_ci 2581cabdff1aSopenharmony_ci weight *= 128; 2582cabdff1aSopenharmony_ci rnd_val -= 6; 2583cabdff1aSopenharmony_ci 2584cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 2585cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 2586cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 2587cabdff1aSopenharmony_ci 2588cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2589cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2590cabdff1aSopenharmony_ci 2591cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2592cabdff1aSopenharmony_ci 2593cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 2594cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2595cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2596cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 2597cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 2598cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 2599cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2600cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2601cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2602cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2603cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 2604cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3); 2605cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5); 2606cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7); 2607cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2608cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2609cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2610cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2611cabdff1aSopenharmony_ci 2612cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 2613cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 2614cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 2615cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, 2616cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 2617cabdff1aSopenharmony_ci dst4, dst5, dst6, dst7); 2618cabdff1aSopenharmony_ci 2619cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 2620cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 2621cabdff1aSopenharmony_ci ST_W2(out0, 0, 2, dst, dst_stride); 2622cabdff1aSopenharmony_ci ST_H2(out0, 2, 6, dst + 4, dst_stride); 2623cabdff1aSopenharmony_ci ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 2624cabdff1aSopenharmony_ci ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2625cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2626cabdff1aSopenharmony_ci ST_W2(out2, 0, 2, dst, dst_stride); 2627cabdff1aSopenharmony_ci ST_H2(out2, 2, 6, dst + 4, dst_stride); 2628cabdff1aSopenharmony_ci ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride); 2629cabdff1aSopenharmony_ci ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2630cabdff1aSopenharmony_ci} 2631cabdff1aSopenharmony_ci 2632cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src, 2633cabdff1aSopenharmony_ci int32_t src_stride, 2634cabdff1aSopenharmony_ci uint8_t *dst, 2635cabdff1aSopenharmony_ci int32_t dst_stride, 2636cabdff1aSopenharmony_ci const int8_t *filter, 2637cabdff1aSopenharmony_ci int32_t weight, 2638cabdff1aSopenharmony_ci int32_t offset, 2639cabdff1aSopenharmony_ci int32_t rnd_val) 2640cabdff1aSopenharmony_ci{ 2641cabdff1aSopenharmony_ci v16u8 out; 2642cabdff1aSopenharmony_ci v8i16 filt0, filt1, dst0, dst1; 2643cabdff1aSopenharmony_ci v16i8 src0, src1; 2644cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2645cabdff1aSopenharmony_ci v16i8 mask1; 2646cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 2647cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2648cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 2649cabdff1aSopenharmony_ci 2650cabdff1aSopenharmony_ci src -= 1; 2651cabdff1aSopenharmony_ci 2652cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2653cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2654cabdff1aSopenharmony_ci 2655cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 2656cabdff1aSopenharmony_ci 2657cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2658cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 2659cabdff1aSopenharmony_ci 2660cabdff1aSopenharmony_ci weight *= 128; 2661cabdff1aSopenharmony_ci rnd_val -= 6; 2662cabdff1aSopenharmony_ci 2663cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 2664cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 2665cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 2666cabdff1aSopenharmony_ci 2667cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2668cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2669cabdff1aSopenharmony_ci 2670cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2671cabdff1aSopenharmony_ci 2672cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src1); 2673cabdff1aSopenharmony_ci XORI_B2_128_SB(src0, src1); 2674cabdff1aSopenharmony_ci 2675cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2676cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 2677cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2678cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2679cabdff1aSopenharmony_ci 2680cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec, 2681cabdff1aSopenharmony_ci dst0, dst1); 2682cabdff1aSopenharmony_ci 2683cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 2684cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst, dst_stride); 2685cabdff1aSopenharmony_ci} 2686cabdff1aSopenharmony_ci 2687cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_8x4_msa(uint8_t *src, 2688cabdff1aSopenharmony_ci int32_t src_stride, 2689cabdff1aSopenharmony_ci uint8_t *dst, 2690cabdff1aSopenharmony_ci int32_t dst_stride, 2691cabdff1aSopenharmony_ci const int8_t *filter, 2692cabdff1aSopenharmony_ci int32_t weight, 2693cabdff1aSopenharmony_ci int32_t offset, 2694cabdff1aSopenharmony_ci int32_t rnd_val) 2695cabdff1aSopenharmony_ci{ 2696cabdff1aSopenharmony_ci v16u8 out0, out1; 2697cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 2698cabdff1aSopenharmony_ci v16i8 mask0, mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2699cabdff1aSopenharmony_ci v8i16 filt0, filt1, dst0, dst1, dst2, dst3; 2700cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2701cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 2702cabdff1aSopenharmony_ci 2703cabdff1aSopenharmony_ci src -= 1; 2704cabdff1aSopenharmony_ci 2705cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2706cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2707cabdff1aSopenharmony_ci 2708cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 2709cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2710cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 2711cabdff1aSopenharmony_ci 2712cabdff1aSopenharmony_ci weight *= 128; 2713cabdff1aSopenharmony_ci rnd_val -= 6; 2714cabdff1aSopenharmony_ci 2715cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 2716cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 2717cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 2718cabdff1aSopenharmony_ci 2719cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2720cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2721cabdff1aSopenharmony_ci 2722cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2723cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2724cabdff1aSopenharmony_ci 2725cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 2726cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2727cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2728cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 2729cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 2730cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 2731cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2732cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2733cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2734cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2735cabdff1aSopenharmony_ci 2736cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 2737cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 2738cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 2739cabdff1aSopenharmony_ci 2740cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 2741cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2742cabdff1aSopenharmony_ci} 2743cabdff1aSopenharmony_ci 2744cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src, 2745cabdff1aSopenharmony_ci int32_t src_stride, 2746cabdff1aSopenharmony_ci uint8_t *dst, 2747cabdff1aSopenharmony_ci int32_t dst_stride, 2748cabdff1aSopenharmony_ci const int8_t *filter, 2749cabdff1aSopenharmony_ci int32_t weight, 2750cabdff1aSopenharmony_ci int32_t offset, 2751cabdff1aSopenharmony_ci int32_t rnd_val) 2752cabdff1aSopenharmony_ci{ 2753cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 2754cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2755cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 2756cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2757cabdff1aSopenharmony_ci v16i8 mask1; 2758cabdff1aSopenharmony_ci v16i8 vec11; 2759cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 2760cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 2761cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2762cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 2763cabdff1aSopenharmony_ci 2764cabdff1aSopenharmony_ci src -= 1; 2765cabdff1aSopenharmony_ci 2766cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2767cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2768cabdff1aSopenharmony_ci 2769cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 2770cabdff1aSopenharmony_ci 2771cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2772cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 2773cabdff1aSopenharmony_ci 2774cabdff1aSopenharmony_ci weight *= 128; 2775cabdff1aSopenharmony_ci rnd_val -= 6; 2776cabdff1aSopenharmony_ci 2777cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 2778cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 2779cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 2780cabdff1aSopenharmony_ci 2781cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2782cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2783cabdff1aSopenharmony_ci 2784cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2785cabdff1aSopenharmony_ci 2786cabdff1aSopenharmony_ci LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5); 2787cabdff1aSopenharmony_ci XORI_B6_128_SB(src0, src1, src2, src3, src4, src5); 2788cabdff1aSopenharmony_ci 2789cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2790cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 2791cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 2792cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 2793cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 2794cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11); 2795cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2796cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2797cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2798cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2799cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 2800cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1); 2801cabdff1aSopenharmony_ci 2802cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 2803cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 2804cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 2805cabdff1aSopenharmony_ci 2806cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec, 2807cabdff1aSopenharmony_ci dst4, dst5); 2808cabdff1aSopenharmony_ci 2809cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 2810cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2811cabdff1aSopenharmony_ci ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 2812cabdff1aSopenharmony_ci} 2813cabdff1aSopenharmony_ci 2814cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_8x8multiple_msa(uint8_t *src, 2815cabdff1aSopenharmony_ci int32_t src_stride, 2816cabdff1aSopenharmony_ci uint8_t *dst, 2817cabdff1aSopenharmony_ci int32_t dst_stride, 2818cabdff1aSopenharmony_ci const int8_t *filter, 2819cabdff1aSopenharmony_ci int32_t height, 2820cabdff1aSopenharmony_ci int32_t weight, 2821cabdff1aSopenharmony_ci int32_t offset, 2822cabdff1aSopenharmony_ci int32_t rnd_val) 2823cabdff1aSopenharmony_ci{ 2824cabdff1aSopenharmony_ci uint32_t loop_cnt; 2825cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2826cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 2827cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2828cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2829cabdff1aSopenharmony_ci v16i8 mask1; 2830cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2831cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2832cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2833cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 2834cabdff1aSopenharmony_ci 2835cabdff1aSopenharmony_ci src -= 1; 2836cabdff1aSopenharmony_ci 2837cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2838cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2839cabdff1aSopenharmony_ci 2840cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 2841cabdff1aSopenharmony_ci 2842cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2843cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 2844cabdff1aSopenharmony_ci 2845cabdff1aSopenharmony_ci weight *= 128; 2846cabdff1aSopenharmony_ci rnd_val -= 6; 2847cabdff1aSopenharmony_ci 2848cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 2849cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 2850cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 2851cabdff1aSopenharmony_ci 2852cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2853cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2854cabdff1aSopenharmony_ci 2855cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2856cabdff1aSopenharmony_ci 2857cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 2858cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 2859cabdff1aSopenharmony_ci src += (8 * src_stride); 2860cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2861cabdff1aSopenharmony_ci 2862cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2863cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 2864cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 2865cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 2866cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2867cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2868cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2869cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2870cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 2871cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3); 2872cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5); 2873cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7); 2874cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2875cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2876cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2877cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2878cabdff1aSopenharmony_ci 2879cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 2880cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 2881cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 2882cabdff1aSopenharmony_ci 2883cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, 2884cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 2885cabdff1aSopenharmony_ci dst4, dst5, dst6, dst7); 2886cabdff1aSopenharmony_ci 2887cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 2888cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 2889cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 2890cabdff1aSopenharmony_ci dst += (8 * dst_stride); 2891cabdff1aSopenharmony_ci } 2892cabdff1aSopenharmony_ci} 2893cabdff1aSopenharmony_ci 2894cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src, 2895cabdff1aSopenharmony_ci int32_t src_stride, 2896cabdff1aSopenharmony_ci uint8_t *dst, 2897cabdff1aSopenharmony_ci int32_t dst_stride, 2898cabdff1aSopenharmony_ci const int8_t *filter, 2899cabdff1aSopenharmony_ci int32_t height, 2900cabdff1aSopenharmony_ci int32_t weight, 2901cabdff1aSopenharmony_ci int32_t offset, 2902cabdff1aSopenharmony_ci int32_t rnd_val) 2903cabdff1aSopenharmony_ci{ 2904cabdff1aSopenharmony_ci if (2 == height) { 2905cabdff1aSopenharmony_ci hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride, 2906cabdff1aSopenharmony_ci filter, weight, offset, rnd_val); 2907cabdff1aSopenharmony_ci } else if (4 == height) { 2908cabdff1aSopenharmony_ci hevc_hz_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride, 2909cabdff1aSopenharmony_ci filter, weight, offset, rnd_val); 2910cabdff1aSopenharmony_ci } else if (6 == height) { 2911cabdff1aSopenharmony_ci hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride, 2912cabdff1aSopenharmony_ci filter, weight, offset, rnd_val); 2913cabdff1aSopenharmony_ci } else { 2914cabdff1aSopenharmony_ci hevc_hz_uniwgt_4t_8x8multiple_msa(src, src_stride, dst, dst_stride, 2915cabdff1aSopenharmony_ci filter, height, weight, offset, 2916cabdff1aSopenharmony_ci rnd_val); 2917cabdff1aSopenharmony_ci } 2918cabdff1aSopenharmony_ci} 2919cabdff1aSopenharmony_ci 2920cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src, 2921cabdff1aSopenharmony_ci int32_t src_stride, 2922cabdff1aSopenharmony_ci uint8_t *dst, 2923cabdff1aSopenharmony_ci int32_t dst_stride, 2924cabdff1aSopenharmony_ci const int8_t *filter, 2925cabdff1aSopenharmony_ci int32_t height, 2926cabdff1aSopenharmony_ci int32_t weight, 2927cabdff1aSopenharmony_ci int32_t offset, 2928cabdff1aSopenharmony_ci int32_t rnd_val) 2929cabdff1aSopenharmony_ci{ 2930cabdff1aSopenharmony_ci uint32_t loop_cnt; 2931cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 2932cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2933cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 2934cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2935cabdff1aSopenharmony_ci v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 2936cabdff1aSopenharmony_ci }; 2937cabdff1aSopenharmony_ci v16i8 mask1; 2938cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 2939cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 2940cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 2941cabdff1aSopenharmony_ci v16i8 mask3, vec11; 2942cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 2943cabdff1aSopenharmony_ci 2944cabdff1aSopenharmony_ci src -= 1; 2945cabdff1aSopenharmony_ci 2946cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2947cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2948cabdff1aSopenharmony_ci 2949cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 2950cabdff1aSopenharmony_ci 2951cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 2952cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 2953cabdff1aSopenharmony_ci 2954cabdff1aSopenharmony_ci weight *= 128; 2955cabdff1aSopenharmony_ci rnd_val -= 6; 2956cabdff1aSopenharmony_ci 2957cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 2958cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 2959cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 2960cabdff1aSopenharmony_ci 2961cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 2962cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 2963cabdff1aSopenharmony_ci 2964cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2965cabdff1aSopenharmony_ci mask3 = mask2 + 2; 2966cabdff1aSopenharmony_ci 2967cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 2968cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 2969cabdff1aSopenharmony_ci src += (4 * src_stride); 2970cabdff1aSopenharmony_ci 2971cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2972cabdff1aSopenharmony_ci 2973cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2974cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 2975cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 2976cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 2977cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec8, vec9); 2978cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec10, vec11); 2979cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2980cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2981cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 2982cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 2983cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 2984cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1); 2985cabdff1aSopenharmony_ci 2986cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 2987cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 2988cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 2989cabdff1aSopenharmony_ci 2990cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 2991cabdff1aSopenharmony_ci rnd_vec, dst4, dst5); 2992cabdff1aSopenharmony_ci 2993cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 2994cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2995cabdff1aSopenharmony_ci ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 2996cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2997cabdff1aSopenharmony_ci } 2998cabdff1aSopenharmony_ci} 2999cabdff1aSopenharmony_ci 3000cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src, 3001cabdff1aSopenharmony_ci int32_t src_stride, 3002cabdff1aSopenharmony_ci uint8_t *dst, 3003cabdff1aSopenharmony_ci int32_t dst_stride, 3004cabdff1aSopenharmony_ci const int8_t *filter, 3005cabdff1aSopenharmony_ci int32_t height, 3006cabdff1aSopenharmony_ci int32_t weight, 3007cabdff1aSopenharmony_ci int32_t offset, 3008cabdff1aSopenharmony_ci int32_t rnd_val) 3009cabdff1aSopenharmony_ci{ 3010cabdff1aSopenharmony_ci uint32_t loop_cnt; 3011cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 3012cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 3013cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3014cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3015cabdff1aSopenharmony_ci v16i8 mask1; 3016cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3017cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3018cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3019cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 3020cabdff1aSopenharmony_ci 3021cabdff1aSopenharmony_ci src -= 1; 3022cabdff1aSopenharmony_ci 3023cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3024cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3025cabdff1aSopenharmony_ci 3026cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 3027cabdff1aSopenharmony_ci 3028cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3029cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 3030cabdff1aSopenharmony_ci 3031cabdff1aSopenharmony_ci weight *= 128; 3032cabdff1aSopenharmony_ci rnd_val -= 6; 3033cabdff1aSopenharmony_ci 3034cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 3035cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 3036cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 3037cabdff1aSopenharmony_ci 3038cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3039cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3040cabdff1aSopenharmony_ci 3041cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3042cabdff1aSopenharmony_ci 3043cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 3044cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src2, src4, src6); 3045cabdff1aSopenharmony_ci LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 3046cabdff1aSopenharmony_ci src += (4 * src_stride); 3047cabdff1aSopenharmony_ci 3048cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3049cabdff1aSopenharmony_ci 3050cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3051cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3052cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3053cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 3054cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3055cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3056cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3057cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3058cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 3059cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3); 3060cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5); 3061cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7); 3062cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3063cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3064cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3065cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3066cabdff1aSopenharmony_ci 3067cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 3068cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 3069cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 3070cabdff1aSopenharmony_ci 3071cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, 3072cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 3073cabdff1aSopenharmony_ci dst4, dst5, dst6, dst7); 3074cabdff1aSopenharmony_ci 3075cabdff1aSopenharmony_ci PCKEV_B4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 3076cabdff1aSopenharmony_ci out0, out1, out2, out3); 3077cabdff1aSopenharmony_ci 3078cabdff1aSopenharmony_ci ST_UB4(out0, out1, out2, out3, dst, dst_stride); 3079cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3080cabdff1aSopenharmony_ci } 3081cabdff1aSopenharmony_ci} 3082cabdff1aSopenharmony_ci 3083cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src, 3084cabdff1aSopenharmony_ci int32_t src_stride, 3085cabdff1aSopenharmony_ci uint8_t *dst, 3086cabdff1aSopenharmony_ci int32_t dst_stride, 3087cabdff1aSopenharmony_ci const int8_t *filter, 3088cabdff1aSopenharmony_ci int32_t height, 3089cabdff1aSopenharmony_ci int32_t weight, 3090cabdff1aSopenharmony_ci int32_t offset, 3091cabdff1aSopenharmony_ci int32_t rnd_val) 3092cabdff1aSopenharmony_ci{ 3093cabdff1aSopenharmony_ci uint32_t loop_cnt; 3094cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 3095cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 3096cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3097cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3; 3098cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3099cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 3100cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3101cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 3102cabdff1aSopenharmony_ci 3103cabdff1aSopenharmony_ci src -= 1; 3104cabdff1aSopenharmony_ci 3105cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3106cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3107cabdff1aSopenharmony_ci 3108cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 3109cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3110cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 3111cabdff1aSopenharmony_ci 3112cabdff1aSopenharmony_ci weight *= 128; 3113cabdff1aSopenharmony_ci rnd_val -= 6; 3114cabdff1aSopenharmony_ci 3115cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 3116cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 3117cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 3118cabdff1aSopenharmony_ci 3119cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3120cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3121cabdff1aSopenharmony_ci 3122cabdff1aSopenharmony_ci mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3123cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3124cabdff1aSopenharmony_ci mask2 = mask0 + 8; 3125cabdff1aSopenharmony_ci mask3 = mask0 + 10; 3126cabdff1aSopenharmony_ci 3127cabdff1aSopenharmony_ci for (loop_cnt = 16; loop_cnt--;) { 3128cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src0, src2); 3129cabdff1aSopenharmony_ci LD_SB2(src + 16, src_stride, src1, src3); 3130cabdff1aSopenharmony_ci src += (2 * src_stride); 3131cabdff1aSopenharmony_ci 3132cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 3133cabdff1aSopenharmony_ci 3134cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3135cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3); 3136cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3137cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec6, vec7); 3138cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3139cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3140cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3141cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3142cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 3143cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec2, vec3); 3144cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3145cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3146cabdff1aSopenharmony_ci 3147cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 3148cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 3149cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 3150cabdff1aSopenharmony_ci 3151cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 3152cabdff1aSopenharmony_ci rnd_vec, dst4, dst5); 3153cabdff1aSopenharmony_ci 3154cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 3155cabdff1aSopenharmony_ci ST_UB2(out0, out1, dst, dst_stride); 3156cabdff1aSopenharmony_ci ST_D2(out2, 0, 1, dst + 16, dst_stride); 3157cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3158cabdff1aSopenharmony_ci } 3159cabdff1aSopenharmony_ci} 3160cabdff1aSopenharmony_ci 3161cabdff1aSopenharmony_cistatic void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src, 3162cabdff1aSopenharmony_ci int32_t src_stride, 3163cabdff1aSopenharmony_ci uint8_t *dst, 3164cabdff1aSopenharmony_ci int32_t dst_stride, 3165cabdff1aSopenharmony_ci const int8_t *filter, 3166cabdff1aSopenharmony_ci int32_t height, 3167cabdff1aSopenharmony_ci int32_t weight, 3168cabdff1aSopenharmony_ci int32_t offset, 3169cabdff1aSopenharmony_ci int32_t rnd_val) 3170cabdff1aSopenharmony_ci{ 3171cabdff1aSopenharmony_ci uint32_t loop_cnt; 3172cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 3173cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 3174cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3175cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 3176cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 3177cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3178cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3179cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3180cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 3181cabdff1aSopenharmony_ci 3182cabdff1aSopenharmony_ci src -= 1; 3183cabdff1aSopenharmony_ci 3184cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3185cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3186cabdff1aSopenharmony_ci 3187cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 3188cabdff1aSopenharmony_ci 3189cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3190cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 3191cabdff1aSopenharmony_ci 3192cabdff1aSopenharmony_ci weight *= 128; 3193cabdff1aSopenharmony_ci rnd_val -= 6; 3194cabdff1aSopenharmony_ci 3195cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 3196cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 3197cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 3198cabdff1aSopenharmony_ci 3199cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3200cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3201cabdff1aSopenharmony_ci 3202cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3203cabdff1aSopenharmony_ci mask2 = mask0 + 8; 3204cabdff1aSopenharmony_ci mask3 = mask0 + 10; 3205cabdff1aSopenharmony_ci 3206cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 3207cabdff1aSopenharmony_ci LD_SB2(src, 16, src0, src1); 3208cabdff1aSopenharmony_ci src2 = LD_SB(src + 24); 3209cabdff1aSopenharmony_ci src += src_stride; 3210cabdff1aSopenharmony_ci LD_SB2(src, 16, src3, src4); 3211cabdff1aSopenharmony_ci src5 = LD_SB(src + 24); 3212cabdff1aSopenharmony_ci src += src_stride; 3213cabdff1aSopenharmony_ci XORI_B6_128_SB(src0, src1, src2, src3, src4, src5); 3214cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3215cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3); 3216cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec4, vec5); 3217cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec6, vec7); 3218cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3219cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3220cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3221cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3222cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3223cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src4, src3, src4, mask2, mask3, vec2, vec3); 3224cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec4, vec5); 3225cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec6, vec7); 3226cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3227cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3228cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3229cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3230cabdff1aSopenharmony_ci 3231cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 3232cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 3233cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 3234cabdff1aSopenharmony_ci 3235cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, 3236cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 3237cabdff1aSopenharmony_ci dst4, dst5, dst6, dst7); 3238cabdff1aSopenharmony_ci 3239cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 3240cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 3241cabdff1aSopenharmony_ci ST_UB2(out0, out1, dst, 16); 3242cabdff1aSopenharmony_ci dst += dst_stride; 3243cabdff1aSopenharmony_ci ST_UB2(out2, out3, dst, 16); 3244cabdff1aSopenharmony_ci dst += dst_stride; 3245cabdff1aSopenharmony_ci } 3246cabdff1aSopenharmony_ci} 3247cabdff1aSopenharmony_ci 3248cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src, 3249cabdff1aSopenharmony_ci int32_t src_stride, 3250cabdff1aSopenharmony_ci uint8_t *dst, 3251cabdff1aSopenharmony_ci int32_t dst_stride, 3252cabdff1aSopenharmony_ci const int8_t *filter, 3253cabdff1aSopenharmony_ci int32_t weight, 3254cabdff1aSopenharmony_ci int32_t offset, 3255cabdff1aSopenharmony_ci int32_t rnd_val) 3256cabdff1aSopenharmony_ci{ 3257cabdff1aSopenharmony_ci v16u8 out; 3258cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 3259cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 3260cabdff1aSopenharmony_ci v16i8 src2110, src4332; 3261cabdff1aSopenharmony_ci v8i16 dst0; 3262cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l; 3263cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3264cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3265cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 3266cabdff1aSopenharmony_ci 3267cabdff1aSopenharmony_ci src -= src_stride; 3268cabdff1aSopenharmony_ci 3269cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 3270cabdff1aSopenharmony_ci 3271cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3272cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 3273cabdff1aSopenharmony_ci 3274cabdff1aSopenharmony_ci weight *= 128; 3275cabdff1aSopenharmony_ci rnd_val -= 6; 3276cabdff1aSopenharmony_ci 3277cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 3278cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 3279cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 3280cabdff1aSopenharmony_ci 3281cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3282cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3283cabdff1aSopenharmony_ci 3284cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3285cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3286cabdff1aSopenharmony_ci 3287cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3288cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3289cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3290cabdff1aSopenharmony_ci ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 3291cabdff1aSopenharmony_ci XORI_B2_128_SB(src2110, src4332); 3292cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3293cabdff1aSopenharmony_ci ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l); 3294cabdff1aSopenharmony_ci DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l); 3295cabdff1aSopenharmony_ci SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); 3296cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 3297cabdff1aSopenharmony_ci dst0 = __msa_adds_s_h(dst0, offset_vec); 3298cabdff1aSopenharmony_ci CLIP_SH_0_255(dst0); 3299cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); 3300cabdff1aSopenharmony_ci ST_W2(out, 0, 1, dst, dst_stride); 3301cabdff1aSopenharmony_ci} 3302cabdff1aSopenharmony_ci 3303cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src, 3304cabdff1aSopenharmony_ci int32_t src_stride, 3305cabdff1aSopenharmony_ci uint8_t *dst, 3306cabdff1aSopenharmony_ci int32_t dst_stride, 3307cabdff1aSopenharmony_ci const int8_t *filter, 3308cabdff1aSopenharmony_ci int32_t weight, 3309cabdff1aSopenharmony_ci int32_t offset, 3310cabdff1aSopenharmony_ci int32_t rnd_val) 3311cabdff1aSopenharmony_ci{ 3312cabdff1aSopenharmony_ci v16u8 out; 3313cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 3314cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 3315cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554; 3316cabdff1aSopenharmony_ci v8i16 dst0, dst1; 3317cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3318cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3319cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 3320cabdff1aSopenharmony_ci 3321cabdff1aSopenharmony_ci src -= src_stride; 3322cabdff1aSopenharmony_ci 3323cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 3324cabdff1aSopenharmony_ci 3325cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3326cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 3327cabdff1aSopenharmony_ci 3328cabdff1aSopenharmony_ci weight *= 128; 3329cabdff1aSopenharmony_ci rnd_val -= 6; 3330cabdff1aSopenharmony_ci 3331cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 3332cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 3333cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 3334cabdff1aSopenharmony_ci 3335cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3336cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3337cabdff1aSopenharmony_ci 3338cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3339cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3340cabdff1aSopenharmony_ci 3341cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 3342cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3343cabdff1aSopenharmony_ci ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3344cabdff1aSopenharmony_ci src32_r, src43_r, src54_r, src65_r); 3345cabdff1aSopenharmony_ci ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 3346cabdff1aSopenharmony_ci src2110, src4332, src6554); 3347cabdff1aSopenharmony_ci XORI_B3_128_SB(src2110, src4332, src6554); 3348cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3349cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1); 3350cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec, 3351cabdff1aSopenharmony_ci dst0, dst1); 3352cabdff1aSopenharmony_ci 3353cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 3354cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 3355cabdff1aSopenharmony_ci} 3356cabdff1aSopenharmony_ci 3357cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src, 3358cabdff1aSopenharmony_ci int32_t src_stride, 3359cabdff1aSopenharmony_ci uint8_t *dst, 3360cabdff1aSopenharmony_ci int32_t dst_stride, 3361cabdff1aSopenharmony_ci const int8_t *filter, 3362cabdff1aSopenharmony_ci int32_t height, 3363cabdff1aSopenharmony_ci int32_t weight, 3364cabdff1aSopenharmony_ci int32_t offset, 3365cabdff1aSopenharmony_ci int32_t rnd_val) 3366cabdff1aSopenharmony_ci{ 3367cabdff1aSopenharmony_ci int32_t loop_cnt; 3368cabdff1aSopenharmony_ci v16u8 out0, out1; 3369cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3370cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 3371cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 3372cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554, src8776; 3373cabdff1aSopenharmony_ci v16i8 src10998; 3374cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, filt0, filt1; 3375cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3376cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 3377cabdff1aSopenharmony_ci 3378cabdff1aSopenharmony_ci src -= src_stride; 3379cabdff1aSopenharmony_ci 3380cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 3381cabdff1aSopenharmony_ci 3382cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3383cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 3384cabdff1aSopenharmony_ci 3385cabdff1aSopenharmony_ci weight *= 128; 3386cabdff1aSopenharmony_ci rnd_val -= 6; 3387cabdff1aSopenharmony_ci 3388cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 3389cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 3390cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 3391cabdff1aSopenharmony_ci 3392cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3393cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3394cabdff1aSopenharmony_ci 3395cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3396cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3397cabdff1aSopenharmony_ci 3398cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3399cabdff1aSopenharmony_ci src += (3 * src_stride); 3400cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3401cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 3402cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3403cabdff1aSopenharmony_ci 3404cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 3405cabdff1aSopenharmony_ci LD_SB8(src, src_stride, 3406cabdff1aSopenharmony_ci src3, src4, src5, src6, src7, src8, src9, src10); 3407cabdff1aSopenharmony_ci src += (8 * src_stride); 3408cabdff1aSopenharmony_ci ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3409cabdff1aSopenharmony_ci src32_r, src43_r, src54_r, src65_r); 3410cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3411cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3412cabdff1aSopenharmony_ci ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, 3413cabdff1aSopenharmony_ci src109_r, src98_r, src4332, src6554, src8776, src10998); 3414cabdff1aSopenharmony_ci XORI_B4_128_SB(src4332, src6554, src8776, src10998); 3415cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3416cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1); 3417cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1); 3418cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1); 3419cabdff1aSopenharmony_ci 3420cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 3421cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 3422cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 3423cabdff1aSopenharmony_ci 3424cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 3425cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3426cabdff1aSopenharmony_ci dst += (8 * dst_stride); 3427cabdff1aSopenharmony_ci 3428cabdff1aSopenharmony_ci src2 = src10; 3429cabdff1aSopenharmony_ci src2110 = src10998; 3430cabdff1aSopenharmony_ci } 3431cabdff1aSopenharmony_ci} 3432cabdff1aSopenharmony_ci 3433cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src, 3434cabdff1aSopenharmony_ci int32_t src_stride, 3435cabdff1aSopenharmony_ci uint8_t *dst, 3436cabdff1aSopenharmony_ci int32_t dst_stride, 3437cabdff1aSopenharmony_ci const int8_t *filter, 3438cabdff1aSopenharmony_ci int32_t height, 3439cabdff1aSopenharmony_ci int32_t weight, 3440cabdff1aSopenharmony_ci int32_t offset, 3441cabdff1aSopenharmony_ci int32_t rnd_val) 3442cabdff1aSopenharmony_ci{ 3443cabdff1aSopenharmony_ci if (2 == height) { 3444cabdff1aSopenharmony_ci hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride, 3445cabdff1aSopenharmony_ci filter, weight, offset, rnd_val); 3446cabdff1aSopenharmony_ci } else if (4 == height) { 3447cabdff1aSopenharmony_ci hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride, 3448cabdff1aSopenharmony_ci filter, weight, offset, rnd_val); 3449cabdff1aSopenharmony_ci } else if (0 == (height % 8)) { 3450cabdff1aSopenharmony_ci hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride, 3451cabdff1aSopenharmony_ci filter, height, weight, offset, 3452cabdff1aSopenharmony_ci rnd_val); 3453cabdff1aSopenharmony_ci } 3454cabdff1aSopenharmony_ci} 3455cabdff1aSopenharmony_ci 3456cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src, 3457cabdff1aSopenharmony_ci int32_t src_stride, 3458cabdff1aSopenharmony_ci uint8_t *dst, 3459cabdff1aSopenharmony_ci int32_t dst_stride, 3460cabdff1aSopenharmony_ci const int8_t *filter, 3461cabdff1aSopenharmony_ci int32_t height, 3462cabdff1aSopenharmony_ci int32_t weight, 3463cabdff1aSopenharmony_ci int32_t offset, 3464cabdff1aSopenharmony_ci int32_t rnd_val) 3465cabdff1aSopenharmony_ci{ 3466cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 3467cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3468cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 3469cabdff1aSopenharmony_ci v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r; 3470cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3471cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3472cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3473cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 3474cabdff1aSopenharmony_ci 3475cabdff1aSopenharmony_ci src -= src_stride; 3476cabdff1aSopenharmony_ci 3477cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 3478cabdff1aSopenharmony_ci 3479cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3480cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 3481cabdff1aSopenharmony_ci 3482cabdff1aSopenharmony_ci weight *= 128; 3483cabdff1aSopenharmony_ci rnd_val -= 6; 3484cabdff1aSopenharmony_ci 3485cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 3486cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 3487cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 3488cabdff1aSopenharmony_ci 3489cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3490cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3491cabdff1aSopenharmony_ci 3492cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3493cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3494cabdff1aSopenharmony_ci 3495cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3496cabdff1aSopenharmony_ci src += (3 * src_stride); 3497cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 3498cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3499cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3500cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3501cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3502cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 3503cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3504cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3505cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3506cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3507cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3508cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3509cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1); 3510cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1); 3511cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 3512cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 3513cabdff1aSopenharmony_ci 3514cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, 3515cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 3516cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 3517cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, 3518cabdff1aSopenharmony_ci weight_vec, offset_vec, rnd_vec, 3519cabdff1aSopenharmony_ci dst4, dst5, dst6, dst7); 3520cabdff1aSopenharmony_ci 3521cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 3522cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 3523cabdff1aSopenharmony_ci ST_W2(out0, 0, 2, dst, dst_stride); 3524cabdff1aSopenharmony_ci ST_H2(out0, 2, 6, dst + 4, dst_stride); 3525cabdff1aSopenharmony_ci ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 3526cabdff1aSopenharmony_ci ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 3527cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3528cabdff1aSopenharmony_ci ST_W2(out2, 0, 2, dst, dst_stride); 3529cabdff1aSopenharmony_ci ST_H2(out2, 2, 6, dst + 4, dst_stride); 3530cabdff1aSopenharmony_ci ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride); 3531cabdff1aSopenharmony_ci ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 3532cabdff1aSopenharmony_ci} 3533cabdff1aSopenharmony_ci 3534cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src, 3535cabdff1aSopenharmony_ci int32_t src_stride, 3536cabdff1aSopenharmony_ci uint8_t *dst, 3537cabdff1aSopenharmony_ci int32_t dst_stride, 3538cabdff1aSopenharmony_ci const int8_t *filter, 3539cabdff1aSopenharmony_ci int32_t weight, 3540cabdff1aSopenharmony_ci int32_t offset, 3541cabdff1aSopenharmony_ci int32_t rnd_val) 3542cabdff1aSopenharmony_ci{ 3543cabdff1aSopenharmony_ci v16u8 out; 3544cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 3545cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 3546cabdff1aSopenharmony_ci v8i16 dst0, dst1; 3547cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3548cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3549cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 3550cabdff1aSopenharmony_ci 3551cabdff1aSopenharmony_ci src -= src_stride; 3552cabdff1aSopenharmony_ci 3553cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 3554cabdff1aSopenharmony_ci 3555cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3556cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 3557cabdff1aSopenharmony_ci 3558cabdff1aSopenharmony_ci weight *= 128; 3559cabdff1aSopenharmony_ci rnd_val -= 6; 3560cabdff1aSopenharmony_ci 3561cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 3562cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 3563cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 3564cabdff1aSopenharmony_ci 3565cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3566cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3567cabdff1aSopenharmony_ci 3568cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3569cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3570cabdff1aSopenharmony_ci 3571cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3572cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3573cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3574cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3575cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3576cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3577cabdff1aSopenharmony_ci 3578cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec, 3579cabdff1aSopenharmony_ci dst0, dst1); 3580cabdff1aSopenharmony_ci 3581cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 3582cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst, dst_stride); 3583cabdff1aSopenharmony_ci} 3584cabdff1aSopenharmony_ci 3585cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_8x4_msa(uint8_t *src, 3586cabdff1aSopenharmony_ci int32_t src_stride, 3587cabdff1aSopenharmony_ci uint8_t *dst, 3588cabdff1aSopenharmony_ci int32_t dst_stride, 3589cabdff1aSopenharmony_ci const int8_t *filter, 3590cabdff1aSopenharmony_ci int32_t weight, 3591cabdff1aSopenharmony_ci int32_t offset, 3592cabdff1aSopenharmony_ci int32_t rnd_val) 3593cabdff1aSopenharmony_ci{ 3594cabdff1aSopenharmony_ci v16u8 out0, out1; 3595cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 3596cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 3597cabdff1aSopenharmony_ci v16i8 src5, src6, src54_r, src65_r; 3598cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3599cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 3600cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3601cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 3602cabdff1aSopenharmony_ci 3603cabdff1aSopenharmony_ci src -= src_stride; 3604cabdff1aSopenharmony_ci 3605cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 3606cabdff1aSopenharmony_ci 3607cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3608cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 3609cabdff1aSopenharmony_ci 3610cabdff1aSopenharmony_ci weight *= 128; 3611cabdff1aSopenharmony_ci rnd_val -= 6; 3612cabdff1aSopenharmony_ci 3613cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 3614cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 3615cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 3616cabdff1aSopenharmony_ci 3617cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3618cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3619cabdff1aSopenharmony_ci 3620cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3621cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3622cabdff1aSopenharmony_ci 3623cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 3624cabdff1aSopenharmony_ci src += (3 * src_stride); 3625cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 3626cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3627cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3628cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 3629cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3630cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3631cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3632cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3633cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 3634cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 3635cabdff1aSopenharmony_ci dst3); 3636cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 3637cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 3638cabdff1aSopenharmony_ci} 3639cabdff1aSopenharmony_ci 3640cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src, 3641cabdff1aSopenharmony_ci int32_t src_stride, 3642cabdff1aSopenharmony_ci uint8_t *dst, 3643cabdff1aSopenharmony_ci int32_t dst_stride, 3644cabdff1aSopenharmony_ci const int8_t *filter, 3645cabdff1aSopenharmony_ci int32_t weight, 3646cabdff1aSopenharmony_ci int32_t offset, 3647cabdff1aSopenharmony_ci int32_t rnd_val) 3648cabdff1aSopenharmony_ci{ 3649cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 3650cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3651cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r; 3652cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r; 3653cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 3654cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3655cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3656cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 3657cabdff1aSopenharmony_ci 3658cabdff1aSopenharmony_ci src -= src_stride; 3659cabdff1aSopenharmony_ci 3660cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 3661cabdff1aSopenharmony_ci 3662cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3663cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 3664cabdff1aSopenharmony_ci 3665cabdff1aSopenharmony_ci weight *= 128; 3666cabdff1aSopenharmony_ci rnd_val -= 6; 3667cabdff1aSopenharmony_ci 3668cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 3669cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 3670cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 3671cabdff1aSopenharmony_ci 3672cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3673cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3674cabdff1aSopenharmony_ci 3675cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3676cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3677cabdff1aSopenharmony_ci 3678cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3679cabdff1aSopenharmony_ci src += (3 * src_stride); 3680cabdff1aSopenharmony_ci LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8); 3681cabdff1aSopenharmony_ci 3682cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3683cabdff1aSopenharmony_ci XORI_B6_128_SB(src3, src4, src5, src6, src7, src8); 3684cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3685cabdff1aSopenharmony_ci src32_r, src43_r); 3686cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 3687cabdff1aSopenharmony_ci src76_r, src87_r); 3688cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3689cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3690cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3691cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3692cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1); 3693cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1); 3694cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 3695cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, dst3); 3696cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec, 3697cabdff1aSopenharmony_ci dst4, dst5); 3698cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 3699cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 3700cabdff1aSopenharmony_ci ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 3701cabdff1aSopenharmony_ci} 3702cabdff1aSopenharmony_ci 3703cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_8x8mult_msa(uint8_t *src, 3704cabdff1aSopenharmony_ci int32_t src_stride, 3705cabdff1aSopenharmony_ci uint8_t *dst, 3706cabdff1aSopenharmony_ci int32_t dst_stride, 3707cabdff1aSopenharmony_ci const int8_t *filter, 3708cabdff1aSopenharmony_ci int32_t height, 3709cabdff1aSopenharmony_ci int32_t weight, 3710cabdff1aSopenharmony_ci int32_t offset, 3711cabdff1aSopenharmony_ci int32_t rnd_val) 3712cabdff1aSopenharmony_ci{ 3713cabdff1aSopenharmony_ci int32_t loop_cnt; 3714cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 3715cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3716cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 3717cabdff1aSopenharmony_ci v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r; 3718cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3719cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3720cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3721cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 3722cabdff1aSopenharmony_ci 3723cabdff1aSopenharmony_ci src -= src_stride; 3724cabdff1aSopenharmony_ci 3725cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 3726cabdff1aSopenharmony_ci 3727cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3728cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 3729cabdff1aSopenharmony_ci 3730cabdff1aSopenharmony_ci weight *= 128; 3731cabdff1aSopenharmony_ci rnd_val -= 6; 3732cabdff1aSopenharmony_ci 3733cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 3734cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 3735cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 3736cabdff1aSopenharmony_ci 3737cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3738cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3739cabdff1aSopenharmony_ci 3740cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3741cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3742cabdff1aSopenharmony_ci 3743cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3744cabdff1aSopenharmony_ci src += (3 * src_stride); 3745cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3746cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3747cabdff1aSopenharmony_ci 3748cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 3749cabdff1aSopenharmony_ci LD_SB8(src, src_stride, 3750cabdff1aSopenharmony_ci src3, src4, src5, src6, src7, src8, src9, src10); 3751cabdff1aSopenharmony_ci src += (8 * src_stride); 3752cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3753cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3754cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 3755cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3756cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3757cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3758cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3759cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3760cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3761cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1); 3762cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1); 3763cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 3764cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 3765cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 3766cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 3767cabdff1aSopenharmony_ci dst3); 3768cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 3769cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst4, dst5, dst6, 3770cabdff1aSopenharmony_ci dst7); 3771cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 3772cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 3773cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 3774cabdff1aSopenharmony_ci dst += (8 * dst_stride); 3775cabdff1aSopenharmony_ci 3776cabdff1aSopenharmony_ci src2 = src10; 3777cabdff1aSopenharmony_ci src10_r = src98_r; 3778cabdff1aSopenharmony_ci src21_r = src109_r; 3779cabdff1aSopenharmony_ci } 3780cabdff1aSopenharmony_ci} 3781cabdff1aSopenharmony_ci 3782cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src, 3783cabdff1aSopenharmony_ci int32_t src_stride, 3784cabdff1aSopenharmony_ci uint8_t *dst, 3785cabdff1aSopenharmony_ci int32_t dst_stride, 3786cabdff1aSopenharmony_ci const int8_t *filter, 3787cabdff1aSopenharmony_ci int32_t height, 3788cabdff1aSopenharmony_ci int32_t weight, 3789cabdff1aSopenharmony_ci int32_t offset, 3790cabdff1aSopenharmony_ci int32_t rnd_val) 3791cabdff1aSopenharmony_ci{ 3792cabdff1aSopenharmony_ci if (2 == height) { 3793cabdff1aSopenharmony_ci hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride, 3794cabdff1aSopenharmony_ci filter, weight, offset, rnd_val); 3795cabdff1aSopenharmony_ci } else if (4 == height) { 3796cabdff1aSopenharmony_ci hevc_vt_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride, 3797cabdff1aSopenharmony_ci filter, weight, offset, rnd_val); 3798cabdff1aSopenharmony_ci } else if (6 == height) { 3799cabdff1aSopenharmony_ci hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride, 3800cabdff1aSopenharmony_ci filter, weight, offset, rnd_val); 3801cabdff1aSopenharmony_ci } else { 3802cabdff1aSopenharmony_ci hevc_vt_uniwgt_4t_8x8mult_msa(src, src_stride, dst, dst_stride, 3803cabdff1aSopenharmony_ci filter, height, weight, offset, 3804cabdff1aSopenharmony_ci rnd_val); 3805cabdff1aSopenharmony_ci } 3806cabdff1aSopenharmony_ci} 3807cabdff1aSopenharmony_ci 3808cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src, 3809cabdff1aSopenharmony_ci int32_t src_stride, 3810cabdff1aSopenharmony_ci uint8_t *dst, 3811cabdff1aSopenharmony_ci int32_t dst_stride, 3812cabdff1aSopenharmony_ci const int8_t *filter, 3813cabdff1aSopenharmony_ci int32_t height, 3814cabdff1aSopenharmony_ci int32_t weight, 3815cabdff1aSopenharmony_ci int32_t offset, 3816cabdff1aSopenharmony_ci int32_t rnd_val) 3817cabdff1aSopenharmony_ci{ 3818cabdff1aSopenharmony_ci int32_t loop_cnt; 3819cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3, out4, out5; 3820cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3821cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 3822cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 3823cabdff1aSopenharmony_ci v16i8 src2110, src4332; 3824cabdff1aSopenharmony_ci v16i8 src54_r, src76_r, src98_r, src65_r, src87_r, src109_r; 3825cabdff1aSopenharmony_ci v16i8 src76_l, src98_l, src87_l, src109_l, src6554, src8776, src10998; 3826cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3827cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 3828cabdff1aSopenharmony_ci v8i16 dst9, dst10, dst11, filter_vec, weight_vec_h, offset_vec, denom_vec; 3829cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 3830cabdff1aSopenharmony_ci 3831cabdff1aSopenharmony_ci src -= (1 * src_stride); 3832cabdff1aSopenharmony_ci 3833cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 3834cabdff1aSopenharmony_ci 3835cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3836cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 3837cabdff1aSopenharmony_ci 3838cabdff1aSopenharmony_ci weight *= 128; 3839cabdff1aSopenharmony_ci rnd_val -= 6; 3840cabdff1aSopenharmony_ci 3841cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 3842cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 3843cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 3844cabdff1aSopenharmony_ci 3845cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3846cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3847cabdff1aSopenharmony_ci 3848cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3849cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3850cabdff1aSopenharmony_ci 3851cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3852cabdff1aSopenharmony_ci src += (3 * src_stride); 3853cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3854cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3855cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3856cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); 3857cabdff1aSopenharmony_ci 3858cabdff1aSopenharmony_ci for (loop_cnt = 2; loop_cnt--;) { 3859cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 3860cabdff1aSopenharmony_ci src += (8 * src_stride); 3861cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3862cabdff1aSopenharmony_ci ILVRL_B2_SB(src3, src2, src32_r, src32_l); 3863cabdff1aSopenharmony_ci ILVRL_B2_SB(src4, src3, src43_r, src43_l); 3864cabdff1aSopenharmony_ci ILVRL_B2_SB(src5, src4, src54_r, src54_l); 3865cabdff1aSopenharmony_ci ILVRL_B2_SB(src6, src5, src65_r, src65_l); 3866cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); 3867cabdff1aSopenharmony_ci src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); 3868cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3869cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3870cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3871cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3872cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 3873cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1); 3874cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 3875cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 3876cabdff1aSopenharmony_ci dst3); 3877cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, 3878cabdff1aSopenharmony_ci rnd_vec, dst4, dst5); 3879cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 3880cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 3881cabdff1aSopenharmony_ci ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 3882cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3883cabdff1aSopenharmony_ci 3884cabdff1aSopenharmony_ci ILVRL_B2_SB(src7, src6, src76_r, src76_l); 3885cabdff1aSopenharmony_ci ILVRL_B2_SB(src8, src7, src87_r, src87_l); 3886cabdff1aSopenharmony_ci ILVRL_B2_SB(src9, src8, src98_r, src98_l); 3887cabdff1aSopenharmony_ci ILVRL_B2_SB(src10, src9, src109_r, src109_l); 3888cabdff1aSopenharmony_ci src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l); 3889cabdff1aSopenharmony_ci src10998 = (v16i8) __msa_ilvr_d((v2i64) src109_l, (v2i64) src98_l); 3890cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1); 3891cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1); 3892cabdff1aSopenharmony_ci dst8 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 3893cabdff1aSopenharmony_ci dst9 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 3894cabdff1aSopenharmony_ci dst10 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1); 3895cabdff1aSopenharmony_ci dst11 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1); 3896cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst6, dst7, dst8, dst9, weight_vec, 3897cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst6, dst7, dst8, 3898cabdff1aSopenharmony_ci dst9); 3899cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst10, dst11, weight_vec, offset_vec, 3900cabdff1aSopenharmony_ci rnd_vec, dst10, dst11); 3901cabdff1aSopenharmony_ci PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); 3902cabdff1aSopenharmony_ci ST_D4(out3, out4, 0, 1, 0, 1, dst, dst_stride); 3903cabdff1aSopenharmony_ci ST_W4(out5, 0, 1, 2, 3, dst + 8, dst_stride); 3904cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3905cabdff1aSopenharmony_ci 3906cabdff1aSopenharmony_ci src2 = src10; 3907cabdff1aSopenharmony_ci src10_r = src98_r; 3908cabdff1aSopenharmony_ci src21_r = src109_r; 3909cabdff1aSopenharmony_ci src2110 = src10998; 3910cabdff1aSopenharmony_ci } 3911cabdff1aSopenharmony_ci} 3912cabdff1aSopenharmony_ci 3913cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src, 3914cabdff1aSopenharmony_ci int32_t src_stride, 3915cabdff1aSopenharmony_ci uint8_t *dst, 3916cabdff1aSopenharmony_ci int32_t dst_stride, 3917cabdff1aSopenharmony_ci const int8_t *filter, 3918cabdff1aSopenharmony_ci int32_t height, 3919cabdff1aSopenharmony_ci int32_t weight, 3920cabdff1aSopenharmony_ci int32_t offset, 3921cabdff1aSopenharmony_ci int32_t rnd_val) 3922cabdff1aSopenharmony_ci{ 3923cabdff1aSopenharmony_ci int32_t loop_cnt; 3924cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 3925cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 3926cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 3927cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src21_l, src43_l; 3928cabdff1aSopenharmony_ci v16i8 src54_r, src54_l, src65_r, src65_l, src6; 3929cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3930cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3931cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 3932cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 3933cabdff1aSopenharmony_ci 3934cabdff1aSopenharmony_ci src -= src_stride; 3935cabdff1aSopenharmony_ci 3936cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 3937cabdff1aSopenharmony_ci 3938cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 3939cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 3940cabdff1aSopenharmony_ci 3941cabdff1aSopenharmony_ci weight *= 128; 3942cabdff1aSopenharmony_ci rnd_val -= 6; 3943cabdff1aSopenharmony_ci 3944cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 3945cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 3946cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 3947cabdff1aSopenharmony_ci 3948cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 3949cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 3950cabdff1aSopenharmony_ci 3951cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3952cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3953cabdff1aSopenharmony_ci 3954cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 3955cabdff1aSopenharmony_ci src += (3 * src_stride); 3956cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3957cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3958cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3959cabdff1aSopenharmony_ci 3960cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 3961cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src3, src4, src5, src6); 3962cabdff1aSopenharmony_ci src += (4 * src_stride); 3963cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 3964cabdff1aSopenharmony_ci ILVRL_B2_SB(src3, src2, src32_r, src32_l); 3965cabdff1aSopenharmony_ci ILVRL_B2_SB(src4, src3, src43_r, src43_l); 3966cabdff1aSopenharmony_ci ILVRL_B2_SB(src5, src4, src54_r, src54_l); 3967cabdff1aSopenharmony_ci ILVRL_B2_SB(src6, src5, src65_r, src65_l); 3968cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3969cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3970cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 3971cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 3972cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 3973cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 3974cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1); 3975cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1); 3976cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 3977cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 3978cabdff1aSopenharmony_ci dst3); 3979cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 3980cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst4, dst5, dst6, 3981cabdff1aSopenharmony_ci dst7); 3982cabdff1aSopenharmony_ci PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1, 3983cabdff1aSopenharmony_ci out2, out3); 3984cabdff1aSopenharmony_ci ST_UB4(out0, out1, out2, out3, dst, dst_stride); 3985cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3986cabdff1aSopenharmony_ci 3987cabdff1aSopenharmony_ci src2 = src6; 3988cabdff1aSopenharmony_ci src10_r = src54_r; 3989cabdff1aSopenharmony_ci src21_r = src65_r; 3990cabdff1aSopenharmony_ci src10_l = src54_l; 3991cabdff1aSopenharmony_ci src21_l = src65_l; 3992cabdff1aSopenharmony_ci } 3993cabdff1aSopenharmony_ci} 3994cabdff1aSopenharmony_ci 3995cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src, 3996cabdff1aSopenharmony_ci int32_t src_stride, 3997cabdff1aSopenharmony_ci uint8_t *dst, 3998cabdff1aSopenharmony_ci int32_t dst_stride, 3999cabdff1aSopenharmony_ci const int8_t *filter, 4000cabdff1aSopenharmony_ci int32_t height, 4001cabdff1aSopenharmony_ci int32_t weight, 4002cabdff1aSopenharmony_ci int32_t offset, 4003cabdff1aSopenharmony_ci int32_t rnd_val) 4004cabdff1aSopenharmony_ci{ 4005cabdff1aSopenharmony_ci uint32_t loop_cnt; 4006cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3, out4, out5; 4007cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 4008cabdff1aSopenharmony_ci v16i8 src6, src7, src8, src9, src10, src11, src12, src13; 4009cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 4010cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 4011cabdff1aSopenharmony_ci v16i8 src87_r, src98_r, src109_r, src1110_r, src1211_r, src1312_r; 4012cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4013cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10; 4014cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec, dst11; 4015cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 4016cabdff1aSopenharmony_ci 4017cabdff1aSopenharmony_ci src -= src_stride; 4018cabdff1aSopenharmony_ci 4019cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 4020cabdff1aSopenharmony_ci 4021cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4022cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 4023cabdff1aSopenharmony_ci 4024cabdff1aSopenharmony_ci weight *= 128; 4025cabdff1aSopenharmony_ci rnd_val -= 6; 4026cabdff1aSopenharmony_ci 4027cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 4028cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 4029cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 4030cabdff1aSopenharmony_ci 4031cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 4032cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 4033cabdff1aSopenharmony_ci 4034cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 4035cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4036cabdff1aSopenharmony_ci 4037cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 4038cabdff1aSopenharmony_ci LD_SB3(src + 16, src_stride, src7, src8, src9); 4039cabdff1aSopenharmony_ci src += (3 * src_stride); 4040cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4041cabdff1aSopenharmony_ci XORI_B3_128_SB(src7, src8, src9); 4042cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 4043cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 4044cabdff1aSopenharmony_ci ILVR_B2_SB(src8, src7, src9, src8, src87_r, src98_r); 4045cabdff1aSopenharmony_ci 4046cabdff1aSopenharmony_ci for (loop_cnt = 8; loop_cnt--;) { 4047cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src3, src4, src5, src6); 4048cabdff1aSopenharmony_ci LD_SB4(src + 16, src_stride, src10, src11, src12, src13); 4049cabdff1aSopenharmony_ci src += (4 * src_stride); 4050cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 4051cabdff1aSopenharmony_ci XORI_B4_128_SB(src10, src11, src12, src13); 4052cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4053cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 4054cabdff1aSopenharmony_ci ILVRL_B2_SB(src5, src4, src54_r, src54_l); 4055cabdff1aSopenharmony_ci ILVRL_B2_SB(src6, src5, src65_r, src65_l); 4056cabdff1aSopenharmony_ci ILVR_B2_SB(src10, src9, src11, src10, src109_r, src1110_r); 4057cabdff1aSopenharmony_ci ILVR_B2_SB(src12, src11, src13, src12, src1211_r, src1312_r); 4058cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4059cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4060cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 4061cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 4062cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 4063cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 4064cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1); 4065cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1); 4066cabdff1aSopenharmony_ci dst8 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 4067cabdff1aSopenharmony_ci dst9 = HEVC_FILT_4TAP_SH(src98_r, src1110_r, filt0, filt1); 4068cabdff1aSopenharmony_ci dst10 = HEVC_FILT_4TAP_SH(src109_r, src1211_r, filt0, filt1); 4069cabdff1aSopenharmony_ci dst11 = HEVC_FILT_4TAP_SH(src1110_r, src1312_r, filt0, filt1); 4070cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 4071cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 4072cabdff1aSopenharmony_ci dst3); 4073cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 4074cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst4, dst5, dst6, 4075cabdff1aSopenharmony_ci dst7); 4076cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec, 4077cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst8, dst9, dst10, 4078cabdff1aSopenharmony_ci dst11); 4079cabdff1aSopenharmony_ci PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1, 4080cabdff1aSopenharmony_ci out2, out3); 4081cabdff1aSopenharmony_ci PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5); 4082cabdff1aSopenharmony_ci ST_UB4(out0, out1, out2, out3, dst, dst_stride); 4083cabdff1aSopenharmony_ci ST_D4(out4, out5, 0, 1, 0, 1, dst + 16, dst_stride); 4084cabdff1aSopenharmony_ci dst += (4 * dst_stride); 4085cabdff1aSopenharmony_ci 4086cabdff1aSopenharmony_ci src2 = src6; 4087cabdff1aSopenharmony_ci src9 = src13; 4088cabdff1aSopenharmony_ci src10_r = src54_r; 4089cabdff1aSopenharmony_ci src21_r = src65_r; 4090cabdff1aSopenharmony_ci src10_l = src54_l; 4091cabdff1aSopenharmony_ci src21_l = src65_l; 4092cabdff1aSopenharmony_ci src87_r = src1211_r; 4093cabdff1aSopenharmony_ci src98_r = src1312_r; 4094cabdff1aSopenharmony_ci } 4095cabdff1aSopenharmony_ci} 4096cabdff1aSopenharmony_ci 4097cabdff1aSopenharmony_cistatic void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src, 4098cabdff1aSopenharmony_ci int32_t src_stride, 4099cabdff1aSopenharmony_ci uint8_t *dst, 4100cabdff1aSopenharmony_ci int32_t dst_stride, 4101cabdff1aSopenharmony_ci const int8_t *filter, 4102cabdff1aSopenharmony_ci int32_t height, 4103cabdff1aSopenharmony_ci int32_t weight, 4104cabdff1aSopenharmony_ci int32_t offset, 4105cabdff1aSopenharmony_ci int32_t rnd_val) 4106cabdff1aSopenharmony_ci{ 4107cabdff1aSopenharmony_ci uint32_t loop_cnt; 4108cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 4109cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 4110cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src76_r, src98_r; 4111cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r; 4112cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 4113cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src76_l, src98_l; 4114cabdff1aSopenharmony_ci v16i8 src21_l, src43_l, src65_l, src87_l; 4115cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4116cabdff1aSopenharmony_ci v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec; 4117cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 4118cabdff1aSopenharmony_ci 4119cabdff1aSopenharmony_ci src -= src_stride; 4120cabdff1aSopenharmony_ci 4121cabdff1aSopenharmony_ci weight = weight & 0x0000FFFF; 4122cabdff1aSopenharmony_ci 4123cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4124cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 4125cabdff1aSopenharmony_ci 4126cabdff1aSopenharmony_ci weight *= 128; 4127cabdff1aSopenharmony_ci rnd_val -= 6; 4128cabdff1aSopenharmony_ci 4129cabdff1aSopenharmony_ci weight_vec_h = __msa_fill_h(weight); 4130cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 4131cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val); 4132cabdff1aSopenharmony_ci 4133cabdff1aSopenharmony_ci weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec); 4134cabdff1aSopenharmony_ci offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h); 4135cabdff1aSopenharmony_ci 4136cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 4137cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4138cabdff1aSopenharmony_ci 4139cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 4140cabdff1aSopenharmony_ci LD_SB3(src + 16, src_stride, src5, src6, src7); 4141cabdff1aSopenharmony_ci src += (3 * src_stride); 4142cabdff1aSopenharmony_ci XORI_B6_128_SB(src0, src1, src2, src5, src6, src7); 4143cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 4144cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 4145cabdff1aSopenharmony_ci ILVR_B2_SB(src6, src5, src7, src6, src65_r, src76_r); 4146cabdff1aSopenharmony_ci ILVL_B2_SB(src6, src5, src7, src6, src65_l, src76_l); 4147cabdff1aSopenharmony_ci 4148cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 4149cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src3, src4); 4150cabdff1aSopenharmony_ci LD_SB2(src + 16, src_stride, src8, src9); 4151cabdff1aSopenharmony_ci src += (2 * src_stride); 4152cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src8, src9); 4153cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 4154cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 4155cabdff1aSopenharmony_ci ILVRL_B2_SB(src8, src7, src87_r, src87_l); 4156cabdff1aSopenharmony_ci ILVRL_B2_SB(src9, src8, src98_r, src98_l); 4157cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 4158cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 4159cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 4160cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 4161cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1); 4162cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 4163cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(src65_l, src87_l, filt0, filt1); 4164cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1); 4165cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec, 4166cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst0, dst1, dst2, 4167cabdff1aSopenharmony_ci dst3); 4168cabdff1aSopenharmony_ci HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec, 4169cabdff1aSopenharmony_ci offset_vec, rnd_vec, dst4, dst5, dst6, 4170cabdff1aSopenharmony_ci dst7); 4171cabdff1aSopenharmony_ci PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst6, dst4, dst7, dst5, out0, out1, 4172cabdff1aSopenharmony_ci out2, out3); 4173cabdff1aSopenharmony_ci ST_UB2(out0, out2, dst, 16); 4174cabdff1aSopenharmony_ci dst += dst_stride; 4175cabdff1aSopenharmony_ci ST_UB2(out1, out3, dst, 16); 4176cabdff1aSopenharmony_ci dst += dst_stride; 4177cabdff1aSopenharmony_ci 4178cabdff1aSopenharmony_ci src2 = src4; 4179cabdff1aSopenharmony_ci src7 = src9; 4180cabdff1aSopenharmony_ci src10_r = src32_r; 4181cabdff1aSopenharmony_ci src21_r = src43_r; 4182cabdff1aSopenharmony_ci src10_l = src32_l; 4183cabdff1aSopenharmony_ci src21_l = src43_l; 4184cabdff1aSopenharmony_ci src65_r = src87_r; 4185cabdff1aSopenharmony_ci src76_r = src98_r; 4186cabdff1aSopenharmony_ci src65_l = src87_l; 4187cabdff1aSopenharmony_ci src76_l = src98_l; 4188cabdff1aSopenharmony_ci } 4189cabdff1aSopenharmony_ci} 4190cabdff1aSopenharmony_ci 4191cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src, 4192cabdff1aSopenharmony_ci int32_t src_stride, 4193cabdff1aSopenharmony_ci uint8_t *dst, 4194cabdff1aSopenharmony_ci int32_t dst_stride, 4195cabdff1aSopenharmony_ci const int8_t *filter_x, 4196cabdff1aSopenharmony_ci const int8_t *filter_y, 4197cabdff1aSopenharmony_ci int32_t weight, 4198cabdff1aSopenharmony_ci int32_t offset, 4199cabdff1aSopenharmony_ci int32_t rnd_val) 4200cabdff1aSopenharmony_ci{ 4201cabdff1aSopenharmony_ci v16u8 out; 4202cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 4203cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4204cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 4205cabdff1aSopenharmony_ci v16i8 mask1; 4206cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filter_vec, tmp; 4207cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 4208cabdff1aSopenharmony_ci v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43; 4209cabdff1aSopenharmony_ci v8i16 offset_vec, const_128, denom_vec; 4210cabdff1aSopenharmony_ci v4i32 dst0, dst1, weight_vec, rnd_vec; 4211cabdff1aSopenharmony_ci 4212cabdff1aSopenharmony_ci src -= (src_stride + 1); 4213cabdff1aSopenharmony_ci 4214cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4215cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4216cabdff1aSopenharmony_ci 4217cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4218cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4219cabdff1aSopenharmony_ci 4220cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4221cabdff1aSopenharmony_ci 4222cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4223cabdff1aSopenharmony_ci 4224cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4225cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 4226cabdff1aSopenharmony_ci 4227cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 4228cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val - 6); 4229cabdff1aSopenharmony_ci const_128 = __msa_fill_h((128 * weight)); 4230cabdff1aSopenharmony_ci offset_vec += __msa_srar_h(const_128, denom_vec); 4231cabdff1aSopenharmony_ci 4232cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 4233cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4234cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1); 4235cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3); 4236cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5); 4237cabdff1aSopenharmony_ci dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4238cabdff1aSopenharmony_ci dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4239cabdff1aSopenharmony_ci dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4240cabdff1aSopenharmony_ci ILVRL_H2_SH(dst31, dst20, dst10, dst32); 4241cabdff1aSopenharmony_ci ILVRL_H2_SH(dst42, dst31, dst21, dst43); 4242cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 4243cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 4244cabdff1aSopenharmony_ci dst0 >>= 6; 4245cabdff1aSopenharmony_ci dst1 >>= 6; 4246cabdff1aSopenharmony_ci MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1); 4247cabdff1aSopenharmony_ci SRAR_W2_SW(dst0, dst1, rnd_vec); 4248cabdff1aSopenharmony_ci tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); 4249cabdff1aSopenharmony_ci tmp += offset_vec; 4250cabdff1aSopenharmony_ci CLIP_SH_0_255(tmp); 4251cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); 4252cabdff1aSopenharmony_ci ST_W2(out, 0, 1, dst, dst_stride); 4253cabdff1aSopenharmony_ci} 4254cabdff1aSopenharmony_ci 4255cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src, 4256cabdff1aSopenharmony_ci int32_t src_stride, 4257cabdff1aSopenharmony_ci uint8_t *dst, 4258cabdff1aSopenharmony_ci int32_t dst_stride, 4259cabdff1aSopenharmony_ci const int8_t *filter_x, 4260cabdff1aSopenharmony_ci const int8_t *filter_y, 4261cabdff1aSopenharmony_ci int32_t weight, 4262cabdff1aSopenharmony_ci int32_t offset, 4263cabdff1aSopenharmony_ci int32_t rnd_val) 4264cabdff1aSopenharmony_ci{ 4265cabdff1aSopenharmony_ci v16u8 out; 4266cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 4267cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4268cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1; 4269cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 4270cabdff1aSopenharmony_ci v16i8 mask1; 4271cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4272cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65; 4273cabdff1aSopenharmony_ci v8i16 offset_vec, const_128, denom_vec; 4274cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, weight_vec, rnd_vec; 4275cabdff1aSopenharmony_ci 4276cabdff1aSopenharmony_ci src -= (src_stride + 1); 4277cabdff1aSopenharmony_ci 4278cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4279cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4280cabdff1aSopenharmony_ci 4281cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4282cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4283cabdff1aSopenharmony_ci 4284cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4285cabdff1aSopenharmony_ci 4286cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4287cabdff1aSopenharmony_ci 4288cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4289cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 4290cabdff1aSopenharmony_ci 4291cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 4292cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val - 6); 4293cabdff1aSopenharmony_ci const_128 = __msa_fill_h((128 * weight)); 4294cabdff1aSopenharmony_ci offset_vec += __msa_srar_h(const_128, denom_vec); 4295cabdff1aSopenharmony_ci 4296cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 4297cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 4298cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1); 4299cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3); 4300cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5); 4301cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7); 4302cabdff1aSopenharmony_ci dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4303cabdff1aSopenharmony_ci dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4304cabdff1aSopenharmony_ci dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4305cabdff1aSopenharmony_ci dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4306cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10, dst43); 4307cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21, dst54); 4308cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32, dst65); 4309cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 4310cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 4311cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1); 4312cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1); 4313cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 4314cabdff1aSopenharmony_ci MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1); 4315cabdff1aSopenharmony_ci MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3); 4316cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 4317cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 4318cabdff1aSopenharmony_ci ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4319cabdff1aSopenharmony_ci CLIP_SH2_0_255(tmp0, tmp1); 4320cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 4321cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 4322cabdff1aSopenharmony_ci} 4323cabdff1aSopenharmony_ci 4324cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src, 4325cabdff1aSopenharmony_ci int32_t src_stride, 4326cabdff1aSopenharmony_ci uint8_t *dst, 4327cabdff1aSopenharmony_ci int32_t dst_stride, 4328cabdff1aSopenharmony_ci const int8_t *filter_x, 4329cabdff1aSopenharmony_ci const int8_t *filter_y, 4330cabdff1aSopenharmony_ci int32_t height, 4331cabdff1aSopenharmony_ci int32_t weight, 4332cabdff1aSopenharmony_ci int32_t offset, 4333cabdff1aSopenharmony_ci int32_t rnd_val) 4334cabdff1aSopenharmony_ci{ 4335cabdff1aSopenharmony_ci uint32_t loop_cnt; 4336cabdff1aSopenharmony_ci v16u8 out0, out1; 4337cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4338cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4339cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 4340cabdff1aSopenharmony_ci v16i8 mask1; 4341cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3; 4342cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4343cabdff1aSopenharmony_ci v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 4344cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 4345cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 4346cabdff1aSopenharmony_ci v8i16 dst98_r, dst109_r, offset_vec, const_128, denom_vec; 4347cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec; 4348cabdff1aSopenharmony_ci 4349cabdff1aSopenharmony_ci src -= (src_stride + 1); 4350cabdff1aSopenharmony_ci 4351cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4352cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4353cabdff1aSopenharmony_ci 4354cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4355cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4356cabdff1aSopenharmony_ci 4357cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4358cabdff1aSopenharmony_ci 4359cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4360cabdff1aSopenharmony_ci 4361cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4362cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 4363cabdff1aSopenharmony_ci 4364cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 4365cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val - 6); 4366cabdff1aSopenharmony_ci const_128 = __msa_fill_h((128 * weight)); 4367cabdff1aSopenharmony_ci offset_vec += __msa_srar_h(const_128, denom_vec); 4368cabdff1aSopenharmony_ci 4369cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 4370cabdff1aSopenharmony_ci src += (3 * src_stride); 4371cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4372cabdff1aSopenharmony_ci 4373cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 4374cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3); 4375cabdff1aSopenharmony_ci dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4376cabdff1aSopenharmony_ci dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4377cabdff1aSopenharmony_ci ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 4378cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 4379cabdff1aSopenharmony_ci 4380cabdff1aSopenharmony_ci for (loop_cnt = height >> 3; loop_cnt--;) { 4381cabdff1aSopenharmony_ci LD_SB8(src, src_stride, 4382cabdff1aSopenharmony_ci src3, src4, src5, src6, src7, src8, src9, src10); 4383cabdff1aSopenharmony_ci src += (8 * src_stride); 4384cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4385cabdff1aSopenharmony_ci 4386cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1); 4387cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3); 4388cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5); 4389cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7); 4390cabdff1aSopenharmony_ci dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4391cabdff1aSopenharmony_ci dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4392cabdff1aSopenharmony_ci dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4393cabdff1aSopenharmony_ci dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4394cabdff1aSopenharmony_ci dst32_r = __msa_ilvr_h(dst73, dst22); 4395cabdff1aSopenharmony_ci ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 4396cabdff1aSopenharmony_ci ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 4397cabdff1aSopenharmony_ci ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 4398cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 4399cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst22, dst106); 4400cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4401cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4402cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4403cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4404cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4405cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4406cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4407cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4408cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 4409cabdff1aSopenharmony_ci SRA_4V(dst4, dst5, dst6, dst7, 6); 4410cabdff1aSopenharmony_ci MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1); 4411cabdff1aSopenharmony_ci MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3); 4412cabdff1aSopenharmony_ci MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5); 4413cabdff1aSopenharmony_ci MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7); 4414cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 4415cabdff1aSopenharmony_ci SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 4416cabdff1aSopenharmony_ci PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1, 4417cabdff1aSopenharmony_ci tmp2, tmp3); 4418cabdff1aSopenharmony_ci ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4419cabdff1aSopenharmony_ci ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 4420cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4421cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4422cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 4423cabdff1aSopenharmony_ci dst += (8 * dst_stride); 4424cabdff1aSopenharmony_ci 4425cabdff1aSopenharmony_ci dst10_r = dst98_r; 4426cabdff1aSopenharmony_ci dst21_r = dst109_r; 4427cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 4428cabdff1aSopenharmony_ci } 4429cabdff1aSopenharmony_ci} 4430cabdff1aSopenharmony_ci 4431cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src, 4432cabdff1aSopenharmony_ci int32_t src_stride, 4433cabdff1aSopenharmony_ci uint8_t *dst, 4434cabdff1aSopenharmony_ci int32_t dst_stride, 4435cabdff1aSopenharmony_ci const int8_t *filter_x, 4436cabdff1aSopenharmony_ci const int8_t *filter_y, 4437cabdff1aSopenharmony_ci int32_t height, 4438cabdff1aSopenharmony_ci int32_t weight, 4439cabdff1aSopenharmony_ci int32_t offset, 4440cabdff1aSopenharmony_ci int32_t rnd_val) 4441cabdff1aSopenharmony_ci{ 4442cabdff1aSopenharmony_ci if (2 == height) { 4443cabdff1aSopenharmony_ci hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride, 4444cabdff1aSopenharmony_ci filter_x, filter_y, weight, 4445cabdff1aSopenharmony_ci offset, rnd_val); 4446cabdff1aSopenharmony_ci } else if (4 == height) { 4447cabdff1aSopenharmony_ci hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride, 4448cabdff1aSopenharmony_ci filter_x,filter_y, weight, 4449cabdff1aSopenharmony_ci offset, rnd_val); 4450cabdff1aSopenharmony_ci } else if (0 == (height % 8)) { 4451cabdff1aSopenharmony_ci hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride, 4452cabdff1aSopenharmony_ci filter_x, filter_y, height, weight, 4453cabdff1aSopenharmony_ci offset, rnd_val); 4454cabdff1aSopenharmony_ci } 4455cabdff1aSopenharmony_ci} 4456cabdff1aSopenharmony_ci 4457cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src, 4458cabdff1aSopenharmony_ci int32_t src_stride, 4459cabdff1aSopenharmony_ci uint8_t *dst, 4460cabdff1aSopenharmony_ci int32_t dst_stride, 4461cabdff1aSopenharmony_ci const int8_t *filter_x, 4462cabdff1aSopenharmony_ci const int8_t *filter_y, 4463cabdff1aSopenharmony_ci int32_t height, 4464cabdff1aSopenharmony_ci int32_t weight, 4465cabdff1aSopenharmony_ci int32_t offset, 4466cabdff1aSopenharmony_ci int32_t rnd_val) 4467cabdff1aSopenharmony_ci{ 4468cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 4469cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4470cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4471cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4472cabdff1aSopenharmony_ci v16i8 mask1; 4473cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filter_vec; 4474cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4475cabdff1aSopenharmony_ci v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9; 4476cabdff1aSopenharmony_ci v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 4477cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r; 4478cabdff1aSopenharmony_ci v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l; 4479cabdff1aSopenharmony_ci v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l; 4480cabdff1aSopenharmony_ci v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l; 4481cabdff1aSopenharmony_ci v8i16 offset_vec, const_128, denom_vec; 4482cabdff1aSopenharmony_ci v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; 4483cabdff1aSopenharmony_ci v4i32 dst0_l, dst1_l, dst2_l, dst3_l, weight_vec, rnd_vec; 4484cabdff1aSopenharmony_ci 4485cabdff1aSopenharmony_ci src -= (src_stride + 1); 4486cabdff1aSopenharmony_ci 4487cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4488cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4489cabdff1aSopenharmony_ci 4490cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4491cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4492cabdff1aSopenharmony_ci 4493cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4494cabdff1aSopenharmony_ci 4495cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4496cabdff1aSopenharmony_ci 4497cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4498cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 4499cabdff1aSopenharmony_ci 4500cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 4501cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val - 6); 4502cabdff1aSopenharmony_ci const_128 = __msa_fill_h((128 * weight)); 4503cabdff1aSopenharmony_ci offset_vec += __msa_srar_h(const_128, denom_vec); 4504cabdff1aSopenharmony_ci 4505cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 4506cabdff1aSopenharmony_ci src += (3 * src_stride); 4507cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4508cabdff1aSopenharmony_ci 4509cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4510cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4511cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4512cabdff1aSopenharmony_ci dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4513cabdff1aSopenharmony_ci dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4514cabdff1aSopenharmony_ci dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4515cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 4516cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 4517cabdff1aSopenharmony_ci 4518cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 4519cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4520cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4521cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4522cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4523cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4524cabdff1aSopenharmony_ci dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4525cabdff1aSopenharmony_ci dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4526cabdff1aSopenharmony_ci dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4527cabdff1aSopenharmony_ci dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4528cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 4529cabdff1aSopenharmony_ci VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3); 4530cabdff1aSopenharmony_ci VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5); 4531cabdff1aSopenharmony_ci VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7); 4532cabdff1aSopenharmony_ci dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4533cabdff1aSopenharmony_ci dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4534cabdff1aSopenharmony_ci dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4535cabdff1aSopenharmony_ci dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4536cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 4537cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 4538cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 4539cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 4540cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 4541cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 4542cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l); 4543cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l); 4544cabdff1aSopenharmony_ci PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l); 4545cabdff1aSopenharmony_ci PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l); 4546cabdff1aSopenharmony_ci dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l); 4547cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4548cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4549cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4550cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4551cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4552cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4553cabdff1aSopenharmony_ci dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4554cabdff1aSopenharmony_ci dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4555cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1); 4556cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1); 4557cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1); 4558cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1); 4559cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 4560cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 4561cabdff1aSopenharmony_ci SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); 4562cabdff1aSopenharmony_ci MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 4563cabdff1aSopenharmony_ci MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 4564cabdff1aSopenharmony_ci MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r); 4565cabdff1aSopenharmony_ci MUL2(dst6_r, weight_vec, dst7_r, weight_vec, dst6_r, dst7_r); 4566cabdff1aSopenharmony_ci MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l); 4567cabdff1aSopenharmony_ci MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l); 4568cabdff1aSopenharmony_ci SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec); 4569cabdff1aSopenharmony_ci SRAR_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, rnd_vec); 4570cabdff1aSopenharmony_ci SRAR_W4_SW(dst0_l, dst1_l, dst2_l, dst3_l, rnd_vec); 4571cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1); 4572cabdff1aSopenharmony_ci PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3); 4573cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5); 4574cabdff1aSopenharmony_ci ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4575cabdff1aSopenharmony_ci ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 4576cabdff1aSopenharmony_ci ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5); 4577cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4578cabdff1aSopenharmony_ci CLIP_SH2_0_255(tmp4, tmp5); 4579cabdff1aSopenharmony_ci PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2); 4580cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 4581cabdff1aSopenharmony_ci ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); 4582cabdff1aSopenharmony_ci} 4583cabdff1aSopenharmony_ci 4584cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src, 4585cabdff1aSopenharmony_ci int32_t src_stride, 4586cabdff1aSopenharmony_ci uint8_t *dst, 4587cabdff1aSopenharmony_ci int32_t dst_stride, 4588cabdff1aSopenharmony_ci const int8_t *filter_x, 4589cabdff1aSopenharmony_ci const int8_t *filter_y, 4590cabdff1aSopenharmony_ci int32_t weight, 4591cabdff1aSopenharmony_ci int32_t offset, 4592cabdff1aSopenharmony_ci int32_t rnd_val) 4593cabdff1aSopenharmony_ci{ 4594cabdff1aSopenharmony_ci v16u8 out; 4595cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 4596cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4597cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filter_vec; 4598cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4599cabdff1aSopenharmony_ci v16i8 mask1; 4600cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 4601cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4; 4602cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 4603cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 4604cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 4605cabdff1aSopenharmony_ci v8i16 tmp0, tmp1; 4606cabdff1aSopenharmony_ci v8i16 offset_vec, const_128, denom_vec; 4607cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 4608cabdff1aSopenharmony_ci 4609cabdff1aSopenharmony_ci src -= (src_stride + 1); 4610cabdff1aSopenharmony_ci 4611cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4612cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4613cabdff1aSopenharmony_ci 4614cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4615cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4616cabdff1aSopenharmony_ci 4617cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4618cabdff1aSopenharmony_ci 4619cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4620cabdff1aSopenharmony_ci 4621cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4622cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 4623cabdff1aSopenharmony_ci 4624cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 4625cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val - 6); 4626cabdff1aSopenharmony_ci const_128 = __msa_fill_h((128 * weight)); 4627cabdff1aSopenharmony_ci offset_vec += __msa_srar_h(const_128, denom_vec); 4628cabdff1aSopenharmony_ci 4629cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 4630cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4631cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4632cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4633cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4634cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 4635cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 4636cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4637cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4638cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4639cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4640cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 4641cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4642cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4643cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4644cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4645cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4646cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4647cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4648cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4649cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4650cabdff1aSopenharmony_ci MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 4651cabdff1aSopenharmony_ci MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l); 4652cabdff1aSopenharmony_ci SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); 4653cabdff1aSopenharmony_ci PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); 4654cabdff1aSopenharmony_ci ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4655cabdff1aSopenharmony_ci CLIP_SH2_0_255(tmp0, tmp1); 4656cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 4657cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst, dst_stride); 4658cabdff1aSopenharmony_ci} 4659cabdff1aSopenharmony_ci 4660cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src, 4661cabdff1aSopenharmony_ci int32_t src_stride, 4662cabdff1aSopenharmony_ci uint8_t *dst, 4663cabdff1aSopenharmony_ci int32_t dst_stride, 4664cabdff1aSopenharmony_ci const int8_t *filter_x, 4665cabdff1aSopenharmony_ci const int8_t *filter_y, 4666cabdff1aSopenharmony_ci int32_t width8mult, 4667cabdff1aSopenharmony_ci int32_t weight, 4668cabdff1aSopenharmony_ci int32_t offset, 4669cabdff1aSopenharmony_ci int32_t rnd_val) 4670cabdff1aSopenharmony_ci{ 4671cabdff1aSopenharmony_ci uint32_t cnt; 4672cabdff1aSopenharmony_ci v16u8 out0, out1; 4673cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 4674cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4675cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec; 4676cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3; 4677cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 4678cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 4679cabdff1aSopenharmony_ci v8i16 offset_vec, const_128, denom_vec; 4680cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4681cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 4682cabdff1aSopenharmony_ci 4683cabdff1aSopenharmony_ci src -= (src_stride + 1); 4684cabdff1aSopenharmony_ci 4685cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4686cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4687cabdff1aSopenharmony_ci 4688cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4689cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4690cabdff1aSopenharmony_ci 4691cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4692cabdff1aSopenharmony_ci 4693cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 4694cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4695cabdff1aSopenharmony_ci 4696cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4697cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 4698cabdff1aSopenharmony_ci 4699cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 4700cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val - 6); 4701cabdff1aSopenharmony_ci const_128 = __msa_fill_h((128 * weight)); 4702cabdff1aSopenharmony_ci offset_vec += __msa_srar_h(const_128, denom_vec); 4703cabdff1aSopenharmony_ci 4704cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 4705cabdff1aSopenharmony_ci LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 4706cabdff1aSopenharmony_ci src += 8; 4707cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 4708cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4709cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4710cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4711cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4712cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4713cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4714cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4715cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4716cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4717cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4718cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4719cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4720cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4721cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4722cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4723cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4724cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4725cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4726cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4727cabdff1aSopenharmony_ci ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4728cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4729cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4730cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4731cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4732cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4733cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4734cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4735cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4736cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4737cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4738cabdff1aSopenharmony_ci MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 4739cabdff1aSopenharmony_ci MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 4740cabdff1aSopenharmony_ci MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l); 4741cabdff1aSopenharmony_ci MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l); 4742cabdff1aSopenharmony_ci SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); 4743cabdff1aSopenharmony_ci SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec); 4744cabdff1aSopenharmony_ci PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 4745cabdff1aSopenharmony_ci dst3_r, tmp0, tmp1, tmp2, tmp3); 4746cabdff1aSopenharmony_ci ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4747cabdff1aSopenharmony_ci ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 4748cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4749cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4750cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 4751cabdff1aSopenharmony_ci dst += 8; 4752cabdff1aSopenharmony_ci } 4753cabdff1aSopenharmony_ci} 4754cabdff1aSopenharmony_ci 4755cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src, 4756cabdff1aSopenharmony_ci int32_t src_stride, 4757cabdff1aSopenharmony_ci uint8_t *dst, 4758cabdff1aSopenharmony_ci int32_t dst_stride, 4759cabdff1aSopenharmony_ci const int8_t *filter_x, 4760cabdff1aSopenharmony_ci const int8_t *filter_y, 4761cabdff1aSopenharmony_ci int32_t weight, 4762cabdff1aSopenharmony_ci int32_t offset, 4763cabdff1aSopenharmony_ci int32_t rnd_val) 4764cabdff1aSopenharmony_ci{ 4765cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 4766cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 4767cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4768cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filter_vec; 4769cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4770cabdff1aSopenharmony_ci v16i8 mask1; 4771cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 4772cabdff1aSopenharmony_ci v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 4773cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 4774cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4775cabdff1aSopenharmony_ci v4i32 dst4_r, dst4_l, dst5_r, dst5_l, weight_vec, rnd_vec; 4776cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst10_l, dst32_l; 4777cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst21_l, dst43_l; 4778cabdff1aSopenharmony_ci v8i16 dst54_r, dst54_l, dst65_r, dst65_l; 4779cabdff1aSopenharmony_ci v8i16 dst76_r, dst76_l, dst87_r, dst87_l; 4780cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 4781cabdff1aSopenharmony_ci v8i16 offset_vec, const_128, denom_vec; 4782cabdff1aSopenharmony_ci 4783cabdff1aSopenharmony_ci src -= (src_stride + 1); 4784cabdff1aSopenharmony_ci 4785cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4786cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4787cabdff1aSopenharmony_ci 4788cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4789cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4790cabdff1aSopenharmony_ci 4791cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4792cabdff1aSopenharmony_ci 4793cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4794cabdff1aSopenharmony_ci 4795cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4796cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 4797cabdff1aSopenharmony_ci 4798cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 4799cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val - 6); 4800cabdff1aSopenharmony_ci const_128 = __msa_fill_h((128 * weight)); 4801cabdff1aSopenharmony_ci offset_vec += __msa_srar_h(const_128, denom_vec); 4802cabdff1aSopenharmony_ci 4803cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 4804cabdff1aSopenharmony_ci src += (5 * src_stride); 4805cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src5, src6, src7, src8); 4806cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4807cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 4808cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4809cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4810cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4811cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 4812cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 4813cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11); 4814cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13); 4815cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15); 4816cabdff1aSopenharmony_ci VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17); 4817cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4818cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4819cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4820cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4821cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 4822cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1); 4823cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1); 4824cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1); 4825cabdff1aSopenharmony_ci dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1); 4826cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4827cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4828cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4829cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4830cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4831cabdff1aSopenharmony_ci ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4832cabdff1aSopenharmony_ci ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 4833cabdff1aSopenharmony_ci ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 4834cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4835cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4836cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4837cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4838cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4839cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4840cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4841cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4842cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4843cabdff1aSopenharmony_ci dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); 4844cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4845cabdff1aSopenharmony_ci dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); 4846cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4847cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4848cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6); 4849cabdff1aSopenharmony_ci MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 4850cabdff1aSopenharmony_ci MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 4851cabdff1aSopenharmony_ci MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r); 4852cabdff1aSopenharmony_ci MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l); 4853cabdff1aSopenharmony_ci MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l); 4854cabdff1aSopenharmony_ci MUL2(dst4_l, weight_vec, dst5_l, weight_vec, dst4_l, dst5_l); 4855cabdff1aSopenharmony_ci SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); 4856cabdff1aSopenharmony_ci SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec); 4857cabdff1aSopenharmony_ci SRAR_W4_SW(dst4_r, dst4_l, dst5_r, dst5_l, rnd_vec); 4858cabdff1aSopenharmony_ci PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r, 4859cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 4860cabdff1aSopenharmony_ci PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5); 4861cabdff1aSopenharmony_ci ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4862cabdff1aSopenharmony_ci ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 4863cabdff1aSopenharmony_ci ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5); 4864cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4865cabdff1aSopenharmony_ci CLIP_SH2_0_255(tmp4, tmp5); 4866cabdff1aSopenharmony_ci PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2); 4867cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 4868cabdff1aSopenharmony_ci ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 4869cabdff1aSopenharmony_ci} 4870cabdff1aSopenharmony_ci 4871cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src, 4872cabdff1aSopenharmony_ci int32_t src_stride, 4873cabdff1aSopenharmony_ci uint8_t *dst, 4874cabdff1aSopenharmony_ci int32_t dst_stride, 4875cabdff1aSopenharmony_ci const int8_t *filter_x, 4876cabdff1aSopenharmony_ci const int8_t *filter_y, 4877cabdff1aSopenharmony_ci int32_t height, 4878cabdff1aSopenharmony_ci int32_t weight, 4879cabdff1aSopenharmony_ci int32_t offset, 4880cabdff1aSopenharmony_ci int32_t rnd_val, 4881cabdff1aSopenharmony_ci int32_t width8mult) 4882cabdff1aSopenharmony_ci{ 4883cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 4884cabdff1aSopenharmony_ci uint8_t *src_tmp; 4885cabdff1aSopenharmony_ci uint8_t *dst_tmp; 4886cabdff1aSopenharmony_ci v16u8 out0, out1; 4887cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 4888cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4889cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filter_vec; 4890cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4891cabdff1aSopenharmony_ci v16i8 mask1; 4892cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4893cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3; 4894cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 4895cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 4896cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 4897cabdff1aSopenharmony_ci v8i16 offset_vec, const_128, denom_vec; 4898cabdff1aSopenharmony_ci v4i32 dst2_r, dst2_l, dst3_r, dst3_l; 4899cabdff1aSopenharmony_ci v4i32 weight_vec, rnd_vec; 4900cabdff1aSopenharmony_ci 4901cabdff1aSopenharmony_ci src -= (src_stride + 1); 4902cabdff1aSopenharmony_ci 4903cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4904cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4905cabdff1aSopenharmony_ci 4906cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4907cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4908cabdff1aSopenharmony_ci 4909cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4910cabdff1aSopenharmony_ci 4911cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4912cabdff1aSopenharmony_ci 4913cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 4914cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 4915cabdff1aSopenharmony_ci 4916cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 4917cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val - 6); 4918cabdff1aSopenharmony_ci const_128 = __msa_fill_h((128 * weight)); 4919cabdff1aSopenharmony_ci offset_vec += __msa_srar_h(const_128, denom_vec); 4920cabdff1aSopenharmony_ci 4921cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 4922cabdff1aSopenharmony_ci src_tmp = src; 4923cabdff1aSopenharmony_ci dst_tmp = dst; 4924cabdff1aSopenharmony_ci 4925cabdff1aSopenharmony_ci LD_SB3(src_tmp, src_stride, src0, src1, src2); 4926cabdff1aSopenharmony_ci src_tmp += (3 * src_stride); 4927cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4928cabdff1aSopenharmony_ci 4929cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4930cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4931cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4932cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4933cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4934cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4935cabdff1aSopenharmony_ci 4936cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4937cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4938cabdff1aSopenharmony_ci 4939cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 4940cabdff1aSopenharmony_ci LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); 4941cabdff1aSopenharmony_ci src_tmp += (4 * src_stride); 4942cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 4943cabdff1aSopenharmony_ci 4944cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4945cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4946cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4947cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4948cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4949cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4950cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4951cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4952cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4953cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4954cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4955cabdff1aSopenharmony_ci ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4956cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4957cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4958cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4959cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4960cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4961cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4962cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4963cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4964cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4965cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4966cabdff1aSopenharmony_ci MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 4967cabdff1aSopenharmony_ci MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 4968cabdff1aSopenharmony_ci MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l); 4969cabdff1aSopenharmony_ci MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l); 4970cabdff1aSopenharmony_ci SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); 4971cabdff1aSopenharmony_ci SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec); 4972cabdff1aSopenharmony_ci PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 4973cabdff1aSopenharmony_ci dst3_r, tmp0, tmp1, tmp2, tmp3); 4974cabdff1aSopenharmony_ci ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 4975cabdff1aSopenharmony_ci ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 4976cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4977cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4978cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 4979cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 4980cabdff1aSopenharmony_ci 4981cabdff1aSopenharmony_ci dst10_r = dst54_r; 4982cabdff1aSopenharmony_ci dst10_l = dst54_l; 4983cabdff1aSopenharmony_ci dst21_r = dst65_r; 4984cabdff1aSopenharmony_ci dst21_l = dst65_l; 4985cabdff1aSopenharmony_ci dst2 = dst6; 4986cabdff1aSopenharmony_ci } 4987cabdff1aSopenharmony_ci 4988cabdff1aSopenharmony_ci src += 8; 4989cabdff1aSopenharmony_ci dst += 8; 4990cabdff1aSopenharmony_ci } 4991cabdff1aSopenharmony_ci} 4992cabdff1aSopenharmony_ci 4993cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src, 4994cabdff1aSopenharmony_ci int32_t src_stride, 4995cabdff1aSopenharmony_ci uint8_t *dst, 4996cabdff1aSopenharmony_ci int32_t dst_stride, 4997cabdff1aSopenharmony_ci const int8_t *filter_x, 4998cabdff1aSopenharmony_ci const int8_t *filter_y, 4999cabdff1aSopenharmony_ci int32_t height, 5000cabdff1aSopenharmony_ci int32_t weight, 5001cabdff1aSopenharmony_ci int32_t offset, 5002cabdff1aSopenharmony_ci int32_t rnd_val) 5003cabdff1aSopenharmony_ci{ 5004cabdff1aSopenharmony_ci 5005cabdff1aSopenharmony_ci if (2 == height) { 5006cabdff1aSopenharmony_ci hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride, 5007cabdff1aSopenharmony_ci filter_x, filter_y, weight, 5008cabdff1aSopenharmony_ci offset, rnd_val); 5009cabdff1aSopenharmony_ci } else if (4 == height) { 5010cabdff1aSopenharmony_ci hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride, 5011cabdff1aSopenharmony_ci filter_x, filter_y, 1, weight, 5012cabdff1aSopenharmony_ci offset, rnd_val); 5013cabdff1aSopenharmony_ci } else if (6 == height) { 5014cabdff1aSopenharmony_ci hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride, 5015cabdff1aSopenharmony_ci filter_x, filter_y, weight, 5016cabdff1aSopenharmony_ci offset, rnd_val); 5017cabdff1aSopenharmony_ci } else if (0 == (height % 4)) { 5018cabdff1aSopenharmony_ci hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 5019cabdff1aSopenharmony_ci filter_x, filter_y, height, weight, 5020cabdff1aSopenharmony_ci offset, rnd_val, 1); 5021cabdff1aSopenharmony_ci } 5022cabdff1aSopenharmony_ci} 5023cabdff1aSopenharmony_ci 5024cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src, 5025cabdff1aSopenharmony_ci int32_t src_stride, 5026cabdff1aSopenharmony_ci uint8_t *dst, 5027cabdff1aSopenharmony_ci int32_t dst_stride, 5028cabdff1aSopenharmony_ci const int8_t *filter_x, 5029cabdff1aSopenharmony_ci const int8_t *filter_y, 5030cabdff1aSopenharmony_ci int32_t height, 5031cabdff1aSopenharmony_ci int32_t weight, 5032cabdff1aSopenharmony_ci int32_t offset, 5033cabdff1aSopenharmony_ci int32_t rnd_val) 5034cabdff1aSopenharmony_ci{ 5035cabdff1aSopenharmony_ci uint32_t loop_cnt; 5036cabdff1aSopenharmony_ci uint8_t *src_tmp, *dst_tmp; 5037cabdff1aSopenharmony_ci v16u8 out0, out1; 5038cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 5039cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 5040cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3; 5041cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3; 5042cabdff1aSopenharmony_ci v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6; 5043cabdff1aSopenharmony_ci v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 5044cabdff1aSopenharmony_ci v8i16 dst76_r, dst98_r, dst87_r, dst109_r; 5045cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 5046cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 5047cabdff1aSopenharmony_ci v8i16 offset_vec, const_128, denom_vec; 5048cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 5049cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec; 5050cabdff1aSopenharmony_ci 5051cabdff1aSopenharmony_ci src -= (src_stride + 1); 5052cabdff1aSopenharmony_ci 5053cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 5054cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 5055cabdff1aSopenharmony_ci 5056cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 5057cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 5058cabdff1aSopenharmony_ci 5059cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 5060cabdff1aSopenharmony_ci 5061cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 5062cabdff1aSopenharmony_ci mask1 = mask0 + 2; 5063cabdff1aSopenharmony_ci 5064cabdff1aSopenharmony_ci weight_vec = __msa_fill_w(weight); 5065cabdff1aSopenharmony_ci rnd_vec = __msa_fill_w(rnd_val); 5066cabdff1aSopenharmony_ci 5067cabdff1aSopenharmony_ci offset_vec = __msa_fill_h(offset); 5068cabdff1aSopenharmony_ci denom_vec = __msa_fill_h(rnd_val - 6); 5069cabdff1aSopenharmony_ci const_128 = __msa_fill_h((128 * weight)); 5070cabdff1aSopenharmony_ci offset_vec += __msa_srar_h(const_128, denom_vec); 5071cabdff1aSopenharmony_ci 5072cabdff1aSopenharmony_ci src_tmp = src; 5073cabdff1aSopenharmony_ci dst_tmp = dst; 5074cabdff1aSopenharmony_ci 5075cabdff1aSopenharmony_ci LD_SB3(src_tmp, src_stride, src0, src1, src2); 5076cabdff1aSopenharmony_ci src_tmp += (3 * src_stride); 5077cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 5078cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 5079cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 5080cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 5081cabdff1aSopenharmony_ci dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5082cabdff1aSopenharmony_ci dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5083cabdff1aSopenharmony_ci dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5084cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 5085cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 5086cabdff1aSopenharmony_ci 5087cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 5088cabdff1aSopenharmony_ci LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); 5089cabdff1aSopenharmony_ci src_tmp += (4 * src_stride); 5090cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 5091cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 5092cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 5093cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 5094cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 5095cabdff1aSopenharmony_ci dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5096cabdff1aSopenharmony_ci dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5097cabdff1aSopenharmony_ci dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5098cabdff1aSopenharmony_ci dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5099cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 5100cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 5101cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 5102cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 5103cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5104cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 5105cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5106cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 5107cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5108cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 5109cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5110cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 5111cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 5112cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 5113cabdff1aSopenharmony_ci MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r); 5114cabdff1aSopenharmony_ci MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r); 5115cabdff1aSopenharmony_ci MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l); 5116cabdff1aSopenharmony_ci MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l); 5117cabdff1aSopenharmony_ci SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); 5118cabdff1aSopenharmony_ci SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec); 5119cabdff1aSopenharmony_ci PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 5120cabdff1aSopenharmony_ci dst3_r, tmp0, tmp1, tmp2, tmp3); 5121cabdff1aSopenharmony_ci ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 5122cabdff1aSopenharmony_ci ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 5123cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5124cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5125cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 5126cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 5127cabdff1aSopenharmony_ci 5128cabdff1aSopenharmony_ci dst10_r = dst54_r; 5129cabdff1aSopenharmony_ci dst10_l = dst54_l; 5130cabdff1aSopenharmony_ci dst21_r = dst65_r; 5131cabdff1aSopenharmony_ci dst21_l = dst65_l; 5132cabdff1aSopenharmony_ci dsth2 = dsth6; 5133cabdff1aSopenharmony_ci } 5134cabdff1aSopenharmony_ci 5135cabdff1aSopenharmony_ci src += 8; 5136cabdff1aSopenharmony_ci dst += 8; 5137cabdff1aSopenharmony_ci 5138cabdff1aSopenharmony_ci mask2 = LD_SB(ff_hevc_mask_arr + 16); 5139cabdff1aSopenharmony_ci mask3 = mask2 + 2; 5140cabdff1aSopenharmony_ci 5141cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 5142cabdff1aSopenharmony_ci src += (3 * src_stride); 5143cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 5144cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 5145cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3); 5146cabdff1aSopenharmony_ci dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5147cabdff1aSopenharmony_ci dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5148cabdff1aSopenharmony_ci ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 5149cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 5150cabdff1aSopenharmony_ci 5151cabdff1aSopenharmony_ci for (loop_cnt = 2; loop_cnt--;) { 5152cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, 5153cabdff1aSopenharmony_ci src10); 5154cabdff1aSopenharmony_ci src += (8 * src_stride); 5155cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 5156cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1); 5157cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3); 5158cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5); 5159cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7); 5160cabdff1aSopenharmony_ci dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 5161cabdff1aSopenharmony_ci dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 5162cabdff1aSopenharmony_ci dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 5163cabdff1aSopenharmony_ci dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 5164cabdff1aSopenharmony_ci dst32_r = __msa_ilvr_h(dst73, dst22); 5165cabdff1aSopenharmony_ci ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 5166cabdff1aSopenharmony_ci ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 5167cabdff1aSopenharmony_ci ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 5168cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 5169cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst22, dst106); 5170cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 5171cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 5172cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 5173cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 5174cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 5175cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 5176cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 5177cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 5178cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 5179cabdff1aSopenharmony_ci SRA_4V(dst4, dst5, dst6, dst7, 6); 5180cabdff1aSopenharmony_ci MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1); 5181cabdff1aSopenharmony_ci MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3); 5182cabdff1aSopenharmony_ci MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5); 5183cabdff1aSopenharmony_ci MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7); 5184cabdff1aSopenharmony_ci SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); 5185cabdff1aSopenharmony_ci SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); 5186cabdff1aSopenharmony_ci PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1, 5187cabdff1aSopenharmony_ci tmp2, tmp3); 5188cabdff1aSopenharmony_ci ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); 5189cabdff1aSopenharmony_ci ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); 5190cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 5191cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 5192cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 5193cabdff1aSopenharmony_ci dst += (8 * dst_stride); 5194cabdff1aSopenharmony_ci 5195cabdff1aSopenharmony_ci dst10_r = dst98_r; 5196cabdff1aSopenharmony_ci dst21_r = dst109_r; 5197cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 5198cabdff1aSopenharmony_ci } 5199cabdff1aSopenharmony_ci} 5200cabdff1aSopenharmony_ci 5201cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src, 5202cabdff1aSopenharmony_ci int32_t src_stride, 5203cabdff1aSopenharmony_ci uint8_t *dst, 5204cabdff1aSopenharmony_ci int32_t dst_stride, 5205cabdff1aSopenharmony_ci const int8_t *filter_x, 5206cabdff1aSopenharmony_ci const int8_t *filter_y, 5207cabdff1aSopenharmony_ci int32_t height, 5208cabdff1aSopenharmony_ci int32_t weight, 5209cabdff1aSopenharmony_ci int32_t offset, 5210cabdff1aSopenharmony_ci int32_t rnd_val) 5211cabdff1aSopenharmony_ci{ 5212cabdff1aSopenharmony_ci if (4 == height) { 5213cabdff1aSopenharmony_ci hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride, 5214cabdff1aSopenharmony_ci filter_x, filter_y, 2, weight, offset, 5215cabdff1aSopenharmony_ci rnd_val); 5216cabdff1aSopenharmony_ci } else { 5217cabdff1aSopenharmony_ci hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 5218cabdff1aSopenharmony_ci filter_x, filter_y, height, weight, 5219cabdff1aSopenharmony_ci offset, rnd_val, 2); 5220cabdff1aSopenharmony_ci } 5221cabdff1aSopenharmony_ci} 5222cabdff1aSopenharmony_ci 5223cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src, 5224cabdff1aSopenharmony_ci int32_t src_stride, 5225cabdff1aSopenharmony_ci uint8_t *dst, 5226cabdff1aSopenharmony_ci int32_t dst_stride, 5227cabdff1aSopenharmony_ci const int8_t *filter_x, 5228cabdff1aSopenharmony_ci const int8_t *filter_y, 5229cabdff1aSopenharmony_ci int32_t height, 5230cabdff1aSopenharmony_ci int32_t weight, 5231cabdff1aSopenharmony_ci int32_t offset, 5232cabdff1aSopenharmony_ci int32_t rnd_val) 5233cabdff1aSopenharmony_ci{ 5234cabdff1aSopenharmony_ci hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 5235cabdff1aSopenharmony_ci filter_x, filter_y, height, weight, 5236cabdff1aSopenharmony_ci offset, rnd_val, 3); 5237cabdff1aSopenharmony_ci} 5238cabdff1aSopenharmony_ci 5239cabdff1aSopenharmony_cistatic void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src, 5240cabdff1aSopenharmony_ci int32_t src_stride, 5241cabdff1aSopenharmony_ci uint8_t *dst, 5242cabdff1aSopenharmony_ci int32_t dst_stride, 5243cabdff1aSopenharmony_ci const int8_t *filter_x, 5244cabdff1aSopenharmony_ci const int8_t *filter_y, 5245cabdff1aSopenharmony_ci int32_t height, 5246cabdff1aSopenharmony_ci int32_t weight, 5247cabdff1aSopenharmony_ci int32_t offset, 5248cabdff1aSopenharmony_ci int32_t rnd_val) 5249cabdff1aSopenharmony_ci{ 5250cabdff1aSopenharmony_ci hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 5251cabdff1aSopenharmony_ci filter_x, filter_y, height, weight, 5252cabdff1aSopenharmony_ci offset, rnd_val, 4); 5253cabdff1aSopenharmony_ci} 5254cabdff1aSopenharmony_ci 5255cabdff1aSopenharmony_ci#define UNIWGT_MC_COPY(WIDTH) \ 5256cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ 5257cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 5258cabdff1aSopenharmony_ci uint8_t *src, \ 5259cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 5260cabdff1aSopenharmony_ci int height, \ 5261cabdff1aSopenharmony_ci int denom, \ 5262cabdff1aSopenharmony_ci int weight, \ 5263cabdff1aSopenharmony_ci int offset, \ 5264cabdff1aSopenharmony_ci intptr_t mx, \ 5265cabdff1aSopenharmony_ci intptr_t my, \ 5266cabdff1aSopenharmony_ci int width) \ 5267cabdff1aSopenharmony_ci{ \ 5268cabdff1aSopenharmony_ci int shift = denom + 14 - 8; \ 5269cabdff1aSopenharmony_ci hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 5270cabdff1aSopenharmony_ci height, weight, offset, shift); \ 5271cabdff1aSopenharmony_ci} 5272cabdff1aSopenharmony_ci 5273cabdff1aSopenharmony_ciUNIWGT_MC_COPY(4); 5274cabdff1aSopenharmony_ciUNIWGT_MC_COPY(6); 5275cabdff1aSopenharmony_ciUNIWGT_MC_COPY(8); 5276cabdff1aSopenharmony_ciUNIWGT_MC_COPY(12); 5277cabdff1aSopenharmony_ciUNIWGT_MC_COPY(16); 5278cabdff1aSopenharmony_ciUNIWGT_MC_COPY(24); 5279cabdff1aSopenharmony_ciUNIWGT_MC_COPY(32); 5280cabdff1aSopenharmony_ciUNIWGT_MC_COPY(48); 5281cabdff1aSopenharmony_ciUNIWGT_MC_COPY(64); 5282cabdff1aSopenharmony_ci 5283cabdff1aSopenharmony_ci#undef UNIWGT_MC_COPY 5284cabdff1aSopenharmony_ci 5285cabdff1aSopenharmony_ci#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 5286cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ 5287cabdff1aSopenharmony_ci ptrdiff_t \ 5288cabdff1aSopenharmony_ci dst_stride, \ 5289cabdff1aSopenharmony_ci uint8_t *src, \ 5290cabdff1aSopenharmony_ci ptrdiff_t \ 5291cabdff1aSopenharmony_ci src_stride, \ 5292cabdff1aSopenharmony_ci int height, \ 5293cabdff1aSopenharmony_ci int denom, \ 5294cabdff1aSopenharmony_ci int weight, \ 5295cabdff1aSopenharmony_ci int offset, \ 5296cabdff1aSopenharmony_ci intptr_t mx, \ 5297cabdff1aSopenharmony_ci intptr_t my, \ 5298cabdff1aSopenharmony_ci int width) \ 5299cabdff1aSopenharmony_ci{ \ 5300cabdff1aSopenharmony_ci const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 5301cabdff1aSopenharmony_ci int shift = denom + 14 - 8; \ 5302cabdff1aSopenharmony_ci \ 5303cabdff1aSopenharmony_ci hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \ 5304cabdff1aSopenharmony_ci dst_stride, filter, height, \ 5305cabdff1aSopenharmony_ci weight, offset, shift); \ 5306cabdff1aSopenharmony_ci} 5307cabdff1aSopenharmony_ci 5308cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 4, 8, hz, mx); 5309cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 8, 8, hz, mx); 5310cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 12, 8, hz, mx); 5311cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 16, 8, hz, mx); 5312cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 24, 8, hz, mx); 5313cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 32, 8, hz, mx); 5314cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 48, 8, hz, mx); 5315cabdff1aSopenharmony_ciUNI_W_MC(qpel, h, 64, 8, hz, mx); 5316cabdff1aSopenharmony_ci 5317cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 4, 8, vt, my); 5318cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 8, 8, vt, my); 5319cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 12, 8, vt, my); 5320cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 16, 8, vt, my); 5321cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 24, 8, vt, my); 5322cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 32, 8, vt, my); 5323cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 48, 8, vt, my); 5324cabdff1aSopenharmony_ciUNI_W_MC(qpel, v, 64, 8, vt, my); 5325cabdff1aSopenharmony_ci 5326cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 4, 4, hz, mx); 5327cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 6, 4, hz, mx); 5328cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 8, 4, hz, mx); 5329cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 12, 4, hz, mx); 5330cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 16, 4, hz, mx); 5331cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 24, 4, hz, mx); 5332cabdff1aSopenharmony_ciUNI_W_MC(epel, h, 32, 4, hz, mx); 5333cabdff1aSopenharmony_ci 5334cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 4, 4, vt, my); 5335cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 6, 4, vt, my); 5336cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 8, 4, vt, my); 5337cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 12, 4, vt, my); 5338cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 16, 4, vt, my); 5339cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 24, 4, vt, my); 5340cabdff1aSopenharmony_ciUNI_W_MC(epel, v, 32, 4, vt, my); 5341cabdff1aSopenharmony_ci 5342cabdff1aSopenharmony_ci#undef UNI_W_MC 5343cabdff1aSopenharmony_ci 5344cabdff1aSopenharmony_ci#define UNI_W_MC_HV(PEL, WIDTH, TAP) \ 5345cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \ 5346cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 5347cabdff1aSopenharmony_ci uint8_t *src, \ 5348cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 5349cabdff1aSopenharmony_ci int height, \ 5350cabdff1aSopenharmony_ci int denom, \ 5351cabdff1aSopenharmony_ci int weight, \ 5352cabdff1aSopenharmony_ci int offset, \ 5353cabdff1aSopenharmony_ci intptr_t mx, \ 5354cabdff1aSopenharmony_ci intptr_t my, \ 5355cabdff1aSopenharmony_ci int width) \ 5356cabdff1aSopenharmony_ci{ \ 5357cabdff1aSopenharmony_ci const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 5358cabdff1aSopenharmony_ci const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 5359cabdff1aSopenharmony_ci int shift = denom + 14 - 8; \ 5360cabdff1aSopenharmony_ci \ 5361cabdff1aSopenharmony_ci hevc_hv_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 5362cabdff1aSopenharmony_ci filter_x, filter_y, height, \ 5363cabdff1aSopenharmony_ci weight, offset, shift); \ 5364cabdff1aSopenharmony_ci} 5365cabdff1aSopenharmony_ci 5366cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 4, 8); 5367cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 8, 8); 5368cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 12, 8); 5369cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 16, 8); 5370cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 24, 8); 5371cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 32, 8); 5372cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 48, 8); 5373cabdff1aSopenharmony_ciUNI_W_MC_HV(qpel, 64, 8); 5374cabdff1aSopenharmony_ci 5375cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 4, 4); 5376cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 6, 4); 5377cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 8, 4); 5378cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 12, 4); 5379cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 16, 4); 5380cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 24, 4); 5381cabdff1aSopenharmony_ciUNI_W_MC_HV(epel, 32, 4); 5382cabdff1aSopenharmony_ci 5383cabdff1aSopenharmony_ci#undef UNI_W_MC_HV 5384