1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 22cabdff1aSopenharmony_ci#include "libavcodec/mips/hevcdsp_mips.h" 23cabdff1aSopenharmony_ci#include "libavcodec/mips/hevc_macros_msa.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { 26cabdff1aSopenharmony_ci /* 8 width cases */ 27cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 29cabdff1aSopenharmony_ci}; 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ci#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \ 32cabdff1aSopenharmony_ci{ \ 33cabdff1aSopenharmony_ci ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \ 34cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, rnd_val); \ 35cabdff1aSopenharmony_ci CLIP_SH2_0_255(out0, out1); \ 36cabdff1aSopenharmony_ci} 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ci#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \ 39cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3, rnd_val, \ 40cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 41cabdff1aSopenharmony_ci{ \ 42cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \ 43cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \ 44cabdff1aSopenharmony_ci} 45cabdff1aSopenharmony_ci 46cabdff1aSopenharmony_ci#define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, \ 47cabdff1aSopenharmony_ci out0, out1) \ 48cabdff1aSopenharmony_ci{ \ 49cabdff1aSopenharmony_ci ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \ 50cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, rnd_val); \ 51cabdff1aSopenharmony_ci CLIP_SH2_0_255(out0, out1); \ 52cabdff1aSopenharmony_ci} 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ci#define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \ 55cabdff1aSopenharmony_ci vec3, rnd_val, out0, out1, out2, out3) \ 56cabdff1aSopenharmony_ci{ \ 57cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1); \ 58cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3); \ 59cabdff1aSopenharmony_ci} 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_cistatic void hevc_bi_copy_4w_msa(uint8_t *src0_ptr, 62cabdff1aSopenharmony_ci int32_t src_stride, 63cabdff1aSopenharmony_ci int16_t *src1_ptr, 64cabdff1aSopenharmony_ci int32_t src2_stride, 65cabdff1aSopenharmony_ci uint8_t *dst, 66cabdff1aSopenharmony_ci int32_t dst_stride, 67cabdff1aSopenharmony_ci int32_t height) 68cabdff1aSopenharmony_ci{ 69cabdff1aSopenharmony_ci uint32_t loop_cnt, tp0, tp1, tp2, tp3; 70cabdff1aSopenharmony_ci uint64_t tpd0, tpd1, tpd2, tpd3; 71cabdff1aSopenharmony_ci v16i8 src0 = { 0 }, src1 = { 0 }; 72cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 73cabdff1aSopenharmony_ci v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 74cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 75cabdff1aSopenharmony_ci 76cabdff1aSopenharmony_ci if (2 == height) { 77cabdff1aSopenharmony_ci LW2(src0_ptr, src_stride, tp0, tp1); 78cabdff1aSopenharmony_ci INSERT_W2_SB(tp0, tp1, src0); 79cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tpd0, tpd1); 80cabdff1aSopenharmony_ci INSERT_D2_SH(tpd0, tpd1, in0); 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_ci dst0 = (v8i16) __msa_ilvr_b(zero, src0); 83cabdff1aSopenharmony_ci dst0 <<= 6; 84cabdff1aSopenharmony_ci dst0 += in0; 85cabdff1aSopenharmony_ci dst0 = __msa_srari_h(dst0, 7); 86cabdff1aSopenharmony_ci CLIP_SH_0_255(dst0); 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); 89cabdff1aSopenharmony_ci ST_W2(dst0, 0, 1, dst, dst_stride); 90cabdff1aSopenharmony_ci } else if (4 == height) { 91cabdff1aSopenharmony_ci LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 92cabdff1aSopenharmony_ci INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); 93cabdff1aSopenharmony_ci LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); 94cabdff1aSopenharmony_ci INSERT_D2_SH(tpd0, tpd1, in0); 95cabdff1aSopenharmony_ci INSERT_D2_SH(tpd2, tpd3, in1); 96cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 97cabdff1aSopenharmony_ci SLLI_2V(dst0, dst1, 6); 98cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1); 99cabdff1aSopenharmony_ci dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 100cabdff1aSopenharmony_ci ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride); 101cabdff1aSopenharmony_ci } else if (0 == height % 8) { 102cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 103cabdff1aSopenharmony_ci LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 104cabdff1aSopenharmony_ci src0_ptr += 4 * src_stride; 105cabdff1aSopenharmony_ci INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); 106cabdff1aSopenharmony_ci LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 107cabdff1aSopenharmony_ci src0_ptr += 4 * src_stride; 108cabdff1aSopenharmony_ci INSERT_W4_SB(tp0, tp1, tp2, tp3, src1); 109cabdff1aSopenharmony_ci LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); 110cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 111cabdff1aSopenharmony_ci INSERT_D2_SH(tpd0, tpd1, in0); 112cabdff1aSopenharmony_ci INSERT_D2_SH(tpd2, tpd3, in1); 113cabdff1aSopenharmony_ci LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3); 114cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 115cabdff1aSopenharmony_ci INSERT_D2_SH(tpd0, tpd1, in2); 116cabdff1aSopenharmony_ci INSERT_D2_SH(tpd2, tpd3, in3); 117cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 118cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 119cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 120cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, 121cabdff1aSopenharmony_ci dst3, 7, dst0, dst1, dst2, dst3); 122cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 123cabdff1aSopenharmony_ci ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 124cabdff1aSopenharmony_ci dst += (8 * dst_stride); 125cabdff1aSopenharmony_ci } 126cabdff1aSopenharmony_ci } 127cabdff1aSopenharmony_ci} 128cabdff1aSopenharmony_ci 129cabdff1aSopenharmony_cistatic void hevc_bi_copy_6w_msa(uint8_t *src0_ptr, 130cabdff1aSopenharmony_ci int32_t src_stride, 131cabdff1aSopenharmony_ci int16_t *src1_ptr, 132cabdff1aSopenharmony_ci int32_t src2_stride, 133cabdff1aSopenharmony_ci uint8_t *dst, 134cabdff1aSopenharmony_ci int32_t dst_stride, 135cabdff1aSopenharmony_ci int32_t height) 136cabdff1aSopenharmony_ci{ 137cabdff1aSopenharmony_ci uint32_t loop_cnt; 138cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 139cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 140cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 141cabdff1aSopenharmony_ci v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 142cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 143cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 144cabdff1aSopenharmony_ci 145cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 146cabdff1aSopenharmony_ci LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 147cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 148cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 149cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src1); 150cabdff1aSopenharmony_ci LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 151cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 152cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src2); 153cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src3); 154cabdff1aSopenharmony_ci LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); 155cabdff1aSopenharmony_ci src1_ptr += (8 * src2_stride); 156cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 157cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 158cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 159cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src3, dst6, dst7); 160cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 161cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 162cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 163cabdff1aSopenharmony_ci 7, dst0, dst1, dst2, dst3); 164cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7, 165cabdff1aSopenharmony_ci 7, dst4, dst5, dst6, dst7); 166cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 167cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 168cabdff1aSopenharmony_ci ST_W2(out0, 0, 2, dst, dst_stride); 169cabdff1aSopenharmony_ci ST_H2(out0, 2, 6, dst + 4, dst_stride); 170cabdff1aSopenharmony_ci ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 171cabdff1aSopenharmony_ci ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 172cabdff1aSopenharmony_ci dst += (4 * dst_stride); 173cabdff1aSopenharmony_ci ST_W2(out2, 0, 2, dst, dst_stride); 174cabdff1aSopenharmony_ci ST_H2(out2, 2, 6, dst + 4, dst_stride); 175cabdff1aSopenharmony_ci ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride); 176cabdff1aSopenharmony_ci ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 177cabdff1aSopenharmony_ci dst += (4 * dst_stride); 178cabdff1aSopenharmony_ci } 179cabdff1aSopenharmony_ci} 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_cistatic void hevc_bi_copy_8w_msa(uint8_t *src0_ptr, 182cabdff1aSopenharmony_ci int32_t src_stride, 183cabdff1aSopenharmony_ci int16_t *src1_ptr, 184cabdff1aSopenharmony_ci int32_t src2_stride, 185cabdff1aSopenharmony_ci uint8_t *dst, 186cabdff1aSopenharmony_ci int32_t dst_stride, 187cabdff1aSopenharmony_ci int32_t height) 188cabdff1aSopenharmony_ci{ 189cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 190cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 191cabdff1aSopenharmony_ci v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 192cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 193cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 194cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_ci if (2 == height) { 197cabdff1aSopenharmony_ci LD2(src0_ptr, src_stride, tp0, tp1); 198cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 199cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 200cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 201cabdff1aSopenharmony_ci SLLI_2V(dst0, dst1, 6); 202cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1); 203cabdff1aSopenharmony_ci out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 204cabdff1aSopenharmony_ci ST_D2(out0, 0, 1, dst, dst_stride); 205cabdff1aSopenharmony_ci } else if (4 == height) { 206cabdff1aSopenharmony_ci LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 207cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 208cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src1); 209cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 210cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 211cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 212cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 213cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 214cabdff1aSopenharmony_ci 7, dst0, dst1, dst2, dst3); 215cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 216cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 217cabdff1aSopenharmony_ci } else if (6 == height) { 218cabdff1aSopenharmony_ci LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 219cabdff1aSopenharmony_ci src0_ptr += 4 * src_stride; 220cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 221cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src1); 222cabdff1aSopenharmony_ci LD2(src0_ptr, src_stride, tp0, tp1); 223cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src2); 224cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 225cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 226cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 227cabdff1aSopenharmony_ci LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); 228cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 229cabdff1aSopenharmony_ci SLLI_2V(dst4, dst5, 6); 230cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 231cabdff1aSopenharmony_ci 7, dst0, dst1, dst2, dst3); 232cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5); 233cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 234cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 235cabdff1aSopenharmony_ci ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 236cabdff1aSopenharmony_ci } else if (0 == height % 8) { 237cabdff1aSopenharmony_ci uint32_t loop_cnt; 238cabdff1aSopenharmony_ci 239cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 240cabdff1aSopenharmony_ci LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 241cabdff1aSopenharmony_ci src0_ptr += 4 * src_stride; 242cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src0); 243cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src1); 244cabdff1aSopenharmony_ci LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); 245cabdff1aSopenharmony_ci src0_ptr += 4 * src_stride; 246cabdff1aSopenharmony_ci INSERT_D2_SB(tp0, tp1, src2); 247cabdff1aSopenharmony_ci INSERT_D2_SB(tp2, tp3, src3); 248cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 249cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 250cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 251cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src3, dst6, dst7); 252cabdff1aSopenharmony_ci LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, 253cabdff1aSopenharmony_ci in7); 254cabdff1aSopenharmony_ci src1_ptr += (8 * src2_stride); 255cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 256cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 257cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, 258cabdff1aSopenharmony_ci dst3, 7, dst0, dst1, dst2, dst3); 259cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, 260cabdff1aSopenharmony_ci dst7, 7, dst4, dst5, dst6, dst7); 261cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 262cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 263cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 264cabdff1aSopenharmony_ci dst += (8 * dst_stride); 265cabdff1aSopenharmony_ci } 266cabdff1aSopenharmony_ci } 267cabdff1aSopenharmony_ci} 268cabdff1aSopenharmony_ci 269cabdff1aSopenharmony_cistatic void hevc_bi_copy_12w_msa(uint8_t *src0_ptr, 270cabdff1aSopenharmony_ci int32_t src_stride, 271cabdff1aSopenharmony_ci int16_t *src1_ptr, 272cabdff1aSopenharmony_ci int32_t src2_stride, 273cabdff1aSopenharmony_ci uint8_t *dst, 274cabdff1aSopenharmony_ci int32_t dst_stride, 275cabdff1aSopenharmony_ci int32_t height) 276cabdff1aSopenharmony_ci{ 277cabdff1aSopenharmony_ci uint32_t loop_cnt; 278cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 279cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 280cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 281cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 282cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 283cabdff1aSopenharmony_ci 284cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 285cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 286cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 287cabdff1aSopenharmony_ci 288cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 289cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 290cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 291cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 292cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, dst0, dst1, 293cabdff1aSopenharmony_ci dst2, dst3); 294cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 295cabdff1aSopenharmony_ci ILVL_W2_SB(src1, src0, src3, src2, src0, src1); 296cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5); 297cabdff1aSopenharmony_ci SLLI_2V(dst4, dst5, 6); 298cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 299cabdff1aSopenharmony_ci 7, dst0, dst1, dst2, dst3); 300cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5); 301cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 302cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 303cabdff1aSopenharmony_ci ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); 304cabdff1aSopenharmony_ci dst += (4 * dst_stride); 305cabdff1aSopenharmony_ci } 306cabdff1aSopenharmony_ci} 307cabdff1aSopenharmony_ci 308cabdff1aSopenharmony_cistatic void hevc_bi_copy_16w_msa(uint8_t *src0_ptr, 309cabdff1aSopenharmony_ci int32_t src_stride, 310cabdff1aSopenharmony_ci int16_t *src1_ptr, 311cabdff1aSopenharmony_ci int32_t src2_stride, 312cabdff1aSopenharmony_ci uint8_t *dst, 313cabdff1aSopenharmony_ci int32_t dst_stride, 314cabdff1aSopenharmony_ci int32_t height) 315cabdff1aSopenharmony_ci{ 316cabdff1aSopenharmony_ci uint32_t loop_cnt; 317cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 318cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 319cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 320cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l; 321cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 322cabdff1aSopenharmony_ci 323cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 324cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 325cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 326cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 327cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 328cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 329cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0_r, dst0_l); 330cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst1_r, dst1_l); 331cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst2_r, dst2_l); 332cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src3, dst3_r, dst3_l); 333cabdff1aSopenharmony_ci SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 334cabdff1aSopenharmony_ci SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); 335cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in4, in5, dst0_r, dst1_r, dst0_l, 336cabdff1aSopenharmony_ci dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l); 337cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in2, in3, in6, in7, dst2_r, dst3_r, dst2_l, 338cabdff1aSopenharmony_ci dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l); 339cabdff1aSopenharmony_ci PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1); 340cabdff1aSopenharmony_ci PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3); 341cabdff1aSopenharmony_ci ST_UB4(out0, out1, out2, out3, dst, dst_stride); 342cabdff1aSopenharmony_ci dst += (4 * dst_stride); 343cabdff1aSopenharmony_ci } 344cabdff1aSopenharmony_ci} 345cabdff1aSopenharmony_ci 346cabdff1aSopenharmony_cistatic void hevc_bi_copy_24w_msa(uint8_t *src0_ptr, 347cabdff1aSopenharmony_ci int32_t src_stride, 348cabdff1aSopenharmony_ci int16_t *src1_ptr, 349cabdff1aSopenharmony_ci int32_t src2_stride, 350cabdff1aSopenharmony_ci uint8_t *dst, 351cabdff1aSopenharmony_ci int32_t dst_stride, 352cabdff1aSopenharmony_ci int32_t height) 353cabdff1aSopenharmony_ci{ 354cabdff1aSopenharmony_ci uint32_t loop_cnt; 355cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3, out4, out5; 356cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 }; 357cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10; 358cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11; 359cabdff1aSopenharmony_ci 360cabdff1aSopenharmony_ci for (loop_cnt = 8; loop_cnt--;) { 361cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5); 362cabdff1aSopenharmony_ci LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7); 363cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 364cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 365cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 366cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11); 367cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 368cabdff1aSopenharmony_ci 369cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 370cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 371cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5); 372cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src4, dst6, dst7); 373cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src5, dst8, dst9); 374cabdff1aSopenharmony_ci ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11); 375cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 376cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 377cabdff1aSopenharmony_ci SLLI_4V(dst8, dst9, dst10, dst11, 6); 378cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in0, in4, in1, in5, dst0, dst1, dst2, dst3, 379cabdff1aSopenharmony_ci 7, dst0, dst1, dst2, dst3); 380cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in2, in6, dst4, dst5, dst6, dst7, 381cabdff1aSopenharmony_ci 7, dst4, dst5, dst6, dst7); 382cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in3, in7, in10, in11, dst8, dst9, dst10, 383cabdff1aSopenharmony_ci dst11, 7, dst8, dst9, dst10, dst11); 384cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 385cabdff1aSopenharmony_ci PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); 386cabdff1aSopenharmony_ci ST_UB4(out0, out1, out3, out4, dst, dst_stride); 387cabdff1aSopenharmony_ci ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride); 388cabdff1aSopenharmony_ci dst += (4 * dst_stride); 389cabdff1aSopenharmony_ci } 390cabdff1aSopenharmony_ci} 391cabdff1aSopenharmony_ci 392cabdff1aSopenharmony_cistatic void hevc_bi_copy_32w_msa(uint8_t *src0_ptr, 393cabdff1aSopenharmony_ci int32_t src_stride, 394cabdff1aSopenharmony_ci int16_t *src1_ptr, 395cabdff1aSopenharmony_ci int32_t src2_stride, 396cabdff1aSopenharmony_ci uint8_t *dst, 397cabdff1aSopenharmony_ci int32_t dst_stride, 398cabdff1aSopenharmony_ci int32_t height) 399cabdff1aSopenharmony_ci{ 400cabdff1aSopenharmony_ci uint32_t loop_cnt; 401cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 402cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 403cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 404cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 405cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 406cabdff1aSopenharmony_ci 407cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 408cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 16, src0, src1); 409cabdff1aSopenharmony_ci src0_ptr += src_stride; 410cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 16, src2, src3); 411cabdff1aSopenharmony_ci src0_ptr += src_stride; 412cabdff1aSopenharmony_ci LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 413cabdff1aSopenharmony_ci src1_ptr += src2_stride; 414cabdff1aSopenharmony_ci LD_SH4(src1_ptr, 8, in4, in5, in6, in7); 415cabdff1aSopenharmony_ci src1_ptr += src2_stride; 416cabdff1aSopenharmony_ci 417cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 418cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 419cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 420cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src3, dst6, dst7); 421cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 422cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 423cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 424cabdff1aSopenharmony_ci 7, dst0, dst1, dst2, dst3); 425cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7, 426cabdff1aSopenharmony_ci 7, dst4, dst5, dst6, dst7); 427cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 428cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 429cabdff1aSopenharmony_ci ST_UB2(out0, out1, dst, 16); 430cabdff1aSopenharmony_ci dst += dst_stride; 431cabdff1aSopenharmony_ci ST_UB2(out2, out3, dst, 16); 432cabdff1aSopenharmony_ci dst += dst_stride; 433cabdff1aSopenharmony_ci } 434cabdff1aSopenharmony_ci} 435cabdff1aSopenharmony_ci 436cabdff1aSopenharmony_cistatic void hevc_bi_copy_48w_msa(uint8_t *src0_ptr, 437cabdff1aSopenharmony_ci int32_t src_stride, 438cabdff1aSopenharmony_ci int16_t *src1_ptr, 439cabdff1aSopenharmony_ci int32_t src2_stride, 440cabdff1aSopenharmony_ci uint8_t *dst, 441cabdff1aSopenharmony_ci int32_t dst_stride, 442cabdff1aSopenharmony_ci int32_t height) 443cabdff1aSopenharmony_ci{ 444cabdff1aSopenharmony_ci uint32_t loop_cnt; 445cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3, out4, out5; 446cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 447cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 448cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10; 449cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11; 450cabdff1aSopenharmony_ci 451cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 452cabdff1aSopenharmony_ci LD_SB3(src0_ptr, 16, src0, src1, src2); 453cabdff1aSopenharmony_ci src0_ptr += src_stride; 454cabdff1aSopenharmony_ci LD_SB3(src0_ptr, 16, src3, src4, src5); 455cabdff1aSopenharmony_ci src0_ptr += src_stride; 456cabdff1aSopenharmony_ci 457cabdff1aSopenharmony_ci LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5); 458cabdff1aSopenharmony_ci src1_ptr += src2_stride; 459cabdff1aSopenharmony_ci LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11); 460cabdff1aSopenharmony_ci src1_ptr += src2_stride; 461cabdff1aSopenharmony_ci 462cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 463cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 464cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 465cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src3, dst6, dst7); 466cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src4, dst8, dst9); 467cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src5, dst10, dst11); 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 470cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 471cabdff1aSopenharmony_ci SLLI_4V(dst8, dst9, dst10, dst11, 6); 472cabdff1aSopenharmony_ci 473cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 474cabdff1aSopenharmony_ci 7, dst0, dst1, dst2, dst3); 475cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7, 476cabdff1aSopenharmony_ci 7, dst4, dst5, dst6, dst7); 477cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in10, in11, dst8, dst9, dst10, 478cabdff1aSopenharmony_ci dst11, 7, dst8, dst9, dst10, dst11); 479cabdff1aSopenharmony_ci PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); 480cabdff1aSopenharmony_ci PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); 481cabdff1aSopenharmony_ci ST_UB2(out0, out1, dst, 16); 482cabdff1aSopenharmony_ci ST_UB(out2, dst + 32); 483cabdff1aSopenharmony_ci dst += dst_stride; 484cabdff1aSopenharmony_ci ST_UB2(out3, out4, dst, 16); 485cabdff1aSopenharmony_ci ST_UB(out5, dst + 32); 486cabdff1aSopenharmony_ci dst += dst_stride; 487cabdff1aSopenharmony_ci } 488cabdff1aSopenharmony_ci} 489cabdff1aSopenharmony_ci 490cabdff1aSopenharmony_cistatic void hevc_bi_copy_64w_msa(uint8_t *src0_ptr, 491cabdff1aSopenharmony_ci int32_t src_stride, 492cabdff1aSopenharmony_ci int16_t *src1_ptr, 493cabdff1aSopenharmony_ci int32_t src2_stride, 494cabdff1aSopenharmony_ci uint8_t *dst, 495cabdff1aSopenharmony_ci int32_t dst_stride, 496cabdff1aSopenharmony_ci int32_t height) 497cabdff1aSopenharmony_ci{ 498cabdff1aSopenharmony_ci uint32_t loop_cnt; 499cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 500cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 501cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 502cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 503cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 504cabdff1aSopenharmony_ci 505cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 506cabdff1aSopenharmony_ci LD_SB4(src0_ptr, 16, src0, src1, src2, src3); 507cabdff1aSopenharmony_ci src0_ptr += src_stride; 508cabdff1aSopenharmony_ci LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7); 509cabdff1aSopenharmony_ci src1_ptr += src2_stride; 510cabdff1aSopenharmony_ci 511cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src0, dst0, dst1); 512cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src1, dst2, dst3); 513cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src2, dst4, dst5); 514cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, src3, dst6, dst7); 515cabdff1aSopenharmony_ci SLLI_4V(dst0, dst1, dst2, dst3, 6); 516cabdff1aSopenharmony_ci SLLI_4V(dst4, dst5, dst6, dst7, 6); 517cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3, 518cabdff1aSopenharmony_ci 7, dst0, dst1, dst2, dst3); 519cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7, 520cabdff1aSopenharmony_ci 7, dst4, dst5, dst6, dst7); 521cabdff1aSopenharmony_ci PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); 522cabdff1aSopenharmony_ci PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3); 523cabdff1aSopenharmony_ci 524cabdff1aSopenharmony_ci ST_UB4(out0, out1, out2, out3, dst, 16); 525cabdff1aSopenharmony_ci dst += dst_stride; 526cabdff1aSopenharmony_ci } 527cabdff1aSopenharmony_ci} 528cabdff1aSopenharmony_ci 529cabdff1aSopenharmony_cistatic void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr, 530cabdff1aSopenharmony_ci int32_t src_stride, 531cabdff1aSopenharmony_ci int16_t *src1_ptr, 532cabdff1aSopenharmony_ci int32_t src2_stride, 533cabdff1aSopenharmony_ci uint8_t *dst, 534cabdff1aSopenharmony_ci int32_t dst_stride, 535cabdff1aSopenharmony_ci const int8_t *filter, 536cabdff1aSopenharmony_ci int32_t height) 537cabdff1aSopenharmony_ci{ 538cabdff1aSopenharmony_ci uint32_t loop_cnt; 539cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 540cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 541cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 542cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 543cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 544cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 545cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 546cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 547cabdff1aSopenharmony_ci 548cabdff1aSopenharmony_ci src0_ptr -= 3; 549cabdff1aSopenharmony_ci 550cabdff1aSopenharmony_ci /* rearranging filter */ 551cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 552cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 553cabdff1aSopenharmony_ci 554cabdff1aSopenharmony_ci mask1 = mask0 + 2; 555cabdff1aSopenharmony_ci mask2 = mask0 + 4; 556cabdff1aSopenharmony_ci mask3 = mask0 + 6; 557cabdff1aSopenharmony_ci 558cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 559cabdff1aSopenharmony_ci const_vec <<= 6; 560cabdff1aSopenharmony_ci 561cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 562cabdff1aSopenharmony_ci LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3, 563cabdff1aSopenharmony_ci src4, src5, src6, src7); 564cabdff1aSopenharmony_ci src0_ptr += (8 * src_stride); 565cabdff1aSopenharmony_ci LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); 566cabdff1aSopenharmony_ci src1_ptr += (8 * src2_stride); 567cabdff1aSopenharmony_ci 568cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 569cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 570cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 571cabdff1aSopenharmony_ci 572cabdff1aSopenharmony_ci dst0 = const_vec; 573cabdff1aSopenharmony_ci dst1 = const_vec; 574cabdff1aSopenharmony_ci dst2 = const_vec; 575cabdff1aSopenharmony_ci dst3 = const_vec; 576cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 577cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3); 578cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 579cabdff1aSopenharmony_ci dst1, dst2, dst3); 580cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1); 581cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3); 582cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 583cabdff1aSopenharmony_ci dst1, dst2, dst3); 584cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1); 585cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3); 586cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 587cabdff1aSopenharmony_ci dst1, dst2, dst3); 588cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec0, vec1); 589cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3); 590cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 591cabdff1aSopenharmony_ci dst1, dst2, dst3); 592cabdff1aSopenharmony_ci 593cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 594cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 595cabdff1aSopenharmony_ci 596cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 597cabdff1aSopenharmony_ci ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 598cabdff1aSopenharmony_ci dst += (8 * dst_stride); 599cabdff1aSopenharmony_ci } 600cabdff1aSopenharmony_ci} 601cabdff1aSopenharmony_ci 602cabdff1aSopenharmony_cistatic void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr, 603cabdff1aSopenharmony_ci int32_t src_stride, 604cabdff1aSopenharmony_ci int16_t *src1_ptr, 605cabdff1aSopenharmony_ci int32_t src2_stride, 606cabdff1aSopenharmony_ci uint8_t *dst, 607cabdff1aSopenharmony_ci int32_t dst_stride, 608cabdff1aSopenharmony_ci const int8_t *filter, 609cabdff1aSopenharmony_ci int32_t height) 610cabdff1aSopenharmony_ci{ 611cabdff1aSopenharmony_ci uint32_t loop_cnt; 612cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 613cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 614cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 615cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 616cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 617cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 618cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 619cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 620cabdff1aSopenharmony_ci 621cabdff1aSopenharmony_ci src0_ptr -= 3; 622cabdff1aSopenharmony_ci 623cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 624cabdff1aSopenharmony_ci const_vec <<= 6; 625cabdff1aSopenharmony_ci 626cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 627cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 628cabdff1aSopenharmony_ci 629cabdff1aSopenharmony_ci mask1 = mask0 + 2; 630cabdff1aSopenharmony_ci mask2 = mask0 + 4; 631cabdff1aSopenharmony_ci mask3 = mask0 + 6; 632cabdff1aSopenharmony_ci 633cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 634cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 635cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 636cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 637cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 638cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 639cabdff1aSopenharmony_ci 640cabdff1aSopenharmony_ci dst0 = const_vec; 641cabdff1aSopenharmony_ci dst1 = const_vec; 642cabdff1aSopenharmony_ci dst2 = const_vec; 643cabdff1aSopenharmony_ci dst3 = const_vec; 644cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 645cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 646cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 647cabdff1aSopenharmony_ci dst1, dst2, dst3); 648cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 649cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 650cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 651cabdff1aSopenharmony_ci dst1, dst2, dst3); 652cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1); 653cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3); 654cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 655cabdff1aSopenharmony_ci dst1, dst2, dst3); 656cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1); 657cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3); 658cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 659cabdff1aSopenharmony_ci dst1, dst2, dst3); 660cabdff1aSopenharmony_ci 661cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 662cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 663cabdff1aSopenharmony_ci 664cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 665cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 666cabdff1aSopenharmony_ci dst += (4 * dst_stride); 667cabdff1aSopenharmony_ci } 668cabdff1aSopenharmony_ci} 669cabdff1aSopenharmony_ci 670cabdff1aSopenharmony_cistatic void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr, 671cabdff1aSopenharmony_ci int32_t src_stride, 672cabdff1aSopenharmony_ci int16_t *src1_ptr, 673cabdff1aSopenharmony_ci int32_t src2_stride, 674cabdff1aSopenharmony_ci uint8_t *dst, 675cabdff1aSopenharmony_ci int32_t dst_stride, 676cabdff1aSopenharmony_ci const int8_t *filter, 677cabdff1aSopenharmony_ci int32_t height) 678cabdff1aSopenharmony_ci{ 679cabdff1aSopenharmony_ci uint32_t loop_cnt; 680cabdff1aSopenharmony_ci int32_t tmp0, tmp1; 681cabdff1aSopenharmony_ci int64_t tmp2, tmp3; 682cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 683cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2; 684cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 685cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 686cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2; 687cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 688cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 689cabdff1aSopenharmony_ci 690cabdff1aSopenharmony_ci src0_ptr -= 3; 691cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 692cabdff1aSopenharmony_ci const_vec <<= 6; 693cabdff1aSopenharmony_ci 694cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 695cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 696cabdff1aSopenharmony_ci 697cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 698cabdff1aSopenharmony_ci mask1 = mask0 + 2; 699cabdff1aSopenharmony_ci mask2 = mask0 + 4; 700cabdff1aSopenharmony_ci mask3 = mask0 + 6; 701cabdff1aSopenharmony_ci mask4 = LD_SB(&ff_hevc_mask_arr[16]); 702cabdff1aSopenharmony_ci mask5 = mask4 + 2; 703cabdff1aSopenharmony_ci mask6 = mask4 + 4; 704cabdff1aSopenharmony_ci mask7 = mask4 + 6; 705cabdff1aSopenharmony_ci 706cabdff1aSopenharmony_ci for (loop_cnt = 8; loop_cnt--;) { 707cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 8, src0, src1); 708cabdff1aSopenharmony_ci src0_ptr += src_stride; 709cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 8, src2, src3); 710cabdff1aSopenharmony_ci src0_ptr += src_stride; 711cabdff1aSopenharmony_ci LD_SH2(src1_ptr, 8, in0, in1); 712cabdff1aSopenharmony_ci src1_ptr += src2_stride; 713cabdff1aSopenharmony_ci LD_SH2(src1_ptr, 8, in2, in3); 714cabdff1aSopenharmony_ci src1_ptr += src2_stride; 715cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 716cabdff1aSopenharmony_ci 717cabdff1aSopenharmony_ci dst0 = const_vec; 718cabdff1aSopenharmony_ci dst1 = const_vec; 719cabdff1aSopenharmony_ci dst2 = const_vec; 720cabdff1aSopenharmony_ci 721cabdff1aSopenharmony_ci VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask0, mask4, mask0, 722cabdff1aSopenharmony_ci vec0, vec1, vec2); 723cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt0, dst0, dst1); 724cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt0); 725cabdff1aSopenharmony_ci VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask1, mask5, mask1, 726cabdff1aSopenharmony_ci vec0, vec1, vec2); 727cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt1, filt1, dst0, dst1); 728cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt1); 729cabdff1aSopenharmony_ci VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask2, mask6, mask2, 730cabdff1aSopenharmony_ci vec0, vec1, vec2); 731cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt2, filt2, dst0, dst1); 732cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt2); 733cabdff1aSopenharmony_ci VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask3, mask7, mask3, 734cabdff1aSopenharmony_ci vec0, vec1, vec2); 735cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt3, filt3, dst0, dst1); 736cabdff1aSopenharmony_ci dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt3); 737cabdff1aSopenharmony_ci 738cabdff1aSopenharmony_ci in1 = (v8i16) __msa_pckev_d((v2i64) in3, (v2i64) in1); 739cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); 740cabdff1aSopenharmony_ci dst2 = __msa_adds_s_h(in2, dst2); 741cabdff1aSopenharmony_ci dst2 = __msa_srari_h(dst2, 7); 742cabdff1aSopenharmony_ci CLIP_SH_0_255(dst2); 743cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst2, dst2, dst0, dst1); 744cabdff1aSopenharmony_ci 745cabdff1aSopenharmony_ci tmp2 = __msa_copy_s_d((v2i64) dst0, 0); 746cabdff1aSopenharmony_ci tmp0 = __msa_copy_s_w((v4i32) dst0, 2); 747cabdff1aSopenharmony_ci tmp3 = __msa_copy_s_d((v2i64) dst1, 0); 748cabdff1aSopenharmony_ci tmp1 = __msa_copy_s_w((v4i32) dst0, 3); 749cabdff1aSopenharmony_ci SD(tmp2, dst); 750cabdff1aSopenharmony_ci SW(tmp0, dst + 8); 751cabdff1aSopenharmony_ci dst += dst_stride; 752cabdff1aSopenharmony_ci SD(tmp3, dst); 753cabdff1aSopenharmony_ci SW(tmp1, dst + 8); 754cabdff1aSopenharmony_ci dst += dst_stride; 755cabdff1aSopenharmony_ci } 756cabdff1aSopenharmony_ci} 757cabdff1aSopenharmony_ci 758cabdff1aSopenharmony_cistatic void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr, 759cabdff1aSopenharmony_ci int32_t src_stride, 760cabdff1aSopenharmony_ci int16_t *src1_ptr, 761cabdff1aSopenharmony_ci int32_t src2_stride, 762cabdff1aSopenharmony_ci uint8_t *dst, 763cabdff1aSopenharmony_ci int32_t dst_stride, 764cabdff1aSopenharmony_ci const int8_t *filter, 765cabdff1aSopenharmony_ci int32_t height) 766cabdff1aSopenharmony_ci{ 767cabdff1aSopenharmony_ci uint32_t loop_cnt; 768cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 769cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 770cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 771cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 772cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 773cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 774cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 775cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 776cabdff1aSopenharmony_ci 777cabdff1aSopenharmony_ci src0_ptr -= 3; 778cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 779cabdff1aSopenharmony_ci const_vec <<= 6; 780cabdff1aSopenharmony_ci 781cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 782cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 783cabdff1aSopenharmony_ci 784cabdff1aSopenharmony_ci mask1 = mask0 + 2; 785cabdff1aSopenharmony_ci mask2 = mask0 + 4; 786cabdff1aSopenharmony_ci mask3 = mask0 + 6; 787cabdff1aSopenharmony_ci 788cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 789cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 8, src0, src1); 790cabdff1aSopenharmony_ci src0_ptr += src_stride; 791cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 8, src2, src3); 792cabdff1aSopenharmony_ci src0_ptr += src_stride; 793cabdff1aSopenharmony_ci LD_SH2(src1_ptr, 8, in0, in1); 794cabdff1aSopenharmony_ci src1_ptr += src2_stride; 795cabdff1aSopenharmony_ci LD_SH2(src1_ptr, 8, in2, in3); 796cabdff1aSopenharmony_ci src1_ptr += src2_stride; 797cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 798cabdff1aSopenharmony_ci 799cabdff1aSopenharmony_ci dst0 = const_vec; 800cabdff1aSopenharmony_ci dst1 = const_vec; 801cabdff1aSopenharmony_ci dst2 = const_vec; 802cabdff1aSopenharmony_ci dst3 = const_vec; 803cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 804cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 805cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 806cabdff1aSopenharmony_ci dst1, dst2, dst3); 807cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 808cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 809cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 810cabdff1aSopenharmony_ci dst1, dst2, dst3); 811cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1); 812cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3); 813cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 814cabdff1aSopenharmony_ci dst1, dst2, dst3); 815cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1); 816cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3); 817cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 818cabdff1aSopenharmony_ci dst1, dst2, dst3); 819cabdff1aSopenharmony_ci 820cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 821cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 822cabdff1aSopenharmony_ci 823cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 824cabdff1aSopenharmony_ci ST_SH2(dst0, dst1, dst, dst_stride); 825cabdff1aSopenharmony_ci dst += (2 * dst_stride); 826cabdff1aSopenharmony_ci } 827cabdff1aSopenharmony_ci} 828cabdff1aSopenharmony_ci 829cabdff1aSopenharmony_cistatic void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr, 830cabdff1aSopenharmony_ci int32_t src_stride, 831cabdff1aSopenharmony_ci int16_t *src1_ptr, 832cabdff1aSopenharmony_ci int32_t src2_stride, 833cabdff1aSopenharmony_ci uint8_t *dst, 834cabdff1aSopenharmony_ci int32_t dst_stride, 835cabdff1aSopenharmony_ci const int8_t *filter, 836cabdff1aSopenharmony_ci int32_t height) 837cabdff1aSopenharmony_ci{ 838cabdff1aSopenharmony_ci uint32_t loop_cnt; 839cabdff1aSopenharmony_ci uint64_t dst_val0; 840cabdff1aSopenharmony_ci v16i8 src0, src1, tmp0, tmp1; 841cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 842cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 843cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 844cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2; 845cabdff1aSopenharmony_ci v8i16 in0, in1, in2; 846cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 847cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 848cabdff1aSopenharmony_ci 849cabdff1aSopenharmony_ci src0_ptr = src0_ptr - 3; 850cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 851cabdff1aSopenharmony_ci const_vec <<= 6; 852cabdff1aSopenharmony_ci 853cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 854cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 855cabdff1aSopenharmony_ci 856cabdff1aSopenharmony_ci mask1 = mask0 + 2; 857cabdff1aSopenharmony_ci mask2 = mask0 + 4; 858cabdff1aSopenharmony_ci mask3 = mask0 + 6; 859cabdff1aSopenharmony_ci mask4 = mask0 + 8; 860cabdff1aSopenharmony_ci mask5 = mask0 + 10; 861cabdff1aSopenharmony_ci mask6 = mask0 + 12; 862cabdff1aSopenharmony_ci mask7 = mask0 + 14; 863cabdff1aSopenharmony_ci 864cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 865cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 16, src0, src1); 866cabdff1aSopenharmony_ci src0_ptr += src_stride; 867cabdff1aSopenharmony_ci LD_SH2(src1_ptr, 8, in0, in1); 868cabdff1aSopenharmony_ci in2 = LD_SH(src1_ptr + 16); 869cabdff1aSopenharmony_ci src1_ptr += src2_stride; 870cabdff1aSopenharmony_ci XORI_B2_128_SB(src0, src1); 871cabdff1aSopenharmony_ci 872cabdff1aSopenharmony_ci dst0 = const_vec; 873cabdff1aSopenharmony_ci dst1 = const_vec; 874cabdff1aSopenharmony_ci dst2 = const_vec; 875cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 876cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src0, src0, mask0, mask1, vec2, vec3); 877cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt1, dst0, 878cabdff1aSopenharmony_ci dst1, dst2, dst0); 879cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src1, src1, mask5, mask1, vec0, vec1); 880cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec2, vec3); 881cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt2, filt2, dst1, 882cabdff1aSopenharmony_ci dst2, dst0, dst1); 883cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src0, src0, mask2, mask3, vec0, vec1); 884cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src1, src1, mask7, mask3, vec2, vec3); 885cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt3, filt3, filt3, dst2, 886cabdff1aSopenharmony_ci dst0, dst1, dst2); 887cabdff1aSopenharmony_ci 888cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); 889cabdff1aSopenharmony_ci dst2 = __msa_adds_s_h(dst2, in2); 890cabdff1aSopenharmony_ci dst2 = __msa_srari_h(dst2, 7); 891cabdff1aSopenharmony_ci CLIP_SH_0_255(dst2); 892cabdff1aSopenharmony_ci 893cabdff1aSopenharmony_ci PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1); 894cabdff1aSopenharmony_ci dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0); 895cabdff1aSopenharmony_ci ST_SB(tmp0, dst); 896cabdff1aSopenharmony_ci SD(dst_val0, dst + 16); 897cabdff1aSopenharmony_ci dst += dst_stride; 898cabdff1aSopenharmony_ci } 899cabdff1aSopenharmony_ci} 900cabdff1aSopenharmony_ci 901cabdff1aSopenharmony_cistatic void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr, 902cabdff1aSopenharmony_ci int32_t src_stride, 903cabdff1aSopenharmony_ci int16_t *src1_ptr, 904cabdff1aSopenharmony_ci int32_t src2_stride, 905cabdff1aSopenharmony_ci uint8_t *dst, 906cabdff1aSopenharmony_ci int32_t dst_stride, 907cabdff1aSopenharmony_ci const int8_t *filter, 908cabdff1aSopenharmony_ci int32_t height) 909cabdff1aSopenharmony_ci{ 910cabdff1aSopenharmony_ci uint32_t loop_cnt; 911cabdff1aSopenharmony_ci v16i8 src0, src1, src2, tmp0, tmp1; 912cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 913cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 914cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 915cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 916cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 917cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 918cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 919cabdff1aSopenharmony_ci 920cabdff1aSopenharmony_ci src0_ptr -= 3; 921cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 922cabdff1aSopenharmony_ci const_vec <<= 6; 923cabdff1aSopenharmony_ci 924cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 925cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 926cabdff1aSopenharmony_ci 927cabdff1aSopenharmony_ci mask1 = mask0 + 2; 928cabdff1aSopenharmony_ci mask2 = mask0 + 4; 929cabdff1aSopenharmony_ci mask3 = mask0 + 6; 930cabdff1aSopenharmony_ci mask4 = mask0 + 8; 931cabdff1aSopenharmony_ci mask5 = mask0 + 10; 932cabdff1aSopenharmony_ci mask6 = mask0 + 12; 933cabdff1aSopenharmony_ci mask7 = mask0 + 14; 934cabdff1aSopenharmony_ci 935cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 936cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 16, src0, src1); 937cabdff1aSopenharmony_ci src2 = LD_SB(src0_ptr + 24); 938cabdff1aSopenharmony_ci src0_ptr += src_stride; 939cabdff1aSopenharmony_ci LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 940cabdff1aSopenharmony_ci src1_ptr += src2_stride; 941cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 942cabdff1aSopenharmony_ci 943cabdff1aSopenharmony_ci dst0 = const_vec; 944cabdff1aSopenharmony_ci dst1 = const_vec; 945cabdff1aSopenharmony_ci dst2 = const_vec; 946cabdff1aSopenharmony_ci dst3 = const_vec; 947cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 948cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3); 949cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 950cabdff1aSopenharmony_ci dst1, dst2, dst3); 951cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1); 952cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3); 953cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 954cabdff1aSopenharmony_ci dst1, dst2, dst3); 955cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1); 956cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3); 957cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 958cabdff1aSopenharmony_ci dst1, dst2, dst3); 959cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1); 960cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3); 961cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 962cabdff1aSopenharmony_ci dst1, dst2, dst3); 963cabdff1aSopenharmony_ci 964cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 965cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 966cabdff1aSopenharmony_ci 967cabdff1aSopenharmony_ci PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1); 968cabdff1aSopenharmony_ci ST_SB2(tmp0, tmp1, dst, 16); 969cabdff1aSopenharmony_ci dst += dst_stride; 970cabdff1aSopenharmony_ci } 971cabdff1aSopenharmony_ci} 972cabdff1aSopenharmony_ci 973cabdff1aSopenharmony_cistatic void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr, 974cabdff1aSopenharmony_ci int32_t src_stride, 975cabdff1aSopenharmony_ci int16_t *src1_ptr, 976cabdff1aSopenharmony_ci int32_t src2_stride, 977cabdff1aSopenharmony_ci uint8_t *dst, 978cabdff1aSopenharmony_ci int32_t dst_stride, 979cabdff1aSopenharmony_ci const int8_t *filter, 980cabdff1aSopenharmony_ci int32_t height) 981cabdff1aSopenharmony_ci{ 982cabdff1aSopenharmony_ci uint32_t loop_cnt; 983cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 984cabdff1aSopenharmony_ci v16i8 tmp0, tmp1, tmp2; 985cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 986cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 987cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 988cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 989cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5; 990cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 991cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 992cabdff1aSopenharmony_ci 993cabdff1aSopenharmony_ci src0_ptr -= 3; 994cabdff1aSopenharmony_ci 995cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 996cabdff1aSopenharmony_ci const_vec <<= 6; 997cabdff1aSopenharmony_ci 998cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 999cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1000cabdff1aSopenharmony_ci 1001cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1002cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1003cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1004cabdff1aSopenharmony_ci mask4 = mask0 + 8; 1005cabdff1aSopenharmony_ci mask5 = mask0 + 10; 1006cabdff1aSopenharmony_ci mask6 = mask0 + 12; 1007cabdff1aSopenharmony_ci mask7 = mask0 + 14; 1008cabdff1aSopenharmony_ci 1009cabdff1aSopenharmony_ci for (loop_cnt = 64; loop_cnt--;) { 1010cabdff1aSopenharmony_ci LD_SB3(src0_ptr, 16, src0, src1, src2); 1011cabdff1aSopenharmony_ci src3 = LD_SB(src0_ptr + 40); 1012cabdff1aSopenharmony_ci src0_ptr += src_stride; 1013cabdff1aSopenharmony_ci LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 1014cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 1015cabdff1aSopenharmony_ci 1016cabdff1aSopenharmony_ci dst0 = const_vec; 1017cabdff1aSopenharmony_ci dst1 = const_vec; 1018cabdff1aSopenharmony_ci dst2 = const_vec; 1019cabdff1aSopenharmony_ci dst3 = const_vec; 1020cabdff1aSopenharmony_ci 1021cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 1022cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3); 1023cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 1024cabdff1aSopenharmony_ci dst1, dst2, dst3); 1025cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1); 1026cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3); 1027cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 1028cabdff1aSopenharmony_ci dst1, dst2, dst3); 1029cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1); 1030cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3); 1031cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 1032cabdff1aSopenharmony_ci dst1, dst2, dst3); 1033cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1); 1034cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3); 1035cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 1036cabdff1aSopenharmony_ci dst1, dst2, dst3); 1037cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); 1038cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3); 1039cabdff1aSopenharmony_ci PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1); 1040cabdff1aSopenharmony_ci ST_SB(tmp0, dst); 1041cabdff1aSopenharmony_ci ST_SB(tmp1, dst + 16); 1042cabdff1aSopenharmony_ci 1043cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 32, 8, in4, in5); 1044cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1045cabdff1aSopenharmony_ci 1046cabdff1aSopenharmony_ci dst4 = const_vec; 1047cabdff1aSopenharmony_ci dst5 = const_vec; 1048cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec0, vec1); 1049cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 1050cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, 1051cabdff1aSopenharmony_ci dst5, dst4, dst5); 1052cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec0, vec1); 1053cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3); 1054cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt3, filt3, dst4, 1055cabdff1aSopenharmony_ci dst5, dst4, dst5); 1056cabdff1aSopenharmony_ci 1057cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); 1058cabdff1aSopenharmony_ci 1059cabdff1aSopenharmony_ci tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4); 1060cabdff1aSopenharmony_ci ST_SB(tmp2, dst + 32); 1061cabdff1aSopenharmony_ci dst += dst_stride; 1062cabdff1aSopenharmony_ci } 1063cabdff1aSopenharmony_ci} 1064cabdff1aSopenharmony_ci 1065cabdff1aSopenharmony_cistatic void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr, 1066cabdff1aSopenharmony_ci int32_t src_stride, 1067cabdff1aSopenharmony_ci int16_t *src1_ptr, 1068cabdff1aSopenharmony_ci int32_t src2_stride, 1069cabdff1aSopenharmony_ci uint8_t *dst, 1070cabdff1aSopenharmony_ci int32_t dst_stride, 1071cabdff1aSopenharmony_ci const int8_t *filter, 1072cabdff1aSopenharmony_ci int32_t height) 1073cabdff1aSopenharmony_ci{ 1074cabdff1aSopenharmony_ci uint32_t loop_cnt; 1075cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, tmp0, tmp1; 1076cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1077cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 1078cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1079cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 1080cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 1081cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 1082cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1083cabdff1aSopenharmony_ci 1084cabdff1aSopenharmony_ci src0_ptr -= 3; 1085cabdff1aSopenharmony_ci 1086cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1087cabdff1aSopenharmony_ci const_vec <<= 6; 1088cabdff1aSopenharmony_ci 1089cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1090cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1091cabdff1aSopenharmony_ci 1092cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1093cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1094cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1095cabdff1aSopenharmony_ci mask4 = mask0 + 8; 1096cabdff1aSopenharmony_ci mask5 = mask0 + 10; 1097cabdff1aSopenharmony_ci mask6 = mask0 + 12; 1098cabdff1aSopenharmony_ci mask7 = mask0 + 14; 1099cabdff1aSopenharmony_ci 1100cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 1101cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 16, src0, src1); 1102cabdff1aSopenharmony_ci src2 = LD_SB(src0_ptr + 24); 1103cabdff1aSopenharmony_ci LD_SB2(src0_ptr + 32, 16, src3, src4); 1104cabdff1aSopenharmony_ci src5 = LD_SB(src0_ptr + 56); 1105cabdff1aSopenharmony_ci LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 1106cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 1107cabdff1aSopenharmony_ci 1108cabdff1aSopenharmony_ci dst0 = const_vec; 1109cabdff1aSopenharmony_ci dst1 = const_vec; 1110cabdff1aSopenharmony_ci dst2 = const_vec; 1111cabdff1aSopenharmony_ci dst3 = const_vec; 1112cabdff1aSopenharmony_ci 1113cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 1114cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3); 1115cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 1116cabdff1aSopenharmony_ci dst1, dst2, dst3); 1117cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1); 1118cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3); 1119cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 1120cabdff1aSopenharmony_ci dst1, dst2, dst3); 1121cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1); 1122cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3); 1123cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 1124cabdff1aSopenharmony_ci dst1, dst2, dst3); 1125cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1); 1126cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3); 1127cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 1128cabdff1aSopenharmony_ci dst1, dst2, dst3); 1129cabdff1aSopenharmony_ci 1130cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 1131cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, 1132cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 1133cabdff1aSopenharmony_ci 1134cabdff1aSopenharmony_ci PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1); 1135cabdff1aSopenharmony_ci ST_SB2(tmp0, tmp1, dst, 16); 1136cabdff1aSopenharmony_ci 1137cabdff1aSopenharmony_ci src0 = src3; 1138cabdff1aSopenharmony_ci src1 = src4; 1139cabdff1aSopenharmony_ci src2 = src5; 1140cabdff1aSopenharmony_ci 1141cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 32, 8, in0, in1, in2, in3); 1142cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 1143cabdff1aSopenharmony_ci 1144cabdff1aSopenharmony_ci dst0 = const_vec; 1145cabdff1aSopenharmony_ci dst1 = const_vec; 1146cabdff1aSopenharmony_ci dst2 = const_vec; 1147cabdff1aSopenharmony_ci dst3 = const_vec; 1148cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 1149cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3); 1150cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 1151cabdff1aSopenharmony_ci dst1, dst2, dst3); 1152cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1); 1153cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3); 1154cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 1155cabdff1aSopenharmony_ci dst1, dst2, dst3); 1156cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1); 1157cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3); 1158cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 1159cabdff1aSopenharmony_ci dst1, dst2, dst3); 1160cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1); 1161cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3); 1162cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 1163cabdff1aSopenharmony_ci dst1, dst2, dst3); 1164cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 1165cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, 1166cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 1167cabdff1aSopenharmony_ci PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1); 1168cabdff1aSopenharmony_ci ST_SB2(tmp0, tmp1, dst + 32, 16); 1169cabdff1aSopenharmony_ci src1_ptr += src2_stride; 1170cabdff1aSopenharmony_ci src0_ptr += src_stride; 1171cabdff1aSopenharmony_ci dst += dst_stride; 1172cabdff1aSopenharmony_ci } 1173cabdff1aSopenharmony_ci} 1174cabdff1aSopenharmony_ci 1175cabdff1aSopenharmony_cistatic void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr, 1176cabdff1aSopenharmony_ci int32_t src_stride, 1177cabdff1aSopenharmony_ci int16_t *src1_ptr, 1178cabdff1aSopenharmony_ci int32_t src2_stride, 1179cabdff1aSopenharmony_ci uint8_t *dst, 1180cabdff1aSopenharmony_ci int32_t dst_stride, 1181cabdff1aSopenharmony_ci const int8_t *filter, 1182cabdff1aSopenharmony_ci int32_t height) 1183cabdff1aSopenharmony_ci{ 1184cabdff1aSopenharmony_ci int32_t loop_cnt; 1185cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 1186cabdff1aSopenharmony_ci v16i8 src6, src7, src8, src9, src10; 1187cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 1188cabdff1aSopenharmony_ci v16i8 src11, src12, src13, src14; 1189cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1190cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1191cabdff1aSopenharmony_ci v16i8 src1110_r, src1211_r, src1312_r, src1413_r; 1192cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554, src8776, src10998; 1193cabdff1aSopenharmony_ci v16i8 src12111110, src14131312; 1194cabdff1aSopenharmony_ci v8i16 dst10, dst32, dst54, dst76; 1195cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1196cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1197cabdff1aSopenharmony_ci 1198cabdff1aSopenharmony_ci src0_ptr -= (3 * src_stride); 1199cabdff1aSopenharmony_ci 1200cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1201cabdff1aSopenharmony_ci const_vec <<= 6; 1202cabdff1aSopenharmony_ci 1203cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1204cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1205cabdff1aSopenharmony_ci 1206cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1207cabdff1aSopenharmony_ci src0_ptr += (7 * src_stride); 1208cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1209cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1210cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1211cabdff1aSopenharmony_ci ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 1212cabdff1aSopenharmony_ci src2110, src4332, src6554); 1213cabdff1aSopenharmony_ci XORI_B3_128_SB(src2110, src4332, src6554); 1214cabdff1aSopenharmony_ci 1215cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 1216cabdff1aSopenharmony_ci LD_SB8(src0_ptr, src_stride, 1217cabdff1aSopenharmony_ci src7, src8, src9, src10, src11, src12, src13, src14); 1218cabdff1aSopenharmony_ci src0_ptr += (8 * src_stride); 1219cabdff1aSopenharmony_ci LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); 1220cabdff1aSopenharmony_ci src1_ptr += (8 * src2_stride); 1221cabdff1aSopenharmony_ci 1222cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 1223cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 1224cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1225cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1226cabdff1aSopenharmony_ci ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, 1227cabdff1aSopenharmony_ci src1110_r, src1211_r, src1312_r, src1413_r); 1228cabdff1aSopenharmony_ci ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r, 1229cabdff1aSopenharmony_ci src1413_r, src1312_r, 1230cabdff1aSopenharmony_ci src8776, src10998, src12111110, src14131312); 1231cabdff1aSopenharmony_ci XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); 1232cabdff1aSopenharmony_ci 1233cabdff1aSopenharmony_ci dst10 = const_vec; 1234cabdff1aSopenharmony_ci DPADD_SB4_SH(src2110, src4332, src6554, src8776, 1235cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10); 1236cabdff1aSopenharmony_ci dst32 = const_vec; 1237cabdff1aSopenharmony_ci DPADD_SB4_SH(src4332, src6554, src8776, src10998, 1238cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32); 1239cabdff1aSopenharmony_ci dst54 = const_vec; 1240cabdff1aSopenharmony_ci DPADD_SB4_SH(src6554, src8776, src10998, src12111110, 1241cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54); 1242cabdff1aSopenharmony_ci dst76 = const_vec; 1243cabdff1aSopenharmony_ci DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, 1244cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76); 1245cabdff1aSopenharmony_ci 1246cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 1247cabdff1aSopenharmony_ci dst10, dst32, dst54, dst76, 7, 1248cabdff1aSopenharmony_ci dst10, dst32, dst54, dst76); 1249cabdff1aSopenharmony_ci 1250cabdff1aSopenharmony_ci PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54); 1251cabdff1aSopenharmony_ci ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 1252cabdff1aSopenharmony_ci dst += (8 * dst_stride); 1253cabdff1aSopenharmony_ci 1254cabdff1aSopenharmony_ci src2110 = src10998; 1255cabdff1aSopenharmony_ci src4332 = src12111110; 1256cabdff1aSopenharmony_ci src6554 = src14131312; 1257cabdff1aSopenharmony_ci src6 = src14; 1258cabdff1aSopenharmony_ci } 1259cabdff1aSopenharmony_ci} 1260cabdff1aSopenharmony_ci 1261cabdff1aSopenharmony_cistatic void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr, 1262cabdff1aSopenharmony_ci int32_t src_stride, 1263cabdff1aSopenharmony_ci int16_t *src1_ptr, 1264cabdff1aSopenharmony_ci int32_t src2_stride, 1265cabdff1aSopenharmony_ci uint8_t *dst, 1266cabdff1aSopenharmony_ci int32_t dst_stride, 1267cabdff1aSopenharmony_ci const int8_t *filter, 1268cabdff1aSopenharmony_ci int32_t height) 1269cabdff1aSopenharmony_ci{ 1270cabdff1aSopenharmony_ci int32_t loop_cnt; 1271cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 1272cabdff1aSopenharmony_ci v16i8 src6, src7, src8, src9, src10; 1273cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 1274cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1275cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1276cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 1277cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1278cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1279cabdff1aSopenharmony_ci 1280cabdff1aSopenharmony_ci src0_ptr -= (3 * src_stride); 1281cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1282cabdff1aSopenharmony_ci const_vec <<= 6; 1283cabdff1aSopenharmony_ci 1284cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1285cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1286cabdff1aSopenharmony_ci 1287cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1288cabdff1aSopenharmony_ci src0_ptr += (7 * src_stride); 1289cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1290cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1291cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1292cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1293cabdff1aSopenharmony_ci 1294cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1295cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 1296cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 1297cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 1298cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 1299cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1300cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1301cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1302cabdff1aSopenharmony_ci 1303cabdff1aSopenharmony_ci dst0_r = const_vec; 1304cabdff1aSopenharmony_ci DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, 1305cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1306cabdff1aSopenharmony_ci dst0_r, dst0_r, dst0_r, dst0_r); 1307cabdff1aSopenharmony_ci dst1_r = const_vec; 1308cabdff1aSopenharmony_ci DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, 1309cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1310cabdff1aSopenharmony_ci dst1_r, dst1_r, dst1_r, dst1_r); 1311cabdff1aSopenharmony_ci dst2_r = const_vec; 1312cabdff1aSopenharmony_ci DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, 1313cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1314cabdff1aSopenharmony_ci dst2_r, dst2_r, dst2_r, dst2_r); 1315cabdff1aSopenharmony_ci dst3_r = const_vec; 1316cabdff1aSopenharmony_ci DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, 1317cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1318cabdff1aSopenharmony_ci dst3_r, dst3_r, dst3_r, dst3_r); 1319cabdff1aSopenharmony_ci 1320cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 1321cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r, 7, 1322cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 1323cabdff1aSopenharmony_ci 1324cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 1325cabdff1aSopenharmony_ci ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride); 1326cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1327cabdff1aSopenharmony_ci 1328cabdff1aSopenharmony_ci src10_r = src54_r; 1329cabdff1aSopenharmony_ci src32_r = src76_r; 1330cabdff1aSopenharmony_ci src54_r = src98_r; 1331cabdff1aSopenharmony_ci src21_r = src65_r; 1332cabdff1aSopenharmony_ci src43_r = src87_r; 1333cabdff1aSopenharmony_ci src65_r = src109_r; 1334cabdff1aSopenharmony_ci 1335cabdff1aSopenharmony_ci src6 = src10; 1336cabdff1aSopenharmony_ci } 1337cabdff1aSopenharmony_ci} 1338cabdff1aSopenharmony_ci 1339cabdff1aSopenharmony_cistatic void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr, 1340cabdff1aSopenharmony_ci int32_t src_stride, 1341cabdff1aSopenharmony_ci int16_t *src1_ptr, 1342cabdff1aSopenharmony_ci int32_t src2_stride, 1343cabdff1aSopenharmony_ci uint8_t *dst, 1344cabdff1aSopenharmony_ci int32_t dst_stride, 1345cabdff1aSopenharmony_ci const int8_t *filter, 1346cabdff1aSopenharmony_ci int32_t height) 1347cabdff1aSopenharmony_ci{ 1348cabdff1aSopenharmony_ci int32_t loop_cnt; 1349cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1350cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 1351cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1352cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1353cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 1354cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; 1355cabdff1aSopenharmony_ci v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; 1356cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554, src8776, src10998; 1357cabdff1aSopenharmony_ci v8i16 dst0_l, dst1_l; 1358cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1359cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1360cabdff1aSopenharmony_ci 1361cabdff1aSopenharmony_ci src0_ptr -= (3 * src_stride); 1362cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1363cabdff1aSopenharmony_ci const_vec <<= 6; 1364cabdff1aSopenharmony_ci 1365cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1366cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1367cabdff1aSopenharmony_ci 1368cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1369cabdff1aSopenharmony_ci src0_ptr += (7 * src_stride); 1370cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1371cabdff1aSopenharmony_ci 1372cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1373cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1374cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1375cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1376cabdff1aSopenharmony_ci src10_l, src32_l, src54_l, src21_l); 1377cabdff1aSopenharmony_ci ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1378cabdff1aSopenharmony_ci ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, 1379cabdff1aSopenharmony_ci src2110, src4332, src6554); 1380cabdff1aSopenharmony_ci 1381cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1382cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 1383cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 1384cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 1385cabdff1aSopenharmony_ci LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7); 1386cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 1387cabdff1aSopenharmony_ci 1388cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 1389cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1390cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1391cabdff1aSopenharmony_ci src76_r, src87_r, src98_r, src109_r); 1392cabdff1aSopenharmony_ci ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1393cabdff1aSopenharmony_ci src76_l, src87_l, src98_l, src109_l); 1394cabdff1aSopenharmony_ci ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998); 1395cabdff1aSopenharmony_ci 1396cabdff1aSopenharmony_ci dst0_r = const_vec; 1397cabdff1aSopenharmony_ci DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, 1398cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1399cabdff1aSopenharmony_ci dst0_r, dst0_r, dst0_r, dst0_r); 1400cabdff1aSopenharmony_ci dst1_r = const_vec; 1401cabdff1aSopenharmony_ci DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, 1402cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1403cabdff1aSopenharmony_ci dst1_r, dst1_r, dst1_r, dst1_r); 1404cabdff1aSopenharmony_ci dst2_r = const_vec; 1405cabdff1aSopenharmony_ci DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, 1406cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1407cabdff1aSopenharmony_ci dst2_r, dst2_r, dst2_r, dst2_r); 1408cabdff1aSopenharmony_ci dst3_r = const_vec; 1409cabdff1aSopenharmony_ci DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, 1410cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1411cabdff1aSopenharmony_ci dst3_r, dst3_r, dst3_r, dst3_r); 1412cabdff1aSopenharmony_ci dst0_l = const_vec; 1413cabdff1aSopenharmony_ci DPADD_SB4_SH(src2110, src4332, src6554, src8776, 1414cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1415cabdff1aSopenharmony_ci dst0_l, dst0_l, dst0_l, dst0_l); 1416cabdff1aSopenharmony_ci dst1_l = const_vec; 1417cabdff1aSopenharmony_ci DPADD_SB4_SH(src4332, src6554, src8776, src10998, 1418cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1419cabdff1aSopenharmony_ci dst1_l, dst1_l, dst1_l, dst1_l); 1420cabdff1aSopenharmony_ci 1421cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 1422cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r, 7, 1423cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 1424cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l); 1425cabdff1aSopenharmony_ci 1426cabdff1aSopenharmony_ci 1427cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 1428cabdff1aSopenharmony_ci dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l); 1429cabdff1aSopenharmony_ci ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride); 1430cabdff1aSopenharmony_ci ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride); 1431cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1432cabdff1aSopenharmony_ci 1433cabdff1aSopenharmony_ci src10_r = src54_r; 1434cabdff1aSopenharmony_ci src32_r = src76_r; 1435cabdff1aSopenharmony_ci src54_r = src98_r; 1436cabdff1aSopenharmony_ci src21_r = src65_r; 1437cabdff1aSopenharmony_ci src43_r = src87_r; 1438cabdff1aSopenharmony_ci src65_r = src109_r; 1439cabdff1aSopenharmony_ci src2110 = src6554; 1440cabdff1aSopenharmony_ci src4332 = src8776; 1441cabdff1aSopenharmony_ci src6554 = src10998; 1442cabdff1aSopenharmony_ci src6 = src10; 1443cabdff1aSopenharmony_ci } 1444cabdff1aSopenharmony_ci} 1445cabdff1aSopenharmony_ci 1446cabdff1aSopenharmony_cistatic void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr, 1447cabdff1aSopenharmony_ci int32_t src_stride, 1448cabdff1aSopenharmony_ci int16_t *src1_ptr, 1449cabdff1aSopenharmony_ci int32_t src2_stride, 1450cabdff1aSopenharmony_ci uint8_t *dst, 1451cabdff1aSopenharmony_ci int32_t dst_stride, 1452cabdff1aSopenharmony_ci const int8_t *filter, 1453cabdff1aSopenharmony_ci int32_t height, int32_t width) 1454cabdff1aSopenharmony_ci{ 1455cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp; 1456cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 1457cabdff1aSopenharmony_ci uint8_t *dst_tmp; 1458cabdff1aSopenharmony_ci uint32_t loop_cnt; 1459cabdff1aSopenharmony_ci uint32_t cnt; 1460cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1461cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 1462cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r; 1463cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r; 1464cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r; 1465cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src76_l; 1466cabdff1aSopenharmony_ci v16i8 src21_l, src43_l, src65_l, src87_l; 1467cabdff1aSopenharmony_ci v8i16 dst0_l, dst1_l; 1468cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1469cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1470cabdff1aSopenharmony_ci 1471cabdff1aSopenharmony_ci src0_ptr -= (3 * src_stride); 1472cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1473cabdff1aSopenharmony_ci const_vec <<= 6; 1474cabdff1aSopenharmony_ci 1475cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 1476cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1477cabdff1aSopenharmony_ci 1478cabdff1aSopenharmony_ci for (cnt = (width >> 4); cnt--;) { 1479cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 1480cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 1481cabdff1aSopenharmony_ci dst_tmp = dst; 1482cabdff1aSopenharmony_ci 1483cabdff1aSopenharmony_ci LD_SB7(src0_ptr_tmp, src_stride, 1484cabdff1aSopenharmony_ci src0, src1, src2, src3, src4, src5, src6); 1485cabdff1aSopenharmony_ci src0_ptr_tmp += (7 * src_stride); 1486cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1487cabdff1aSopenharmony_ci 1488cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1489cabdff1aSopenharmony_ci src10_r, src32_r, src54_r, src21_r); 1490cabdff1aSopenharmony_ci ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1491cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1492cabdff1aSopenharmony_ci src10_l, src32_l, src54_l, src21_l); 1493cabdff1aSopenharmony_ci ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1494cabdff1aSopenharmony_ci 1495cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 1496cabdff1aSopenharmony_ci LD_SB2(src0_ptr_tmp, src_stride, src7, src8); 1497cabdff1aSopenharmony_ci src0_ptr_tmp += (2 * src_stride); 1498cabdff1aSopenharmony_ci LD_SH2(src1_ptr_tmp, src2_stride, in0, in1); 1499cabdff1aSopenharmony_ci LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3); 1500cabdff1aSopenharmony_ci src1_ptr_tmp += (2 * src2_stride); 1501cabdff1aSopenharmony_ci XORI_B2_128_SB(src7, src8); 1502cabdff1aSopenharmony_ci 1503cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 1504cabdff1aSopenharmony_ci ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 1505cabdff1aSopenharmony_ci 1506cabdff1aSopenharmony_ci dst0_r = const_vec; 1507cabdff1aSopenharmony_ci DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, 1508cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1509cabdff1aSopenharmony_ci dst0_r, dst0_r, dst0_r, dst0_r); 1510cabdff1aSopenharmony_ci dst1_r = const_vec; 1511cabdff1aSopenharmony_ci DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, 1512cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1513cabdff1aSopenharmony_ci dst1_r, dst1_r, dst1_r, dst1_r); 1514cabdff1aSopenharmony_ci dst0_l = const_vec; 1515cabdff1aSopenharmony_ci DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l, 1516cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1517cabdff1aSopenharmony_ci dst0_l, dst0_l, dst0_l, dst0_l); 1518cabdff1aSopenharmony_ci dst1_l = const_vec; 1519cabdff1aSopenharmony_ci DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l, 1520cabdff1aSopenharmony_ci filt0, filt1, filt2, filt3, 1521cabdff1aSopenharmony_ci dst1_l, dst1_l, dst1_l, dst1_l); 1522cabdff1aSopenharmony_ci 1523cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 1524cabdff1aSopenharmony_ci dst0_r, dst1_r, dst0_l, dst1_l, 7, 1525cabdff1aSopenharmony_ci dst0_r, dst1_r, dst0_l, dst1_l); 1526cabdff1aSopenharmony_ci 1527cabdff1aSopenharmony_ci PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 1528cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride); 1529cabdff1aSopenharmony_ci dst_tmp += (2 * dst_stride); 1530cabdff1aSopenharmony_ci 1531cabdff1aSopenharmony_ci src10_r = src32_r; 1532cabdff1aSopenharmony_ci src32_r = src54_r; 1533cabdff1aSopenharmony_ci src54_r = src76_r; 1534cabdff1aSopenharmony_ci src21_r = src43_r; 1535cabdff1aSopenharmony_ci src43_r = src65_r; 1536cabdff1aSopenharmony_ci src65_r = src87_r; 1537cabdff1aSopenharmony_ci src10_l = src32_l; 1538cabdff1aSopenharmony_ci src32_l = src54_l; 1539cabdff1aSopenharmony_ci src54_l = src76_l; 1540cabdff1aSopenharmony_ci src21_l = src43_l; 1541cabdff1aSopenharmony_ci src43_l = src65_l; 1542cabdff1aSopenharmony_ci src65_l = src87_l; 1543cabdff1aSopenharmony_ci src6 = src8; 1544cabdff1aSopenharmony_ci } 1545cabdff1aSopenharmony_ci 1546cabdff1aSopenharmony_ci src0_ptr += 16; 1547cabdff1aSopenharmony_ci src1_ptr += 16; 1548cabdff1aSopenharmony_ci dst += 16; 1549cabdff1aSopenharmony_ci } 1550cabdff1aSopenharmony_ci} 1551cabdff1aSopenharmony_ci 1552cabdff1aSopenharmony_cistatic void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr, 1553cabdff1aSopenharmony_ci int32_t src_stride, 1554cabdff1aSopenharmony_ci int16_t *src1_ptr, 1555cabdff1aSopenharmony_ci int32_t src2_stride, 1556cabdff1aSopenharmony_ci uint8_t *dst, 1557cabdff1aSopenharmony_ci int32_t dst_stride, 1558cabdff1aSopenharmony_ci const int8_t *filter, 1559cabdff1aSopenharmony_ci int32_t height) 1560cabdff1aSopenharmony_ci{ 1561cabdff1aSopenharmony_ci hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 1562cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 16); 1563cabdff1aSopenharmony_ci} 1564cabdff1aSopenharmony_ci 1565cabdff1aSopenharmony_cistatic void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr, 1566cabdff1aSopenharmony_ci int32_t src_stride, 1567cabdff1aSopenharmony_ci int16_t *src1_ptr, 1568cabdff1aSopenharmony_ci int32_t src2_stride, 1569cabdff1aSopenharmony_ci uint8_t *dst, 1570cabdff1aSopenharmony_ci int32_t dst_stride, 1571cabdff1aSopenharmony_ci const int8_t *filter, 1572cabdff1aSopenharmony_ci int32_t height) 1573cabdff1aSopenharmony_ci{ 1574cabdff1aSopenharmony_ci hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 1575cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 16); 1576cabdff1aSopenharmony_ci hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride, 1577cabdff1aSopenharmony_ci dst + 16, dst_stride, filter, height); 1578cabdff1aSopenharmony_ci} 1579cabdff1aSopenharmony_ci 1580cabdff1aSopenharmony_cistatic void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr, 1581cabdff1aSopenharmony_ci int32_t src_stride, 1582cabdff1aSopenharmony_ci int16_t *src1_ptr, 1583cabdff1aSopenharmony_ci int32_t src2_stride, 1584cabdff1aSopenharmony_ci uint8_t *dst, 1585cabdff1aSopenharmony_ci int32_t dst_stride, 1586cabdff1aSopenharmony_ci const int8_t *filter, 1587cabdff1aSopenharmony_ci int32_t height) 1588cabdff1aSopenharmony_ci{ 1589cabdff1aSopenharmony_ci hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 1590cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 32); 1591cabdff1aSopenharmony_ci} 1592cabdff1aSopenharmony_ci 1593cabdff1aSopenharmony_cistatic void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr, 1594cabdff1aSopenharmony_ci int32_t src_stride, 1595cabdff1aSopenharmony_ci int16_t *src1_ptr, 1596cabdff1aSopenharmony_ci int32_t src2_stride, 1597cabdff1aSopenharmony_ci uint8_t *dst, 1598cabdff1aSopenharmony_ci int32_t dst_stride, 1599cabdff1aSopenharmony_ci const int8_t *filter, 1600cabdff1aSopenharmony_ci int32_t height) 1601cabdff1aSopenharmony_ci{ 1602cabdff1aSopenharmony_ci hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 1603cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 48); 1604cabdff1aSopenharmony_ci} 1605cabdff1aSopenharmony_ci 1606cabdff1aSopenharmony_cistatic void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr, 1607cabdff1aSopenharmony_ci int32_t src_stride, 1608cabdff1aSopenharmony_ci int16_t *src1_ptr, 1609cabdff1aSopenharmony_ci int32_t src2_stride, 1610cabdff1aSopenharmony_ci uint8_t *dst, 1611cabdff1aSopenharmony_ci int32_t dst_stride, 1612cabdff1aSopenharmony_ci const int8_t *filter, 1613cabdff1aSopenharmony_ci int32_t height) 1614cabdff1aSopenharmony_ci{ 1615cabdff1aSopenharmony_ci hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 1616cabdff1aSopenharmony_ci dst, dst_stride, filter, height, 64); 1617cabdff1aSopenharmony_ci} 1618cabdff1aSopenharmony_ci 1619cabdff1aSopenharmony_cistatic void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr, 1620cabdff1aSopenharmony_ci int32_t src_stride, 1621cabdff1aSopenharmony_ci int16_t *src1_ptr, 1622cabdff1aSopenharmony_ci int32_t src2_stride, 1623cabdff1aSopenharmony_ci uint8_t *dst, 1624cabdff1aSopenharmony_ci int32_t dst_stride, 1625cabdff1aSopenharmony_ci const int8_t *filter_x, 1626cabdff1aSopenharmony_ci const int8_t *filter_y, 1627cabdff1aSopenharmony_ci int32_t height) 1628cabdff1aSopenharmony_ci{ 1629cabdff1aSopenharmony_ci uint32_t loop_cnt; 1630cabdff1aSopenharmony_ci uint64_t tp0, tp1; 1631cabdff1aSopenharmony_ci v16u8 out; 1632cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1633cabdff1aSopenharmony_ci v8i16 in0 = { 0 }, in1 = { 0 }; 1634cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1635cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1636cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 1637cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1638cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1639cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1640cabdff1aSopenharmony_ci v8i16 out0, out1; 1641cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108; 1642cabdff1aSopenharmony_ci v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109; 1643cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3; 1644cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1645cabdff1aSopenharmony_ci 1646cabdff1aSopenharmony_ci src0_ptr -= ((3 * src_stride) + 3); 1647cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 1648cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1649cabdff1aSopenharmony_ci 1650cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 1651cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 1652cabdff1aSopenharmony_ci 1653cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1654cabdff1aSopenharmony_ci 1655cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1656cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1657cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1658cabdff1aSopenharmony_ci 1659cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1660cabdff1aSopenharmony_ci const_vec <<= 6; 1661cabdff1aSopenharmony_ci 1662cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 1663cabdff1aSopenharmony_ci src0_ptr += (7 * src_stride); 1664cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1665cabdff1aSopenharmony_ci 1666cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 1667cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1668cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1669cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, 1670cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1671cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, 1672cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1673cabdff1aSopenharmony_ci 1674cabdff1aSopenharmony_ci dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1675cabdff1aSopenharmony_ci filt3); 1676cabdff1aSopenharmony_ci dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1677cabdff1aSopenharmony_ci filt3); 1678cabdff1aSopenharmony_ci dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1679cabdff1aSopenharmony_ci filt3); 1680cabdff1aSopenharmony_ci dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 1681cabdff1aSopenharmony_ci filt3); 1682cabdff1aSopenharmony_ci 1683cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10, dst43); 1684cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21, dst54); 1685cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32, dst65); 1686cabdff1aSopenharmony_ci 1687cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 1688cabdff1aSopenharmony_ci 1689cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 1690cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 1691cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 1692cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 1693cabdff1aSopenharmony_ci 1694cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 1695cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 1696cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 1697cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 1698cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in1); 1699cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 1700cabdff1aSopenharmony_ci 1701cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3, 1702cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1703cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3, 1704cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1705cabdff1aSopenharmony_ci dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1706cabdff1aSopenharmony_ci filt3); 1707cabdff1aSopenharmony_ci dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1708cabdff1aSopenharmony_ci filt3); 1709cabdff1aSopenharmony_ci 1710cabdff1aSopenharmony_ci dst76 = __msa_ilvr_h(dst97, dst66); 1711cabdff1aSopenharmony_ci ILVRL_H2_SH(dst108, dst97, dst87, dst109); 1712cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 1713cabdff1aSopenharmony_ci dst98 = __msa_ilvr_h(dst66, dst108); 1714cabdff1aSopenharmony_ci 1715cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1, 1716cabdff1aSopenharmony_ci filt_h2, filt_h3); 1717cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1, 1718cabdff1aSopenharmony_ci filt_h2, filt_h3); 1719cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1, 1720cabdff1aSopenharmony_ci filt_h2, filt_h3); 1721cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1, 1722cabdff1aSopenharmony_ci filt_h2, filt_h3); 1723cabdff1aSopenharmony_ci 1724cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 1725cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, out0, out1); 1726cabdff1aSopenharmony_ci ADDS_SH2_SH(out0, in0, out1, in1, out0, out1); 1727cabdff1aSopenharmony_ci ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1); 1728cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, 7); 1729cabdff1aSopenharmony_ci CLIP_SH2_0_255(out0, out1); 1730cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0); 1731cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 1732cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1733cabdff1aSopenharmony_ci 1734cabdff1aSopenharmony_ci dst10 = dst54; 1735cabdff1aSopenharmony_ci dst32 = dst76; 1736cabdff1aSopenharmony_ci dst54 = dst98; 1737cabdff1aSopenharmony_ci dst21 = dst65; 1738cabdff1aSopenharmony_ci dst43 = dst87; 1739cabdff1aSopenharmony_ci dst65 = dst109; 1740cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 1741cabdff1aSopenharmony_ci } 1742cabdff1aSopenharmony_ci} 1743cabdff1aSopenharmony_ci 1744cabdff1aSopenharmony_cistatic void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr, 1745cabdff1aSopenharmony_ci int32_t src_stride, 1746cabdff1aSopenharmony_ci int16_t *src1_ptr, 1747cabdff1aSopenharmony_ci int32_t src2_stride, 1748cabdff1aSopenharmony_ci uint8_t *dst, 1749cabdff1aSopenharmony_ci int32_t dst_stride, 1750cabdff1aSopenharmony_ci const int8_t *filter_x, 1751cabdff1aSopenharmony_ci const int8_t *filter_y, 1752cabdff1aSopenharmony_ci int32_t height, int32_t width) 1753cabdff1aSopenharmony_ci{ 1754cabdff1aSopenharmony_ci uint32_t loop_cnt; 1755cabdff1aSopenharmony_ci uint32_t cnt; 1756cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp; 1757cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 1758cabdff1aSopenharmony_ci uint8_t *dst_tmp; 1759cabdff1aSopenharmony_ci v16u8 out; 1760cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 1761cabdff1aSopenharmony_ci v8i16 in0, tmp; 1762cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3; 1763cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1764cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 1765cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 1766cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 1767cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1768cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1769cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1770cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l; 1771cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 1772cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst76_l; 1773cabdff1aSopenharmony_ci 1774cabdff1aSopenharmony_ci src0_ptr -= ((3 * src_stride) + 3); 1775cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1776cabdff1aSopenharmony_ci const_vec <<= 6; 1777cabdff1aSopenharmony_ci 1778cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 1779cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1780cabdff1aSopenharmony_ci 1781cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 1782cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 1783cabdff1aSopenharmony_ci 1784cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1785cabdff1aSopenharmony_ci 1786cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1787cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1788cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1789cabdff1aSopenharmony_ci 1790cabdff1aSopenharmony_ci for (cnt = width >> 3; cnt--;) { 1791cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 1792cabdff1aSopenharmony_ci dst_tmp = dst; 1793cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 1794cabdff1aSopenharmony_ci 1795cabdff1aSopenharmony_ci LD_SB7(src0_ptr_tmp, src_stride, 1796cabdff1aSopenharmony_ci src0, src1, src2, src3, src4, src5, src6); 1797cabdff1aSopenharmony_ci src0_ptr_tmp += (7 * src_stride); 1798cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1799cabdff1aSopenharmony_ci 1800cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 1801cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1802cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1803cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1804cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1805cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1806cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1807cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1808cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 1809cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1810cabdff1aSopenharmony_ci filt3); 1811cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1812cabdff1aSopenharmony_ci filt3); 1813cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1814cabdff1aSopenharmony_ci filt3); 1815cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1816cabdff1aSopenharmony_ci filt2, filt3); 1817cabdff1aSopenharmony_ci 1818cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 1819cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1820cabdff1aSopenharmony_ci VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, 1821cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1822cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, 1823cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 1824cabdff1aSopenharmony_ci dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1825cabdff1aSopenharmony_ci filt3); 1826cabdff1aSopenharmony_ci dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1827cabdff1aSopenharmony_ci filt3); 1828cabdff1aSopenharmony_ci dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1829cabdff1aSopenharmony_ci filt3); 1830cabdff1aSopenharmony_ci 1831cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 1832cabdff1aSopenharmony_ci src7 = LD_SB(src0_ptr_tmp); 1833cabdff1aSopenharmony_ci src7 = (v16i8) __msa_xori_b((v16u8) src7, 128); 1834cabdff1aSopenharmony_ci src0_ptr_tmp += src_stride; 1835cabdff1aSopenharmony_ci 1836cabdff1aSopenharmony_ci in0 = LD_SH(src1_ptr_tmp); 1837cabdff1aSopenharmony_ci src1_ptr_tmp += src2_stride; 1838cabdff1aSopenharmony_ci 1839cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, 1840cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1841cabdff1aSopenharmony_ci dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1842cabdff1aSopenharmony_ci filt2, filt3); 1843cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 1844cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 1845cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 1846cabdff1aSopenharmony_ci ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1847cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 1848cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1849cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 1850cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2, filt_h3); 1851cabdff1aSopenharmony_ci dst0_r >>= 6; 1852cabdff1aSopenharmony_ci dst0_l >>= 6; 1853cabdff1aSopenharmony_ci 1854cabdff1aSopenharmony_ci tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 1855cabdff1aSopenharmony_ci ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp); 1856cabdff1aSopenharmony_ci tmp = __msa_srari_h(tmp, 7); 1857cabdff1aSopenharmony_ci CLIP_SH_0_255(tmp); 1858cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); 1859cabdff1aSopenharmony_ci ST_D1(out, 0, dst_tmp); 1860cabdff1aSopenharmony_ci dst_tmp += dst_stride; 1861cabdff1aSopenharmony_ci 1862cabdff1aSopenharmony_ci dst0 = dst1; 1863cabdff1aSopenharmony_ci dst1 = dst2; 1864cabdff1aSopenharmony_ci dst2 = dst3; 1865cabdff1aSopenharmony_ci dst3 = dst4; 1866cabdff1aSopenharmony_ci dst4 = dst5; 1867cabdff1aSopenharmony_ci dst5 = dst6; 1868cabdff1aSopenharmony_ci dst6 = dst7; 1869cabdff1aSopenharmony_ci } 1870cabdff1aSopenharmony_ci 1871cabdff1aSopenharmony_ci src0_ptr += 8; 1872cabdff1aSopenharmony_ci dst += 8; 1873cabdff1aSopenharmony_ci src1_ptr += 8; 1874cabdff1aSopenharmony_ci } 1875cabdff1aSopenharmony_ci} 1876cabdff1aSopenharmony_ci 1877cabdff1aSopenharmony_cistatic void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr, 1878cabdff1aSopenharmony_ci int32_t src_stride, 1879cabdff1aSopenharmony_ci int16_t *src1_ptr, 1880cabdff1aSopenharmony_ci int32_t src2_stride, 1881cabdff1aSopenharmony_ci uint8_t *dst, 1882cabdff1aSopenharmony_ci int32_t dst_stride, 1883cabdff1aSopenharmony_ci const int8_t *filter_x, 1884cabdff1aSopenharmony_ci const int8_t *filter_y, 1885cabdff1aSopenharmony_ci int32_t height) 1886cabdff1aSopenharmony_ci{ 1887cabdff1aSopenharmony_ci hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 1888cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 1889cabdff1aSopenharmony_ci height, 8); 1890cabdff1aSopenharmony_ci} 1891cabdff1aSopenharmony_ci 1892cabdff1aSopenharmony_cistatic void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr, 1893cabdff1aSopenharmony_ci int32_t src_stride, 1894cabdff1aSopenharmony_ci int16_t *src1_ptr, 1895cabdff1aSopenharmony_ci int32_t src2_stride, 1896cabdff1aSopenharmony_ci uint8_t *dst, 1897cabdff1aSopenharmony_ci int32_t dst_stride, 1898cabdff1aSopenharmony_ci const int8_t *filter_x, 1899cabdff1aSopenharmony_ci const int8_t *filter_y, 1900cabdff1aSopenharmony_ci int32_t height) 1901cabdff1aSopenharmony_ci{ 1902cabdff1aSopenharmony_ci uint32_t loop_cnt; 1903cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp, *dst_tmp; 1904cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 1905cabdff1aSopenharmony_ci uint64_t tp0, tp1; 1906cabdff1aSopenharmony_ci v16u8 out; 1907cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1908cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1909cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1910cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1911cabdff1aSopenharmony_ci v8i16 in0, in1 = { 0 }, out0, out1, tmp, filter_vec, const_vec; 1912cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3; 1913cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1914cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108; 1915cabdff1aSopenharmony_ci v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109; 1916cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 1917cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst76_l; 1918cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, tmp0, tmp1, tmp2, tmp3; 1919cabdff1aSopenharmony_ci 1920cabdff1aSopenharmony_ci src0_ptr -= ((3 * src_stride) + 3); 1921cabdff1aSopenharmony_ci 1922cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 1923cabdff1aSopenharmony_ci const_vec <<= 6; 1924cabdff1aSopenharmony_ci 1925cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 1926cabdff1aSopenharmony_ci SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1927cabdff1aSopenharmony_ci 1928cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 1929cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 1930cabdff1aSopenharmony_ci 1931cabdff1aSopenharmony_ci SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1932cabdff1aSopenharmony_ci 1933cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 1934cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1935cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1936cabdff1aSopenharmony_ci mask3 = mask0 + 6; 1937cabdff1aSopenharmony_ci 1938cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 1939cabdff1aSopenharmony_ci dst_tmp = dst; 1940cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 1941cabdff1aSopenharmony_ci 1942cabdff1aSopenharmony_ci LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, 1943cabdff1aSopenharmony_ci src6); 1944cabdff1aSopenharmony_ci src0_ptr_tmp += (7 * src_stride); 1945cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1946cabdff1aSopenharmony_ci 1947cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 1948cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 1949cabdff1aSopenharmony_ci vec3); 1950cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, 1951cabdff1aSopenharmony_ci vec7); 1952cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 1953cabdff1aSopenharmony_ci vec11); 1954cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14, 1955cabdff1aSopenharmony_ci vec15); 1956cabdff1aSopenharmony_ci dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1957cabdff1aSopenharmony_ci filt3); 1958cabdff1aSopenharmony_ci dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1959cabdff1aSopenharmony_ci filt3); 1960cabdff1aSopenharmony_ci dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1961cabdff1aSopenharmony_ci filt3); 1962cabdff1aSopenharmony_ci dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1963cabdff1aSopenharmony_ci filt2, filt3); 1964cabdff1aSopenharmony_ci VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 1965cabdff1aSopenharmony_ci vec3); 1966cabdff1aSopenharmony_ci VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, 1967cabdff1aSopenharmony_ci vec7); 1968cabdff1aSopenharmony_ci VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 1969cabdff1aSopenharmony_ci vec11); 1970cabdff1aSopenharmony_ci dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1971cabdff1aSopenharmony_ci filt3); 1972cabdff1aSopenharmony_ci dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1973cabdff1aSopenharmony_ci filt3); 1974cabdff1aSopenharmony_ci dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1975cabdff1aSopenharmony_ci filt3); 1976cabdff1aSopenharmony_ci 1977cabdff1aSopenharmony_ci for (loop_cnt = 16; loop_cnt--;) { 1978cabdff1aSopenharmony_ci src7 = LD_SB(src0_ptr_tmp); 1979cabdff1aSopenharmony_ci src7 = (v16i8) __msa_xori_b((v16u8) src7, 128); 1980cabdff1aSopenharmony_ci src0_ptr_tmp += src_stride; 1981cabdff1aSopenharmony_ci 1982cabdff1aSopenharmony_ci in0 = LD_SH(src1_ptr_tmp); 1983cabdff1aSopenharmony_ci src1_ptr_tmp += src2_stride; 1984cabdff1aSopenharmony_ci 1985cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 1986cabdff1aSopenharmony_ci vec3); 1987cabdff1aSopenharmony_ci dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1988cabdff1aSopenharmony_ci filt2, filt3); 1989cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 1990cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 1991cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 1992cabdff1aSopenharmony_ci ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1993cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 1994cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1995cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0, 1996cabdff1aSopenharmony_ci filt_h1, filt_h2, filt_h3); 1997cabdff1aSopenharmony_ci dst0_r >>= 6; 1998cabdff1aSopenharmony_ci dst0_l >>= 6; 1999cabdff1aSopenharmony_ci 2000cabdff1aSopenharmony_ci tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 2001cabdff1aSopenharmony_ci ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp); 2002cabdff1aSopenharmony_ci tmp = __msa_srari_h(tmp, 7); 2003cabdff1aSopenharmony_ci CLIP_SH_0_255(tmp); 2004cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); 2005cabdff1aSopenharmony_ci ST_D1(out, 0, dst_tmp); 2006cabdff1aSopenharmony_ci dst_tmp += dst_stride; 2007cabdff1aSopenharmony_ci 2008cabdff1aSopenharmony_ci dst0 = dst1; 2009cabdff1aSopenharmony_ci dst1 = dst2; 2010cabdff1aSopenharmony_ci dst2 = dst3; 2011cabdff1aSopenharmony_ci dst3 = dst4; 2012cabdff1aSopenharmony_ci dst4 = dst5; 2013cabdff1aSopenharmony_ci dst5 = dst6; 2014cabdff1aSopenharmony_ci dst6 = dst7; 2015cabdff1aSopenharmony_ci } 2016cabdff1aSopenharmony_ci 2017cabdff1aSopenharmony_ci src0_ptr += 8; 2018cabdff1aSopenharmony_ci dst += 8; 2019cabdff1aSopenharmony_ci src1_ptr += 8; 2020cabdff1aSopenharmony_ci 2021cabdff1aSopenharmony_ci mask4 = LD_SB(ff_hevc_mask_arr + 16); 2022cabdff1aSopenharmony_ci mask5 = mask4 + 2; 2023cabdff1aSopenharmony_ci mask6 = mask4 + 4; 2024cabdff1aSopenharmony_ci mask7 = mask4 + 6; 2025cabdff1aSopenharmony_ci 2026cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 2027cabdff1aSopenharmony_ci src0_ptr += (7 * src_stride); 2028cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 2029cabdff1aSopenharmony_ci 2030cabdff1aSopenharmony_ci /* row 0 row 1 row 2 row 3 */ 2031cabdff1aSopenharmony_ci VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); 2032cabdff1aSopenharmony_ci VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7); 2033cabdff1aSopenharmony_ci VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, 2034cabdff1aSopenharmony_ci vec8, vec9, vec10, vec11); 2035cabdff1aSopenharmony_ci VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, 2036cabdff1aSopenharmony_ci vec12, vec13, vec14, vec15); 2037cabdff1aSopenharmony_ci dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2038cabdff1aSopenharmony_ci filt3); 2039cabdff1aSopenharmony_ci dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2040cabdff1aSopenharmony_ci filt3); 2041cabdff1aSopenharmony_ci dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 2042cabdff1aSopenharmony_ci filt3); 2043cabdff1aSopenharmony_ci dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 2044cabdff1aSopenharmony_ci filt3); 2045cabdff1aSopenharmony_ci 2046cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10, dst43); 2047cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21, dst54); 2048cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32, dst65); 2049cabdff1aSopenharmony_ci 2050cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 2051cabdff1aSopenharmony_ci 2052cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 2053cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10); 2054cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 2055cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 2056cabdff1aSopenharmony_ci 2057cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 2058cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 2059cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 2060cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 2061cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in1); 2062cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 2063cabdff1aSopenharmony_ci 2064cabdff1aSopenharmony_ci VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 2065cabdff1aSopenharmony_ci vec3); 2066cabdff1aSopenharmony_ci VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6, 2067cabdff1aSopenharmony_ci vec7); 2068cabdff1aSopenharmony_ci dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 2069cabdff1aSopenharmony_ci filt3); 2070cabdff1aSopenharmony_ci dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 2071cabdff1aSopenharmony_ci filt3); 2072cabdff1aSopenharmony_ci 2073cabdff1aSopenharmony_ci dst76 = __msa_ilvr_h(dst97, dst66); 2074cabdff1aSopenharmony_ci ILVRL_H2_SH(dst108, dst97, dst87, dst109); 2075cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 2076cabdff1aSopenharmony_ci dst98 = __msa_ilvr_h(dst66, dst108); 2077cabdff1aSopenharmony_ci 2078cabdff1aSopenharmony_ci tmp0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1, 2079cabdff1aSopenharmony_ci filt_h2, filt_h3); 2080cabdff1aSopenharmony_ci tmp1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1, 2081cabdff1aSopenharmony_ci filt_h2, filt_h3); 2082cabdff1aSopenharmony_ci tmp2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1, 2083cabdff1aSopenharmony_ci filt_h2, filt_h3); 2084cabdff1aSopenharmony_ci tmp3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1, 2085cabdff1aSopenharmony_ci filt_h2, filt_h3); 2086cabdff1aSopenharmony_ci SRA_4V(tmp0, tmp1, tmp2, tmp3, 6); 2087cabdff1aSopenharmony_ci PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, out0, out1); 2088cabdff1aSopenharmony_ci ADDS_SH2_SH(out0, in0, out1, in1, out0, out1); 2089cabdff1aSopenharmony_ci ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1); 2090cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, 7); 2091cabdff1aSopenharmony_ci CLIP_SH2_0_255(out0, out1); 2092cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0); 2093cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2094cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2095cabdff1aSopenharmony_ci 2096cabdff1aSopenharmony_ci dst10 = dst54; 2097cabdff1aSopenharmony_ci dst32 = dst76; 2098cabdff1aSopenharmony_ci dst54 = dst98; 2099cabdff1aSopenharmony_ci dst21 = dst65; 2100cabdff1aSopenharmony_ci dst43 = dst87; 2101cabdff1aSopenharmony_ci dst65 = dst109; 2102cabdff1aSopenharmony_ci dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 2103cabdff1aSopenharmony_ci } 2104cabdff1aSopenharmony_ci} 2105cabdff1aSopenharmony_ci 2106cabdff1aSopenharmony_cistatic void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr, 2107cabdff1aSopenharmony_ci int32_t src_stride, 2108cabdff1aSopenharmony_ci int16_t *src1_ptr, 2109cabdff1aSopenharmony_ci int32_t src2_stride, 2110cabdff1aSopenharmony_ci uint8_t *dst, 2111cabdff1aSopenharmony_ci int32_t dst_stride, 2112cabdff1aSopenharmony_ci const int8_t *filter_x, 2113cabdff1aSopenharmony_ci const int8_t *filter_y, 2114cabdff1aSopenharmony_ci int32_t height) 2115cabdff1aSopenharmony_ci{ 2116cabdff1aSopenharmony_ci hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2117cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 2118cabdff1aSopenharmony_ci height, 16); 2119cabdff1aSopenharmony_ci} 2120cabdff1aSopenharmony_ci 2121cabdff1aSopenharmony_cistatic void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr, 2122cabdff1aSopenharmony_ci int32_t src_stride, 2123cabdff1aSopenharmony_ci int16_t *src1_ptr, 2124cabdff1aSopenharmony_ci int32_t src2_stride, 2125cabdff1aSopenharmony_ci uint8_t *dst, 2126cabdff1aSopenharmony_ci int32_t dst_stride, 2127cabdff1aSopenharmony_ci const int8_t *filter_x, 2128cabdff1aSopenharmony_ci const int8_t *filter_y, 2129cabdff1aSopenharmony_ci int32_t height) 2130cabdff1aSopenharmony_ci{ 2131cabdff1aSopenharmony_ci hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2132cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 2133cabdff1aSopenharmony_ci height, 24); 2134cabdff1aSopenharmony_ci} 2135cabdff1aSopenharmony_ci 2136cabdff1aSopenharmony_cistatic void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr, 2137cabdff1aSopenharmony_ci int32_t src_stride, 2138cabdff1aSopenharmony_ci int16_t *src1_ptr, 2139cabdff1aSopenharmony_ci int32_t src2_stride, 2140cabdff1aSopenharmony_ci uint8_t *dst, 2141cabdff1aSopenharmony_ci int32_t dst_stride, 2142cabdff1aSopenharmony_ci const int8_t *filter_x, 2143cabdff1aSopenharmony_ci const int8_t *filter_y, 2144cabdff1aSopenharmony_ci int32_t height) 2145cabdff1aSopenharmony_ci{ 2146cabdff1aSopenharmony_ci hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2147cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 2148cabdff1aSopenharmony_ci height, 32); 2149cabdff1aSopenharmony_ci} 2150cabdff1aSopenharmony_ci 2151cabdff1aSopenharmony_cistatic void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr, 2152cabdff1aSopenharmony_ci int32_t src_stride, 2153cabdff1aSopenharmony_ci int16_t *src1_ptr, 2154cabdff1aSopenharmony_ci int32_t src2_stride, 2155cabdff1aSopenharmony_ci uint8_t *dst, 2156cabdff1aSopenharmony_ci int32_t dst_stride, 2157cabdff1aSopenharmony_ci const int8_t *filter_x, 2158cabdff1aSopenharmony_ci const int8_t *filter_y, 2159cabdff1aSopenharmony_ci int32_t height) 2160cabdff1aSopenharmony_ci{ 2161cabdff1aSopenharmony_ci hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2162cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 2163cabdff1aSopenharmony_ci height, 48); 2164cabdff1aSopenharmony_ci} 2165cabdff1aSopenharmony_ci 2166cabdff1aSopenharmony_cistatic void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr, 2167cabdff1aSopenharmony_ci int32_t src_stride, 2168cabdff1aSopenharmony_ci int16_t *src1_ptr, 2169cabdff1aSopenharmony_ci int32_t src2_stride, 2170cabdff1aSopenharmony_ci uint8_t *dst, 2171cabdff1aSopenharmony_ci int32_t dst_stride, 2172cabdff1aSopenharmony_ci const int8_t *filter_x, 2173cabdff1aSopenharmony_ci const int8_t *filter_y, 2174cabdff1aSopenharmony_ci int32_t height) 2175cabdff1aSopenharmony_ci{ 2176cabdff1aSopenharmony_ci hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2177cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 2178cabdff1aSopenharmony_ci height, 64); 2179cabdff1aSopenharmony_ci} 2180cabdff1aSopenharmony_ci 2181cabdff1aSopenharmony_cistatic void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr, 2182cabdff1aSopenharmony_ci int32_t src_stride, 2183cabdff1aSopenharmony_ci int16_t *src1_ptr, 2184cabdff1aSopenharmony_ci int32_t src2_stride, 2185cabdff1aSopenharmony_ci uint8_t *dst, 2186cabdff1aSopenharmony_ci int32_t dst_stride, 2187cabdff1aSopenharmony_ci const int8_t *filter, 2188cabdff1aSopenharmony_ci int32_t height) 2189cabdff1aSopenharmony_ci{ 2190cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2191cabdff1aSopenharmony_ci v16i8 src0, src1, dst0, vec0, vec1; 2192cabdff1aSopenharmony_ci v8i16 in0, in1; 2193cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2194cabdff1aSopenharmony_ci v16i8 mask1; 2195cabdff1aSopenharmony_ci v8i16 tmp0; 2196cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2197cabdff1aSopenharmony_ci 2198cabdff1aSopenharmony_ci src0_ptr -= 1; 2199cabdff1aSopenharmony_ci 2200cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2201cabdff1aSopenharmony_ci const_vec <<= 6; 2202cabdff1aSopenharmony_ci 2203cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2204cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2205cabdff1aSopenharmony_ci 2206cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2207cabdff1aSopenharmony_ci 2208cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src0, src1); 2209cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 2210cabdff1aSopenharmony_ci in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); 2211cabdff1aSopenharmony_ci XORI_B2_128_SB(src0, src1); 2212cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 2213cabdff1aSopenharmony_ci tmp0 = const_vec; 2214cabdff1aSopenharmony_ci DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0); 2215cabdff1aSopenharmony_ci 2216cabdff1aSopenharmony_ci tmp0 = __msa_adds_s_h(tmp0, in0); 2217cabdff1aSopenharmony_ci tmp0 = __msa_srari_h(tmp0, 7); 2218cabdff1aSopenharmony_ci CLIP_SH_0_255(tmp0); 2219cabdff1aSopenharmony_ci dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0); 2220cabdff1aSopenharmony_ci 2221cabdff1aSopenharmony_ci ST_W2(dst0, 0, 1, dst, dst_stride); 2222cabdff1aSopenharmony_ci} 2223cabdff1aSopenharmony_ci 2224cabdff1aSopenharmony_cistatic void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr, 2225cabdff1aSopenharmony_ci int32_t src_stride, 2226cabdff1aSopenharmony_ci int16_t *src1_ptr, 2227cabdff1aSopenharmony_ci int32_t src2_stride, 2228cabdff1aSopenharmony_ci uint8_t *dst, 2229cabdff1aSopenharmony_ci int32_t dst_stride, 2230cabdff1aSopenharmony_ci const int8_t *filter, 2231cabdff1aSopenharmony_ci int32_t height) 2232cabdff1aSopenharmony_ci{ 2233cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2234cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, dst0, vec0, vec1; 2235cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 2236cabdff1aSopenharmony_ci v16i8 vec2, vec3; 2237cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2238cabdff1aSopenharmony_ci v16i8 mask1; 2239cabdff1aSopenharmony_ci v8i16 tmp0, tmp1; 2240cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2241cabdff1aSopenharmony_ci 2242cabdff1aSopenharmony_ci src0_ptr -= 1; 2243cabdff1aSopenharmony_ci 2244cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2245cabdff1aSopenharmony_ci const_vec <<= 6; 2246cabdff1aSopenharmony_ci 2247cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2248cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2249cabdff1aSopenharmony_ci 2250cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2251cabdff1aSopenharmony_ci 2252cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 2253cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2254cabdff1aSopenharmony_ci 2255cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 2256cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2257cabdff1aSopenharmony_ci 2258cabdff1aSopenharmony_ci tmp0 = const_vec; 2259cabdff1aSopenharmony_ci tmp1 = const_vec; 2260cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 2261cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 2262cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, tmp0, tmp1, 2263cabdff1aSopenharmony_ci tmp0, tmp1); 2264cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1); 2265cabdff1aSopenharmony_ci dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 2266cabdff1aSopenharmony_ci 2267cabdff1aSopenharmony_ci ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride); 2268cabdff1aSopenharmony_ci} 2269cabdff1aSopenharmony_ci 2270cabdff1aSopenharmony_cistatic void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, 2271cabdff1aSopenharmony_ci int32_t src_stride, 2272cabdff1aSopenharmony_ci int16_t *src1_ptr, 2273cabdff1aSopenharmony_ci int32_t src2_stride, 2274cabdff1aSopenharmony_ci uint8_t *dst, 2275cabdff1aSopenharmony_ci int32_t dst_stride, 2276cabdff1aSopenharmony_ci const int8_t *filter, 2277cabdff1aSopenharmony_ci int32_t height) 2278cabdff1aSopenharmony_ci{ 2279cabdff1aSopenharmony_ci uint32_t loop_cnt; 2280cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2281cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2282cabdff1aSopenharmony_ci v16i8 dst0, dst1; 2283cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 2284cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2285cabdff1aSopenharmony_ci v16i8 mask1, vec0, vec1, vec2, vec3; 2286cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 2287cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2288cabdff1aSopenharmony_ci 2289cabdff1aSopenharmony_ci src0_ptr -= 1; 2290cabdff1aSopenharmony_ci 2291cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2292cabdff1aSopenharmony_ci const_vec <<= 6; 2293cabdff1aSopenharmony_ci 2294cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2295cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2296cabdff1aSopenharmony_ci 2297cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2298cabdff1aSopenharmony_ci 2299cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 2300cabdff1aSopenharmony_ci LD_SB8(src0_ptr, src_stride, 2301cabdff1aSopenharmony_ci src0, src1, src2, src3, src4, src5, src6, src7); 2302cabdff1aSopenharmony_ci src0_ptr += (8 * src_stride); 2303cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2304cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 2305cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7); 2306cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 2307cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 2308cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 2309cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2310cabdff1aSopenharmony_ci 2311cabdff1aSopenharmony_ci tmp0 = const_vec; 2312cabdff1aSopenharmony_ci tmp1 = const_vec; 2313cabdff1aSopenharmony_ci tmp2 = const_vec; 2314cabdff1aSopenharmony_ci tmp3 = const_vec; 2315cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 2316cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3); 2317cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, 2318cabdff1aSopenharmony_ci tmp1, tmp2, tmp3); 2319cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1); 2320cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3); 2321cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, tmp0, 2322cabdff1aSopenharmony_ci tmp1, tmp2, tmp3); 2323cabdff1aSopenharmony_ci 2324cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2325cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3); 2326cabdff1aSopenharmony_ci 2327cabdff1aSopenharmony_ci PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); 2328cabdff1aSopenharmony_ci ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 2329cabdff1aSopenharmony_ci dst += (8 * dst_stride); 2330cabdff1aSopenharmony_ci } 2331cabdff1aSopenharmony_ci} 2332cabdff1aSopenharmony_ci 2333cabdff1aSopenharmony_cistatic void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr, 2334cabdff1aSopenharmony_ci int32_t src_stride, 2335cabdff1aSopenharmony_ci int16_t *src1_ptr, 2336cabdff1aSopenharmony_ci int32_t src2_stride, 2337cabdff1aSopenharmony_ci uint8_t *dst, 2338cabdff1aSopenharmony_ci int32_t dst_stride, 2339cabdff1aSopenharmony_ci const int8_t *filter, 2340cabdff1aSopenharmony_ci int32_t height) 2341cabdff1aSopenharmony_ci{ 2342cabdff1aSopenharmony_ci if (2 == height) { 2343cabdff1aSopenharmony_ci hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2344cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 2345cabdff1aSopenharmony_ci } else if (4 == height) { 2346cabdff1aSopenharmony_ci hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2347cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 2348cabdff1aSopenharmony_ci } else if (8 == height || 16 == height) { 2349cabdff1aSopenharmony_ci hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride, 2350cabdff1aSopenharmony_ci src1_ptr, src2_stride, 2351cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 2352cabdff1aSopenharmony_ci } 2353cabdff1aSopenharmony_ci} 2354cabdff1aSopenharmony_ci 2355cabdff1aSopenharmony_cistatic void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr, 2356cabdff1aSopenharmony_ci int32_t src_stride, 2357cabdff1aSopenharmony_ci int16_t *src1_ptr, 2358cabdff1aSopenharmony_ci int32_t src2_stride, 2359cabdff1aSopenharmony_ci uint8_t *dst, 2360cabdff1aSopenharmony_ci int32_t dst_stride, 2361cabdff1aSopenharmony_ci const int8_t *filter, 2362cabdff1aSopenharmony_ci int32_t height) 2363cabdff1aSopenharmony_ci{ 2364cabdff1aSopenharmony_ci uint32_t loop_cnt; 2365cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2366cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 2367cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 2368cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2369cabdff1aSopenharmony_ci v16i8 mask1; 2370cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 2371cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 2372cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2373cabdff1aSopenharmony_ci 2374cabdff1aSopenharmony_ci src0_ptr -= 1; 2375cabdff1aSopenharmony_ci 2376cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2377cabdff1aSopenharmony_ci const_vec <<= 6; 2378cabdff1aSopenharmony_ci 2379cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2380cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2381cabdff1aSopenharmony_ci 2382cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2383cabdff1aSopenharmony_ci 2384cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2385cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 2386cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 2387cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2388cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 2389cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2390cabdff1aSopenharmony_ci 2391cabdff1aSopenharmony_ci dst0 = const_vec; 2392cabdff1aSopenharmony_ci dst1 = const_vec; 2393cabdff1aSopenharmony_ci dst2 = const_vec; 2394cabdff1aSopenharmony_ci dst3 = const_vec; 2395cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2396cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 2397cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2398cabdff1aSopenharmony_ci dst1, dst2, dst3); 2399cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 2400cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 2401cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2402cabdff1aSopenharmony_ci dst1, dst2, dst3); 2403cabdff1aSopenharmony_ci 2404cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2405cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2406cabdff1aSopenharmony_ci 2407cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2408cabdff1aSopenharmony_ci ST_W2(dst0, 0, 2, dst, dst_stride); 2409cabdff1aSopenharmony_ci ST_H2(dst0, 2, 6, dst + 4, dst_stride); 2410cabdff1aSopenharmony_ci ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride); 2411cabdff1aSopenharmony_ci ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2412cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2413cabdff1aSopenharmony_ci } 2414cabdff1aSopenharmony_ci} 2415cabdff1aSopenharmony_ci 2416cabdff1aSopenharmony_cistatic void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr, 2417cabdff1aSopenharmony_ci int32_t src_stride, 2418cabdff1aSopenharmony_ci int16_t *src1_ptr, 2419cabdff1aSopenharmony_ci int32_t src2_stride, 2420cabdff1aSopenharmony_ci uint8_t *dst, 2421cabdff1aSopenharmony_ci int32_t dst_stride, 2422cabdff1aSopenharmony_ci const int8_t *filter, 2423cabdff1aSopenharmony_ci int32_t height) 2424cabdff1aSopenharmony_ci{ 2425cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2426cabdff1aSopenharmony_ci v16i8 src0, src1; 2427cabdff1aSopenharmony_ci v8i16 in0, in1; 2428cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2429cabdff1aSopenharmony_ci v16i8 mask1, vec0, vec1, vec2, vec3; 2430cabdff1aSopenharmony_ci v8i16 dst0, dst1; 2431cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2432cabdff1aSopenharmony_ci 2433cabdff1aSopenharmony_ci src0_ptr -= 1; 2434cabdff1aSopenharmony_ci 2435cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2436cabdff1aSopenharmony_ci const_vec <<= 6; 2437cabdff1aSopenharmony_ci 2438cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2439cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2440cabdff1aSopenharmony_ci 2441cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2442cabdff1aSopenharmony_ci 2443cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src0, src1); 2444cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 2445cabdff1aSopenharmony_ci XORI_B2_128_SB(src0, src1); 2446cabdff1aSopenharmony_ci 2447cabdff1aSopenharmony_ci dst0 = const_vec; 2448cabdff1aSopenharmony_ci dst1 = const_vec; 2449cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2450cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec2, vec3); 2451cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst0, dst1, 2452cabdff1aSopenharmony_ci dst0, dst1); 2453cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); 2454cabdff1aSopenharmony_ci 2455cabdff1aSopenharmony_ci dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); 2456cabdff1aSopenharmony_ci ST_D2(dst0, 0, 1, dst, dst_stride); 2457cabdff1aSopenharmony_ci} 2458cabdff1aSopenharmony_ci 2459cabdff1aSopenharmony_cistatic void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr, 2460cabdff1aSopenharmony_ci int32_t src_stride, 2461cabdff1aSopenharmony_ci int16_t *src1_ptr, 2462cabdff1aSopenharmony_ci int32_t src2_stride, 2463cabdff1aSopenharmony_ci uint8_t *dst, 2464cabdff1aSopenharmony_ci int32_t dst_stride, 2465cabdff1aSopenharmony_ci const int8_t *filter, 2466cabdff1aSopenharmony_ci int32_t height) 2467cabdff1aSopenharmony_ci{ 2468cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2469cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 2470cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5; 2471cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2472cabdff1aSopenharmony_ci v16i8 mask1; 2473cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 2474cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 2475cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2476cabdff1aSopenharmony_ci 2477cabdff1aSopenharmony_ci src0_ptr -= 1; 2478cabdff1aSopenharmony_ci 2479cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2480cabdff1aSopenharmony_ci const_vec <<= 6; 2481cabdff1aSopenharmony_ci 2482cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2483cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2484cabdff1aSopenharmony_ci 2485cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2486cabdff1aSopenharmony_ci 2487cabdff1aSopenharmony_ci LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5); 2488cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2489cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 2490cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in4, in5); 2491cabdff1aSopenharmony_ci XORI_B6_128_SB(src0, src1, src2, src3, src4, src5); 2492cabdff1aSopenharmony_ci 2493cabdff1aSopenharmony_ci dst0 = const_vec; 2494cabdff1aSopenharmony_ci dst1 = const_vec; 2495cabdff1aSopenharmony_ci dst2 = const_vec; 2496cabdff1aSopenharmony_ci dst3 = const_vec; 2497cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2498cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 2499cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, dst1, 2500cabdff1aSopenharmony_ci dst2, dst3); 2501cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 2502cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 2503cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, dst1, 2504cabdff1aSopenharmony_ci dst2, dst3); 2505cabdff1aSopenharmony_ci dst4 = const_vec; 2506cabdff1aSopenharmony_ci dst5 = const_vec; 2507cabdff1aSopenharmony_ci 2508cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 2509cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec2, vec3); 2510cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, dst5, 2511cabdff1aSopenharmony_ci dst4, dst5); 2512cabdff1aSopenharmony_ci 2513cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2514cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2515cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); 2516cabdff1aSopenharmony_ci 2517cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2518cabdff1aSopenharmony_ci dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); 2519cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 2520cabdff1aSopenharmony_ci ST_D2(dst2, 0, 1, dst + 4 * dst_stride, dst_stride); 2521cabdff1aSopenharmony_ci} 2522cabdff1aSopenharmony_ci 2523cabdff1aSopenharmony_cistatic void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, 2524cabdff1aSopenharmony_ci int32_t src_stride, 2525cabdff1aSopenharmony_ci int16_t *src1_ptr, 2526cabdff1aSopenharmony_ci int32_t src2_stride, 2527cabdff1aSopenharmony_ci uint8_t *dst, 2528cabdff1aSopenharmony_ci int32_t dst_stride, 2529cabdff1aSopenharmony_ci const int8_t *filter, 2530cabdff1aSopenharmony_ci int32_t height) 2531cabdff1aSopenharmony_ci{ 2532cabdff1aSopenharmony_ci uint32_t loop_cnt; 2533cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2534cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 2535cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 2536cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2537cabdff1aSopenharmony_ci v16i8 mask1; 2538cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 2539cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 2540cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2541cabdff1aSopenharmony_ci 2542cabdff1aSopenharmony_ci src0_ptr -= 1; 2543cabdff1aSopenharmony_ci 2544cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2545cabdff1aSopenharmony_ci const_vec <<= 6; 2546cabdff1aSopenharmony_ci 2547cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2548cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2549cabdff1aSopenharmony_ci 2550cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2551cabdff1aSopenharmony_ci 2552cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2553cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 2554cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 2555cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2556cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 2557cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2558cabdff1aSopenharmony_ci 2559cabdff1aSopenharmony_ci dst0 = const_vec; 2560cabdff1aSopenharmony_ci dst1 = const_vec; 2561cabdff1aSopenharmony_ci dst2 = const_vec; 2562cabdff1aSopenharmony_ci dst3 = const_vec; 2563cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2564cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 2565cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2566cabdff1aSopenharmony_ci dst1, dst2, dst3); 2567cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 2568cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 2569cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2570cabdff1aSopenharmony_ci dst1, dst2, dst3); 2571cabdff1aSopenharmony_ci 2572cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2573cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2574cabdff1aSopenharmony_ci 2575cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2576cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 2577cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2578cabdff1aSopenharmony_ci } 2579cabdff1aSopenharmony_ci} 2580cabdff1aSopenharmony_ci 2581cabdff1aSopenharmony_cistatic void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr, 2582cabdff1aSopenharmony_ci int32_t src_stride, 2583cabdff1aSopenharmony_ci int16_t *src1_ptr, 2584cabdff1aSopenharmony_ci int32_t src2_stride, 2585cabdff1aSopenharmony_ci uint8_t *dst, 2586cabdff1aSopenharmony_ci int32_t dst_stride, 2587cabdff1aSopenharmony_ci const int8_t *filter, 2588cabdff1aSopenharmony_ci int32_t height) 2589cabdff1aSopenharmony_ci{ 2590cabdff1aSopenharmony_ci if (2 == height) { 2591cabdff1aSopenharmony_ci hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2592cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 2593cabdff1aSopenharmony_ci } else if (6 == height) { 2594cabdff1aSopenharmony_ci hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 2595cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 2596cabdff1aSopenharmony_ci } else if (0 == (height % 4)) { 2597cabdff1aSopenharmony_ci hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride, 2598cabdff1aSopenharmony_ci src1_ptr, src2_stride, 2599cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 2600cabdff1aSopenharmony_ci } 2601cabdff1aSopenharmony_ci} 2602cabdff1aSopenharmony_ci 2603cabdff1aSopenharmony_cistatic void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr, 2604cabdff1aSopenharmony_ci int32_t src_stride, 2605cabdff1aSopenharmony_ci int16_t *src1_ptr, 2606cabdff1aSopenharmony_ci int32_t src2_stride, 2607cabdff1aSopenharmony_ci uint8_t *dst, 2608cabdff1aSopenharmony_ci int32_t dst_stride, 2609cabdff1aSopenharmony_ci const int8_t *filter, 2610cabdff1aSopenharmony_ci int32_t height) 2611cabdff1aSopenharmony_ci{ 2612cabdff1aSopenharmony_ci uint32_t loop_cnt; 2613cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2614cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 2615cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 2616cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2617cabdff1aSopenharmony_ci v16i8 mask2 = { 2618cabdff1aSopenharmony_ci 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 2619cabdff1aSopenharmony_ci }; 2620cabdff1aSopenharmony_ci v16i8 mask1, mask3; 2621cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 2622cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 2623cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2624cabdff1aSopenharmony_ci 2625cabdff1aSopenharmony_ci src0_ptr -= 1; 2626cabdff1aSopenharmony_ci 2627cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2628cabdff1aSopenharmony_ci const_vec <<= 6; 2629cabdff1aSopenharmony_ci 2630cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2631cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2632cabdff1aSopenharmony_ci 2633cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2634cabdff1aSopenharmony_ci mask3 = mask2 + 2; 2635cabdff1aSopenharmony_ci 2636cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2637cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3); 2638cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 2639cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2640cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7); 2641cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 2642cabdff1aSopenharmony_ci 2643cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 2644cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2645cabdff1aSopenharmony_ci 2646cabdff1aSopenharmony_ci dst0 = const_vec; 2647cabdff1aSopenharmony_ci dst1 = const_vec; 2648cabdff1aSopenharmony_ci dst2 = const_vec; 2649cabdff1aSopenharmony_ci dst3 = const_vec; 2650cabdff1aSopenharmony_ci dst4 = const_vec; 2651cabdff1aSopenharmony_ci dst5 = const_vec; 2652cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2653cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 2654cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 2655cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2656cabdff1aSopenharmony_ci dst1, dst2, dst3); 2657cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5); 2658cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 2659cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 2660cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec4, vec5); 2661cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2662cabdff1aSopenharmony_ci dst1, dst2, dst3); 2663cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5); 2664cabdff1aSopenharmony_ci 2665cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2666cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2667cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5); 2668cabdff1aSopenharmony_ci 2669cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2670cabdff1aSopenharmony_ci dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); 2671cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 2672cabdff1aSopenharmony_ci ST_W4(dst2, 0, 1, 2, 3, dst + 8, dst_stride); 2673cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2674cabdff1aSopenharmony_ci } 2675cabdff1aSopenharmony_ci} 2676cabdff1aSopenharmony_ci 2677cabdff1aSopenharmony_cistatic void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr, 2678cabdff1aSopenharmony_ci int32_t src_stride, 2679cabdff1aSopenharmony_ci int16_t *src1_ptr, 2680cabdff1aSopenharmony_ci int32_t src2_stride, 2681cabdff1aSopenharmony_ci uint8_t *dst, 2682cabdff1aSopenharmony_ci int32_t dst_stride, 2683cabdff1aSopenharmony_ci const int8_t *filter, 2684cabdff1aSopenharmony_ci int32_t height) 2685cabdff1aSopenharmony_ci{ 2686cabdff1aSopenharmony_ci uint32_t loop_cnt; 2687cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3; 2688cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, dst0, dst1, dst2, dst3; 2689cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2690cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2691cabdff1aSopenharmony_ci v16i8 mask1; 2692cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2693cabdff1aSopenharmony_ci 2694cabdff1aSopenharmony_ci src0_ptr -= 1; 2695cabdff1aSopenharmony_ci 2696cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2697cabdff1aSopenharmony_ci const_vec <<= 6; 2698cabdff1aSopenharmony_ci 2699cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2700cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2701cabdff1aSopenharmony_ci 2702cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2703cabdff1aSopenharmony_ci 2704cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 2705cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src0, src2); 2706cabdff1aSopenharmony_ci LD_SB2(src0_ptr + 8, src_stride, src1, src3); 2707cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 2708cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in2); 2709cabdff1aSopenharmony_ci LD_SH2(src1_ptr + 8, src2_stride, in1, in3); 2710cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 2711cabdff1aSopenharmony_ci 2712cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2713cabdff1aSopenharmony_ci 2714cabdff1aSopenharmony_ci dst0 = const_vec; 2715cabdff1aSopenharmony_ci dst1 = const_vec; 2716cabdff1aSopenharmony_ci dst2 = const_vec; 2717cabdff1aSopenharmony_ci dst3 = const_vec; 2718cabdff1aSopenharmony_ci 2719cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2720cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 2721cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2722cabdff1aSopenharmony_ci dst1, dst2, dst3); 2723cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 2724cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 2725cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2726cabdff1aSopenharmony_ci dst1, dst2, dst3); 2727cabdff1aSopenharmony_ci 2728cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2729cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2730cabdff1aSopenharmony_ci 2731cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2732cabdff1aSopenharmony_ci ST_SH2(dst0, dst1, dst, dst_stride); 2733cabdff1aSopenharmony_ci dst += (2 * dst_stride); 2734cabdff1aSopenharmony_ci } 2735cabdff1aSopenharmony_ci} 2736cabdff1aSopenharmony_ci 2737cabdff1aSopenharmony_cistatic void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr, 2738cabdff1aSopenharmony_ci int32_t src_stride, 2739cabdff1aSopenharmony_ci int16_t *src1_ptr, 2740cabdff1aSopenharmony_ci int32_t src2_stride, 2741cabdff1aSopenharmony_ci uint8_t *dst, 2742cabdff1aSopenharmony_ci int32_t dst_stride, 2743cabdff1aSopenharmony_ci const int8_t *filter, 2744cabdff1aSopenharmony_ci int32_t height) 2745cabdff1aSopenharmony_ci{ 2746cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 2747cabdff1aSopenharmony_ci uint8_t *dst_tmp; 2748cabdff1aSopenharmony_ci uint32_t loop_cnt; 2749cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2750cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 2751cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2752cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2753cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 2754cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 2755cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2756cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2757cabdff1aSopenharmony_ci 2758cabdff1aSopenharmony_ci src0_ptr -= 1; 2759cabdff1aSopenharmony_ci 2760cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2761cabdff1aSopenharmony_ci const_vec <<= 6; 2762cabdff1aSopenharmony_ci 2763cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2764cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2765cabdff1aSopenharmony_ci 2766cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2767cabdff1aSopenharmony_ci mask2 = mask0 + 8; 2768cabdff1aSopenharmony_ci mask3 = mask0 + 10; 2769cabdff1aSopenharmony_ci 2770cabdff1aSopenharmony_ci dst_tmp = dst + 16; 2771cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr + 16; 2772cabdff1aSopenharmony_ci 2773cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2774cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6); 2775cabdff1aSopenharmony_ci LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7); 2776cabdff1aSopenharmony_ci src0_ptr += (4 * src_stride); 2777cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6); 2778cabdff1aSopenharmony_ci LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7); 2779cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 2780cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2781cabdff1aSopenharmony_ci 2782cabdff1aSopenharmony_ci dst0 = const_vec; 2783cabdff1aSopenharmony_ci dst1 = const_vec; 2784cabdff1aSopenharmony_ci dst2 = const_vec; 2785cabdff1aSopenharmony_ci dst3 = const_vec; 2786cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1); 2787cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask0, mask2, vec2, vec3); 2788cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2789cabdff1aSopenharmony_ci dst1, dst2, dst3); 2790cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1); 2791cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask1, mask3, vec2, vec3); 2792cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2793cabdff1aSopenharmony_ci dst1, dst2, dst3); 2794cabdff1aSopenharmony_ci 2795cabdff1aSopenharmony_ci dst4 = const_vec; 2796cabdff1aSopenharmony_ci dst5 = const_vec; 2797cabdff1aSopenharmony_ci dst6 = const_vec; 2798cabdff1aSopenharmony_ci dst7 = const_vec; 2799cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask0, mask2, vec0, vec1); 2800cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask0, mask2, vec2, vec3); 2801cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst4, 2802cabdff1aSopenharmony_ci dst5, dst6, dst7); 2803cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask1, mask3, vec0, vec1); 2804cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask1, mask3, vec2, vec3); 2805cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst4, 2806cabdff1aSopenharmony_ci dst5, dst6, dst7); 2807cabdff1aSopenharmony_ci 2808cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2809cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2810cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in4, in5, in6, in7, 2811cabdff1aSopenharmony_ci dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7); 2812cabdff1aSopenharmony_ci 2813cabdff1aSopenharmony_ci PCKEV_B4_SH(dst1, dst0, dst3, dst2, 2814cabdff1aSopenharmony_ci dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3); 2815cabdff1aSopenharmony_ci ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); 2816cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2817cabdff1aSopenharmony_ci 2818cabdff1aSopenharmony_ci LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); 2819cabdff1aSopenharmony_ci src1_ptr_tmp += (4 * src2_stride); 2820cabdff1aSopenharmony_ci 2821cabdff1aSopenharmony_ci dst0 = const_vec; 2822cabdff1aSopenharmony_ci dst1 = const_vec; 2823cabdff1aSopenharmony_ci dst2 = const_vec; 2824cabdff1aSopenharmony_ci dst3 = const_vec; 2825cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1); 2826cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3); 2827cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2828cabdff1aSopenharmony_ci dst1, dst2, dst3); 2829cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec0, vec1); 2830cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec2, vec3); 2831cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2832cabdff1aSopenharmony_ci dst1, dst2, dst3); 2833cabdff1aSopenharmony_ci 2834cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2835cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2836cabdff1aSopenharmony_ci 2837cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2838cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst_tmp, dst_stride); 2839cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 2840cabdff1aSopenharmony_ci } 2841cabdff1aSopenharmony_ci} 2842cabdff1aSopenharmony_ci 2843cabdff1aSopenharmony_cistatic void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr, 2844cabdff1aSopenharmony_ci int32_t src_stride, 2845cabdff1aSopenharmony_ci int16_t *src1_ptr, 2846cabdff1aSopenharmony_ci int32_t src2_stride, 2847cabdff1aSopenharmony_ci uint8_t *dst, 2848cabdff1aSopenharmony_ci int32_t dst_stride, 2849cabdff1aSopenharmony_ci const int8_t *filter, 2850cabdff1aSopenharmony_ci int32_t height) 2851cabdff1aSopenharmony_ci{ 2852cabdff1aSopenharmony_ci uint32_t loop_cnt; 2853cabdff1aSopenharmony_ci v16i8 src0, src1, src2; 2854cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 2855cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2856cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2857cabdff1aSopenharmony_ci v16i8 mask1, mask2, mask3; 2858cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3; 2859cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3; 2860cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2861cabdff1aSopenharmony_ci 2862cabdff1aSopenharmony_ci src0_ptr -= 1; 2863cabdff1aSopenharmony_ci 2864cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2865cabdff1aSopenharmony_ci const_vec <<= 6; 2866cabdff1aSopenharmony_ci 2867cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2868cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2869cabdff1aSopenharmony_ci 2870cabdff1aSopenharmony_ci mask1 = mask0 + 2; 2871cabdff1aSopenharmony_ci mask2 = mask0 + 8; 2872cabdff1aSopenharmony_ci mask3 = mask0 + 10; 2873cabdff1aSopenharmony_ci 2874cabdff1aSopenharmony_ci for (loop_cnt = height; loop_cnt--;) { 2875cabdff1aSopenharmony_ci LD_SB2(src0_ptr, 16, src0, src1); 2876cabdff1aSopenharmony_ci src2 = LD_SB(src0_ptr + 24); 2877cabdff1aSopenharmony_ci src0_ptr += src_stride; 2878cabdff1aSopenharmony_ci LD_SH4(src1_ptr, 8, in0, in1, in2, in3); 2879cabdff1aSopenharmony_ci src1_ptr += src2_stride; 2880cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 2881cabdff1aSopenharmony_ci 2882cabdff1aSopenharmony_ci dst0 = const_vec; 2883cabdff1aSopenharmony_ci dst1 = const_vec; 2884cabdff1aSopenharmony_ci dst2 = const_vec; 2885cabdff1aSopenharmony_ci dst3 = const_vec; 2886cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1); 2887cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3); 2888cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2889cabdff1aSopenharmony_ci dst1, dst2, dst3); 2890cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1); 2891cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3); 2892cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2893cabdff1aSopenharmony_ci dst1, dst2, dst3); 2894cabdff1aSopenharmony_ci 2895cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 2896cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3); 2897cabdff1aSopenharmony_ci 2898cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); 2899cabdff1aSopenharmony_ci ST_SH2(dst0, dst1, dst, 16); 2900cabdff1aSopenharmony_ci dst += dst_stride; 2901cabdff1aSopenharmony_ci } 2902cabdff1aSopenharmony_ci} 2903cabdff1aSopenharmony_ci 2904cabdff1aSopenharmony_cistatic void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr, 2905cabdff1aSopenharmony_ci int32_t src_stride, 2906cabdff1aSopenharmony_ci int16_t *src1_ptr, 2907cabdff1aSopenharmony_ci int32_t src2_stride, 2908cabdff1aSopenharmony_ci uint8_t *dst, 2909cabdff1aSopenharmony_ci int32_t dst_stride, 2910cabdff1aSopenharmony_ci const int8_t *filter, 2911cabdff1aSopenharmony_ci int32_t height) 2912cabdff1aSopenharmony_ci{ 2913cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 2914cabdff1aSopenharmony_ci v8i16 in0, in1; 2915cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; 2916cabdff1aSopenharmony_ci v8i16 dst10; 2917cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2918cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2919cabdff1aSopenharmony_ci 2920cabdff1aSopenharmony_ci src0_ptr -= src_stride; 2921cabdff1aSopenharmony_ci 2922cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2923cabdff1aSopenharmony_ci const_vec <<= 6; 2924cabdff1aSopenharmony_ci 2925cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2926cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2927cabdff1aSopenharmony_ci 2928cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 2929cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 2930cabdff1aSopenharmony_ci 2931cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2932cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 2933cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2934cabdff1aSopenharmony_ci 2935cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 2936cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 2937cabdff1aSopenharmony_ci in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0); 2938cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2939cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); 2940cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); 2941cabdff1aSopenharmony_ci 2942cabdff1aSopenharmony_ci dst10 = const_vec; 2943cabdff1aSopenharmony_ci DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2944cabdff1aSopenharmony_ci dst10 = __msa_adds_s_h(dst10, in0); 2945cabdff1aSopenharmony_ci dst10 = __msa_srari_h(dst10, 7); 2946cabdff1aSopenharmony_ci CLIP_SH_0_255(dst10); 2947cabdff1aSopenharmony_ci 2948cabdff1aSopenharmony_ci dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10); 2949cabdff1aSopenharmony_ci ST_W2(dst10, 0, 1, dst, dst_stride); 2950cabdff1aSopenharmony_ci} 2951cabdff1aSopenharmony_ci 2952cabdff1aSopenharmony_cistatic void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr, 2953cabdff1aSopenharmony_ci int32_t src_stride, 2954cabdff1aSopenharmony_ci int16_t *src1_ptr, 2955cabdff1aSopenharmony_ci int32_t src2_stride, 2956cabdff1aSopenharmony_ci uint8_t *dst, 2957cabdff1aSopenharmony_ci int32_t dst_stride, 2958cabdff1aSopenharmony_ci const int8_t *filter, 2959cabdff1aSopenharmony_ci int32_t height) 2960cabdff1aSopenharmony_ci{ 2961cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 2962cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 2963cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 2964cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554; 2965cabdff1aSopenharmony_ci v8i16 dst10, dst32; 2966cabdff1aSopenharmony_ci v8i16 filt0, filt1; 2967cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 2968cabdff1aSopenharmony_ci 2969cabdff1aSopenharmony_ci src0_ptr -= src_stride; 2970cabdff1aSopenharmony_ci 2971cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 2972cabdff1aSopenharmony_ci const_vec <<= 6; 2973cabdff1aSopenharmony_ci 2974cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 2975cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2976cabdff1aSopenharmony_ci 2977cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 2978cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 2979cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2980cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 2981cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2982cabdff1aSopenharmony_ci 2983cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6); 2984cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 2985cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 2986cabdff1aSopenharmony_ci ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 2987cabdff1aSopenharmony_ci src32_r, src43_r, src54_r, src65_r); 2988cabdff1aSopenharmony_ci ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554); 2989cabdff1aSopenharmony_ci XORI_B2_128_SB(src4332, src6554); 2990cabdff1aSopenharmony_ci 2991cabdff1aSopenharmony_ci dst10 = const_vec; 2992cabdff1aSopenharmony_ci DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2993cabdff1aSopenharmony_ci dst32 = const_vec; 2994cabdff1aSopenharmony_ci DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); 2995cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32); 2996cabdff1aSopenharmony_ci 2997cabdff1aSopenharmony_ci dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10); 2998cabdff1aSopenharmony_ci ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride); 2999cabdff1aSopenharmony_ci} 3000cabdff1aSopenharmony_ci 3001cabdff1aSopenharmony_cistatic void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, 3002cabdff1aSopenharmony_ci int32_t src_stride, 3003cabdff1aSopenharmony_ci int16_t *src1_ptr, 3004cabdff1aSopenharmony_ci int32_t src2_stride, 3005cabdff1aSopenharmony_ci uint8_t *dst, 3006cabdff1aSopenharmony_ci int32_t dst_stride, 3007cabdff1aSopenharmony_ci const int8_t *filter, 3008cabdff1aSopenharmony_ci int32_t height) 3009cabdff1aSopenharmony_ci{ 3010cabdff1aSopenharmony_ci int32_t loop_cnt; 3011cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 3012cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 3013cabdff1aSopenharmony_ci v16i8 src6, src7, src8, src9; 3014cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 3015cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 3016cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554, src8776; 3017cabdff1aSopenharmony_ci v8i16 dst10, dst32, dst54, dst76; 3018cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3019cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3020cabdff1aSopenharmony_ci 3021cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3022cabdff1aSopenharmony_ci 3023cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3024cabdff1aSopenharmony_ci const_vec <<= 6; 3025cabdff1aSopenharmony_ci 3026cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3027cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3028cabdff1aSopenharmony_ci 3029cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3030cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3031cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3032cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 3033cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3034cabdff1aSopenharmony_ci 3035cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 3036cabdff1aSopenharmony_ci LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); 3037cabdff1aSopenharmony_ci src0_ptr += (6 * src_stride); 3038cabdff1aSopenharmony_ci LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7); 3039cabdff1aSopenharmony_ci src1_ptr += (8 * src2_stride); 3040cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, in0, in1); 3041cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in2, in3); 3042cabdff1aSopenharmony_ci ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3043cabdff1aSopenharmony_ci src32_r, src43_r, src54_r, src65_r); 3044cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3045cabdff1aSopenharmony_ci ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, 3046cabdff1aSopenharmony_ci src4332, src6554, src8776); 3047cabdff1aSopenharmony_ci XORI_B3_128_SB(src4332, src6554, src8776); 3048cabdff1aSopenharmony_ci 3049cabdff1aSopenharmony_ci dst10 = const_vec; 3050cabdff1aSopenharmony_ci DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 3051cabdff1aSopenharmony_ci dst32 = const_vec; 3052cabdff1aSopenharmony_ci DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); 3053cabdff1aSopenharmony_ci dst54 = const_vec; 3054cabdff1aSopenharmony_ci DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54); 3055cabdff1aSopenharmony_ci 3056cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src9, src2); 3057cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3058cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r); 3059cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r); 3060cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 3061cabdff1aSopenharmony_ci dst76 = const_vec; 3062cabdff1aSopenharmony_ci DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76); 3063cabdff1aSopenharmony_ci 3064cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3065cabdff1aSopenharmony_ci dst10, dst32, dst54, dst76, 7, 3066cabdff1aSopenharmony_ci dst10, dst32, dst54, dst76); 3067cabdff1aSopenharmony_ci 3068cabdff1aSopenharmony_ci PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54); 3069cabdff1aSopenharmony_ci ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3070cabdff1aSopenharmony_ci dst += (8 * dst_stride); 3071cabdff1aSopenharmony_ci } 3072cabdff1aSopenharmony_ci} 3073cabdff1aSopenharmony_ci 3074cabdff1aSopenharmony_cistatic void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr, 3075cabdff1aSopenharmony_ci int32_t src_stride, 3076cabdff1aSopenharmony_ci int16_t *src1_ptr, 3077cabdff1aSopenharmony_ci int32_t src2_stride, 3078cabdff1aSopenharmony_ci uint8_t *dst, 3079cabdff1aSopenharmony_ci int32_t dst_stride, 3080cabdff1aSopenharmony_ci const int8_t *filter, 3081cabdff1aSopenharmony_ci int32_t height) 3082cabdff1aSopenharmony_ci{ 3083cabdff1aSopenharmony_ci if (2 == height) { 3084cabdff1aSopenharmony_ci hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3085cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 3086cabdff1aSopenharmony_ci } else if (4 == height) { 3087cabdff1aSopenharmony_ci hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3088cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 3089cabdff1aSopenharmony_ci } else { 3090cabdff1aSopenharmony_ci hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride, 3091cabdff1aSopenharmony_ci src1_ptr, src2_stride, 3092cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 3093cabdff1aSopenharmony_ci } 3094cabdff1aSopenharmony_ci} 3095cabdff1aSopenharmony_ci 3096cabdff1aSopenharmony_cistatic void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr, 3097cabdff1aSopenharmony_ci int32_t src_stride, 3098cabdff1aSopenharmony_ci int16_t *src1_ptr, 3099cabdff1aSopenharmony_ci int32_t src2_stride, 3100cabdff1aSopenharmony_ci uint8_t *dst, 3101cabdff1aSopenharmony_ci int32_t dst_stride, 3102cabdff1aSopenharmony_ci const int8_t *filter, 3103cabdff1aSopenharmony_ci int32_t height) 3104cabdff1aSopenharmony_ci{ 3105cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3106cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 3107cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r; 3108cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3109cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3110cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3111cabdff1aSopenharmony_ci 3112cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3113cabdff1aSopenharmony_ci 3114cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3115cabdff1aSopenharmony_ci const_vec <<= 6; 3116cabdff1aSopenharmony_ci 3117cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3118cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3119cabdff1aSopenharmony_ci 3120cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3121cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3122cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 3123cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3124cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src5, src6); 3125cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3126cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src7, src8); 3127cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3128cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src9, src10); 3129cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3130cabdff1aSopenharmony_ci 3131cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3132cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 3133cabdff1aSopenharmony_ci 3134cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3135cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 3136cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 3137cabdff1aSopenharmony_ci XORI_B2_128_SB(src7, src8); 3138cabdff1aSopenharmony_ci XORI_B2_128_SB(src9, src10); 3139cabdff1aSopenharmony_ci 3140cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3141cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3142cabdff1aSopenharmony_ci 3143cabdff1aSopenharmony_ci dst0_r = const_vec; 3144cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3145cabdff1aSopenharmony_ci dst1_r = const_vec; 3146cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3147cabdff1aSopenharmony_ci 3148cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 3149cabdff1aSopenharmony_ci 3150cabdff1aSopenharmony_ci dst2_r = const_vec; 3151cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); 3152cabdff1aSopenharmony_ci dst3_r = const_vec; 3153cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); 3154cabdff1aSopenharmony_ci 3155cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3156cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r, 7, 3157cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 3158cabdff1aSopenharmony_ci 3159cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 3160cabdff1aSopenharmony_ci ST_W2(dst0_r, 0, 2, dst, dst_stride); 3161cabdff1aSopenharmony_ci ST_H2(dst0_r, 2, 6, dst + 4, dst_stride); 3162cabdff1aSopenharmony_ci ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride); 3163cabdff1aSopenharmony_ci ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 3164cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3165cabdff1aSopenharmony_ci 3166cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3167cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 3168cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src32_r, src43_r); 3169cabdff1aSopenharmony_ci 3170cabdff1aSopenharmony_ci dst0_r = const_vec; 3171cabdff1aSopenharmony_ci DPADD_SB2_SH(src54_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3172cabdff1aSopenharmony_ci dst1_r = const_vec; 3173cabdff1aSopenharmony_ci DPADD_SB2_SH(src65_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3174cabdff1aSopenharmony_ci 3175cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src10, src9, src54_r, src65_r); 3176cabdff1aSopenharmony_ci 3177cabdff1aSopenharmony_ci dst2_r = const_vec; 3178cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); 3179cabdff1aSopenharmony_ci dst3_r = const_vec; 3180cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); 3181cabdff1aSopenharmony_ci 3182cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3183cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r, 7, 3184cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 3185cabdff1aSopenharmony_ci 3186cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 3187cabdff1aSopenharmony_ci ST_W2(dst0_r, 0, 2, dst, dst_stride); 3188cabdff1aSopenharmony_ci ST_H2(dst0_r, 2, 6, dst + 4, dst_stride); 3189cabdff1aSopenharmony_ci ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride); 3190cabdff1aSopenharmony_ci ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 3191cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3192cabdff1aSopenharmony_ci} 3193cabdff1aSopenharmony_ci 3194cabdff1aSopenharmony_cistatic void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr, 3195cabdff1aSopenharmony_ci int32_t src_stride, 3196cabdff1aSopenharmony_ci int16_t *src1_ptr, 3197cabdff1aSopenharmony_ci int32_t src2_stride, 3198cabdff1aSopenharmony_ci uint8_t *dst, 3199cabdff1aSopenharmony_ci int32_t dst_stride, 3200cabdff1aSopenharmony_ci const int8_t *filter, 3201cabdff1aSopenharmony_ci int32_t height) 3202cabdff1aSopenharmony_ci{ 3203cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 3204cabdff1aSopenharmony_ci v8i16 in0, in1, dst0_r, dst1_r; 3205cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 3206cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3207cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3208cabdff1aSopenharmony_ci 3209cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3210cabdff1aSopenharmony_ci 3211cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3212cabdff1aSopenharmony_ci const_vec <<= 6; 3213cabdff1aSopenharmony_ci 3214cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3215cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3216cabdff1aSopenharmony_ci 3217cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3218cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3219cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3220cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3221cabdff1aSopenharmony_ci 3222cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 3223cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 3224cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 3225cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3226cabdff1aSopenharmony_ci 3227cabdff1aSopenharmony_ci dst0_r = const_vec; 3228cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3229cabdff1aSopenharmony_ci dst1_r = const_vec; 3230cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3231cabdff1aSopenharmony_ci 3232cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r); 3233cabdff1aSopenharmony_ci dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); 3234cabdff1aSopenharmony_ci 3235cabdff1aSopenharmony_ci ST_D2(dst0_r, 0, 1, dst, dst_stride); 3236cabdff1aSopenharmony_ci} 3237cabdff1aSopenharmony_ci 3238cabdff1aSopenharmony_cistatic void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr, 3239cabdff1aSopenharmony_ci int32_t src_stride, 3240cabdff1aSopenharmony_ci int16_t *src1_ptr, 3241cabdff1aSopenharmony_ci int32_t src2_stride, 3242cabdff1aSopenharmony_ci uint8_t *dst, 3243cabdff1aSopenharmony_ci int32_t dst_stride, 3244cabdff1aSopenharmony_ci const int8_t *filter, 3245cabdff1aSopenharmony_ci int32_t height) 3246cabdff1aSopenharmony_ci{ 3247cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3248cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5; 3249cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r; 3250cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src65_r, src87_r; 3251cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r; 3252cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3253cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3254cabdff1aSopenharmony_ci 3255cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3256cabdff1aSopenharmony_ci 3257cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3258cabdff1aSopenharmony_ci const_vec <<= 6; 3259cabdff1aSopenharmony_ci 3260cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3261cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3262cabdff1aSopenharmony_ci 3263cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3264cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3265cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3266cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3267cabdff1aSopenharmony_ci 3268cabdff1aSopenharmony_ci LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8); 3269cabdff1aSopenharmony_ci LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); 3270cabdff1aSopenharmony_ci XORI_B6_128_SB(src3, src4, src5, src6, src7, src8); 3271cabdff1aSopenharmony_ci ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 3272cabdff1aSopenharmony_ci src32_r, src43_r, src54_r, src65_r); 3273cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3274cabdff1aSopenharmony_ci 3275cabdff1aSopenharmony_ci dst0_r = const_vec; 3276cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3277cabdff1aSopenharmony_ci dst1_r = const_vec; 3278cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3279cabdff1aSopenharmony_ci dst2_r = const_vec; 3280cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); 3281cabdff1aSopenharmony_ci dst3_r = const_vec; 3282cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); 3283cabdff1aSopenharmony_ci dst4_r = const_vec; 3284cabdff1aSopenharmony_ci DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r); 3285cabdff1aSopenharmony_ci dst5_r = const_vec; 3286cabdff1aSopenharmony_ci DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r); 3287cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3288cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r, 7, 3289cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 3290cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r); 3291cabdff1aSopenharmony_ci 3292cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 3293cabdff1aSopenharmony_ci dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r); 3294cabdff1aSopenharmony_ci ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride); 3295cabdff1aSopenharmony_ci ST_D2(dst2_r, 0, 1, dst + 4 * dst_stride, dst_stride); 3296cabdff1aSopenharmony_ci} 3297cabdff1aSopenharmony_ci 3298cabdff1aSopenharmony_cistatic void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, 3299cabdff1aSopenharmony_ci int32_t src_stride, 3300cabdff1aSopenharmony_ci int16_t *src1_ptr, 3301cabdff1aSopenharmony_ci int32_t src2_stride, 3302cabdff1aSopenharmony_ci uint8_t *dst, 3303cabdff1aSopenharmony_ci int32_t dst_stride, 3304cabdff1aSopenharmony_ci const int8_t *filter, 3305cabdff1aSopenharmony_ci int32_t height) 3306cabdff1aSopenharmony_ci{ 3307cabdff1aSopenharmony_ci int32_t loop_cnt; 3308cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 3309cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 3310cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 3311cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3312cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3313cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3314cabdff1aSopenharmony_ci 3315cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3316cabdff1aSopenharmony_ci 3317cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3318cabdff1aSopenharmony_ci const_vec <<= 6; 3319cabdff1aSopenharmony_ci 3320cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3321cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3322cabdff1aSopenharmony_ci 3323cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3324cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3325cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3326cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3327cabdff1aSopenharmony_ci 3328cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 3329cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 3330cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3331cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3332cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 3333cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 3334cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3335cabdff1aSopenharmony_ci 3336cabdff1aSopenharmony_ci dst0_r = const_vec; 3337cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3338cabdff1aSopenharmony_ci dst1_r = const_vec; 3339cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3340cabdff1aSopenharmony_ci 3341cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src5, src2); 3342cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3343cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src2); 3344cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 3345cabdff1aSopenharmony_ci 3346cabdff1aSopenharmony_ci dst2_r = const_vec; 3347cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r); 3348cabdff1aSopenharmony_ci dst3_r = const_vec; 3349cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r); 3350cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3351cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r, 7, 3352cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 3353cabdff1aSopenharmony_ci 3354cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 3355cabdff1aSopenharmony_ci ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride); 3356cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3357cabdff1aSopenharmony_ci } 3358cabdff1aSopenharmony_ci} 3359cabdff1aSopenharmony_ci 3360cabdff1aSopenharmony_cistatic void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr, 3361cabdff1aSopenharmony_ci int32_t src_stride, 3362cabdff1aSopenharmony_ci int16_t *src1_ptr, 3363cabdff1aSopenharmony_ci int32_t src2_stride, 3364cabdff1aSopenharmony_ci uint8_t *dst, 3365cabdff1aSopenharmony_ci int32_t dst_stride, 3366cabdff1aSopenharmony_ci const int8_t *filter, 3367cabdff1aSopenharmony_ci int32_t height) 3368cabdff1aSopenharmony_ci{ 3369cabdff1aSopenharmony_ci if (2 == height) { 3370cabdff1aSopenharmony_ci hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3371cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 3372cabdff1aSopenharmony_ci } else if (6 == height) { 3373cabdff1aSopenharmony_ci hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 3374cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 3375cabdff1aSopenharmony_ci } else { 3376cabdff1aSopenharmony_ci hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride, 3377cabdff1aSopenharmony_ci src1_ptr, src2_stride, 3378cabdff1aSopenharmony_ci dst, dst_stride, filter, height); 3379cabdff1aSopenharmony_ci } 3380cabdff1aSopenharmony_ci} 3381cabdff1aSopenharmony_ci 3382cabdff1aSopenharmony_cistatic void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr, 3383cabdff1aSopenharmony_ci int32_t src_stride, 3384cabdff1aSopenharmony_ci int16_t *src1_ptr, 3385cabdff1aSopenharmony_ci int32_t src2_stride, 3386cabdff1aSopenharmony_ci uint8_t *dst, 3387cabdff1aSopenharmony_ci int32_t dst_stride, 3388cabdff1aSopenharmony_ci const int8_t *filter, 3389cabdff1aSopenharmony_ci int32_t height) 3390cabdff1aSopenharmony_ci{ 3391cabdff1aSopenharmony_ci int32_t loop_cnt; 3392cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 3393cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 3394cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r; 3395cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3396cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 3397cabdff1aSopenharmony_ci v16i8 src2110, src4332, src6554; 3398cabdff1aSopenharmony_ci v8i16 dst0_l, dst1_l, filt0, filt1; 3399cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3400cabdff1aSopenharmony_ci 3401cabdff1aSopenharmony_ci src0_ptr -= (1 * src_stride); 3402cabdff1aSopenharmony_ci 3403cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3404cabdff1aSopenharmony_ci const_vec <<= 6; 3405cabdff1aSopenharmony_ci 3406cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3407cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3408cabdff1aSopenharmony_ci 3409cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3410cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3411cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3412cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3413cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3414cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); 3415cabdff1aSopenharmony_ci 3416cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 3417cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 3418cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3419cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src5, src6); 3420cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3421cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 3422cabdff1aSopenharmony_ci LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7); 3423cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 3424cabdff1aSopenharmony_ci ILVR_D2_SH(in5, in4, in7, in6, in4, in5); 3425cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 3426cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 3427cabdff1aSopenharmony_ci 3428cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3429cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3430cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); 3431cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 3432cabdff1aSopenharmony_ci ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l); 3433cabdff1aSopenharmony_ci src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); 3434cabdff1aSopenharmony_ci 3435cabdff1aSopenharmony_ci dst0_r = const_vec; 3436cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3437cabdff1aSopenharmony_ci dst1_r = const_vec; 3438cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3439cabdff1aSopenharmony_ci dst0_l = const_vec; 3440cabdff1aSopenharmony_ci DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l); 3441cabdff1aSopenharmony_ci dst2_r = const_vec; 3442cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); 3443cabdff1aSopenharmony_ci dst3_r = const_vec; 3444cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); 3445cabdff1aSopenharmony_ci dst1_l = const_vec; 3446cabdff1aSopenharmony_ci DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l); 3447cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3448cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r, 7, 3449cabdff1aSopenharmony_ci dst0_r, dst1_r, dst2_r, dst3_r); 3450cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l); 3451cabdff1aSopenharmony_ci 3452cabdff1aSopenharmony_ci PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 3453cabdff1aSopenharmony_ci dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l); 3454cabdff1aSopenharmony_ci ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride); 3455cabdff1aSopenharmony_ci ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride); 3456cabdff1aSopenharmony_ci dst += (4 * dst_stride); 3457cabdff1aSopenharmony_ci 3458cabdff1aSopenharmony_ci src2 = src6; 3459cabdff1aSopenharmony_ci src10_r = src54_r; 3460cabdff1aSopenharmony_ci src21_r = src65_r; 3461cabdff1aSopenharmony_ci src2110 = src6554; 3462cabdff1aSopenharmony_ci } 3463cabdff1aSopenharmony_ci} 3464cabdff1aSopenharmony_ci 3465cabdff1aSopenharmony_cistatic void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr, 3466cabdff1aSopenharmony_ci int32_t src_stride, 3467cabdff1aSopenharmony_ci int16_t *src1_ptr, 3468cabdff1aSopenharmony_ci int32_t src2_stride, 3469cabdff1aSopenharmony_ci uint8_t *dst, 3470cabdff1aSopenharmony_ci int32_t dst_stride, 3471cabdff1aSopenharmony_ci const int8_t *filter, 3472cabdff1aSopenharmony_ci int32_t height) 3473cabdff1aSopenharmony_ci{ 3474cabdff1aSopenharmony_ci int32_t loop_cnt; 3475cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 3476cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 3477cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r; 3478cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src21_l, src43_l; 3479cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst0_l, dst1_l; 3480cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3481cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3482cabdff1aSopenharmony_ci 3483cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3484cabdff1aSopenharmony_ci 3485cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3486cabdff1aSopenharmony_ci const_vec <<= 6; 3487cabdff1aSopenharmony_ci 3488cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3489cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3490cabdff1aSopenharmony_ci 3491cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3492cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3493cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3494cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3495cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3496cabdff1aSopenharmony_ci 3497cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 3498cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 3499cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3500cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 3501cabdff1aSopenharmony_ci LD_SH2((src1_ptr + 8), src2_stride, in2, in3); 3502cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 3503cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 3504cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3505cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3506cabdff1aSopenharmony_ci 3507cabdff1aSopenharmony_ci dst0_r = const_vec; 3508cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3509cabdff1aSopenharmony_ci dst1_r = const_vec; 3510cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3511cabdff1aSopenharmony_ci dst0_l = const_vec; 3512cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); 3513cabdff1aSopenharmony_ci dst1_l = const_vec; 3514cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); 3515cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3516cabdff1aSopenharmony_ci dst0_r, dst1_r, dst0_l, dst1_l, 7, 3517cabdff1aSopenharmony_ci dst0_r, dst1_r, dst0_l, dst1_l); 3518cabdff1aSopenharmony_ci 3519cabdff1aSopenharmony_ci PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3520cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst1_r, dst, dst_stride); 3521cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3522cabdff1aSopenharmony_ci 3523cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src5, src2); 3524cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3525cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 3526cabdff1aSopenharmony_ci LD_SH2((src1_ptr + 8), src2_stride, in2, in3); 3527cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 3528cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src2); 3529cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 3530cabdff1aSopenharmony_ci ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 3531cabdff1aSopenharmony_ci 3532cabdff1aSopenharmony_ci dst0_r = const_vec; 3533cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); 3534cabdff1aSopenharmony_ci dst0_l = const_vec; 3535cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); 3536cabdff1aSopenharmony_ci dst1_r = const_vec; 3537cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); 3538cabdff1aSopenharmony_ci dst1_l = const_vec; 3539cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); 3540cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3541cabdff1aSopenharmony_ci dst0_r, dst1_r, dst0_l, dst1_l, 7, 3542cabdff1aSopenharmony_ci dst0_r, dst1_r, dst0_l, dst1_l); 3543cabdff1aSopenharmony_ci 3544cabdff1aSopenharmony_ci PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3545cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst1_r, dst, dst_stride); 3546cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3547cabdff1aSopenharmony_ci } 3548cabdff1aSopenharmony_ci} 3549cabdff1aSopenharmony_ci 3550cabdff1aSopenharmony_cistatic void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr, 3551cabdff1aSopenharmony_ci int32_t src_stride, 3552cabdff1aSopenharmony_ci int16_t *src1_ptr, 3553cabdff1aSopenharmony_ci int32_t src2_stride, 3554cabdff1aSopenharmony_ci uint8_t *dst, 3555cabdff1aSopenharmony_ci int32_t dst_stride, 3556cabdff1aSopenharmony_ci const int8_t *filter, 3557cabdff1aSopenharmony_ci int32_t height) 3558cabdff1aSopenharmony_ci{ 3559cabdff1aSopenharmony_ci uint32_t loop_cnt; 3560cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 3561cabdff1aSopenharmony_ci v16i8 src6, src7, src8, src9, src10, src11; 3562cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5; 3563cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src76_r, src98_r; 3564cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src87_r, src109_r; 3565cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src21_l, src43_l; 3566cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3567cabdff1aSopenharmony_ci v8i16 dst0_l, dst1_l; 3568cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3569cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3570cabdff1aSopenharmony_ci 3571cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3572cabdff1aSopenharmony_ci 3573cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3574cabdff1aSopenharmony_ci const_vec <<= 6; 3575cabdff1aSopenharmony_ci 3576cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3577cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3578cabdff1aSopenharmony_ci 3579cabdff1aSopenharmony_ci /* 16width */ 3580cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3581cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3582cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3583cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3584cabdff1aSopenharmony_ci /* 8width */ 3585cabdff1aSopenharmony_ci LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); 3586cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3587cabdff1aSopenharmony_ci XORI_B3_128_SB(src6, src7, src8); 3588cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3589cabdff1aSopenharmony_ci 3590cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 3591cabdff1aSopenharmony_ci /* 16width */ 3592cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 3593cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 3594cabdff1aSopenharmony_ci LD_SH2((src1_ptr + 8), src2_stride, in2, in3); 3595cabdff1aSopenharmony_ci LD_SH2((src1_ptr + 16), src2_stride, in4, in5); 3596cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 3597cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 3598cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3599cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3600cabdff1aSopenharmony_ci /* 8width */ 3601cabdff1aSopenharmony_ci LD_SB2(src0_ptr + 16, src_stride, src9, src10); 3602cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3603cabdff1aSopenharmony_ci XORI_B2_128_SB(src9, src10); 3604cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3605cabdff1aSopenharmony_ci /* 16width */ 3606cabdff1aSopenharmony_ci dst0_r = const_vec; 3607cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3608cabdff1aSopenharmony_ci dst0_l = const_vec; 3609cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); 3610cabdff1aSopenharmony_ci dst1_r = const_vec; 3611cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3612cabdff1aSopenharmony_ci dst1_l = const_vec; 3613cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); 3614cabdff1aSopenharmony_ci /* 8width */ 3615cabdff1aSopenharmony_ci dst2_r = const_vec; 3616cabdff1aSopenharmony_ci DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); 3617cabdff1aSopenharmony_ci dst3_r = const_vec; 3618cabdff1aSopenharmony_ci DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); 3619cabdff1aSopenharmony_ci /* 16width */ 3620cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3621cabdff1aSopenharmony_ci dst0_r, dst1_r, dst0_l, dst1_l, 7, 3622cabdff1aSopenharmony_ci dst0_r, dst1_r, dst0_l, dst1_l); 3623cabdff1aSopenharmony_ci 3624cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r); 3625cabdff1aSopenharmony_ci 3626cabdff1aSopenharmony_ci PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3627cabdff1aSopenharmony_ci dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r); 3628cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst1_r, dst, dst_stride); 3629cabdff1aSopenharmony_ci ST_D2(dst2_r, 0, 1, dst + 16, dst_stride); 3630cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3631cabdff1aSopenharmony_ci 3632cabdff1aSopenharmony_ci /* 16width */ 3633cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src5, src2); 3634cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 3635cabdff1aSopenharmony_ci LD_SH2((src1_ptr + 8), src2_stride, in2, in3); 3636cabdff1aSopenharmony_ci LD_SH2((src1_ptr + 16), src2_stride, in4, in5); 3637cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 3638cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src2); 3639cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 3640cabdff1aSopenharmony_ci ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 3641cabdff1aSopenharmony_ci /* 8width */ 3642cabdff1aSopenharmony_ci LD_SB2(src0_ptr + 16, src_stride, src11, src8); 3643cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3644cabdff1aSopenharmony_ci XORI_B2_128_SB(src11, src8); 3645cabdff1aSopenharmony_ci ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); 3646cabdff1aSopenharmony_ci /* 16width */ 3647cabdff1aSopenharmony_ci dst0_r = const_vec; 3648cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); 3649cabdff1aSopenharmony_ci dst0_l = const_vec; 3650cabdff1aSopenharmony_ci DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); 3651cabdff1aSopenharmony_ci dst1_r = const_vec; 3652cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); 3653cabdff1aSopenharmony_ci dst1_l = const_vec; 3654cabdff1aSopenharmony_ci DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); 3655cabdff1aSopenharmony_ci /* 8width */ 3656cabdff1aSopenharmony_ci dst2_r = const_vec; 3657cabdff1aSopenharmony_ci DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r); 3658cabdff1aSopenharmony_ci dst3_r = const_vec; 3659cabdff1aSopenharmony_ci DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r); 3660cabdff1aSopenharmony_ci 3661cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3662cabdff1aSopenharmony_ci dst0_r, dst1_r, dst0_l, dst1_l, 7, 3663cabdff1aSopenharmony_ci dst0_r, dst1_r, dst0_l, dst1_l); 3664cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r); 3665cabdff1aSopenharmony_ci 3666cabdff1aSopenharmony_ci PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3667cabdff1aSopenharmony_ci dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r); 3668cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst1_r, dst, dst_stride); 3669cabdff1aSopenharmony_ci ST_D2(dst2_r, 0, 1, dst + 16, dst_stride); 3670cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3671cabdff1aSopenharmony_ci } 3672cabdff1aSopenharmony_ci} 3673cabdff1aSopenharmony_ci 3674cabdff1aSopenharmony_cistatic void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr, 3675cabdff1aSopenharmony_ci int32_t src_stride, 3676cabdff1aSopenharmony_ci int16_t *src1_ptr, 3677cabdff1aSopenharmony_ci int32_t src2_stride, 3678cabdff1aSopenharmony_ci uint8_t *dst, 3679cabdff1aSopenharmony_ci int32_t dst_stride, 3680cabdff1aSopenharmony_ci const int8_t *filter, 3681cabdff1aSopenharmony_ci int32_t height) 3682cabdff1aSopenharmony_ci{ 3683cabdff1aSopenharmony_ci uint32_t loop_cnt; 3684cabdff1aSopenharmony_ci uint8_t *dst_tmp = dst + 16; 3685cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10; 3686cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 3687cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src76_r, src98_r; 3688cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src87_r, src109_r; 3689cabdff1aSopenharmony_ci v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3690cabdff1aSopenharmony_ci v16i8 src10_l, src32_l, src76_l, src98_l; 3691cabdff1aSopenharmony_ci v16i8 src21_l, src43_l, src87_l, src109_l; 3692cabdff1aSopenharmony_ci v8i16 dst0_l, dst1_l, dst2_l, dst3_l; 3693cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3694cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3695cabdff1aSopenharmony_ci 3696cabdff1aSopenharmony_ci src0_ptr -= src_stride; 3697cabdff1aSopenharmony_ci 3698cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3699cabdff1aSopenharmony_ci const_vec <<= 6; 3700cabdff1aSopenharmony_ci 3701cabdff1aSopenharmony_ci filter_vec = LD_SH(filter); 3702cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3703cabdff1aSopenharmony_ci 3704cabdff1aSopenharmony_ci /* 16width */ 3705cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3706cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3707cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3708cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3709cabdff1aSopenharmony_ci 3710cabdff1aSopenharmony_ci /* next 16width */ 3711cabdff1aSopenharmony_ci LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8); 3712cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3713cabdff1aSopenharmony_ci XORI_B3_128_SB(src6, src7, src8); 3714cabdff1aSopenharmony_ci ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3715cabdff1aSopenharmony_ci ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 3716cabdff1aSopenharmony_ci 3717cabdff1aSopenharmony_ci for (loop_cnt = (height >> 1); loop_cnt--;) { 3718cabdff1aSopenharmony_ci /* 16width */ 3719cabdff1aSopenharmony_ci LD_SB2(src0_ptr, src_stride, src3, src4); 3720cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 3721cabdff1aSopenharmony_ci LD_SH2((src1_ptr + 8), src2_stride, in2, in3); 3722cabdff1aSopenharmony_ci LD_SH2((src1_ptr + 16), src2_stride, in4, in5); 3723cabdff1aSopenharmony_ci LD_SH2((src1_ptr + 24), src2_stride, in6, in7); 3724cabdff1aSopenharmony_ci src1_ptr += (2 * src2_stride); 3725cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 3726cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3727cabdff1aSopenharmony_ci ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3728cabdff1aSopenharmony_ci /* 16width */ 3729cabdff1aSopenharmony_ci dst0_r = const_vec; 3730cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3731cabdff1aSopenharmony_ci dst0_l = const_vec; 3732cabdff1aSopenharmony_ci DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); 3733cabdff1aSopenharmony_ci dst1_r = const_vec; 3734cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3735cabdff1aSopenharmony_ci dst1_l = const_vec; 3736cabdff1aSopenharmony_ci DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); 3737cabdff1aSopenharmony_ci /* 16width */ 3738cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in0, in1, in2, in3, 3739cabdff1aSopenharmony_ci dst0_r, dst1_r, dst0_l, dst1_l, 7, 3740cabdff1aSopenharmony_ci dst0_r, dst1_r, dst0_l, dst1_l); 3741cabdff1aSopenharmony_ci 3742cabdff1aSopenharmony_ci src10_r = src32_r; 3743cabdff1aSopenharmony_ci src21_r = src43_r; 3744cabdff1aSopenharmony_ci src10_l = src32_l; 3745cabdff1aSopenharmony_ci src21_l = src43_l; 3746cabdff1aSopenharmony_ci src2 = src4; 3747cabdff1aSopenharmony_ci 3748cabdff1aSopenharmony_ci PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3749cabdff1aSopenharmony_ci ST_SH2(dst0_r, dst1_r, dst, dst_stride); 3750cabdff1aSopenharmony_ci dst += (2 * dst_stride); 3751cabdff1aSopenharmony_ci 3752cabdff1aSopenharmony_ci /* next 16width */ 3753cabdff1aSopenharmony_ci LD_SB2(src0_ptr + 16, src_stride, src9, src10); 3754cabdff1aSopenharmony_ci src0_ptr += (2 * src_stride); 3755cabdff1aSopenharmony_ci XORI_B2_128_SB(src9, src10); 3756cabdff1aSopenharmony_ci ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3757cabdff1aSopenharmony_ci ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); 3758cabdff1aSopenharmony_ci /* next 16width */ 3759cabdff1aSopenharmony_ci dst2_r = const_vec; 3760cabdff1aSopenharmony_ci DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); 3761cabdff1aSopenharmony_ci dst2_l = const_vec; 3762cabdff1aSopenharmony_ci DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l); 3763cabdff1aSopenharmony_ci dst3_r = const_vec; 3764cabdff1aSopenharmony_ci DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); 3765cabdff1aSopenharmony_ci dst3_l = const_vec; 3766cabdff1aSopenharmony_ci DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l); 3767cabdff1aSopenharmony_ci /* next 16width */ 3768cabdff1aSopenharmony_ci HEVC_BI_RND_CLIP4(in4, in5, in6, in7, 3769cabdff1aSopenharmony_ci dst2_r, dst3_r, dst2_l, dst3_l, 7, 3770cabdff1aSopenharmony_ci dst2_r, dst3_r, dst2_l, dst3_l); 3771cabdff1aSopenharmony_ci 3772cabdff1aSopenharmony_ci PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r); 3773cabdff1aSopenharmony_ci ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride); 3774cabdff1aSopenharmony_ci dst_tmp += (2 * dst_stride); 3775cabdff1aSopenharmony_ci 3776cabdff1aSopenharmony_ci src76_r = src98_r; 3777cabdff1aSopenharmony_ci src87_r = src109_r; 3778cabdff1aSopenharmony_ci src76_l = src98_l; 3779cabdff1aSopenharmony_ci src87_l = src109_l; 3780cabdff1aSopenharmony_ci src8 = src10; 3781cabdff1aSopenharmony_ci } 3782cabdff1aSopenharmony_ci} 3783cabdff1aSopenharmony_ci 3784cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr, 3785cabdff1aSopenharmony_ci int32_t src_stride, 3786cabdff1aSopenharmony_ci int16_t *src1_ptr, 3787cabdff1aSopenharmony_ci int32_t src2_stride, 3788cabdff1aSopenharmony_ci uint8_t *dst, 3789cabdff1aSopenharmony_ci int32_t dst_stride, 3790cabdff1aSopenharmony_ci const int8_t *filter_x, 3791cabdff1aSopenharmony_ci const int8_t *filter_y) 3792cabdff1aSopenharmony_ci{ 3793cabdff1aSopenharmony_ci uint64_t tp0, tp1; 3794cabdff1aSopenharmony_ci v16u8 out; 3795cabdff1aSopenharmony_ci v8i16 in0 = { 0 }; 3796cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 3797cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3798cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 3799cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3800cabdff1aSopenharmony_ci v16i8 mask1; 3801cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3802cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 3803cabdff1aSopenharmony_ci v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp; 3804cabdff1aSopenharmony_ci v4i32 dst0, dst1; 3805cabdff1aSopenharmony_ci 3806cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 3807cabdff1aSopenharmony_ci 3808cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3809cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3810cabdff1aSopenharmony_ci 3811cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3812cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3813cabdff1aSopenharmony_ci 3814cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3815cabdff1aSopenharmony_ci 3816cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3817cabdff1aSopenharmony_ci 3818cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3819cabdff1aSopenharmony_ci const_vec <<= 6; 3820cabdff1aSopenharmony_ci 3821cabdff1aSopenharmony_ci LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4); 3822cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3823cabdff1aSopenharmony_ci 3824cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 3825cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 3826cabdff1aSopenharmony_ci in0 = __msa_adds_s_h(in0, const_vec); 3827cabdff1aSopenharmony_ci 3828cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1); 3829cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3); 3830cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5); 3831cabdff1aSopenharmony_ci 3832cabdff1aSopenharmony_ci dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3833cabdff1aSopenharmony_ci dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3834cabdff1aSopenharmony_ci dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3835cabdff1aSopenharmony_ci 3836cabdff1aSopenharmony_ci ILVRL_H2_SH(dst31, dst20, dst10, dst32); 3837cabdff1aSopenharmony_ci ILVRL_H2_SH(dst42, dst31, dst21, dst43); 3838cabdff1aSopenharmony_ci 3839cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 3840cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 3841cabdff1aSopenharmony_ci dst0 >>= 6; 3842cabdff1aSopenharmony_ci dst1 >>= 6; 3843cabdff1aSopenharmony_ci tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); 3844cabdff1aSopenharmony_ci tmp = __msa_adds_s_h(tmp, in0); 3845cabdff1aSopenharmony_ci tmp = __msa_srari_h(tmp, 7); 3846cabdff1aSopenharmony_ci CLIP_SH_0_255(tmp); 3847cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); 3848cabdff1aSopenharmony_ci ST_W2(out, 0, 1, dst, dst_stride); 3849cabdff1aSopenharmony_ci} 3850cabdff1aSopenharmony_ci 3851cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr, 3852cabdff1aSopenharmony_ci int32_t src_stride, 3853cabdff1aSopenharmony_ci int16_t *src1_ptr, 3854cabdff1aSopenharmony_ci int32_t src2_stride, 3855cabdff1aSopenharmony_ci uint8_t *dst, 3856cabdff1aSopenharmony_ci int32_t dst_stride, 3857cabdff1aSopenharmony_ci const int8_t *filter_x, 3858cabdff1aSopenharmony_ci const int8_t *filter_y) 3859cabdff1aSopenharmony_ci{ 3860cabdff1aSopenharmony_ci uint64_t tp0, tp1; 3861cabdff1aSopenharmony_ci v16u8 out; 3862cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 3863cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3864cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 3865cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3866cabdff1aSopenharmony_ci v16i8 mask1; 3867cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3868cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3869cabdff1aSopenharmony_ci v8i16 tmp0, tmp1; 3870cabdff1aSopenharmony_ci v8i16 in0 = { 0 }, in1 = { 0 }; 3871cabdff1aSopenharmony_ci v8i16 dst30, dst41, dst52, dst63; 3872cabdff1aSopenharmony_ci v8i16 dst10, dst32, dst54, dst21, dst43, dst65; 3873cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3; 3874cabdff1aSopenharmony_ci 3875cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 3876cabdff1aSopenharmony_ci 3877cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3878cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3879cabdff1aSopenharmony_ci 3880cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3881cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3882cabdff1aSopenharmony_ci 3883cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3884cabdff1aSopenharmony_ci 3885cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3886cabdff1aSopenharmony_ci 3887cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 3888cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 3889cabdff1aSopenharmony_ci 3890cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3891cabdff1aSopenharmony_ci const_vec <<= 6; 3892cabdff1aSopenharmony_ci 3893cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 3894cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 3895cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 3896cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 3897cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in1); 3898cabdff1aSopenharmony_ci 3899cabdff1aSopenharmony_ci ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1); 3900cabdff1aSopenharmony_ci 3901cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1); 3902cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3); 3903cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5); 3904cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7); 3905cabdff1aSopenharmony_ci 3906cabdff1aSopenharmony_ci dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3907cabdff1aSopenharmony_ci dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3908cabdff1aSopenharmony_ci dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3909cabdff1aSopenharmony_ci dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3910cabdff1aSopenharmony_ci 3911cabdff1aSopenharmony_ci ILVRL_H2_SH(dst41, dst30, dst10, dst43); 3912cabdff1aSopenharmony_ci ILVRL_H2_SH(dst52, dst41, dst21, dst54); 3913cabdff1aSopenharmony_ci ILVRL_H2_SH(dst63, dst52, dst32, dst65); 3914cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 3915cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 3916cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1); 3917cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1); 3918cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 3919cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 3920cabdff1aSopenharmony_ci ADDS_SH2_SH(tmp0, in0, tmp1, in1, tmp0, tmp1); 3921cabdff1aSopenharmony_ci SRARI_H2_SH(tmp0, tmp1, 7); 3922cabdff1aSopenharmony_ci CLIP_SH2_0_255(tmp0, tmp1); 3923cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 3924cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 3925cabdff1aSopenharmony_ci} 3926cabdff1aSopenharmony_ci 3927cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr, 3928cabdff1aSopenharmony_ci int32_t src_stride, 3929cabdff1aSopenharmony_ci int16_t *src1_ptr, 3930cabdff1aSopenharmony_ci int32_t src2_stride, 3931cabdff1aSopenharmony_ci uint8_t *dst, 3932cabdff1aSopenharmony_ci int32_t dst_stride, 3933cabdff1aSopenharmony_ci const int8_t *filter_x, 3934cabdff1aSopenharmony_ci const int8_t *filter_y, 3935cabdff1aSopenharmony_ci int32_t height) 3936cabdff1aSopenharmony_ci{ 3937cabdff1aSopenharmony_ci uint32_t loop_cnt; 3938cabdff1aSopenharmony_ci uint64_t tp0, tp1; 3939cabdff1aSopenharmony_ci v16u8 out0, out1; 3940cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3941cabdff1aSopenharmony_ci v8i16 filt0, filt1; 3942cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 3943cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3944cabdff1aSopenharmony_ci v16i8 mask1; 3945cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 3946cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3947cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 3948cabdff1aSopenharmony_ci v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 3949cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 3950cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 3951cabdff1aSopenharmony_ci v8i16 dst98_r, dst109_r; 3952cabdff1aSopenharmony_ci v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 3953cabdff1aSopenharmony_ci v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; 3954cabdff1aSopenharmony_ci 3955cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 3956cabdff1aSopenharmony_ci 3957cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 3958cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3959cabdff1aSopenharmony_ci 3960cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 3961cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 3962cabdff1aSopenharmony_ci 3963cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3964cabdff1aSopenharmony_ci 3965cabdff1aSopenharmony_ci mask1 = mask0 + 2; 3966cabdff1aSopenharmony_ci 3967cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 3968cabdff1aSopenharmony_ci const_vec <<= 6; 3969cabdff1aSopenharmony_ci 3970cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 3971cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 3972cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 3973cabdff1aSopenharmony_ci 3974cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 3975cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3); 3976cabdff1aSopenharmony_ci dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3977cabdff1aSopenharmony_ci dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3978cabdff1aSopenharmony_ci ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 3979cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 3980cabdff1aSopenharmony_ci 3981cabdff1aSopenharmony_ci 3982cabdff1aSopenharmony_ci for (loop_cnt = height >> 3; loop_cnt--;) { 3983cabdff1aSopenharmony_ci LD_SB8(src0_ptr, src_stride, 3984cabdff1aSopenharmony_ci src3, src4, src5, src6, src7, src8, src9, src10); 3985cabdff1aSopenharmony_ci src0_ptr += (8 * src_stride); 3986cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3987cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1); 3988cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3); 3989cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5); 3990cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7); 3991cabdff1aSopenharmony_ci 3992cabdff1aSopenharmony_ci dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3993cabdff1aSopenharmony_ci dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3994cabdff1aSopenharmony_ci dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3995cabdff1aSopenharmony_ci dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3996cabdff1aSopenharmony_ci 3997cabdff1aSopenharmony_ci dst32_r = __msa_ilvr_h(dst73, dst22); 3998cabdff1aSopenharmony_ci ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 3999cabdff1aSopenharmony_ci ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 4000cabdff1aSopenharmony_ci ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 4001cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 4002cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst22, dst106); 4003cabdff1aSopenharmony_ci 4004cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4005cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 4006cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 4007cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4008cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 4009cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in1); 4010cabdff1aSopenharmony_ci 4011cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4012cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 4013cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in2); 4014cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4015cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 4016cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in3); 4017cabdff1aSopenharmony_ci 4018cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, 4019cabdff1aSopenharmony_ci const_vec, in0, in1, in2, in3); 4020cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4021cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4022cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4023cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4024cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4025cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4026cabdff1aSopenharmony_ci dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4027cabdff1aSopenharmony_ci dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4028cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 4029cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 4030cabdff1aSopenharmony_ci PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r, 4031cabdff1aSopenharmony_ci dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3); 4032cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, 4033cabdff1aSopenharmony_ci tmp2, tmp3); 4034cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4035cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4036cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4037cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 4038cabdff1aSopenharmony_ci dst += (8 * dst_stride); 4039cabdff1aSopenharmony_ci 4040cabdff1aSopenharmony_ci dst10_r = dst98_r; 4041cabdff1aSopenharmony_ci dst21_r = dst109_r; 4042cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 4043cabdff1aSopenharmony_ci } 4044cabdff1aSopenharmony_ci} 4045cabdff1aSopenharmony_ci 4046cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr, 4047cabdff1aSopenharmony_ci int32_t src_stride, 4048cabdff1aSopenharmony_ci int16_t *src1_ptr, 4049cabdff1aSopenharmony_ci int32_t src2_stride, 4050cabdff1aSopenharmony_ci uint8_t *dst, 4051cabdff1aSopenharmony_ci int32_t dst_stride, 4052cabdff1aSopenharmony_ci const int8_t *filter_x, 4053cabdff1aSopenharmony_ci const int8_t *filter_y, 4054cabdff1aSopenharmony_ci int32_t height) 4055cabdff1aSopenharmony_ci{ 4056cabdff1aSopenharmony_ci if (2 == height) { 4057cabdff1aSopenharmony_ci hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4058cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y); 4059cabdff1aSopenharmony_ci } else if (4 == height) { 4060cabdff1aSopenharmony_ci hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4061cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y); 4062cabdff1aSopenharmony_ci } else if (0 == (height % 8)) { 4063cabdff1aSopenharmony_ci hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride, 4064cabdff1aSopenharmony_ci src1_ptr, src2_stride, 4065cabdff1aSopenharmony_ci dst, dst_stride, 4066cabdff1aSopenharmony_ci filter_x, filter_y, height); 4067cabdff1aSopenharmony_ci } 4068cabdff1aSopenharmony_ci} 4069cabdff1aSopenharmony_ci 4070cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr, 4071cabdff1aSopenharmony_ci int32_t src_stride, 4072cabdff1aSopenharmony_ci int16_t *src1_ptr, 4073cabdff1aSopenharmony_ci int32_t src2_stride, 4074cabdff1aSopenharmony_ci uint8_t *dst, 4075cabdff1aSopenharmony_ci int32_t dst_stride, 4076cabdff1aSopenharmony_ci const int8_t *filter_x, 4077cabdff1aSopenharmony_ci const int8_t *filter_y, 4078cabdff1aSopenharmony_ci int32_t height) 4079cabdff1aSopenharmony_ci{ 4080cabdff1aSopenharmony_ci uint32_t tpw0, tpw1, tpw2, tpw3; 4081cabdff1aSopenharmony_ci uint64_t tp0, tp1; 4082cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 4083cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4084cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4085cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4086cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 4087cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4088cabdff1aSopenharmony_ci v16i8 mask1; 4089cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 4090cabdff1aSopenharmony_ci v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9; 4091cabdff1aSopenharmony_ci v8i16 dsth10, tmp4, tmp5; 4092cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4093cabdff1aSopenharmony_ci v4i32 dst4_r, dst5_r, dst6_r, dst7_r; 4094cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 4095cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 4096cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 4097cabdff1aSopenharmony_ci v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r; 4098cabdff1aSopenharmony_ci v8i16 dst54_l, dst76_l, dst98_l, dst65_l, dst87_l, dst109_l; 4099cabdff1aSopenharmony_ci v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l; 4100cabdff1aSopenharmony_ci v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 4101cabdff1aSopenharmony_ci v8i16 in4 = { 0 }, in5 = { 0 }; 4102cabdff1aSopenharmony_ci 4103cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 4104cabdff1aSopenharmony_ci 4105cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4106cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4107cabdff1aSopenharmony_ci 4108cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4109cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4110cabdff1aSopenharmony_ci 4111cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4112cabdff1aSopenharmony_ci 4113cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4114cabdff1aSopenharmony_ci 4115cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 4116cabdff1aSopenharmony_ci const_vec <<= 6; 4117cabdff1aSopenharmony_ci 4118cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4119cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 4120cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4121cabdff1aSopenharmony_ci 4122cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4123cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4124cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4125cabdff1aSopenharmony_ci 4126cabdff1aSopenharmony_ci dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4127cabdff1aSopenharmony_ci dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4128cabdff1aSopenharmony_ci dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4129cabdff1aSopenharmony_ci 4130cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 4131cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 4132cabdff1aSopenharmony_ci 4133cabdff1aSopenharmony_ci LD_SB8(src0_ptr, src_stride, 4134cabdff1aSopenharmony_ci src3, src4, src5, src6, src7, src8, src9, src10); 4135cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4136cabdff1aSopenharmony_ci 4137cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4138cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4139cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4140cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4141cabdff1aSopenharmony_ci 4142cabdff1aSopenharmony_ci dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4143cabdff1aSopenharmony_ci dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4144cabdff1aSopenharmony_ci dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4145cabdff1aSopenharmony_ci dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4146cabdff1aSopenharmony_ci 4147cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 4148cabdff1aSopenharmony_ci VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3); 4149cabdff1aSopenharmony_ci VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5); 4150cabdff1aSopenharmony_ci VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7); 4151cabdff1aSopenharmony_ci 4152cabdff1aSopenharmony_ci dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4153cabdff1aSopenharmony_ci dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4154cabdff1aSopenharmony_ci dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4155cabdff1aSopenharmony_ci dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4156cabdff1aSopenharmony_ci 4157cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 4158cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 4159cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 4160cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 4161cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 4162cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 4163cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l); 4164cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l); 4165cabdff1aSopenharmony_ci PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l); 4166cabdff1aSopenharmony_ci PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l); 4167cabdff1aSopenharmony_ci dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l); 4168cabdff1aSopenharmony_ci 4169cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4170cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4171cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4172cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4173cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4174cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4175cabdff1aSopenharmony_ci dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4176cabdff1aSopenharmony_ci dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4177cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1); 4178cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1); 4179cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1); 4180cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1); 4181cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 4182cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 4183cabdff1aSopenharmony_ci SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); 4184cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1); 4185cabdff1aSopenharmony_ci PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3); 4186cabdff1aSopenharmony_ci PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5); 4187cabdff1aSopenharmony_ci 4188cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4189cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 4190cabdff1aSopenharmony_ci LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1); 4191cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in1); 4192cabdff1aSopenharmony_ci 4193cabdff1aSopenharmony_ci LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1); 4194cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in2); 4195cabdff1aSopenharmony_ci LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1); 4196cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in3); 4197cabdff1aSopenharmony_ci 4198cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec, 4199cabdff1aSopenharmony_ci in0, in1, in2, in3); 4200cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2, 4201cabdff1aSopenharmony_ci tmp3); 4202cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4203cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4204cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4205cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 4206cabdff1aSopenharmony_ci 4207cabdff1aSopenharmony_ci LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3); 4208cabdff1aSopenharmony_ci src1_ptr += (4 * src2_stride); 4209cabdff1aSopenharmony_ci INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4); 4210cabdff1aSopenharmony_ci LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3); 4211cabdff1aSopenharmony_ci INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5); 4212cabdff1aSopenharmony_ci ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5); 4213cabdff1aSopenharmony_ci ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5); 4214cabdff1aSopenharmony_ci SRARI_H2_SH(tmp4, tmp5, 7); 4215cabdff1aSopenharmony_ci CLIP_SH2_0_255(tmp4, tmp5); 4216cabdff1aSopenharmony_ci out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); 4217cabdff1aSopenharmony_ci ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); 4218cabdff1aSopenharmony_ci} 4219cabdff1aSopenharmony_ci 4220cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr, 4221cabdff1aSopenharmony_ci int32_t src_stride, 4222cabdff1aSopenharmony_ci int16_t *src1_ptr, 4223cabdff1aSopenharmony_ci int32_t src2_stride, 4224cabdff1aSopenharmony_ci uint8_t *dst, 4225cabdff1aSopenharmony_ci int32_t dst_stride, 4226cabdff1aSopenharmony_ci const int8_t *filter_x, 4227cabdff1aSopenharmony_ci const int8_t *filter_y) 4228cabdff1aSopenharmony_ci{ 4229cabdff1aSopenharmony_ci v16u8 out; 4230cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 4231cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4232cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 4233cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4234cabdff1aSopenharmony_ci v16i8 mask1; 4235cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 4236cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 4237cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4; 4238cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 4239cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 4240cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 4241cabdff1aSopenharmony_ci v8i16 tmp0, tmp1; 4242cabdff1aSopenharmony_ci v8i16 in0, in1; 4243cabdff1aSopenharmony_ci 4244cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 4245cabdff1aSopenharmony_ci 4246cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4247cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4248cabdff1aSopenharmony_ci 4249cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4250cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4251cabdff1aSopenharmony_ci 4252cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4253cabdff1aSopenharmony_ci 4254cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4255cabdff1aSopenharmony_ci 4256cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 4257cabdff1aSopenharmony_ci const_vec <<= 6; 4258cabdff1aSopenharmony_ci 4259cabdff1aSopenharmony_ci LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4); 4260cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4261cabdff1aSopenharmony_ci 4262cabdff1aSopenharmony_ci LD_SH2(src1_ptr, src2_stride, in0, in1); 4263cabdff1aSopenharmony_ci ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1); 4264cabdff1aSopenharmony_ci 4265cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4266cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4267cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4268cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 4269cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 4270cabdff1aSopenharmony_ci 4271cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4272cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4273cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4274cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4275cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 4276cabdff1aSopenharmony_ci 4277cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4278cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4279cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4280cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4281cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4282cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4283cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4284cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4285cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4286cabdff1aSopenharmony_ci PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); 4287cabdff1aSopenharmony_ci ADDS_SH2_SH(in0, tmp0, in1, tmp1, tmp0, tmp1); 4288cabdff1aSopenharmony_ci SRARI_H2_SH(tmp0, tmp1, 7); 4289cabdff1aSopenharmony_ci CLIP_SH2_0_255(tmp0, tmp1); 4290cabdff1aSopenharmony_ci out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 4291cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst, dst_stride); 4292cabdff1aSopenharmony_ci} 4293cabdff1aSopenharmony_ci 4294cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr, 4295cabdff1aSopenharmony_ci int32_t src_stride, 4296cabdff1aSopenharmony_ci int16_t *src1_ptr, 4297cabdff1aSopenharmony_ci int32_t src2_stride, 4298cabdff1aSopenharmony_ci uint8_t *dst, 4299cabdff1aSopenharmony_ci int32_t dst_stride, 4300cabdff1aSopenharmony_ci const int8_t *filter_x, 4301cabdff1aSopenharmony_ci const int8_t *filter_y, 4302cabdff1aSopenharmony_ci int32_t width8mult) 4303cabdff1aSopenharmony_ci{ 4304cabdff1aSopenharmony_ci uint32_t cnt; 4305cabdff1aSopenharmony_ci v16u8 out0, out1; 4306cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 4307cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4308cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec; 4309cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3; 4310cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 4311cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4312cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 4313cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 4314cabdff1aSopenharmony_ci 4315cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 4316cabdff1aSopenharmony_ci 4317cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4318cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4319cabdff1aSopenharmony_ci 4320cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4321cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4322cabdff1aSopenharmony_ci 4323cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4324cabdff1aSopenharmony_ci 4325cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 4326cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4327cabdff1aSopenharmony_ci 4328cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 4329cabdff1aSopenharmony_ci const_vec <<= 6; 4330cabdff1aSopenharmony_ci 4331cabdff1aSopenharmony_ci for (cnt = width8mult; cnt--;) { 4332cabdff1aSopenharmony_ci LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6); 4333cabdff1aSopenharmony_ci src0_ptr += 8; 4334cabdff1aSopenharmony_ci XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 4335cabdff1aSopenharmony_ci 4336cabdff1aSopenharmony_ci LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3); 4337cabdff1aSopenharmony_ci src1_ptr += 8; 4338cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, 4339cabdff1aSopenharmony_ci const_vec, in0, in1, in2, in3); 4340cabdff1aSopenharmony_ci 4341cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4342cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4343cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4344cabdff1aSopenharmony_ci 4345cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4346cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4347cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4348cabdff1aSopenharmony_ci 4349cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4350cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4351cabdff1aSopenharmony_ci 4352cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4353cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4354cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4355cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4356cabdff1aSopenharmony_ci 4357cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4358cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4359cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4360cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4361cabdff1aSopenharmony_ci 4362cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4363cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4364cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4365cabdff1aSopenharmony_ci ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4366cabdff1aSopenharmony_ci 4367cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4368cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4369cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4370cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4371cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4372cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4373cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4374cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4375cabdff1aSopenharmony_ci 4376cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4377cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4378cabdff1aSopenharmony_ci PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 4379cabdff1aSopenharmony_ci dst3_r, tmp0, tmp1, tmp2, tmp3); 4380cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 4381cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 4382cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4383cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4384cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4385cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 4386cabdff1aSopenharmony_ci dst += 8; 4387cabdff1aSopenharmony_ci } 4388cabdff1aSopenharmony_ci} 4389cabdff1aSopenharmony_ci 4390cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr, 4391cabdff1aSopenharmony_ci int32_t src_stride, 4392cabdff1aSopenharmony_ci int16_t *src1_ptr, 4393cabdff1aSopenharmony_ci int32_t src2_stride, 4394cabdff1aSopenharmony_ci uint8_t *dst, 4395cabdff1aSopenharmony_ci int32_t dst_stride, 4396cabdff1aSopenharmony_ci const int8_t *filter_x, 4397cabdff1aSopenharmony_ci const int8_t *filter_y) 4398cabdff1aSopenharmony_ci{ 4399cabdff1aSopenharmony_ci v16u8 out0, out1, out2; 4400cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 4401cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5; 4402cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4403cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 4404cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4405cabdff1aSopenharmony_ci v16i8 mask1; 4406cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 4407cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 4408cabdff1aSopenharmony_ci v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 4409cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 4410cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 4411cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4412cabdff1aSopenharmony_ci v4i32 dst4_r, dst4_l, dst5_r, dst5_l; 4413cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst10_l, dst32_l; 4414cabdff1aSopenharmony_ci v8i16 dst21_r, dst43_r, dst21_l, dst43_l; 4415cabdff1aSopenharmony_ci v8i16 dst54_r, dst54_l, dst65_r, dst65_l; 4416cabdff1aSopenharmony_ci v8i16 dst76_r, dst76_l, dst87_r, dst87_l; 4417cabdff1aSopenharmony_ci 4418cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 4419cabdff1aSopenharmony_ci 4420cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4421cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4422cabdff1aSopenharmony_ci 4423cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4424cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4425cabdff1aSopenharmony_ci 4426cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4427cabdff1aSopenharmony_ci 4428cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4429cabdff1aSopenharmony_ci 4430cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 4431cabdff1aSopenharmony_ci const_vec <<= 6; 4432cabdff1aSopenharmony_ci 4433cabdff1aSopenharmony_ci LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4); 4434cabdff1aSopenharmony_ci src0_ptr += (5 * src_stride); 4435cabdff1aSopenharmony_ci LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8); 4436cabdff1aSopenharmony_ci 4437cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4438cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 4439cabdff1aSopenharmony_ci 4440cabdff1aSopenharmony_ci LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5); 4441cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec, 4442cabdff1aSopenharmony_ci in0, in1, in2, in3); 4443cabdff1aSopenharmony_ci ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5); 4444cabdff1aSopenharmony_ci 4445cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4446cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4447cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4448cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 4449cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 4450cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11); 4451cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13); 4452cabdff1aSopenharmony_ci VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15); 4453cabdff1aSopenharmony_ci VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17); 4454cabdff1aSopenharmony_ci 4455cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4456cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4457cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4458cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4459cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 4460cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1); 4461cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1); 4462cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1); 4463cabdff1aSopenharmony_ci dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1); 4464cabdff1aSopenharmony_ci 4465cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4466cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4467cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4468cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4469cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4470cabdff1aSopenharmony_ci ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4471cabdff1aSopenharmony_ci ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 4472cabdff1aSopenharmony_ci ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 4473cabdff1aSopenharmony_ci 4474cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4475cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4476cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4477cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4478cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4479cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4480cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4481cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4482cabdff1aSopenharmony_ci dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4483cabdff1aSopenharmony_ci dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); 4484cabdff1aSopenharmony_ci dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4485cabdff1aSopenharmony_ci dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); 4486cabdff1aSopenharmony_ci 4487cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4488cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4489cabdff1aSopenharmony_ci SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6); 4490cabdff1aSopenharmony_ci PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r, 4491cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 4492cabdff1aSopenharmony_ci PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5); 4493cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 4494cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 4495cabdff1aSopenharmony_ci ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5); 4496cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4497cabdff1aSopenharmony_ci SRARI_H2_SH(tmp4, tmp5, 7); 4498cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4499cabdff1aSopenharmony_ci CLIP_SH2_0_255(tmp4, tmp5); 4500cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4501cabdff1aSopenharmony_ci out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); 4502cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 4503cabdff1aSopenharmony_ci ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 4504cabdff1aSopenharmony_ci} 4505cabdff1aSopenharmony_ci 4506cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr, 4507cabdff1aSopenharmony_ci int32_t src_stride, 4508cabdff1aSopenharmony_ci int16_t *src1_ptr, 4509cabdff1aSopenharmony_ci int32_t src2_stride, 4510cabdff1aSopenharmony_ci uint8_t *dst, 4511cabdff1aSopenharmony_ci int32_t dst_stride, 4512cabdff1aSopenharmony_ci const int8_t *filter_x, 4513cabdff1aSopenharmony_ci const int8_t *filter_y, 4514cabdff1aSopenharmony_ci int32_t height, 4515cabdff1aSopenharmony_ci int32_t width) 4516cabdff1aSopenharmony_ci{ 4517cabdff1aSopenharmony_ci uint32_t loop_cnt, cnt; 4518cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp; 4519cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 4520cabdff1aSopenharmony_ci uint8_t *dst_tmp; 4521cabdff1aSopenharmony_ci v16u8 out0, out1; 4522cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 4523cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3; 4524cabdff1aSopenharmony_ci v8i16 filt0, filt1; 4525cabdff1aSopenharmony_ci v8i16 filt_h0, filt_h1; 4526cabdff1aSopenharmony_ci v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 4527cabdff1aSopenharmony_ci v16i8 mask1; 4528cabdff1aSopenharmony_ci v8i16 filter_vec, const_vec; 4529cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4530cabdff1aSopenharmony_ci v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 4531cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4532cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 4533cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 4534cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 4535cabdff1aSopenharmony_ci v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6; 4536cabdff1aSopenharmony_ci 4537cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 4538cabdff1aSopenharmony_ci 4539cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4540cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4541cabdff1aSopenharmony_ci 4542cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4543cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4544cabdff1aSopenharmony_ci 4545cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4546cabdff1aSopenharmony_ci 4547cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4548cabdff1aSopenharmony_ci 4549cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 4550cabdff1aSopenharmony_ci const_vec <<= 6; 4551cabdff1aSopenharmony_ci 4552cabdff1aSopenharmony_ci for (cnt = width >> 3; cnt--;) { 4553cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 4554cabdff1aSopenharmony_ci dst_tmp = dst; 4555cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 4556cabdff1aSopenharmony_ci 4557cabdff1aSopenharmony_ci LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2); 4558cabdff1aSopenharmony_ci src0_ptr_tmp += (3 * src_stride); 4559cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4560cabdff1aSopenharmony_ci 4561cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4562cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4563cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4564cabdff1aSopenharmony_ci 4565cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4566cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4567cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4568cabdff1aSopenharmony_ci 4569cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4570cabdff1aSopenharmony_ci ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4571cabdff1aSopenharmony_ci 4572cabdff1aSopenharmony_ci for (loop_cnt = height >> 2; loop_cnt--;) { 4573cabdff1aSopenharmony_ci LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6); 4574cabdff1aSopenharmony_ci src0_ptr_tmp += (4 * src_stride); 4575cabdff1aSopenharmony_ci LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); 4576cabdff1aSopenharmony_ci src1_ptr_tmp += (4 * src2_stride); 4577cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 4578cabdff1aSopenharmony_ci 4579cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, 4580cabdff1aSopenharmony_ci const_vec, in0, in1, in2, in3); 4581cabdff1aSopenharmony_ci 4582cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4583cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4584cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4585cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4586cabdff1aSopenharmony_ci 4587cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4588cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4589cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4590cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4591cabdff1aSopenharmony_ci 4592cabdff1aSopenharmony_ci ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4593cabdff1aSopenharmony_ci ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4594cabdff1aSopenharmony_ci ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4595cabdff1aSopenharmony_ci ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4596cabdff1aSopenharmony_ci 4597cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4598cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4599cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4600cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4601cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4602cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4603cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4604cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4605cabdff1aSopenharmony_ci 4606cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4607cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4608cabdff1aSopenharmony_ci PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 4609cabdff1aSopenharmony_ci dst3_r, tmp0, tmp1, tmp2, tmp3); 4610cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 4611cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 4612cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4613cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4614cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4615cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 4616cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 4617cabdff1aSopenharmony_ci 4618cabdff1aSopenharmony_ci dst10_r = dst54_r; 4619cabdff1aSopenharmony_ci dst10_l = dst54_l; 4620cabdff1aSopenharmony_ci dst21_r = dst65_r; 4621cabdff1aSopenharmony_ci dst21_l = dst65_l; 4622cabdff1aSopenharmony_ci dst2 = dst6; 4623cabdff1aSopenharmony_ci } 4624cabdff1aSopenharmony_ci 4625cabdff1aSopenharmony_ci src0_ptr += 8; 4626cabdff1aSopenharmony_ci dst += 8; 4627cabdff1aSopenharmony_ci src1_ptr += 8; 4628cabdff1aSopenharmony_ci } 4629cabdff1aSopenharmony_ci} 4630cabdff1aSopenharmony_ci 4631cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr, 4632cabdff1aSopenharmony_ci int32_t src_stride, 4633cabdff1aSopenharmony_ci int16_t *src1_ptr, 4634cabdff1aSopenharmony_ci int32_t src2_stride, 4635cabdff1aSopenharmony_ci uint8_t *dst, 4636cabdff1aSopenharmony_ci int32_t dst_stride, 4637cabdff1aSopenharmony_ci const int8_t *filter_x, 4638cabdff1aSopenharmony_ci const int8_t *filter_y, 4639cabdff1aSopenharmony_ci int32_t height) 4640cabdff1aSopenharmony_ci{ 4641cabdff1aSopenharmony_ci if (2 == height) { 4642cabdff1aSopenharmony_ci hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4643cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y); 4644cabdff1aSopenharmony_ci } else if (4 == height) { 4645cabdff1aSopenharmony_ci hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4646cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 1); 4647cabdff1aSopenharmony_ci } else if (6 == height) { 4648cabdff1aSopenharmony_ci hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4649cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y); 4650cabdff1aSopenharmony_ci } else { 4651cabdff1aSopenharmony_ci hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, 4652cabdff1aSopenharmony_ci src1_ptr, src2_stride, 4653cabdff1aSopenharmony_ci dst, dst_stride, 4654cabdff1aSopenharmony_ci filter_x, filter_y, height, 8); 4655cabdff1aSopenharmony_ci } 4656cabdff1aSopenharmony_ci} 4657cabdff1aSopenharmony_ci 4658cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr, 4659cabdff1aSopenharmony_ci int32_t src_stride, 4660cabdff1aSopenharmony_ci int16_t *src1_ptr, 4661cabdff1aSopenharmony_ci int32_t src2_stride, 4662cabdff1aSopenharmony_ci uint8_t *dst, 4663cabdff1aSopenharmony_ci int32_t dst_stride, 4664cabdff1aSopenharmony_ci const int8_t *filter_x, 4665cabdff1aSopenharmony_ci const int8_t *filter_y, 4666cabdff1aSopenharmony_ci int32_t height) 4667cabdff1aSopenharmony_ci{ 4668cabdff1aSopenharmony_ci uint32_t loop_cnt; 4669cabdff1aSopenharmony_ci uint64_t tp0, tp1; 4670cabdff1aSopenharmony_ci uint8_t *src0_ptr_tmp, *dst_tmp; 4671cabdff1aSopenharmony_ci int16_t *src1_ptr_tmp; 4672cabdff1aSopenharmony_ci v16u8 out0, out1; 4673cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4674cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4675cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3; 4676cabdff1aSopenharmony_ci v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3; 4677cabdff1aSopenharmony_ci v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, const_vec; 4678cabdff1aSopenharmony_ci v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 4679cabdff1aSopenharmony_ci v8i16 dst76_r, dst98_r, dst87_r, dst109_r; 4680cabdff1aSopenharmony_ci v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 }; 4681cabdff1aSopenharmony_ci v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 4682cabdff1aSopenharmony_ci v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 4683cabdff1aSopenharmony_ci v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4684cabdff1aSopenharmony_ci v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 4685cabdff1aSopenharmony_ci 4686cabdff1aSopenharmony_ci src0_ptr -= (src_stride + 1); 4687cabdff1aSopenharmony_ci 4688cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_x); 4689cabdff1aSopenharmony_ci SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4690cabdff1aSopenharmony_ci 4691cabdff1aSopenharmony_ci filter_vec = LD_SH(filter_y); 4692cabdff1aSopenharmony_ci UNPCK_R_SB_SH(filter_vec, filter_vec); 4693cabdff1aSopenharmony_ci 4694cabdff1aSopenharmony_ci SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4695cabdff1aSopenharmony_ci 4696cabdff1aSopenharmony_ci mask0 = LD_SB(ff_hevc_mask_arr); 4697cabdff1aSopenharmony_ci mask1 = mask0 + 2; 4698cabdff1aSopenharmony_ci 4699cabdff1aSopenharmony_ci const_vec = __msa_ldi_h(128); 4700cabdff1aSopenharmony_ci const_vec <<= 6; 4701cabdff1aSopenharmony_ci 4702cabdff1aSopenharmony_ci src0_ptr_tmp = src0_ptr; 4703cabdff1aSopenharmony_ci dst_tmp = dst; 4704cabdff1aSopenharmony_ci src1_ptr_tmp = src1_ptr; 4705cabdff1aSopenharmony_ci 4706cabdff1aSopenharmony_ci LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2); 4707cabdff1aSopenharmony_ci src0_ptr_tmp += (3 * src_stride); 4708cabdff1aSopenharmony_ci 4709cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4710cabdff1aSopenharmony_ci 4711cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4712cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4713cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4714cabdff1aSopenharmony_ci 4715cabdff1aSopenharmony_ci dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4716cabdff1aSopenharmony_ci dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4717cabdff1aSopenharmony_ci dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4718cabdff1aSopenharmony_ci 4719cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 4720cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 4721cabdff1aSopenharmony_ci 4722cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 4723cabdff1aSopenharmony_ci LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6); 4724cabdff1aSopenharmony_ci src0_ptr_tmp += (4 * src_stride); 4725cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 4726cabdff1aSopenharmony_ci 4727cabdff1aSopenharmony_ci LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3); 4728cabdff1aSopenharmony_ci src1_ptr_tmp += (4 * src2_stride); 4729cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, 4730cabdff1aSopenharmony_ci const_vec, in0, in1, in2, in3); 4731cabdff1aSopenharmony_ci 4732cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4733cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4734cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4735cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4736cabdff1aSopenharmony_ci 4737cabdff1aSopenharmony_ci dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4738cabdff1aSopenharmony_ci dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4739cabdff1aSopenharmony_ci dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4740cabdff1aSopenharmony_ci dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4741cabdff1aSopenharmony_ci 4742cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 4743cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 4744cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 4745cabdff1aSopenharmony_ci ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 4746cabdff1aSopenharmony_ci 4747cabdff1aSopenharmony_ci dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4748cabdff1aSopenharmony_ci dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4749cabdff1aSopenharmony_ci dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4750cabdff1aSopenharmony_ci dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4751cabdff1aSopenharmony_ci dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4752cabdff1aSopenharmony_ci dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4753cabdff1aSopenharmony_ci dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4754cabdff1aSopenharmony_ci dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4755cabdff1aSopenharmony_ci 4756cabdff1aSopenharmony_ci SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4757cabdff1aSopenharmony_ci SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4758cabdff1aSopenharmony_ci PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 4759cabdff1aSopenharmony_ci dst3_r, tmp0, tmp1, tmp2, tmp3); 4760cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 4761cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 4762cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4763cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4764cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4765cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 4766cabdff1aSopenharmony_ci dst_tmp += (4 * dst_stride); 4767cabdff1aSopenharmony_ci 4768cabdff1aSopenharmony_ci dst10_r = dst54_r; 4769cabdff1aSopenharmony_ci dst10_l = dst54_l; 4770cabdff1aSopenharmony_ci dst21_r = dst65_r; 4771cabdff1aSopenharmony_ci dst21_l = dst65_l; 4772cabdff1aSopenharmony_ci dsth2 = dsth6; 4773cabdff1aSopenharmony_ci } 4774cabdff1aSopenharmony_ci 4775cabdff1aSopenharmony_ci src0_ptr += 8; 4776cabdff1aSopenharmony_ci dst += 8; 4777cabdff1aSopenharmony_ci src1_ptr += 8; 4778cabdff1aSopenharmony_ci 4779cabdff1aSopenharmony_ci mask2 = LD_SB(ff_hevc_mask_arr + 16); 4780cabdff1aSopenharmony_ci mask3 = mask2 + 2; 4781cabdff1aSopenharmony_ci 4782cabdff1aSopenharmony_ci LD_SB3(src0_ptr, src_stride, src0, src1, src2); 4783cabdff1aSopenharmony_ci src0_ptr += (3 * src_stride); 4784cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 4785cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 4786cabdff1aSopenharmony_ci VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3); 4787cabdff1aSopenharmony_ci 4788cabdff1aSopenharmony_ci dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4789cabdff1aSopenharmony_ci dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4790cabdff1aSopenharmony_ci 4791cabdff1aSopenharmony_ci ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 4792cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 4793cabdff1aSopenharmony_ci 4794cabdff1aSopenharmony_ci for (loop_cnt = 2; loop_cnt--;) { 4795cabdff1aSopenharmony_ci LD_SB8(src0_ptr, src_stride, 4796cabdff1aSopenharmony_ci src3, src4, src5, src6, src7, src8, src9, src10); 4797cabdff1aSopenharmony_ci src0_ptr += (8 * src_stride); 4798cabdff1aSopenharmony_ci XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4799cabdff1aSopenharmony_ci VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1); 4800cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3); 4801cabdff1aSopenharmony_ci VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5); 4802cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7); 4803cabdff1aSopenharmony_ci 4804cabdff1aSopenharmony_ci dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 4805cabdff1aSopenharmony_ci dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 4806cabdff1aSopenharmony_ci dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 4807cabdff1aSopenharmony_ci dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 4808cabdff1aSopenharmony_ci 4809cabdff1aSopenharmony_ci dst32_r = __msa_ilvr_h(dst73, dst22); 4810cabdff1aSopenharmony_ci ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 4811cabdff1aSopenharmony_ci ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 4812cabdff1aSopenharmony_ci ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 4813cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 4814cabdff1aSopenharmony_ci dst76_r = __msa_ilvr_h(dst22, dst106); 4815cabdff1aSopenharmony_ci 4816cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4817cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 4818cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in0); 4819cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4820cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 4821cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in1); 4822cabdff1aSopenharmony_ci 4823cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4824cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 4825cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in2); 4826cabdff1aSopenharmony_ci LD2(src1_ptr, src2_stride, tp0, tp1); 4827cabdff1aSopenharmony_ci src1_ptr += 2 * src2_stride; 4828cabdff1aSopenharmony_ci INSERT_D2_SH(tp0, tp1, in3); 4829cabdff1aSopenharmony_ci 4830cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, 4831cabdff1aSopenharmony_ci const_vec, in0, in1, in2, in3); 4832cabdff1aSopenharmony_ci 4833cabdff1aSopenharmony_ci dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4834cabdff1aSopenharmony_ci dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4835cabdff1aSopenharmony_ci dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4836cabdff1aSopenharmony_ci dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4837cabdff1aSopenharmony_ci dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4838cabdff1aSopenharmony_ci dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4839cabdff1aSopenharmony_ci dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4840cabdff1aSopenharmony_ci dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4841cabdff1aSopenharmony_ci 4842cabdff1aSopenharmony_ci SRA_4V(dst0, dst1, dst2, dst3, 6); 4843cabdff1aSopenharmony_ci SRA_4V(dst4, dst5, dst6, dst7, 6); 4844cabdff1aSopenharmony_ci PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 4845cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 4846cabdff1aSopenharmony_ci ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, 4847cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 4848cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 4849cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 4850cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 4851cabdff1aSopenharmony_ci ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 4852cabdff1aSopenharmony_ci dst += (8 * dst_stride); 4853cabdff1aSopenharmony_ci 4854cabdff1aSopenharmony_ci dst10_r = dst98_r; 4855cabdff1aSopenharmony_ci dst21_r = dst109_r; 4856cabdff1aSopenharmony_ci dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 4857cabdff1aSopenharmony_ci } 4858cabdff1aSopenharmony_ci} 4859cabdff1aSopenharmony_ci 4860cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr, 4861cabdff1aSopenharmony_ci int32_t src_stride, 4862cabdff1aSopenharmony_ci int16_t *src1_ptr, 4863cabdff1aSopenharmony_ci int32_t src2_stride, 4864cabdff1aSopenharmony_ci uint8_t *dst, 4865cabdff1aSopenharmony_ci int32_t dst_stride, 4866cabdff1aSopenharmony_ci const int8_t *filter_x, 4867cabdff1aSopenharmony_ci const int8_t *filter_y, 4868cabdff1aSopenharmony_ci int32_t height) 4869cabdff1aSopenharmony_ci{ 4870cabdff1aSopenharmony_ci if (4 == height) { 4871cabdff1aSopenharmony_ci hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4872cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 2); 4873cabdff1aSopenharmony_ci } else { 4874cabdff1aSopenharmony_ci hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, 4875cabdff1aSopenharmony_ci src2_stride, dst, dst_stride, filter_x, 4876cabdff1aSopenharmony_ci filter_y, height, 16); 4877cabdff1aSopenharmony_ci } 4878cabdff1aSopenharmony_ci} 4879cabdff1aSopenharmony_ci 4880cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr, 4881cabdff1aSopenharmony_ci int32_t src_stride, 4882cabdff1aSopenharmony_ci int16_t *src1_ptr, 4883cabdff1aSopenharmony_ci int32_t src2_stride, 4884cabdff1aSopenharmony_ci uint8_t *dst, 4885cabdff1aSopenharmony_ci int32_t dst_stride, 4886cabdff1aSopenharmony_ci const int8_t *filter_x, 4887cabdff1aSopenharmony_ci const int8_t *filter_y, 4888cabdff1aSopenharmony_ci int32_t height) 4889cabdff1aSopenharmony_ci{ 4890cabdff1aSopenharmony_ci hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4891cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 4892cabdff1aSopenharmony_ci height, 24); 4893cabdff1aSopenharmony_ci} 4894cabdff1aSopenharmony_ci 4895cabdff1aSopenharmony_cistatic void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr, 4896cabdff1aSopenharmony_ci int32_t src_stride, 4897cabdff1aSopenharmony_ci int16_t *src1_ptr, 4898cabdff1aSopenharmony_ci int32_t src2_stride, 4899cabdff1aSopenharmony_ci uint8_t *dst, 4900cabdff1aSopenharmony_ci int32_t dst_stride, 4901cabdff1aSopenharmony_ci const int8_t *filter_x, 4902cabdff1aSopenharmony_ci const int8_t *filter_y, 4903cabdff1aSopenharmony_ci int32_t height) 4904cabdff1aSopenharmony_ci{ 4905cabdff1aSopenharmony_ci hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride, 4906cabdff1aSopenharmony_ci dst, dst_stride, filter_x, filter_y, 4907cabdff1aSopenharmony_ci height, 32); 4908cabdff1aSopenharmony_ci} 4909cabdff1aSopenharmony_ci 4910cabdff1aSopenharmony_ci#define BI_MC_COPY(WIDTH) \ 4911cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ 4912cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 4913cabdff1aSopenharmony_ci uint8_t *src, \ 4914cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 4915cabdff1aSopenharmony_ci int16_t *src_16bit, \ 4916cabdff1aSopenharmony_ci int height, \ 4917cabdff1aSopenharmony_ci intptr_t mx, \ 4918cabdff1aSopenharmony_ci intptr_t my, \ 4919cabdff1aSopenharmony_ci int width) \ 4920cabdff1aSopenharmony_ci{ \ 4921cabdff1aSopenharmony_ci hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \ 4922cabdff1aSopenharmony_ci dst, dst_stride, height); \ 4923cabdff1aSopenharmony_ci} 4924cabdff1aSopenharmony_ci 4925cabdff1aSopenharmony_ciBI_MC_COPY(4); 4926cabdff1aSopenharmony_ciBI_MC_COPY(6); 4927cabdff1aSopenharmony_ciBI_MC_COPY(8); 4928cabdff1aSopenharmony_ciBI_MC_COPY(12); 4929cabdff1aSopenharmony_ciBI_MC_COPY(16); 4930cabdff1aSopenharmony_ciBI_MC_COPY(24); 4931cabdff1aSopenharmony_ciBI_MC_COPY(32); 4932cabdff1aSopenharmony_ciBI_MC_COPY(48); 4933cabdff1aSopenharmony_ciBI_MC_COPY(64); 4934cabdff1aSopenharmony_ci 4935cabdff1aSopenharmony_ci#undef BI_MC_COPY 4936cabdff1aSopenharmony_ci 4937cabdff1aSopenharmony_ci#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 4938cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ 4939cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 4940cabdff1aSopenharmony_ci uint8_t *src, \ 4941cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 4942cabdff1aSopenharmony_ci int16_t *src_16bit, \ 4943cabdff1aSopenharmony_ci int height, \ 4944cabdff1aSopenharmony_ci intptr_t mx, \ 4945cabdff1aSopenharmony_ci intptr_t my, \ 4946cabdff1aSopenharmony_ci int width) \ 4947cabdff1aSopenharmony_ci{ \ 4948cabdff1aSopenharmony_ci const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 4949cabdff1aSopenharmony_ci \ 4950cabdff1aSopenharmony_ci hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ 4951cabdff1aSopenharmony_ci MAX_PB_SIZE, dst, dst_stride, \ 4952cabdff1aSopenharmony_ci filter, height); \ 4953cabdff1aSopenharmony_ci} 4954cabdff1aSopenharmony_ci 4955cabdff1aSopenharmony_ciBI_MC(qpel, h, 4, 8, hz, mx); 4956cabdff1aSopenharmony_ciBI_MC(qpel, h, 8, 8, hz, mx); 4957cabdff1aSopenharmony_ciBI_MC(qpel, h, 12, 8, hz, mx); 4958cabdff1aSopenharmony_ciBI_MC(qpel, h, 16, 8, hz, mx); 4959cabdff1aSopenharmony_ciBI_MC(qpel, h, 24, 8, hz, mx); 4960cabdff1aSopenharmony_ciBI_MC(qpel, h, 32, 8, hz, mx); 4961cabdff1aSopenharmony_ciBI_MC(qpel, h, 48, 8, hz, mx); 4962cabdff1aSopenharmony_ciBI_MC(qpel, h, 64, 8, hz, mx); 4963cabdff1aSopenharmony_ci 4964cabdff1aSopenharmony_ciBI_MC(qpel, v, 4, 8, vt, my); 4965cabdff1aSopenharmony_ciBI_MC(qpel, v, 8, 8, vt, my); 4966cabdff1aSopenharmony_ciBI_MC(qpel, v, 12, 8, vt, my); 4967cabdff1aSopenharmony_ciBI_MC(qpel, v, 16, 8, vt, my); 4968cabdff1aSopenharmony_ciBI_MC(qpel, v, 24, 8, vt, my); 4969cabdff1aSopenharmony_ciBI_MC(qpel, v, 32, 8, vt, my); 4970cabdff1aSopenharmony_ciBI_MC(qpel, v, 48, 8, vt, my); 4971cabdff1aSopenharmony_ciBI_MC(qpel, v, 64, 8, vt, my); 4972cabdff1aSopenharmony_ci 4973cabdff1aSopenharmony_ciBI_MC(epel, h, 4, 4, hz, mx); 4974cabdff1aSopenharmony_ciBI_MC(epel, h, 8, 4, hz, mx); 4975cabdff1aSopenharmony_ciBI_MC(epel, h, 6, 4, hz, mx); 4976cabdff1aSopenharmony_ciBI_MC(epel, h, 12, 4, hz, mx); 4977cabdff1aSopenharmony_ciBI_MC(epel, h, 16, 4, hz, mx); 4978cabdff1aSopenharmony_ciBI_MC(epel, h, 24, 4, hz, mx); 4979cabdff1aSopenharmony_ciBI_MC(epel, h, 32, 4, hz, mx); 4980cabdff1aSopenharmony_ci 4981cabdff1aSopenharmony_ciBI_MC(epel, v, 4, 4, vt, my); 4982cabdff1aSopenharmony_ciBI_MC(epel, v, 8, 4, vt, my); 4983cabdff1aSopenharmony_ciBI_MC(epel, v, 6, 4, vt, my); 4984cabdff1aSopenharmony_ciBI_MC(epel, v, 12, 4, vt, my); 4985cabdff1aSopenharmony_ciBI_MC(epel, v, 16, 4, vt, my); 4986cabdff1aSopenharmony_ciBI_MC(epel, v, 24, 4, vt, my); 4987cabdff1aSopenharmony_ciBI_MC(epel, v, 32, 4, vt, my); 4988cabdff1aSopenharmony_ci 4989cabdff1aSopenharmony_ci#undef BI_MC 4990cabdff1aSopenharmony_ci 4991cabdff1aSopenharmony_ci#define BI_MC_HV(PEL, WIDTH, TAP) \ 4992cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \ 4993cabdff1aSopenharmony_ci ptrdiff_t dst_stride, \ 4994cabdff1aSopenharmony_ci uint8_t *src, \ 4995cabdff1aSopenharmony_ci ptrdiff_t src_stride, \ 4996cabdff1aSopenharmony_ci int16_t *src_16bit, \ 4997cabdff1aSopenharmony_ci int height, \ 4998cabdff1aSopenharmony_ci intptr_t mx, \ 4999cabdff1aSopenharmony_ci intptr_t my, \ 5000cabdff1aSopenharmony_ci int width) \ 5001cabdff1aSopenharmony_ci{ \ 5002cabdff1aSopenharmony_ci const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 5003cabdff1aSopenharmony_ci const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 5004cabdff1aSopenharmony_ci \ 5005cabdff1aSopenharmony_ci hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ 5006cabdff1aSopenharmony_ci MAX_PB_SIZE, dst, dst_stride, \ 5007cabdff1aSopenharmony_ci filter_x, filter_y, height); \ 5008cabdff1aSopenharmony_ci} 5009cabdff1aSopenharmony_ci 5010cabdff1aSopenharmony_ciBI_MC_HV(qpel, 4, 8); 5011cabdff1aSopenharmony_ciBI_MC_HV(qpel, 8, 8); 5012cabdff1aSopenharmony_ciBI_MC_HV(qpel, 12, 8); 5013cabdff1aSopenharmony_ciBI_MC_HV(qpel, 16, 8); 5014cabdff1aSopenharmony_ciBI_MC_HV(qpel, 24, 8); 5015cabdff1aSopenharmony_ciBI_MC_HV(qpel, 32, 8); 5016cabdff1aSopenharmony_ciBI_MC_HV(qpel, 48, 8); 5017cabdff1aSopenharmony_ciBI_MC_HV(qpel, 64, 8); 5018cabdff1aSopenharmony_ci 5019cabdff1aSopenharmony_ciBI_MC_HV(epel, 4, 4); 5020cabdff1aSopenharmony_ciBI_MC_HV(epel, 8, 4); 5021cabdff1aSopenharmony_ciBI_MC_HV(epel, 6, 4); 5022cabdff1aSopenharmony_ciBI_MC_HV(epel, 12, 4); 5023cabdff1aSopenharmony_ciBI_MC_HV(epel, 16, 4); 5024cabdff1aSopenharmony_ciBI_MC_HV(epel, 24, 4); 5025cabdff1aSopenharmony_ciBI_MC_HV(epel, 32, 4); 5026cabdff1aSopenharmony_ci 5027cabdff1aSopenharmony_ci#undef BI_MC_HV 5028