1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 22cabdff1aSopenharmony_ci#include "h264dsp_mips.h" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_cistatic void avc_wgt_4x2_msa(uint8_t *data, ptrdiff_t stride, 25cabdff1aSopenharmony_ci int32_t log2_denom, int32_t src_weight, 26cabdff1aSopenharmony_ci int32_t offset_in) 27cabdff1aSopenharmony_ci{ 28cabdff1aSopenharmony_ci uint32_t tp0, tp1, offset_val; 29cabdff1aSopenharmony_ci v16u8 zero = { 0 }; 30cabdff1aSopenharmony_ci v16u8 src0 = { 0 }; 31cabdff1aSopenharmony_ci v8i16 src0_r, tmp0, wgt, denom, offset; 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 34cabdff1aSopenharmony_ci 35cabdff1aSopenharmony_ci wgt = __msa_fill_h(src_weight); 36cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_val); 37cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom); 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ci LW2(data, stride, tp0, tp1); 40cabdff1aSopenharmony_ci INSERT_W2_UB(tp0, tp1, src0); 41cabdff1aSopenharmony_ci src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0); 42cabdff1aSopenharmony_ci tmp0 = wgt * src0_r; 43cabdff1aSopenharmony_ci tmp0 = __msa_adds_s_h(tmp0, offset); 44cabdff1aSopenharmony_ci tmp0 = __msa_maxi_s_h(tmp0, 0); 45cabdff1aSopenharmony_ci tmp0 = __msa_srlr_h(tmp0, denom); 46cabdff1aSopenharmony_ci tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7); 47cabdff1aSopenharmony_ci src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0); 48cabdff1aSopenharmony_ci ST_W2(src0, 0, 1, data, stride); 49cabdff1aSopenharmony_ci} 50cabdff1aSopenharmony_ci 51cabdff1aSopenharmony_cistatic void avc_wgt_4x4_msa(uint8_t *data, ptrdiff_t stride, 52cabdff1aSopenharmony_ci int32_t log2_denom, int32_t src_weight, 53cabdff1aSopenharmony_ci int32_t offset_in) 54cabdff1aSopenharmony_ci{ 55cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3, offset_val; 56cabdff1aSopenharmony_ci v16u8 src0 = { 0 }; 57cabdff1aSopenharmony_ci v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset; 58cabdff1aSopenharmony_ci 59cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ci wgt = __msa_fill_h(src_weight); 62cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_val); 63cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom); 64cabdff1aSopenharmony_ci 65cabdff1aSopenharmony_ci LW4(data, stride, tp0, tp1, tp2, tp3); 66cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 67cabdff1aSopenharmony_ci UNPCK_UB_SH(src0, src0_r, src1_r); 68cabdff1aSopenharmony_ci MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1); 69cabdff1aSopenharmony_ci ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1); 70cabdff1aSopenharmony_ci MAXI_SH2_SH(tmp0, tmp1, 0); 71cabdff1aSopenharmony_ci tmp0 = __msa_srlr_h(tmp0, denom); 72cabdff1aSopenharmony_ci tmp1 = __msa_srlr_h(tmp1, denom); 73cabdff1aSopenharmony_ci SAT_UH2_SH(tmp0, tmp1, 7); 74cabdff1aSopenharmony_ci src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 75cabdff1aSopenharmony_ci ST_W4(src0, 0, 1, 2, 3, data, stride); 76cabdff1aSopenharmony_ci} 77cabdff1aSopenharmony_ci 78cabdff1aSopenharmony_cistatic void avc_wgt_4x8_msa(uint8_t *data, ptrdiff_t stride, 79cabdff1aSopenharmony_ci int32_t log2_denom, int32_t src_weight, 80cabdff1aSopenharmony_ci int32_t offset_in) 81cabdff1aSopenharmony_ci{ 82cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3, offset_val; 83cabdff1aSopenharmony_ci v16u8 src0 = { 0 }, src1 = { 0 }; 84cabdff1aSopenharmony_ci v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3; 85cabdff1aSopenharmony_ci v8i16 wgt, denom, offset; 86cabdff1aSopenharmony_ci 87cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ci wgt = __msa_fill_h(src_weight); 90cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_val); 91cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom); 92cabdff1aSopenharmony_ci 93cabdff1aSopenharmony_ci LW4(data, stride, tp0, tp1, tp2, tp3); 94cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 95cabdff1aSopenharmony_ci LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3); 96cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, src1); 97cabdff1aSopenharmony_ci UNPCK_UB_SH(src0, src0_r, src1_r); 98cabdff1aSopenharmony_ci UNPCK_UB_SH(src1, src2_r, src3_r); 99cabdff1aSopenharmony_ci MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2, 100cabdff1aSopenharmony_ci tmp3); 101cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0, 102cabdff1aSopenharmony_ci tmp1, tmp2, tmp3); 103cabdff1aSopenharmony_ci MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0); 104cabdff1aSopenharmony_ci SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom); 105cabdff1aSopenharmony_ci SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 106cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 107cabdff1aSopenharmony_ci ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride); 108cabdff1aSopenharmony_ci} 109cabdff1aSopenharmony_ci 110cabdff1aSopenharmony_cistatic void avc_wgt_8x4_msa(uint8_t *data, ptrdiff_t stride, 111cabdff1aSopenharmony_ci int32_t log2_denom, int32_t src_weight, 112cabdff1aSopenharmony_ci int32_t offset_in) 113cabdff1aSopenharmony_ci{ 114cabdff1aSopenharmony_ci uint32_t offset_val; 115cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 116cabdff1aSopenharmony_ci v16u8 src0 = { 0 }, src1 = { 0 }; 117cabdff1aSopenharmony_ci v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3; 118cabdff1aSopenharmony_ci v8i16 wgt, denom, offset; 119cabdff1aSopenharmony_ci 120cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 121cabdff1aSopenharmony_ci 122cabdff1aSopenharmony_ci wgt = __msa_fill_h(src_weight); 123cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_val); 124cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom); 125cabdff1aSopenharmony_ci 126cabdff1aSopenharmony_ci LD4(data, stride, tp0, tp1, tp2, tp3); 127cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, src0); 128cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, src1); 129cabdff1aSopenharmony_ci UNPCK_UB_SH(src0, src0_r, src1_r); 130cabdff1aSopenharmony_ci UNPCK_UB_SH(src1, src2_r, src3_r); 131cabdff1aSopenharmony_ci MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2, 132cabdff1aSopenharmony_ci tmp3); 133cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0, 134cabdff1aSopenharmony_ci tmp1, tmp2, tmp3); 135cabdff1aSopenharmony_ci MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0); 136cabdff1aSopenharmony_ci SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom); 137cabdff1aSopenharmony_ci SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 138cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 139cabdff1aSopenharmony_ci ST_D4(src0, src1, 0, 1, 0, 1, data, stride); 140cabdff1aSopenharmony_ci} 141cabdff1aSopenharmony_ci 142cabdff1aSopenharmony_cistatic void avc_wgt_8x8_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, 143cabdff1aSopenharmony_ci int32_t src_weight, int32_t offset_in) 144cabdff1aSopenharmony_ci{ 145cabdff1aSopenharmony_ci uint32_t offset_val; 146cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 147cabdff1aSopenharmony_ci v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 148cabdff1aSopenharmony_ci v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; 149cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 150cabdff1aSopenharmony_ci v8i16 wgt, denom, offset; 151cabdff1aSopenharmony_ci 152cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 153cabdff1aSopenharmony_ci 154cabdff1aSopenharmony_ci wgt = __msa_fill_h(src_weight); 155cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_val); 156cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom); 157cabdff1aSopenharmony_ci 158cabdff1aSopenharmony_ci LD4(data, stride, tp0, tp1, tp2, tp3); 159cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, src0); 160cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, src1); 161cabdff1aSopenharmony_ci LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3); 162cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, src2); 163cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, src3); 164cabdff1aSopenharmony_ci UNPCK_UB_SH(src0, src0_r, src1_r); 165cabdff1aSopenharmony_ci UNPCK_UB_SH(src1, src2_r, src3_r); 166cabdff1aSopenharmony_ci UNPCK_UB_SH(src2, src4_r, src5_r); 167cabdff1aSopenharmony_ci UNPCK_UB_SH(src3, src6_r, src7_r); 168cabdff1aSopenharmony_ci MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2, 169cabdff1aSopenharmony_ci tmp3); 170cabdff1aSopenharmony_ci MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6, 171cabdff1aSopenharmony_ci tmp7); 172cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0, 173cabdff1aSopenharmony_ci tmp1, tmp2, tmp3); 174cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4, 175cabdff1aSopenharmony_ci tmp5, tmp6, tmp7); 176cabdff1aSopenharmony_ci MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0); 177cabdff1aSopenharmony_ci SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom); 178cabdff1aSopenharmony_ci SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); 179cabdff1aSopenharmony_ci PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1, 180cabdff1aSopenharmony_ci src2, src3); 181cabdff1aSopenharmony_ci ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride); 182cabdff1aSopenharmony_ci} 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_cistatic void avc_wgt_8x16_msa(uint8_t *data, ptrdiff_t stride, 185cabdff1aSopenharmony_ci int32_t log2_denom, int32_t src_weight, 186cabdff1aSopenharmony_ci int32_t offset_in) 187cabdff1aSopenharmony_ci{ 188cabdff1aSopenharmony_ci uint32_t offset_val, cnt; 189cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 190cabdff1aSopenharmony_ci v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 191cabdff1aSopenharmony_ci v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; 192cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 193cabdff1aSopenharmony_ci v8i16 wgt, denom, offset; 194cabdff1aSopenharmony_ci 195cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 196cabdff1aSopenharmony_ci 197cabdff1aSopenharmony_ci wgt = __msa_fill_h(src_weight); 198cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_val); 199cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom); 200cabdff1aSopenharmony_ci 201cabdff1aSopenharmony_ci for (cnt = 2; cnt--;) { 202cabdff1aSopenharmony_ci LD4(data, stride, tp0, tp1, tp2, tp3); 203cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, src0); 204cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, src1); 205cabdff1aSopenharmony_ci LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3); 206cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, src2); 207cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, src3); 208cabdff1aSopenharmony_ci UNPCK_UB_SH(src0, src0_r, src1_r); 209cabdff1aSopenharmony_ci UNPCK_UB_SH(src1, src2_r, src3_r); 210cabdff1aSopenharmony_ci UNPCK_UB_SH(src2, src4_r, src5_r); 211cabdff1aSopenharmony_ci UNPCK_UB_SH(src3, src6_r, src7_r); 212cabdff1aSopenharmony_ci MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, 213cabdff1aSopenharmony_ci tmp2, tmp3); 214cabdff1aSopenharmony_ci MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, 215cabdff1aSopenharmony_ci tmp6, tmp7); 216cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, 217cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 218cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, 219cabdff1aSopenharmony_ci tmp4, tmp5, tmp6, tmp7); 220cabdff1aSopenharmony_ci MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0); 221cabdff1aSopenharmony_ci SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom); 222cabdff1aSopenharmony_ci SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); 223cabdff1aSopenharmony_ci PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1, 224cabdff1aSopenharmony_ci src2, src3); 225cabdff1aSopenharmony_ci ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride); 226cabdff1aSopenharmony_ci data += 8 * stride; 227cabdff1aSopenharmony_ci } 228cabdff1aSopenharmony_ci} 229cabdff1aSopenharmony_ci 230cabdff1aSopenharmony_cistatic void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 231cabdff1aSopenharmony_ci int32_t log2_denom, int32_t src_weight, 232cabdff1aSopenharmony_ci int32_t dst_weight, int32_t offset_in) 233cabdff1aSopenharmony_ci{ 234cabdff1aSopenharmony_ci uint32_t tp0, tp1; 235cabdff1aSopenharmony_ci v16i8 src_wgt, dst_wgt, wgt, vec0; 236cabdff1aSopenharmony_ci v16u8 src0 = { 0 }, dst0 = { 0 }; 237cabdff1aSopenharmony_ci v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255); 238cabdff1aSopenharmony_ci 239cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 240cabdff1aSopenharmony_ci offset_in += (128 * (src_weight + dst_weight)); 241cabdff1aSopenharmony_ci 242cabdff1aSopenharmony_ci src_wgt = __msa_fill_b(src_weight); 243cabdff1aSopenharmony_ci dst_wgt = __msa_fill_b(dst_weight); 244cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_in); 245cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom + 1); 246cabdff1aSopenharmony_ci 247cabdff1aSopenharmony_ci wgt = __msa_ilvev_b(dst_wgt, src_wgt); 248cabdff1aSopenharmony_ci 249cabdff1aSopenharmony_ci LW2(src, stride, tp0, tp1); 250cabdff1aSopenharmony_ci INSERT_W2_UB(tp0, tp1, src0); 251cabdff1aSopenharmony_ci LW2(dst, stride, tp0, tp1); 252cabdff1aSopenharmony_ci INSERT_W2_UB(tp0, tp1, dst0); 253cabdff1aSopenharmony_ci XORI_B2_128_UB(src0, dst0); 254cabdff1aSopenharmony_ci vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0); 255cabdff1aSopenharmony_ci tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 256cabdff1aSopenharmony_ci tmp0 >>= denom; 257cabdff1aSopenharmony_ci tmp0 = __msa_maxi_s_h(tmp0, 0); 258cabdff1aSopenharmony_ci tmp0 = __msa_min_s_h(max255, tmp0); 259cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0); 260cabdff1aSopenharmony_ci ST_W2(dst0, 0, 1, dst, stride); 261cabdff1aSopenharmony_ci} 262cabdff1aSopenharmony_ci 263cabdff1aSopenharmony_cistatic void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 264cabdff1aSopenharmony_ci int32_t log2_denom, int32_t src_weight, 265cabdff1aSopenharmony_ci int32_t dst_weight, int32_t offset_in) 266cabdff1aSopenharmony_ci{ 267cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 268cabdff1aSopenharmony_ci v16i8 src_wgt, dst_wgt, wgt, vec0, vec1; 269cabdff1aSopenharmony_ci v16u8 src0, dst0; 270cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, denom, offset; 271cabdff1aSopenharmony_ci 272cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 273cabdff1aSopenharmony_ci offset_in += (128 * (src_weight + dst_weight)); 274cabdff1aSopenharmony_ci 275cabdff1aSopenharmony_ci src_wgt = __msa_fill_b(src_weight); 276cabdff1aSopenharmony_ci dst_wgt = __msa_fill_b(dst_weight); 277cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_in); 278cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom + 1); 279cabdff1aSopenharmony_ci 280cabdff1aSopenharmony_ci wgt = __msa_ilvev_b(dst_wgt, src_wgt); 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci LW4(src, stride, tp0, tp1, tp2, tp3); 283cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 284cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 285cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 286cabdff1aSopenharmony_ci XORI_B2_128_UB(src0, dst0); 287cabdff1aSopenharmony_ci ILVRL_B2_SB(dst0, src0, vec0, vec1); 288cabdff1aSopenharmony_ci tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 289cabdff1aSopenharmony_ci tmp1 = __msa_dpadd_s_h(offset, wgt, vec1); 290cabdff1aSopenharmony_ci tmp0 >>= denom; 291cabdff1aSopenharmony_ci tmp1 >>= denom; 292cabdff1aSopenharmony_ci CLIP_SH2_0_255(tmp0, tmp1); 293cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 294cabdff1aSopenharmony_ci ST_W4(dst0, 0, 1, 2, 3, dst, stride); 295cabdff1aSopenharmony_ci} 296cabdff1aSopenharmony_ci 297cabdff1aSopenharmony_cistatic void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 298cabdff1aSopenharmony_ci int32_t log2_denom, int32_t src_weight, 299cabdff1aSopenharmony_ci int32_t dst_weight, int32_t offset_in) 300cabdff1aSopenharmony_ci{ 301cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 302cabdff1aSopenharmony_ci v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3; 303cabdff1aSopenharmony_ci v16u8 src0, src1, dst0, dst1; 304cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset; 305cabdff1aSopenharmony_ci 306cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 307cabdff1aSopenharmony_ci offset_in += (128 * (src_weight + dst_weight)); 308cabdff1aSopenharmony_ci 309cabdff1aSopenharmony_ci src_wgt = __msa_fill_b(src_weight); 310cabdff1aSopenharmony_ci dst_wgt = __msa_fill_b(dst_weight); 311cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_in); 312cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom + 1); 313cabdff1aSopenharmony_ci wgt = __msa_ilvev_b(dst_wgt, src_wgt); 314cabdff1aSopenharmony_ci 315cabdff1aSopenharmony_ci LW4(src, stride, tp0, tp1, tp2, tp3); 316cabdff1aSopenharmony_ci src += 4 * stride; 317cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 318cabdff1aSopenharmony_ci LW4(src, stride, tp0, tp1, tp2, tp3); 319cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, src1); 320cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 321cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 322cabdff1aSopenharmony_ci LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 323cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 324cabdff1aSopenharmony_ci XORI_B4_128_UB(src0, src1, dst0, dst1); 325cabdff1aSopenharmony_ci ILVRL_B2_SB(dst0, src0, vec0, vec1); 326cabdff1aSopenharmony_ci ILVRL_B2_SB(dst1, src1, vec2, vec3); 327cabdff1aSopenharmony_ci tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 328cabdff1aSopenharmony_ci tmp1 = __msa_dpadd_s_h(offset, wgt, vec1); 329cabdff1aSopenharmony_ci tmp2 = __msa_dpadd_s_h(offset, wgt, vec2); 330cabdff1aSopenharmony_ci tmp3 = __msa_dpadd_s_h(offset, wgt, vec3); 331cabdff1aSopenharmony_ci SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); 332cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 333cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); 334cabdff1aSopenharmony_ci ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 335cabdff1aSopenharmony_ci} 336cabdff1aSopenharmony_ci 337cabdff1aSopenharmony_cistatic void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 338cabdff1aSopenharmony_ci int32_t log2_denom, int32_t src_weight, 339cabdff1aSopenharmony_ci int32_t dst_weight, int32_t offset_in) 340cabdff1aSopenharmony_ci{ 341cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 342cabdff1aSopenharmony_ci v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3; 343cabdff1aSopenharmony_ci v16u8 src0, src1, dst0, dst1; 344cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset; 345cabdff1aSopenharmony_ci 346cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 347cabdff1aSopenharmony_ci offset_in += (128 * (src_weight + dst_weight)); 348cabdff1aSopenharmony_ci 349cabdff1aSopenharmony_ci src_wgt = __msa_fill_b(src_weight); 350cabdff1aSopenharmony_ci dst_wgt = __msa_fill_b(dst_weight); 351cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_in); 352cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom + 1); 353cabdff1aSopenharmony_ci 354cabdff1aSopenharmony_ci wgt = __msa_ilvev_b(dst_wgt, src_wgt); 355cabdff1aSopenharmony_ci 356cabdff1aSopenharmony_ci LD4(src, stride, tp0, tp1, tp2, tp3); 357cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, src0); 358cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, src1); 359cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 360cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 361cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 362cabdff1aSopenharmony_ci XORI_B4_128_UB(src0, src1, dst0, dst1); 363cabdff1aSopenharmony_ci ILVRL_B2_SB(dst0, src0, vec0, vec1); 364cabdff1aSopenharmony_ci ILVRL_B2_SB(dst1, src1, vec2, vec3); 365cabdff1aSopenharmony_ci tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 366cabdff1aSopenharmony_ci tmp1 = __msa_dpadd_s_h(offset, wgt, vec1); 367cabdff1aSopenharmony_ci tmp2 = __msa_dpadd_s_h(offset, wgt, vec2); 368cabdff1aSopenharmony_ci tmp3 = __msa_dpadd_s_h(offset, wgt, vec3); 369cabdff1aSopenharmony_ci SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); 370cabdff1aSopenharmony_ci CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 371cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); 372cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 373cabdff1aSopenharmony_ci} 374cabdff1aSopenharmony_ci 375cabdff1aSopenharmony_cistatic void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 376cabdff1aSopenharmony_ci int32_t log2_denom, int32_t src_weight, 377cabdff1aSopenharmony_ci int32_t dst_weight, int32_t offset_in) 378cabdff1aSopenharmony_ci{ 379cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 380cabdff1aSopenharmony_ci v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 381cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; 382cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset; 383cabdff1aSopenharmony_ci 384cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 385cabdff1aSopenharmony_ci offset_in += (128 * (src_weight + dst_weight)); 386cabdff1aSopenharmony_ci 387cabdff1aSopenharmony_ci src_wgt = __msa_fill_b(src_weight); 388cabdff1aSopenharmony_ci dst_wgt = __msa_fill_b(dst_weight); 389cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_in); 390cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom + 1); 391cabdff1aSopenharmony_ci wgt = __msa_ilvev_b(dst_wgt, src_wgt); 392cabdff1aSopenharmony_ci 393cabdff1aSopenharmony_ci LD4(src, stride, tp0, tp1, tp2, tp3); 394cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, src0); 395cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, src1); 396cabdff1aSopenharmony_ci LD4(src + 4 * stride, stride, tp0, tp1, tp2, tp3); 397cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, src2); 398cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, src3); 399cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 400cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 401cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 402cabdff1aSopenharmony_ci LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 403cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst2); 404cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst3); 405cabdff1aSopenharmony_ci XORI_B8_128_UB(src0, src1, src2, src3, dst0, dst1, dst2, dst3); 406cabdff1aSopenharmony_ci ILVRL_B2_SB(dst0, src0, vec0, vec1); 407cabdff1aSopenharmony_ci ILVRL_B2_SB(dst1, src1, vec2, vec3); 408cabdff1aSopenharmony_ci ILVRL_B2_SB(dst2, src2, vec4, vec5); 409cabdff1aSopenharmony_ci ILVRL_B2_SB(dst3, src3, vec6, vec7); 410cabdff1aSopenharmony_ci tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 411cabdff1aSopenharmony_ci tmp1 = __msa_dpadd_s_h(offset, wgt, vec1); 412cabdff1aSopenharmony_ci tmp2 = __msa_dpadd_s_h(offset, wgt, vec2); 413cabdff1aSopenharmony_ci tmp3 = __msa_dpadd_s_h(offset, wgt, vec3); 414cabdff1aSopenharmony_ci tmp4 = __msa_dpadd_s_h(offset, wgt, vec4); 415cabdff1aSopenharmony_ci tmp5 = __msa_dpadd_s_h(offset, wgt, vec5); 416cabdff1aSopenharmony_ci tmp6 = __msa_dpadd_s_h(offset, wgt, vec6); 417cabdff1aSopenharmony_ci tmp7 = __msa_dpadd_s_h(offset, wgt, vec7); 418cabdff1aSopenharmony_ci SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); 419cabdff1aSopenharmony_ci SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); 420cabdff1aSopenharmony_ci CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 421cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); 422cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3); 423cabdff1aSopenharmony_ci ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 424cabdff1aSopenharmony_ci} 425cabdff1aSopenharmony_ci 426cabdff1aSopenharmony_cistatic void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 427cabdff1aSopenharmony_ci int32_t log2_denom, int32_t src_weight, 428cabdff1aSopenharmony_ci int32_t dst_weight, int32_t offset_in) 429cabdff1aSopenharmony_ci{ 430cabdff1aSopenharmony_ci uint8_t cnt; 431cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 432cabdff1aSopenharmony_ci v16i8 src_wgt, dst_wgt, wgt; 433cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3; 434cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3; 435cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 436cabdff1aSopenharmony_ci v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; 437cabdff1aSopenharmony_ci v8i16 denom, offset; 438cabdff1aSopenharmony_ci 439cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 440cabdff1aSopenharmony_ci offset_in += (128 * (src_weight + dst_weight)); 441cabdff1aSopenharmony_ci 442cabdff1aSopenharmony_ci src_wgt = __msa_fill_b(src_weight); 443cabdff1aSopenharmony_ci dst_wgt = __msa_fill_b(dst_weight); 444cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_in); 445cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom + 1); 446cabdff1aSopenharmony_ci wgt = __msa_ilvev_b(dst_wgt, src_wgt); 447cabdff1aSopenharmony_ci 448cabdff1aSopenharmony_ci for (cnt = 2; cnt--;) { 449cabdff1aSopenharmony_ci LD4(src, stride, tp0, tp1, tp2, tp3); 450cabdff1aSopenharmony_ci src += 4 * stride; 451cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, src0); 452cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, src1); 453cabdff1aSopenharmony_ci LD4(src, stride, tp0, tp1, tp2, tp3); 454cabdff1aSopenharmony_ci src += 4 * stride; 455cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, src2); 456cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, src3); 457cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 458cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 459cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 460cabdff1aSopenharmony_ci LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 461cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst2); 462cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst3); 463cabdff1aSopenharmony_ci XORI_B4_128_UB(src0, src1, src2, src3); 464cabdff1aSopenharmony_ci XORI_B4_128_UB(dst0, dst1, dst2, dst3); 465cabdff1aSopenharmony_ci ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, 466cabdff1aSopenharmony_ci vec0, vec2, vec4, vec6); 467cabdff1aSopenharmony_ci ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, 468cabdff1aSopenharmony_ci vec1, vec3, vec5, vec7); 469cabdff1aSopenharmony_ci 470cabdff1aSopenharmony_ci temp0 = __msa_dpadd_s_h(offset, wgt, vec0); 471cabdff1aSopenharmony_ci temp1 = __msa_dpadd_s_h(offset, wgt, vec1); 472cabdff1aSopenharmony_ci temp2 = __msa_dpadd_s_h(offset, wgt, vec2); 473cabdff1aSopenharmony_ci temp3 = __msa_dpadd_s_h(offset, wgt, vec3); 474cabdff1aSopenharmony_ci temp4 = __msa_dpadd_s_h(offset, wgt, vec4); 475cabdff1aSopenharmony_ci temp5 = __msa_dpadd_s_h(offset, wgt, vec5); 476cabdff1aSopenharmony_ci temp6 = __msa_dpadd_s_h(offset, wgt, vec6); 477cabdff1aSopenharmony_ci temp7 = __msa_dpadd_s_h(offset, wgt, vec7); 478cabdff1aSopenharmony_ci 479cabdff1aSopenharmony_ci SRA_4V(temp0, temp1, temp2, temp3, denom); 480cabdff1aSopenharmony_ci SRA_4V(temp4, temp5, temp6, temp7, denom); 481cabdff1aSopenharmony_ci CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7); 482cabdff1aSopenharmony_ci PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, 483cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 484cabdff1aSopenharmony_ci ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 485cabdff1aSopenharmony_ci dst += 8 * stride; 486cabdff1aSopenharmony_ci } 487cabdff1aSopenharmony_ci} 488cabdff1aSopenharmony_ci 489cabdff1aSopenharmony_ci#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \ 490cabdff1aSopenharmony_ci q3_or_p3_org_in, p1_or_q1_org_in, \ 491cabdff1aSopenharmony_ci p2_or_q2_org_in, q1_or_p1_org_in, \ 492cabdff1aSopenharmony_ci p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \ 493cabdff1aSopenharmony_ci{ \ 494cabdff1aSopenharmony_ci v8i16 threshold; \ 495cabdff1aSopenharmony_ci v8i16 const3 = __msa_ldi_h(3); \ 496cabdff1aSopenharmony_ci \ 497cabdff1aSopenharmony_ci threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in); \ 498cabdff1aSopenharmony_ci threshold += (p1_or_q1_org_in); \ 499cabdff1aSopenharmony_ci \ 500cabdff1aSopenharmony_ci (p0_or_q0_out) = threshold << 1; \ 501cabdff1aSopenharmony_ci (p0_or_q0_out) += (p2_or_q2_org_in); \ 502cabdff1aSopenharmony_ci (p0_or_q0_out) += (q1_or_p1_org_in); \ 503cabdff1aSopenharmony_ci (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3); \ 504cabdff1aSopenharmony_ci \ 505cabdff1aSopenharmony_ci (p1_or_q1_out) = (p2_or_q2_org_in) + threshold; \ 506cabdff1aSopenharmony_ci (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2); \ 507cabdff1aSopenharmony_ci \ 508cabdff1aSopenharmony_ci (p2_or_q2_out) = (p2_or_q2_org_in) * const3; \ 509cabdff1aSopenharmony_ci (p2_or_q2_out) += (p3_or_q3_org_in); \ 510cabdff1aSopenharmony_ci (p2_or_q2_out) += (p3_or_q3_org_in); \ 511cabdff1aSopenharmony_ci (p2_or_q2_out) += threshold; \ 512cabdff1aSopenharmony_ci (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3); \ 513cabdff1aSopenharmony_ci} 514cabdff1aSopenharmony_ci 515cabdff1aSopenharmony_ci/* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */ 516cabdff1aSopenharmony_ci#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \ 517cabdff1aSopenharmony_ci p1_or_q1_org_in, p0_or_q0_out) \ 518cabdff1aSopenharmony_ci{ \ 519cabdff1aSopenharmony_ci (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in); \ 520cabdff1aSopenharmony_ci (p0_or_q0_out) += (p1_or_q1_org_in); \ 521cabdff1aSopenharmony_ci (p0_or_q0_out) += (p1_or_q1_org_in); \ 522cabdff1aSopenharmony_ci (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2); \ 523cabdff1aSopenharmony_ci} 524cabdff1aSopenharmony_ci 525cabdff1aSopenharmony_ci#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \ 526cabdff1aSopenharmony_ci p1_or_q1_org_in, p2_or_q2_org_in, \ 527cabdff1aSopenharmony_ci negate_tc_in, tc_in, p1_or_q1_out) \ 528cabdff1aSopenharmony_ci{ \ 529cabdff1aSopenharmony_ci v8i16 clip3, temp; \ 530cabdff1aSopenharmony_ci \ 531cabdff1aSopenharmony_ci clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in, \ 532cabdff1aSopenharmony_ci (v8u16) q0_or_p0_org_in); \ 533cabdff1aSopenharmony_ci temp = p1_or_q1_org_in << 1; \ 534cabdff1aSopenharmony_ci clip3 = clip3 - temp; \ 535cabdff1aSopenharmony_ci clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \ 536cabdff1aSopenharmony_ci CLIP_SH(clip3, negate_tc_in, tc_in); \ 537cabdff1aSopenharmony_ci p1_or_q1_out = p1_or_q1_org_in + clip3; \ 538cabdff1aSopenharmony_ci} 539cabdff1aSopenharmony_ci 540cabdff1aSopenharmony_ci#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \ 541cabdff1aSopenharmony_ci p1_or_q1_org_in, q1_or_p1_org_in, \ 542cabdff1aSopenharmony_ci negate_threshold_in, threshold_in, \ 543cabdff1aSopenharmony_ci p0_or_q0_out, q0_or_p0_out) \ 544cabdff1aSopenharmony_ci{ \ 545cabdff1aSopenharmony_ci v8i16 q0_sub_p0, p1_sub_q1, delta; \ 546cabdff1aSopenharmony_ci \ 547cabdff1aSopenharmony_ci q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \ 548cabdff1aSopenharmony_ci p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \ 549cabdff1aSopenharmony_ci q0_sub_p0 <<= 2; \ 550cabdff1aSopenharmony_ci p1_sub_q1 += 4; \ 551cabdff1aSopenharmony_ci delta = q0_sub_p0 + p1_sub_q1; \ 552cabdff1aSopenharmony_ci delta >>= 3; \ 553cabdff1aSopenharmony_ci \ 554cabdff1aSopenharmony_ci CLIP_SH(delta, negate_threshold_in, threshold_in); \ 555cabdff1aSopenharmony_ci \ 556cabdff1aSopenharmony_ci p0_or_q0_out = p0_or_q0_org_in + delta; \ 557cabdff1aSopenharmony_ci q0_or_p0_out = q0_or_p0_org_in - delta; \ 558cabdff1aSopenharmony_ci \ 559cabdff1aSopenharmony_ci CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out); \ 560cabdff1aSopenharmony_ci} 561cabdff1aSopenharmony_ci 562cabdff1aSopenharmony_ci#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res) \ 563cabdff1aSopenharmony_ci{ \ 564cabdff1aSopenharmony_ci uint32_t load0, load1, load2, load3; \ 565cabdff1aSopenharmony_ci v16u8 src0 = { 0 }; \ 566cabdff1aSopenharmony_ci v16u8 src1 = { 0 }; \ 567cabdff1aSopenharmony_ci v16u8 src2 = { 0 }; \ 568cabdff1aSopenharmony_ci v16u8 src3 = { 0 }; \ 569cabdff1aSopenharmony_ci v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \ 570cabdff1aSopenharmony_ci v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \ 571cabdff1aSopenharmony_ci v8i16 tc, q0_sub_p0, p1_sub_q1, delta; \ 572cabdff1aSopenharmony_ci v8i16 res0_r, res1_r; \ 573cabdff1aSopenharmony_ci v16i8 zeros = { 0 }; \ 574cabdff1aSopenharmony_ci v16u8 res0, res1; \ 575cabdff1aSopenharmony_ci \ 576cabdff1aSopenharmony_ci LW4((src - 2), stride, load0, load1, load2, load3); \ 577cabdff1aSopenharmony_ci src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \ 578cabdff1aSopenharmony_ci src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \ 579cabdff1aSopenharmony_ci src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2); \ 580cabdff1aSopenharmony_ci src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3); \ 581cabdff1aSopenharmony_ci \ 582cabdff1aSopenharmony_ci TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3); \ 583cabdff1aSopenharmony_ci \ 584cabdff1aSopenharmony_ci p0_asub_q0 = __msa_asub_u_b(src2, src1); \ 585cabdff1aSopenharmony_ci p1_asub_p0 = __msa_asub_u_b(src1, src0); \ 586cabdff1aSopenharmony_ci q1_asub_q0 = __msa_asub_u_b(src2, src3); \ 587cabdff1aSopenharmony_ci \ 588cabdff1aSopenharmony_ci tc = __msa_fill_h(tc_val); \ 589cabdff1aSopenharmony_ci \ 590cabdff1aSopenharmony_ci is_less_than_alpha = (p0_asub_q0 < alpha); \ 591cabdff1aSopenharmony_ci is_less_than_beta = (p1_asub_p0 < beta); \ 592cabdff1aSopenharmony_ci is_less_than = is_less_than_alpha & is_less_than_beta; \ 593cabdff1aSopenharmony_ci is_less_than_beta = (q1_asub_q0 < beta); \ 594cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; \ 595cabdff1aSopenharmony_ci \ 596cabdff1aSopenharmony_ci ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \ 597cabdff1aSopenharmony_ci HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \ 598cabdff1aSopenharmony_ci \ 599cabdff1aSopenharmony_ci q0_sub_p0 <<= 2; \ 600cabdff1aSopenharmony_ci delta = q0_sub_p0 + p1_sub_q1; \ 601cabdff1aSopenharmony_ci delta = __msa_srari_h(delta, 3); \ 602cabdff1aSopenharmony_ci \ 603cabdff1aSopenharmony_ci CLIP_SH(delta, -tc, tc); \ 604cabdff1aSopenharmony_ci \ 605cabdff1aSopenharmony_ci ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \ 606cabdff1aSopenharmony_ci \ 607cabdff1aSopenharmony_ci res0_r += delta; \ 608cabdff1aSopenharmony_ci res1_r -= delta; \ 609cabdff1aSopenharmony_ci \ 610cabdff1aSopenharmony_ci CLIP_SH2_0_255(res0_r, res1_r); \ 611cabdff1aSopenharmony_ci PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \ 612cabdff1aSopenharmony_ci \ 613cabdff1aSopenharmony_ci res0 = __msa_bmnz_v(src1, res0, is_less_than); \ 614cabdff1aSopenharmony_ci res1 = __msa_bmnz_v(src2, res1, is_less_than); \ 615cabdff1aSopenharmony_ci \ 616cabdff1aSopenharmony_ci res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \ 617cabdff1aSopenharmony_ci} 618cabdff1aSopenharmony_ci 619cabdff1aSopenharmony_ci#define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3) \ 620cabdff1aSopenharmony_ci{ \ 621cabdff1aSopenharmony_ci v16i8 zero_m = { 0 }; \ 622cabdff1aSopenharmony_ci \ 623cabdff1aSopenharmony_ci out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \ 624cabdff1aSopenharmony_ci out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \ 625cabdff1aSopenharmony_ci SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3); \ 626cabdff1aSopenharmony_ci} 627cabdff1aSopenharmony_ci 628cabdff1aSopenharmony_ci#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \ 629cabdff1aSopenharmony_ci{ \ 630cabdff1aSopenharmony_ci uint32_t load0, load1; \ 631cabdff1aSopenharmony_ci v16u8 src0 = { 0 }; \ 632cabdff1aSopenharmony_ci v16u8 src1 = { 0 }; \ 633cabdff1aSopenharmony_ci v16u8 src2 = { 0 }; \ 634cabdff1aSopenharmony_ci v16u8 src3 = { 0 }; \ 635cabdff1aSopenharmony_ci v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \ 636cabdff1aSopenharmony_ci v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \ 637cabdff1aSopenharmony_ci v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r; \ 638cabdff1aSopenharmony_ci v16i8 zeros = { 0 }; \ 639cabdff1aSopenharmony_ci v16u8 res0, res1; \ 640cabdff1aSopenharmony_ci \ 641cabdff1aSopenharmony_ci load0 = LW(src - 2); \ 642cabdff1aSopenharmony_ci load1 = LW(src - 2 + stride); \ 643cabdff1aSopenharmony_ci \ 644cabdff1aSopenharmony_ci src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \ 645cabdff1aSopenharmony_ci src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \ 646cabdff1aSopenharmony_ci \ 647cabdff1aSopenharmony_ci TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3); \ 648cabdff1aSopenharmony_ci \ 649cabdff1aSopenharmony_ci p0_asub_q0 = __msa_asub_u_b(src2, src1); \ 650cabdff1aSopenharmony_ci p1_asub_p0 = __msa_asub_u_b(src1, src0); \ 651cabdff1aSopenharmony_ci q1_asub_q0 = __msa_asub_u_b(src2, src3); \ 652cabdff1aSopenharmony_ci \ 653cabdff1aSopenharmony_ci tc = __msa_fill_h(tc_val); \ 654cabdff1aSopenharmony_ci \ 655cabdff1aSopenharmony_ci is_less_than_alpha = (p0_asub_q0 < alpha); \ 656cabdff1aSopenharmony_ci is_less_than_beta = (p1_asub_p0 < beta); \ 657cabdff1aSopenharmony_ci is_less_than = is_less_than_alpha & is_less_than_beta; \ 658cabdff1aSopenharmony_ci is_less_than_beta = (q1_asub_q0 < beta); \ 659cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; \ 660cabdff1aSopenharmony_ci \ 661cabdff1aSopenharmony_ci ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \ 662cabdff1aSopenharmony_ci HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \ 663cabdff1aSopenharmony_ci \ 664cabdff1aSopenharmony_ci q0_sub_p0 <<= 2; \ 665cabdff1aSopenharmony_ci delta = q0_sub_p0 + p1_sub_q1; \ 666cabdff1aSopenharmony_ci delta = __msa_srari_h(delta, 3); \ 667cabdff1aSopenharmony_ci CLIP_SH(delta, -tc, tc); \ 668cabdff1aSopenharmony_ci \ 669cabdff1aSopenharmony_ci ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \ 670cabdff1aSopenharmony_ci \ 671cabdff1aSopenharmony_ci res0_r += delta; \ 672cabdff1aSopenharmony_ci res1_r -= delta; \ 673cabdff1aSopenharmony_ci \ 674cabdff1aSopenharmony_ci CLIP_SH2_0_255(res0_r, res1_r); \ 675cabdff1aSopenharmony_ci PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \ 676cabdff1aSopenharmony_ci \ 677cabdff1aSopenharmony_ci res0 = __msa_bmnz_v(src1, res0, is_less_than); \ 678cabdff1aSopenharmony_ci res1 = __msa_bmnz_v(src2, res1, is_less_than); \ 679cabdff1aSopenharmony_ci \ 680cabdff1aSopenharmony_ci res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \ 681cabdff1aSopenharmony_ci} 682cabdff1aSopenharmony_ci 683cabdff1aSopenharmony_cistatic void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data, 684cabdff1aSopenharmony_ci uint8_t alpha_in, 685cabdff1aSopenharmony_ci uint8_t beta_in, 686cabdff1aSopenharmony_ci ptrdiff_t img_width) 687cabdff1aSopenharmony_ci{ 688cabdff1aSopenharmony_ci v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; 689cabdff1aSopenharmony_ci v16u8 is_less_than, is_less_than_beta, is_less_than_alpha; 690cabdff1aSopenharmony_ci v16u8 p1_org, p0_org, q0_org, q1_org; 691cabdff1aSopenharmony_ci 692cabdff1aSopenharmony_ci LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org); 693cabdff1aSopenharmony_ci 694cabdff1aSopenharmony_ci p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); 695cabdff1aSopenharmony_ci p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); 696cabdff1aSopenharmony_ci q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); 697cabdff1aSopenharmony_ci 698cabdff1aSopenharmony_ci is_less_than_alpha = (p0_asub_q0 < alpha_in); 699cabdff1aSopenharmony_ci is_less_than_beta = (p1_asub_p0 < beta_in); 700cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than_alpha; 701cabdff1aSopenharmony_ci is_less_than_beta = (q1_asub_q0 < beta_in); 702cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 703cabdff1aSopenharmony_ci 704cabdff1aSopenharmony_ci if (!__msa_test_bz_v(is_less_than)) { 705cabdff1aSopenharmony_ci v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta; 706cabdff1aSopenharmony_ci v8i16 p0_r = { 0 }; 707cabdff1aSopenharmony_ci v8i16 q0_r = { 0 }; 708cabdff1aSopenharmony_ci v8i16 p0_l = { 0 }; 709cabdff1aSopenharmony_ci v8i16 q0_l = { 0 }; 710cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 711cabdff1aSopenharmony_ci v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 712cabdff1aSopenharmony_ci v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; 713cabdff1aSopenharmony_ci v16u8 q2_org = LD_UB(data + (2 * img_width)); 714cabdff1aSopenharmony_ci v16u8 p2_org = LD_UB(data - (3 * img_width)); 715cabdff1aSopenharmony_ci v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2); 716cabdff1aSopenharmony_ci 717cabdff1aSopenharmony_ci UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l); 718cabdff1aSopenharmony_ci UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l); 719cabdff1aSopenharmony_ci UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l); 720cabdff1aSopenharmony_ci 721cabdff1aSopenharmony_ci tmp_flag = (p0_asub_q0 < tmp_flag); 722cabdff1aSopenharmony_ci 723cabdff1aSopenharmony_ci p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org); 724cabdff1aSopenharmony_ci is_less_than_beta = (p2_asub_p0 < beta_in); 725cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & tmp_flag; 726cabdff1aSopenharmony_ci negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff); 727cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 728cabdff1aSopenharmony_ci negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 729cabdff1aSopenharmony_ci 730cabdff1aSopenharmony_ci q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org); 731cabdff1aSopenharmony_ci q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org); 732cabdff1aSopenharmony_ci 733cabdff1aSopenharmony_ci /* combine and store */ 734cabdff1aSopenharmony_ci if (!__msa_test_bz_v(is_less_than_beta)) { 735cabdff1aSopenharmony_ci v8i16 p3_org_l, p3_org_r; 736cabdff1aSopenharmony_ci v16u8 p3_org = LD_UB(data - (img_width << 2)); 737cabdff1aSopenharmony_ci v16u8 p2, p1; 738cabdff1aSopenharmony_ci v8i16 p2_r = { 0 }; 739cabdff1aSopenharmony_ci v8i16 p2_l = { 0 }; 740cabdff1aSopenharmony_ci v8i16 p1_r = { 0 }; 741cabdff1aSopenharmony_ci v8i16 p1_l = { 0 }; 742cabdff1aSopenharmony_ci 743cabdff1aSopenharmony_ci ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r); 744cabdff1aSopenharmony_ci AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r, 745cabdff1aSopenharmony_ci p2_r, q1_org_r, p0_r, p1_r, p2_r); 746cabdff1aSopenharmony_ci 747cabdff1aSopenharmony_ci ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l); 748cabdff1aSopenharmony_ci AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l, 749cabdff1aSopenharmony_ci p2_l, q1_org_l, p0_l, p1_l, p2_l); 750cabdff1aSopenharmony_ci 751cabdff1aSopenharmony_ci PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2); 752cabdff1aSopenharmony_ci 753cabdff1aSopenharmony_ci p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta); 754cabdff1aSopenharmony_ci p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta); 755cabdff1aSopenharmony_ci p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta); 756cabdff1aSopenharmony_ci 757cabdff1aSopenharmony_ci ST_UB(p1_org, data - (2 * img_width)); 758cabdff1aSopenharmony_ci ST_UB(p2_org, data - (3 * img_width)); 759cabdff1aSopenharmony_ci } 760cabdff1aSopenharmony_ci 761cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r); 762cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l); 763cabdff1aSopenharmony_ci 764cabdff1aSopenharmony_ci /* combine */ 765cabdff1aSopenharmony_ci p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r); 766cabdff1aSopenharmony_ci p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta); 767cabdff1aSopenharmony_ci 768cabdff1aSopenharmony_ci ST_UB(p0_org, data - img_width); 769cabdff1aSopenharmony_ci 770cabdff1aSopenharmony_ci /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */ 771cabdff1aSopenharmony_ci q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org); 772cabdff1aSopenharmony_ci is_less_than_beta = (q2_asub_q0 < beta_in); 773cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & tmp_flag; 774cabdff1aSopenharmony_ci negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff); 775cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 776cabdff1aSopenharmony_ci negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 777cabdff1aSopenharmony_ci 778cabdff1aSopenharmony_ci /* combine and store */ 779cabdff1aSopenharmony_ci if (!__msa_test_bz_v(is_less_than_beta)) { 780cabdff1aSopenharmony_ci v8i16 q3_org_r, q3_org_l; 781cabdff1aSopenharmony_ci v16u8 q3_org = LD_UB(data + (3 * img_width)); 782cabdff1aSopenharmony_ci v16u8 q1, q2; 783cabdff1aSopenharmony_ci v8i16 q2_r = { 0 }; 784cabdff1aSopenharmony_ci v8i16 q2_l = { 0 }; 785cabdff1aSopenharmony_ci v8i16 q1_r = { 0 }; 786cabdff1aSopenharmony_ci v8i16 q1_l = { 0 }; 787cabdff1aSopenharmony_ci 788cabdff1aSopenharmony_ci ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r); 789cabdff1aSopenharmony_ci AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r, 790cabdff1aSopenharmony_ci q2_r, p1_org_r, q0_r, q1_r, q2_r); 791cabdff1aSopenharmony_ci 792cabdff1aSopenharmony_ci ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l); 793cabdff1aSopenharmony_ci AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l, 794cabdff1aSopenharmony_ci q2_l, p1_org_l, q0_l, q1_l, q2_l); 795cabdff1aSopenharmony_ci 796cabdff1aSopenharmony_ci PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2); 797cabdff1aSopenharmony_ci q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta); 798cabdff1aSopenharmony_ci q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta); 799cabdff1aSopenharmony_ci q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta); 800cabdff1aSopenharmony_ci 801cabdff1aSopenharmony_ci ST_UB(q1_org, data + img_width); 802cabdff1aSopenharmony_ci ST_UB(q2_org, data + 2 * img_width); 803cabdff1aSopenharmony_ci } 804cabdff1aSopenharmony_ci 805cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r); 806cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l); 807cabdff1aSopenharmony_ci 808cabdff1aSopenharmony_ci /* combine */ 809cabdff1aSopenharmony_ci q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r); 810cabdff1aSopenharmony_ci q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta); 811cabdff1aSopenharmony_ci 812cabdff1aSopenharmony_ci ST_UB(q0_org, data); 813cabdff1aSopenharmony_ci } 814cabdff1aSopenharmony_ci} 815cabdff1aSopenharmony_ci 816cabdff1aSopenharmony_cistatic void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data, 817cabdff1aSopenharmony_ci uint8_t alpha_in, 818cabdff1aSopenharmony_ci uint8_t beta_in, 819cabdff1aSopenharmony_ci ptrdiff_t img_width) 820cabdff1aSopenharmony_ci{ 821cabdff1aSopenharmony_ci uint8_t *src = data - 4; 822cabdff1aSopenharmony_ci v16u8 alpha, beta, p0_asub_q0; 823cabdff1aSopenharmony_ci v16u8 is_less_than_alpha, is_less_than, is_less_than_beta; 824cabdff1aSopenharmony_ci v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; 825cabdff1aSopenharmony_ci v16u8 p1_asub_p0, q1_asub_q0; 826cabdff1aSopenharmony_ci 827cabdff1aSopenharmony_ci 828cabdff1aSopenharmony_ci { 829cabdff1aSopenharmony_ci v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 830cabdff1aSopenharmony_ci v16u8 row8, row9, row10, row11, row12, row13, row14, row15; 831cabdff1aSopenharmony_ci 832cabdff1aSopenharmony_ci LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7); 833cabdff1aSopenharmony_ci LD_UB8(src + (8 * img_width), img_width, 834cabdff1aSopenharmony_ci row8, row9, row10, row11, row12, row13, row14, row15); 835cabdff1aSopenharmony_ci 836cabdff1aSopenharmony_ci TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, 837cabdff1aSopenharmony_ci row4, row5, row6, row7, 838cabdff1aSopenharmony_ci row8, row9, row10, row11, 839cabdff1aSopenharmony_ci row12, row13, row14, row15, 840cabdff1aSopenharmony_ci p3_org, p2_org, p1_org, p0_org, 841cabdff1aSopenharmony_ci q0_org, q1_org, q2_org, q3_org); 842cabdff1aSopenharmony_ci } 843cabdff1aSopenharmony_ci 844cabdff1aSopenharmony_ci p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); 845cabdff1aSopenharmony_ci p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); 846cabdff1aSopenharmony_ci q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); 847cabdff1aSopenharmony_ci 848cabdff1aSopenharmony_ci alpha = (v16u8) __msa_fill_b(alpha_in); 849cabdff1aSopenharmony_ci beta = (v16u8) __msa_fill_b(beta_in); 850cabdff1aSopenharmony_ci 851cabdff1aSopenharmony_ci is_less_than_alpha = (p0_asub_q0 < alpha); 852cabdff1aSopenharmony_ci is_less_than_beta = (p1_asub_p0 < beta); 853cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than_alpha; 854cabdff1aSopenharmony_ci is_less_than_beta = (q1_asub_q0 < beta); 855cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 856cabdff1aSopenharmony_ci 857cabdff1aSopenharmony_ci if (!__msa_test_bz_v(is_less_than)) { 858cabdff1aSopenharmony_ci v8i16 p0_r = { 0 }; 859cabdff1aSopenharmony_ci v8i16 q0_r = { 0 }; 860cabdff1aSopenharmony_ci v8i16 p0_l = { 0 }; 861cabdff1aSopenharmony_ci v8i16 q0_l = { 0 }; 862cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 863cabdff1aSopenharmony_ci v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0; 864cabdff1aSopenharmony_ci v16u8 negate_is_less_than_beta; 865cabdff1aSopenharmony_ci v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 866cabdff1aSopenharmony_ci v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; 867cabdff1aSopenharmony_ci 868cabdff1aSopenharmony_ci UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l); 869cabdff1aSopenharmony_ci UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l); 870cabdff1aSopenharmony_ci UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l); 871cabdff1aSopenharmony_ci UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l); 872cabdff1aSopenharmony_ci 873cabdff1aSopenharmony_ci tmp_flag = alpha >> 2; 874cabdff1aSopenharmony_ci tmp_flag = tmp_flag + 2; 875cabdff1aSopenharmony_ci tmp_flag = (p0_asub_q0 < tmp_flag); 876cabdff1aSopenharmony_ci 877cabdff1aSopenharmony_ci p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org); 878cabdff1aSopenharmony_ci is_less_than_beta = (p2_asub_p0 < beta); 879cabdff1aSopenharmony_ci is_less_than_beta = tmp_flag & is_less_than_beta; 880cabdff1aSopenharmony_ci negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff); 881cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 882cabdff1aSopenharmony_ci negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 883cabdff1aSopenharmony_ci 884cabdff1aSopenharmony_ci if (!__msa_test_bz_v(is_less_than_beta)) { 885cabdff1aSopenharmony_ci v16u8 p2, p1; 886cabdff1aSopenharmony_ci v8i16 p3_org_r, p3_org_l; 887cabdff1aSopenharmony_ci v8i16 p2_l = { 0 }; 888cabdff1aSopenharmony_ci v8i16 p2_r = { 0 }; 889cabdff1aSopenharmony_ci v8i16 p1_l = { 0 }; 890cabdff1aSopenharmony_ci v8i16 p1_r = { 0 }; 891cabdff1aSopenharmony_ci 892cabdff1aSopenharmony_ci ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r); 893cabdff1aSopenharmony_ci AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r, 894cabdff1aSopenharmony_ci p2_r, q1_org_r, p0_r, p1_r, p2_r); 895cabdff1aSopenharmony_ci 896cabdff1aSopenharmony_ci ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l); 897cabdff1aSopenharmony_ci AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l, 898cabdff1aSopenharmony_ci p2_l, q1_org_l, p0_l, p1_l, p2_l); 899cabdff1aSopenharmony_ci 900cabdff1aSopenharmony_ci PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2); 901cabdff1aSopenharmony_ci p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta); 902cabdff1aSopenharmony_ci p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta); 903cabdff1aSopenharmony_ci p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta); 904cabdff1aSopenharmony_ci } 905cabdff1aSopenharmony_ci 906cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r); 907cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l); 908cabdff1aSopenharmony_ci 909cabdff1aSopenharmony_ci p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r); 910cabdff1aSopenharmony_ci p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta); 911cabdff1aSopenharmony_ci 912cabdff1aSopenharmony_ci q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org); 913cabdff1aSopenharmony_ci is_less_than_beta = (q2_asub_q0 < beta); 914cabdff1aSopenharmony_ci 915cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & tmp_flag; 916cabdff1aSopenharmony_ci negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff); 917cabdff1aSopenharmony_ci 918cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 919cabdff1aSopenharmony_ci negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 920cabdff1aSopenharmony_ci 921cabdff1aSopenharmony_ci if (!__msa_test_bz_v(is_less_than_beta)) { 922cabdff1aSopenharmony_ci v16u8 q1, q2; 923cabdff1aSopenharmony_ci v8i16 q3_org_r, q3_org_l; 924cabdff1aSopenharmony_ci v8i16 q1_l = { 0 }; 925cabdff1aSopenharmony_ci v8i16 q1_r = { 0 }; 926cabdff1aSopenharmony_ci v8i16 q2_l = { 0 }; 927cabdff1aSopenharmony_ci v8i16 q2_r = { 0 }; 928cabdff1aSopenharmony_ci 929cabdff1aSopenharmony_ci ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r); 930cabdff1aSopenharmony_ci AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r, 931cabdff1aSopenharmony_ci q2_r, p1_org_r, q0_r, q1_r, q2_r); 932cabdff1aSopenharmony_ci 933cabdff1aSopenharmony_ci ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l); 934cabdff1aSopenharmony_ci AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l, 935cabdff1aSopenharmony_ci q2_l, p1_org_l, q0_l, q1_l, q2_l); 936cabdff1aSopenharmony_ci 937cabdff1aSopenharmony_ci PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2); 938cabdff1aSopenharmony_ci q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta); 939cabdff1aSopenharmony_ci q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta); 940cabdff1aSopenharmony_ci q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta); 941cabdff1aSopenharmony_ci } 942cabdff1aSopenharmony_ci 943cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r); 944cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l); 945cabdff1aSopenharmony_ci 946cabdff1aSopenharmony_ci q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r); 947cabdff1aSopenharmony_ci q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta); 948cabdff1aSopenharmony_ci 949cabdff1aSopenharmony_ci { 950cabdff1aSopenharmony_ci v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 951cabdff1aSopenharmony_ci 952cabdff1aSopenharmony_ci ILVRL_B2_SH(p1_org, p2_org, tp0, tp2); 953cabdff1aSopenharmony_ci ILVRL_B2_SH(q0_org, p0_org, tp1, tp3); 954cabdff1aSopenharmony_ci ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5); 955cabdff1aSopenharmony_ci 956cabdff1aSopenharmony_ci ILVRL_H2_SH(tp1, tp0, tmp3, tmp4); 957cabdff1aSopenharmony_ci ILVRL_H2_SH(tp3, tp2, tmp6, tmp7); 958cabdff1aSopenharmony_ci 959cabdff1aSopenharmony_ci src = data - 3; 960cabdff1aSopenharmony_ci ST_W4(tmp3, 0, 1, 2, 3, src, img_width); 961cabdff1aSopenharmony_ci ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width); 962cabdff1aSopenharmony_ci src += 4 * img_width; 963cabdff1aSopenharmony_ci ST_W4(tmp4, 0, 1, 2, 3, src, img_width); 964cabdff1aSopenharmony_ci ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width); 965cabdff1aSopenharmony_ci src += 4 * img_width; 966cabdff1aSopenharmony_ci 967cabdff1aSopenharmony_ci ST_W4(tmp6, 0, 1, 2, 3, src, img_width); 968cabdff1aSopenharmony_ci ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width); 969cabdff1aSopenharmony_ci src += 4 * img_width; 970cabdff1aSopenharmony_ci ST_W4(tmp7, 0, 1, 2, 3, src, img_width); 971cabdff1aSopenharmony_ci ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width); 972cabdff1aSopenharmony_ci } 973cabdff1aSopenharmony_ci } 974cabdff1aSopenharmony_ci} 975cabdff1aSopenharmony_ci 976cabdff1aSopenharmony_cistatic void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, 977cabdff1aSopenharmony_ci ptrdiff_t stride, 978cabdff1aSopenharmony_ci int32_t alpha_in, 979cabdff1aSopenharmony_ci int32_t beta_in) 980cabdff1aSopenharmony_ci{ 981cabdff1aSopenharmony_ci uint64_t load0, load1; 982cabdff1aSopenharmony_ci uint32_t out0, out2; 983cabdff1aSopenharmony_ci uint16_t out1, out3; 984cabdff1aSopenharmony_ci v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; 985cabdff1aSopenharmony_ci v8u16 dst0_r, dst1_r, dst4_r, dst5_r; 986cabdff1aSopenharmony_ci v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r; 987cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y; 988cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 989cabdff1aSopenharmony_ci v16u8 alpha, beta; 990cabdff1aSopenharmony_ci v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0; 991cabdff1aSopenharmony_ci v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; 992cabdff1aSopenharmony_ci v16u8 is_less_than_beta1, is_less_than_beta2; 993cabdff1aSopenharmony_ci v16i8 src0 = { 0 }; 994cabdff1aSopenharmony_ci v16i8 src1 = { 0 }; 995cabdff1aSopenharmony_ci v16i8 src2 = { 0 }; 996cabdff1aSopenharmony_ci v16i8 src3 = { 0 }; 997cabdff1aSopenharmony_ci v16i8 src4 = { 0 }; 998cabdff1aSopenharmony_ci v16i8 src5 = { 0 }; 999cabdff1aSopenharmony_ci v16i8 src6 = { 0 }; 1000cabdff1aSopenharmony_ci v16i8 src7 = { 0 }; 1001cabdff1aSopenharmony_ci v16i8 zeros = { 0 }; 1002cabdff1aSopenharmony_ci 1003cabdff1aSopenharmony_ci load0 = LD(src - 4); 1004cabdff1aSopenharmony_ci load1 = LD(src + stride - 4); 1005cabdff1aSopenharmony_ci src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0); 1006cabdff1aSopenharmony_ci src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1); 1007cabdff1aSopenharmony_ci 1008cabdff1aSopenharmony_ci load0 = LD(src + (2 * stride) - 4); 1009cabdff1aSopenharmony_ci load1 = LD(src + (3 * stride) - 4); 1010cabdff1aSopenharmony_ci src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0); 1011cabdff1aSopenharmony_ci src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1); 1012cabdff1aSopenharmony_ci 1013cabdff1aSopenharmony_ci load0 = LD(src + (4 * stride) - 4); 1014cabdff1aSopenharmony_ci load1 = LD(src + (5 * stride) - 4); 1015cabdff1aSopenharmony_ci src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0); 1016cabdff1aSopenharmony_ci src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1); 1017cabdff1aSopenharmony_ci 1018cabdff1aSopenharmony_ci load0 = LD(src + (6 * stride) - 4); 1019cabdff1aSopenharmony_ci load1 = LD(src + (7 * stride) - 4); 1020cabdff1aSopenharmony_ci src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0); 1021cabdff1aSopenharmony_ci src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1); 1022cabdff1aSopenharmony_ci 1023cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 1024cabdff1aSopenharmony_ci src0, src1, src2, src3); 1025cabdff1aSopenharmony_ci 1026cabdff1aSopenharmony_ci ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2); 1027cabdff1aSopenharmony_ci ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3); 1028cabdff1aSopenharmony_ci 1029cabdff1aSopenharmony_ci ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3); 1030cabdff1aSopenharmony_ci ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5); 1031cabdff1aSopenharmony_ci SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5, 1032cabdff1aSopenharmony_ci 8, src0, src2, src4, src7); 1033cabdff1aSopenharmony_ci 1034cabdff1aSopenharmony_ci p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3); 1035cabdff1aSopenharmony_ci p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2); 1036cabdff1aSopenharmony_ci q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3); 1037cabdff1aSopenharmony_ci 1038cabdff1aSopenharmony_ci alpha = (v16u8) __msa_fill_b(alpha_in); 1039cabdff1aSopenharmony_ci beta = (v16u8) __msa_fill_b(beta_in); 1040cabdff1aSopenharmony_ci 1041cabdff1aSopenharmony_ci is_less_than_alpha = (p0_asub_q0 < alpha); 1042cabdff1aSopenharmony_ci is_less_than_beta = (p1_asub_p0 < beta); 1043cabdff1aSopenharmony_ci is_less_than = is_less_than_alpha & is_less_than_beta; 1044cabdff1aSopenharmony_ci is_less_than_beta = (q1_asub_q0 < beta); 1045cabdff1aSopenharmony_ci is_less_than = is_less_than & is_less_than_beta; 1046cabdff1aSopenharmony_ci 1047cabdff1aSopenharmony_ci alpha >>= 2; 1048cabdff1aSopenharmony_ci alpha += 2; 1049cabdff1aSopenharmony_ci 1050cabdff1aSopenharmony_ci is_less_than_alpha = (p0_asub_q0 < alpha); 1051cabdff1aSopenharmony_ci 1052cabdff1aSopenharmony_ci p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2); 1053cabdff1aSopenharmony_ci is_less_than_beta1 = (p2_asub_p0 < beta); 1054cabdff1aSopenharmony_ci q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3); 1055cabdff1aSopenharmony_ci is_less_than_beta2 = (q2_asub_q0 < beta); 1056cabdff1aSopenharmony_ci 1057cabdff1aSopenharmony_ci ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1058cabdff1aSopenharmony_ci src0_r, src1_r, src2_r, src3_r); 1059cabdff1aSopenharmony_ci ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7, 1060cabdff1aSopenharmony_ci src4_r, src5_r, src6_r, src7_r); 1061cabdff1aSopenharmony_ci 1062cabdff1aSopenharmony_ci dst2_x_r = src1_r + src2_r + src3_r; 1063cabdff1aSopenharmony_ci dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r; 1064cabdff1aSopenharmony_ci dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3); 1065cabdff1aSopenharmony_ci dst1_r = src0_r + src1_r + src2_r + src3_r; 1066cabdff1aSopenharmony_ci dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2); 1067cabdff1aSopenharmony_ci 1068cabdff1aSopenharmony_ci dst0_r = (2 * src6_r) + (3 * src0_r); 1069cabdff1aSopenharmony_ci dst0_r += src1_r + src2_r + src3_r; 1070cabdff1aSopenharmony_ci dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3); 1071cabdff1aSopenharmony_ci dst2_y_r = (2 * src1_r) + src2_r + src4_r; 1072cabdff1aSopenharmony_ci dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2); 1073cabdff1aSopenharmony_ci 1074cabdff1aSopenharmony_ci PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y); 1075cabdff1aSopenharmony_ci dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1); 1076cabdff1aSopenharmony_ci 1077cabdff1aSopenharmony_ci dst3_x_r = src2_r + src3_r + src4_r; 1078cabdff1aSopenharmony_ci dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r; 1079cabdff1aSopenharmony_ci dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3); 1080cabdff1aSopenharmony_ci dst4_r = src2_r + src3_r + src4_r + src5_r; 1081cabdff1aSopenharmony_ci dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2); 1082cabdff1aSopenharmony_ci 1083cabdff1aSopenharmony_ci dst5_r = (2 * src7_r) + (3 * src5_r); 1084cabdff1aSopenharmony_ci dst5_r += src4_r + src3_r + src2_r; 1085cabdff1aSopenharmony_ci dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3); 1086cabdff1aSopenharmony_ci dst3_y_r = (2 * src4_r) + src3_r + src1_r; 1087cabdff1aSopenharmony_ci dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2); 1088cabdff1aSopenharmony_ci 1089cabdff1aSopenharmony_ci PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y); 1090cabdff1aSopenharmony_ci dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2); 1091cabdff1aSopenharmony_ci 1092cabdff1aSopenharmony_ci dst2_y_r = (2 * src1_r) + src2_r + src4_r; 1093cabdff1aSopenharmony_ci dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2); 1094cabdff1aSopenharmony_ci dst3_y_r = (2 * src4_r) + src3_r + src1_r; 1095cabdff1aSopenharmony_ci dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2); 1096cabdff1aSopenharmony_ci 1097cabdff1aSopenharmony_ci PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y); 1098cabdff1aSopenharmony_ci 1099cabdff1aSopenharmony_ci dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha); 1100cabdff1aSopenharmony_ci dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha); 1101cabdff1aSopenharmony_ci dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than); 1102cabdff1aSopenharmony_ci dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than); 1103cabdff1aSopenharmony_ci 1104cabdff1aSopenharmony_ci is_less_than = is_less_than_alpha & is_less_than; 1105cabdff1aSopenharmony_ci dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r); 1106cabdff1aSopenharmony_ci is_less_than_beta1 = is_less_than_beta1 & is_less_than; 1107cabdff1aSopenharmony_ci dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1); 1108cabdff1aSopenharmony_ci 1109cabdff1aSopenharmony_ci dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r); 1110cabdff1aSopenharmony_ci dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1); 1111cabdff1aSopenharmony_ci dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r); 1112cabdff1aSopenharmony_ci is_less_than_beta2 = is_less_than_beta2 & is_less_than; 1113cabdff1aSopenharmony_ci dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2); 1114cabdff1aSopenharmony_ci dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r); 1115cabdff1aSopenharmony_ci dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2); 1116cabdff1aSopenharmony_ci 1117cabdff1aSopenharmony_ci ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1); 1118cabdff1aSopenharmony_ci dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4); 1119cabdff1aSopenharmony_ci ILVRL_H2_SH(dst1, dst0, tmp0, tmp1); 1120cabdff1aSopenharmony_ci ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3); 1121cabdff1aSopenharmony_ci 1122cabdff1aSopenharmony_ci ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4); 1123cabdff1aSopenharmony_ci SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5); 1124cabdff1aSopenharmony_ci dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0); 1125cabdff1aSopenharmony_ci dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1); 1126cabdff1aSopenharmony_ci SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y); 1127cabdff1aSopenharmony_ci 1128cabdff1aSopenharmony_ci out0 = __msa_copy_u_w((v4i32) dst0, 0); 1129cabdff1aSopenharmony_ci out1 = __msa_copy_u_h((v8i16) dst0, 2); 1130cabdff1aSopenharmony_ci out2 = __msa_copy_u_w((v4i32) dst1, 0); 1131cabdff1aSopenharmony_ci out3 = __msa_copy_u_h((v8i16) dst1, 2); 1132cabdff1aSopenharmony_ci 1133cabdff1aSopenharmony_ci SW(out0, (src - 3)); 1134cabdff1aSopenharmony_ci SH(out1, (src + 1)); 1135cabdff1aSopenharmony_ci src += stride; 1136cabdff1aSopenharmony_ci SW(out2, (src - 3)); 1137cabdff1aSopenharmony_ci SH(out3, (src + 1)); 1138cabdff1aSopenharmony_ci src += stride; 1139cabdff1aSopenharmony_ci 1140cabdff1aSopenharmony_ci out0 = __msa_copy_u_w((v4i32) dst2_x, 0); 1141cabdff1aSopenharmony_ci out1 = __msa_copy_u_h((v8i16) dst2_x, 2); 1142cabdff1aSopenharmony_ci out2 = __msa_copy_u_w((v4i32) dst3_x, 0); 1143cabdff1aSopenharmony_ci out3 = __msa_copy_u_h((v8i16) dst3_x, 2); 1144cabdff1aSopenharmony_ci 1145cabdff1aSopenharmony_ci SW(out0, (src - 3)); 1146cabdff1aSopenharmony_ci SH(out1, (src + 1)); 1147cabdff1aSopenharmony_ci src += stride; 1148cabdff1aSopenharmony_ci SW(out2, (src - 3)); 1149cabdff1aSopenharmony_ci SH(out3, (src + 1)); 1150cabdff1aSopenharmony_ci src += stride; 1151cabdff1aSopenharmony_ci 1152cabdff1aSopenharmony_ci out0 = __msa_copy_u_w((v4i32) dst4, 0); 1153cabdff1aSopenharmony_ci out1 = __msa_copy_u_h((v8i16) dst4, 2); 1154cabdff1aSopenharmony_ci out2 = __msa_copy_u_w((v4i32) dst5, 0); 1155cabdff1aSopenharmony_ci out3 = __msa_copy_u_h((v8i16) dst5, 2); 1156cabdff1aSopenharmony_ci 1157cabdff1aSopenharmony_ci SW(out0, (src - 3)); 1158cabdff1aSopenharmony_ci SH(out1, (src + 1)); 1159cabdff1aSopenharmony_ci src += stride; 1160cabdff1aSopenharmony_ci SW(out2, (src - 3)); 1161cabdff1aSopenharmony_ci SH(out3, (src + 1)); 1162cabdff1aSopenharmony_ci src += stride; 1163cabdff1aSopenharmony_ci 1164cabdff1aSopenharmony_ci out0 = __msa_copy_u_w((v4i32) dst2_y, 0); 1165cabdff1aSopenharmony_ci out1 = __msa_copy_u_h((v8i16) dst2_y, 2); 1166cabdff1aSopenharmony_ci out2 = __msa_copy_u_w((v4i32) dst3_y, 0); 1167cabdff1aSopenharmony_ci out3 = __msa_copy_u_h((v8i16) dst3_y, 2); 1168cabdff1aSopenharmony_ci 1169cabdff1aSopenharmony_ci SW(out0, (src - 3)); 1170cabdff1aSopenharmony_ci SH(out1, (src + 1)); 1171cabdff1aSopenharmony_ci src += stride; 1172cabdff1aSopenharmony_ci SW(out2, (src - 3)); 1173cabdff1aSopenharmony_ci SH(out3, (src + 1)); 1174cabdff1aSopenharmony_ci} 1175cabdff1aSopenharmony_ci 1176cabdff1aSopenharmony_cistatic void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr, 1177cabdff1aSopenharmony_ci uint8_t alpha_in, 1178cabdff1aSopenharmony_ci uint8_t beta_in, 1179cabdff1aSopenharmony_ci ptrdiff_t img_width) 1180cabdff1aSopenharmony_ci{ 1181cabdff1aSopenharmony_ci v16u8 alpha, beta; 1182cabdff1aSopenharmony_ci v16u8 is_less_than; 1183cabdff1aSopenharmony_ci v8i16 p0_or_q0, q0_or_p0; 1184cabdff1aSopenharmony_ci v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org; 1185cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 1186cabdff1aSopenharmony_ci v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; 1187cabdff1aSopenharmony_ci v16u8 is_less_than_alpha, is_less_than_beta; 1188cabdff1aSopenharmony_ci v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 1189cabdff1aSopenharmony_ci 1190cabdff1aSopenharmony_ci alpha = (v16u8) __msa_fill_b(alpha_in); 1191cabdff1aSopenharmony_ci beta = (v16u8) __msa_fill_b(beta_in); 1192cabdff1aSopenharmony_ci 1193cabdff1aSopenharmony_ci LD_UB4(data_cb_or_cr - (img_width << 1), img_width, 1194cabdff1aSopenharmony_ci p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org); 1195cabdff1aSopenharmony_ci 1196cabdff1aSopenharmony_ci p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org); 1197cabdff1aSopenharmony_ci p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org); 1198cabdff1aSopenharmony_ci q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org); 1199cabdff1aSopenharmony_ci 1200cabdff1aSopenharmony_ci is_less_than_alpha = (p0_asub_q0 < alpha); 1201cabdff1aSopenharmony_ci is_less_than_beta = (p1_asub_p0 < beta); 1202cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than_alpha; 1203cabdff1aSopenharmony_ci is_less_than_beta = (q1_asub_q0 < beta); 1204cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 1205cabdff1aSopenharmony_ci 1206cabdff1aSopenharmony_ci is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than); 1207cabdff1aSopenharmony_ci 1208cabdff1aSopenharmony_ci if (!__msa_test_bz_v(is_less_than)) { 1209cabdff1aSopenharmony_ci ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org, 1210cabdff1aSopenharmony_ci zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r); 1211cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0); 1212cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0); 1213cabdff1aSopenharmony_ci PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0); 1214cabdff1aSopenharmony_ci 1215cabdff1aSopenharmony_ci p0_or_q0_org = 1216cabdff1aSopenharmony_ci __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than); 1217cabdff1aSopenharmony_ci q0_or_p0_org = 1218cabdff1aSopenharmony_ci __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than); 1219cabdff1aSopenharmony_ci 1220cabdff1aSopenharmony_ci ST_UB(q0_or_p0_org, data_cb_or_cr); 1221cabdff1aSopenharmony_ci ST_UB(p0_or_q0_org, data_cb_or_cr - img_width); 1222cabdff1aSopenharmony_ci } 1223cabdff1aSopenharmony_ci} 1224cabdff1aSopenharmony_ci 1225cabdff1aSopenharmony_cistatic void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr, 1226cabdff1aSopenharmony_ci uint8_t alpha_in, 1227cabdff1aSopenharmony_ci uint8_t beta_in, 1228cabdff1aSopenharmony_ci ptrdiff_t img_width) 1229cabdff1aSopenharmony_ci{ 1230cabdff1aSopenharmony_ci v8i16 tmp1; 1231cabdff1aSopenharmony_ci v16u8 alpha, beta, is_less_than; 1232cabdff1aSopenharmony_ci v8i16 p0_or_q0, q0_or_p0; 1233cabdff1aSopenharmony_ci v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org; 1234cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 1235cabdff1aSopenharmony_ci v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; 1236cabdff1aSopenharmony_ci v16u8 is_less_than_alpha, is_less_than_beta; 1237cabdff1aSopenharmony_ci v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 1238cabdff1aSopenharmony_ci 1239cabdff1aSopenharmony_ci { 1240cabdff1aSopenharmony_ci v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 1241cabdff1aSopenharmony_ci 1242cabdff1aSopenharmony_ci LD_UB8((data_cb_or_cr - 2), img_width, 1243cabdff1aSopenharmony_ci row0, row1, row2, row3, row4, row5, row6, row7); 1244cabdff1aSopenharmony_ci 1245cabdff1aSopenharmony_ci TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 1246cabdff1aSopenharmony_ci p1_or_q1_org, p0_or_q0_org, 1247cabdff1aSopenharmony_ci q0_or_p0_org, q1_or_p1_org); 1248cabdff1aSopenharmony_ci } 1249cabdff1aSopenharmony_ci 1250cabdff1aSopenharmony_ci alpha = (v16u8) __msa_fill_b(alpha_in); 1251cabdff1aSopenharmony_ci beta = (v16u8) __msa_fill_b(beta_in); 1252cabdff1aSopenharmony_ci 1253cabdff1aSopenharmony_ci p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org); 1254cabdff1aSopenharmony_ci p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org); 1255cabdff1aSopenharmony_ci q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org); 1256cabdff1aSopenharmony_ci 1257cabdff1aSopenharmony_ci is_less_than_alpha = (p0_asub_q0 < alpha); 1258cabdff1aSopenharmony_ci is_less_than_beta = (p1_asub_p0 < beta); 1259cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than_alpha; 1260cabdff1aSopenharmony_ci is_less_than_beta = (q1_asub_q0 < beta); 1261cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 1262cabdff1aSopenharmony_ci is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than); 1263cabdff1aSopenharmony_ci 1264cabdff1aSopenharmony_ci if (!__msa_test_bz_v(is_less_than)) { 1265cabdff1aSopenharmony_ci ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org, 1266cabdff1aSopenharmony_ci zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r); 1267cabdff1aSopenharmony_ci 1268cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0); 1269cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0); 1270cabdff1aSopenharmony_ci 1271cabdff1aSopenharmony_ci /* convert 16 bit output into 8 bit output */ 1272cabdff1aSopenharmony_ci PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0); 1273cabdff1aSopenharmony_ci 1274cabdff1aSopenharmony_ci p0_or_q0_org = 1275cabdff1aSopenharmony_ci __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than); 1276cabdff1aSopenharmony_ci q0_or_p0_org = 1277cabdff1aSopenharmony_ci __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than); 1278cabdff1aSopenharmony_ci tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org); 1279cabdff1aSopenharmony_ci 1280cabdff1aSopenharmony_ci data_cb_or_cr -= 1; 1281cabdff1aSopenharmony_ci ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width); 1282cabdff1aSopenharmony_ci data_cb_or_cr += 4 * img_width; 1283cabdff1aSopenharmony_ci ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width); 1284cabdff1aSopenharmony_ci } 1285cabdff1aSopenharmony_ci} 1286cabdff1aSopenharmony_ci 1287cabdff1aSopenharmony_cistatic void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t* pPix, uint32_t iStride, 1288cabdff1aSopenharmony_ci uint8_t iAlpha, uint8_t iBeta, 1289cabdff1aSopenharmony_ci uint8_t* pTc) 1290cabdff1aSopenharmony_ci{ 1291cabdff1aSopenharmony_ci v16u8 p0, p1, p2, q0, q1, q2; 1292cabdff1aSopenharmony_ci v16i8 iTc, negiTc, negTc, flags, f; 1293cabdff1aSopenharmony_ci v8i16 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r, q1_l, q1_r, q2_l, q2_r; 1294cabdff1aSopenharmony_ci v8i16 tc_l, tc_r, negTc_l, negTc_r; 1295cabdff1aSopenharmony_ci v8i16 iTc_l, iTc_r, negiTc_l, negiTc_r; 1296cabdff1aSopenharmony_ci // Use for temporary variable 1297cabdff1aSopenharmony_ci v8i16 t0, t1, t2, t3; 1298cabdff1aSopenharmony_ci v16u8 alpha, beta; 1299cabdff1aSopenharmony_ci v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0; 1300cabdff1aSopenharmony_ci v16i8 const_1_b = __msa_ldi_b(1); 1301cabdff1aSopenharmony_ci v8i16 const_1_h = __msa_ldi_h(1); 1302cabdff1aSopenharmony_ci v8i16 const_4_h = __msa_ldi_h(4); 1303cabdff1aSopenharmony_ci v8i16 const_not_255_h = __msa_ldi_h(~255); 1304cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 1305cabdff1aSopenharmony_ci v16i8 tc = { pTc[0 >> 2], pTc[1 >> 2], pTc[2 >> 2], pTc[3 >> 2], 1306cabdff1aSopenharmony_ci pTc[4 >> 2], pTc[5 >> 2], pTc[6 >> 2], pTc[7 >> 2], 1307cabdff1aSopenharmony_ci pTc[8 >> 2], pTc[9 >> 2], pTc[10 >> 2], pTc[11 >> 2], 1308cabdff1aSopenharmony_ci pTc[12 >> 2], pTc[13 >> 2], pTc[14 >> 2], pTc[15 >> 2] }; 1309cabdff1aSopenharmony_ci negTc = zero - tc; 1310cabdff1aSopenharmony_ci iTc = tc; 1311cabdff1aSopenharmony_ci 1312cabdff1aSopenharmony_ci // Load data from pPix 1313cabdff1aSopenharmony_ci LD_SH8(pPix - 3, iStride, t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r); 1314cabdff1aSopenharmony_ci LD_SH8(pPix + 8 * iStride - 3, iStride, p0_l, p0_r, p1_l, p1_r, 1315cabdff1aSopenharmony_ci p2_l, p2_r, q0_l, q0_r); 1316cabdff1aSopenharmony_ci TRANSPOSE16x8_UB_UB(t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r, 1317cabdff1aSopenharmony_ci p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r, 1318cabdff1aSopenharmony_ci p2, p1, p0, q0, q1, q2, alpha, beta); 1319cabdff1aSopenharmony_ci 1320cabdff1aSopenharmony_ci alpha = (v16u8)__msa_fill_b(iAlpha); 1321cabdff1aSopenharmony_ci beta = (v16u8)__msa_fill_b(iBeta); 1322cabdff1aSopenharmony_ci 1323cabdff1aSopenharmony_ci bDetaP0Q0 = __msa_asub_u_b(p0, q0); 1324cabdff1aSopenharmony_ci bDetaP1P0 = __msa_asub_u_b(p1, p0); 1325cabdff1aSopenharmony_ci bDetaQ1Q0 = __msa_asub_u_b(q1, q0); 1326cabdff1aSopenharmony_ci bDetaP2P0 = __msa_asub_u_b(p2, p0); 1327cabdff1aSopenharmony_ci bDetaQ2Q0 = __msa_asub_u_b(q2, q0); 1328cabdff1aSopenharmony_ci bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha); 1329cabdff1aSopenharmony_ci bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta); 1330cabdff1aSopenharmony_ci bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta); 1331cabdff1aSopenharmony_ci bDetaP2P0 = (v16u8)__msa_clt_u_b(bDetaP2P0, beta); 1332cabdff1aSopenharmony_ci bDetaQ2Q0 = (v16u8)__msa_clt_u_b(bDetaQ2Q0, beta); 1333cabdff1aSopenharmony_ci 1334cabdff1aSopenharmony_ci // Unsigned extend p0, p1, p2, q0, q1, q2 from 8 bits to 16 bits 1335cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, p0, p0_r, p0_l); 1336cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, p1, p1_r, p1_l); 1337cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, p2, p2_r, p2_l); 1338cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, q0, q0_r, q0_l); 1339cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, q1, q1_r, q1_l); 1340cabdff1aSopenharmony_ci ILVRL_B2_SH(zero, q2, q2_r, q2_l); 1341cabdff1aSopenharmony_ci // Signed extend tc, negTc from 8 bits to 16 bits 1342cabdff1aSopenharmony_ci flags = __msa_clt_s_b(tc, zero); 1343cabdff1aSopenharmony_ci ILVRL_B2(v8i16, flags, tc, tc_r, tc_l); 1344cabdff1aSopenharmony_ci flags = __msa_clt_s_b(negTc, zero); 1345cabdff1aSopenharmony_ci ILVRL_B2(v8i16, flags, negTc, negTc_r, negTc_l); 1346cabdff1aSopenharmony_ci 1347cabdff1aSopenharmony_ci f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0; 1348cabdff1aSopenharmony_ci flags = f & (v16i8)bDetaP2P0; 1349cabdff1aSopenharmony_ci flags = __msa_ceq_b(flags, zero); 1350cabdff1aSopenharmony_ci iTc += ((~flags) & const_1_b); 1351cabdff1aSopenharmony_ci flags = f & (v16i8)bDetaQ2Q0; 1352cabdff1aSopenharmony_ci flags = __msa_ceq_b(flags, zero); 1353cabdff1aSopenharmony_ci iTc += ((~flags) & const_1_b); 1354cabdff1aSopenharmony_ci negiTc = zero - iTc; 1355cabdff1aSopenharmony_ci // Signed extend iTc, negiTc from 8 bits to 16 bits 1356cabdff1aSopenharmony_ci flags = __msa_clt_s_b(iTc, zero); 1357cabdff1aSopenharmony_ci ILVRL_B2(v8i16, flags, iTc, iTc_r, iTc_l); 1358cabdff1aSopenharmony_ci flags = __msa_clt_s_b(negiTc, zero); 1359cabdff1aSopenharmony_ci ILVRL_B2(v8i16, flags, negiTc, negiTc_r, negiTc_l); 1360cabdff1aSopenharmony_ci 1361cabdff1aSopenharmony_ci // Calculate the left part 1362cabdff1aSopenharmony_ci // p1 1363cabdff1aSopenharmony_ci t0 = (p2_l + ((p0_l + q0_l + const_1_h) >> 1) - (p1_l << 1)) >> 1; 1364cabdff1aSopenharmony_ci t0 = __msa_max_s_h(negTc_l, t0); 1365cabdff1aSopenharmony_ci t0 = __msa_min_s_h(tc_l, t0); 1366cabdff1aSopenharmony_ci t1 = p1_l + t0; 1367cabdff1aSopenharmony_ci // q1 1368cabdff1aSopenharmony_ci t0 = (q2_l + ((p0_l + q0_l + const_1_h) >> 1) - (q1_l << 1)) >> 1; 1369cabdff1aSopenharmony_ci t0 = __msa_max_s_h(negTc_l, t0); 1370cabdff1aSopenharmony_ci t0 = __msa_min_s_h(tc_l, t0); 1371cabdff1aSopenharmony_ci t2 = q1_l + t0; 1372cabdff1aSopenharmony_ci // iDeta 1373cabdff1aSopenharmony_ci t0 = (((q0_l - p0_l) << 2) + (p1_l - q1_l) + const_4_h) >> 3; 1374cabdff1aSopenharmony_ci t0 = __msa_max_s_h(negiTc_l, t0); 1375cabdff1aSopenharmony_ci t0 = __msa_min_s_h(iTc_l, t0); 1376cabdff1aSopenharmony_ci p1_l = t1; 1377cabdff1aSopenharmony_ci q1_l = t2; 1378cabdff1aSopenharmony_ci // p0 1379cabdff1aSopenharmony_ci t1 = p0_l + t0; 1380cabdff1aSopenharmony_ci t2 = t1 & const_not_255_h; 1381cabdff1aSopenharmony_ci t3 = __msa_cle_s_h((v8i16)zero, t1); 1382cabdff1aSopenharmony_ci flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero); 1383cabdff1aSopenharmony_ci p0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags)); 1384cabdff1aSopenharmony_ci // q0 1385cabdff1aSopenharmony_ci t1 = q0_l - t0; 1386cabdff1aSopenharmony_ci t2 = t1 & const_not_255_h; 1387cabdff1aSopenharmony_ci t3 = __msa_cle_s_h((v8i16)zero, t1); 1388cabdff1aSopenharmony_ci flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero); 1389cabdff1aSopenharmony_ci q0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags)); 1390cabdff1aSopenharmony_ci 1391cabdff1aSopenharmony_ci // Calculate the right part 1392cabdff1aSopenharmony_ci // p1 1393cabdff1aSopenharmony_ci t0 = (p2_r + ((p0_r + q0_r + const_1_h) >> 1) - (p1_r << 1)) >> 1; 1394cabdff1aSopenharmony_ci t0 = __msa_max_s_h(negTc_r, t0); 1395cabdff1aSopenharmony_ci t0 = __msa_min_s_h(tc_r, t0); 1396cabdff1aSopenharmony_ci t1 = p1_r + t0; 1397cabdff1aSopenharmony_ci // q1 1398cabdff1aSopenharmony_ci t0 = (q2_r + ((p0_r + q0_r + const_1_h) >> 1) - (q1_r << 1)) >> 1; 1399cabdff1aSopenharmony_ci t0 = __msa_max_s_h(negTc_r, t0); 1400cabdff1aSopenharmony_ci t0 = __msa_min_s_h(tc_r, t0); 1401cabdff1aSopenharmony_ci t2 = q1_r + t0; 1402cabdff1aSopenharmony_ci // iDeta 1403cabdff1aSopenharmony_ci t0 = (((q0_r - p0_r) << 2) + (p1_r - q1_r) + const_4_h) >> 3; 1404cabdff1aSopenharmony_ci t0 = __msa_max_s_h(negiTc_r, t0); 1405cabdff1aSopenharmony_ci t0 = __msa_min_s_h(iTc_r, t0); 1406cabdff1aSopenharmony_ci p1_r = t1; 1407cabdff1aSopenharmony_ci q1_r = t2; 1408cabdff1aSopenharmony_ci // p0 1409cabdff1aSopenharmony_ci t1 = p0_r + t0; 1410cabdff1aSopenharmony_ci t2 = t1 & const_not_255_h; 1411cabdff1aSopenharmony_ci t3 = __msa_cle_s_h((v8i16)zero, t1); 1412cabdff1aSopenharmony_ci flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero); 1413cabdff1aSopenharmony_ci p0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags)); 1414cabdff1aSopenharmony_ci // q0 1415cabdff1aSopenharmony_ci t1 = q0_r - t0; 1416cabdff1aSopenharmony_ci t2 = t1 & const_not_255_h; 1417cabdff1aSopenharmony_ci t3 = __msa_cle_s_h((v8i16)zero, t1); 1418cabdff1aSopenharmony_ci flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero); 1419cabdff1aSopenharmony_ci q0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags)); 1420cabdff1aSopenharmony_ci 1421cabdff1aSopenharmony_ci // Combined left and right 1422cabdff1aSopenharmony_ci PCKEV_B4(v8i16, p1_l, p1_r, p0_l, p0_r, q0_l, q0_r, q1_l, q1_r, 1423cabdff1aSopenharmony_ci t0, t1, t2, t3); 1424cabdff1aSopenharmony_ci flags = (v16i8)__msa_cle_s_b(zero, tc); 1425cabdff1aSopenharmony_ci flags &= f; 1426cabdff1aSopenharmony_ci p0 = (v16u8)(((v16i8)t1 & flags) + (p0 & (~flags))); 1427cabdff1aSopenharmony_ci q0 = (v16u8)(((v16i8)t2 & flags) + (q0 & (~flags))); 1428cabdff1aSopenharmony_ci // Using t1, t2 as temporary flags 1429cabdff1aSopenharmony_ci t1 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaP2P0, zero)))); 1430cabdff1aSopenharmony_ci p1 = (v16u8)(t0 & t1) + (p1 & (v16u8)(~t1)); 1431cabdff1aSopenharmony_ci t2 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaQ2Q0, zero)))); 1432cabdff1aSopenharmony_ci q1 = (v16u8)(t3 & t2) + (q1 & (v16u8)(~t2)); 1433cabdff1aSopenharmony_ci 1434cabdff1aSopenharmony_ci ILVRL_B2_SH(p0, p1, t0, t1); 1435cabdff1aSopenharmony_ci ILVRL_B2_SH(q1, q0, t2, t3); 1436cabdff1aSopenharmony_ci ILVRL_H2_UB(t2, t0, p1, p0); 1437cabdff1aSopenharmony_ci ILVRL_H2_UB(t3, t1, q0, q1); 1438cabdff1aSopenharmony_ci // Store data to pPix 1439cabdff1aSopenharmony_ci ST_W8(p1, p0, 0, 1, 2, 3, 0, 1, 2, 3, pPix - 2, iStride); 1440cabdff1aSopenharmony_ci ST_W8(q0, q1, 0, 1, 2, 3, 0, 1, 2, 3, pPix + 8 * iStride - 2, iStride); 1441cabdff1aSopenharmony_ci} 1442cabdff1aSopenharmony_ci 1443cabdff1aSopenharmony_cistatic void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data, 1444cabdff1aSopenharmony_ci uint8_t bs0, uint8_t bs1, 1445cabdff1aSopenharmony_ci uint8_t bs2, uint8_t bs3, 1446cabdff1aSopenharmony_ci uint8_t tc0, uint8_t tc1, 1447cabdff1aSopenharmony_ci uint8_t tc2, uint8_t tc3, 1448cabdff1aSopenharmony_ci uint8_t alpha_in, 1449cabdff1aSopenharmony_ci uint8_t beta_in, 1450cabdff1aSopenharmony_ci ptrdiff_t image_width) 1451cabdff1aSopenharmony_ci{ 1452cabdff1aSopenharmony_ci v16u8 tmp_vec; 1453cabdff1aSopenharmony_ci v16u8 bs = { 0 }; 1454cabdff1aSopenharmony_ci 1455cabdff1aSopenharmony_ci tmp_vec = (v16u8) __msa_fill_b(bs0); 1456cabdff1aSopenharmony_ci bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec); 1457cabdff1aSopenharmony_ci tmp_vec = (v16u8) __msa_fill_b(bs1); 1458cabdff1aSopenharmony_ci bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec); 1459cabdff1aSopenharmony_ci tmp_vec = (v16u8) __msa_fill_b(bs2); 1460cabdff1aSopenharmony_ci bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec); 1461cabdff1aSopenharmony_ci tmp_vec = (v16u8) __msa_fill_b(bs3); 1462cabdff1aSopenharmony_ci bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec); 1463cabdff1aSopenharmony_ci 1464cabdff1aSopenharmony_ci if (!__msa_test_bz_v(bs)) { 1465cabdff1aSopenharmony_ci v16u8 alpha, beta, is_less_than, is_less_than_beta; 1466cabdff1aSopenharmony_ci v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org; 1467cabdff1aSopenharmony_ci v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; 1468cabdff1aSopenharmony_ci v16u8 is_less_than_alpha, is_bs_greater_than0; 1469cabdff1aSopenharmony_ci v8i16 p0_r, q0_r, p0_l, q0_l; 1470cabdff1aSopenharmony_ci v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 1471cabdff1aSopenharmony_ci v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; 1472cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 1473cabdff1aSopenharmony_ci v16i8 tc = { 0 }; 1474cabdff1aSopenharmony_ci 1475cabdff1aSopenharmony_ci tmp_vec = (v16u8) __msa_fill_b(tc0); 1476cabdff1aSopenharmony_ci tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec); 1477cabdff1aSopenharmony_ci tmp_vec = (v16u8) __msa_fill_b(tc1); 1478cabdff1aSopenharmony_ci tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec); 1479cabdff1aSopenharmony_ci tmp_vec = (v16u8) __msa_fill_b(tc2); 1480cabdff1aSopenharmony_ci tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec); 1481cabdff1aSopenharmony_ci tmp_vec = (v16u8) __msa_fill_b(tc3); 1482cabdff1aSopenharmony_ci tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec); 1483cabdff1aSopenharmony_ci 1484cabdff1aSopenharmony_ci alpha = (v16u8) __msa_fill_b(alpha_in); 1485cabdff1aSopenharmony_ci beta = (v16u8) __msa_fill_b(beta_in); 1486cabdff1aSopenharmony_ci 1487cabdff1aSopenharmony_ci LD_UB5(data - (3 * image_width), image_width, 1488cabdff1aSopenharmony_ci p2_org, p1_org, p0_org, q0_org, q1_org); 1489cabdff1aSopenharmony_ci 1490cabdff1aSopenharmony_ci is_bs_greater_than0 = ((v16u8) zero < bs); 1491cabdff1aSopenharmony_ci p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); 1492cabdff1aSopenharmony_ci p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); 1493cabdff1aSopenharmony_ci q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); 1494cabdff1aSopenharmony_ci 1495cabdff1aSopenharmony_ci is_less_than_alpha = (p0_asub_q0 < alpha); 1496cabdff1aSopenharmony_ci is_less_than_beta = (p1_asub_p0 < beta); 1497cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than_alpha; 1498cabdff1aSopenharmony_ci is_less_than_beta = (q1_asub_q0 < beta); 1499cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 1500cabdff1aSopenharmony_ci is_less_than = is_less_than & is_bs_greater_than0; 1501cabdff1aSopenharmony_ci 1502cabdff1aSopenharmony_ci if (!__msa_test_bz_v(is_less_than)) { 1503cabdff1aSopenharmony_ci v16i8 sign_negate_tc, negate_tc; 1504cabdff1aSopenharmony_ci v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r; 1505cabdff1aSopenharmony_ci v16u8 p2_asub_p0, q2_asub_q0; 1506cabdff1aSopenharmony_ci 1507cabdff1aSopenharmony_ci q2_org = LD_UB(data + (2 * image_width)); 1508cabdff1aSopenharmony_ci negate_tc = zero - tc; 1509cabdff1aSopenharmony_ci sign_negate_tc = __msa_clti_s_b(negate_tc, 0); 1510cabdff1aSopenharmony_ci 1511cabdff1aSopenharmony_ci ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l); 1512cabdff1aSopenharmony_ci 1513cabdff1aSopenharmony_ci UNPCK_UB_SH(tc, tc_r, tc_l); 1514cabdff1aSopenharmony_ci UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l); 1515cabdff1aSopenharmony_ci UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l); 1516cabdff1aSopenharmony_ci UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l); 1517cabdff1aSopenharmony_ci 1518cabdff1aSopenharmony_ci p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org); 1519cabdff1aSopenharmony_ci is_less_than_beta = (p2_asub_p0 < beta); 1520cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 1521cabdff1aSopenharmony_ci 1522cabdff1aSopenharmony_ci if (!__msa_test_bz_v(is_less_than_beta)) { 1523cabdff1aSopenharmony_ci v16u8 p1; 1524cabdff1aSopenharmony_ci v8i16 p1_r = { 0 }; 1525cabdff1aSopenharmony_ci v8i16 p1_l = { 0 }; 1526cabdff1aSopenharmony_ci v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org); 1527cabdff1aSopenharmony_ci v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org); 1528cabdff1aSopenharmony_ci 1529cabdff1aSopenharmony_ci AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r, 1530cabdff1aSopenharmony_ci negate_tc_r, tc_r, p1_r); 1531cabdff1aSopenharmony_ci AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l, 1532cabdff1aSopenharmony_ci i16_negatetc_l, tc_l, p1_l); 1533cabdff1aSopenharmony_ci 1534cabdff1aSopenharmony_ci p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r); 1535cabdff1aSopenharmony_ci p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta); 1536cabdff1aSopenharmony_ci ST_UB(p1_org, data - (2 * image_width)); 1537cabdff1aSopenharmony_ci 1538cabdff1aSopenharmony_ci is_less_than_beta = __msa_andi_b(is_less_than_beta, 1); 1539cabdff1aSopenharmony_ci tc = tc + (v16i8) is_less_than_beta; 1540cabdff1aSopenharmony_ci } 1541cabdff1aSopenharmony_ci 1542cabdff1aSopenharmony_ci q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org); 1543cabdff1aSopenharmony_ci is_less_than_beta = (q2_asub_q0 < beta); 1544cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 1545cabdff1aSopenharmony_ci 1546cabdff1aSopenharmony_ci q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org); 1547cabdff1aSopenharmony_ci q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org); 1548cabdff1aSopenharmony_ci 1549cabdff1aSopenharmony_ci if (!__msa_test_bz_v(is_less_than_beta)) { 1550cabdff1aSopenharmony_ci v16u8 q1; 1551cabdff1aSopenharmony_ci v8i16 q1_r = { 0 }; 1552cabdff1aSopenharmony_ci v8i16 q1_l = { 0 }; 1553cabdff1aSopenharmony_ci v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org); 1554cabdff1aSopenharmony_ci v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org); 1555cabdff1aSopenharmony_ci 1556cabdff1aSopenharmony_ci AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r, 1557cabdff1aSopenharmony_ci negate_tc_r, tc_r, q1_r); 1558cabdff1aSopenharmony_ci AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l, 1559cabdff1aSopenharmony_ci i16_negatetc_l, tc_l, q1_l); 1560cabdff1aSopenharmony_ci 1561cabdff1aSopenharmony_ci q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r); 1562cabdff1aSopenharmony_ci q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta); 1563cabdff1aSopenharmony_ci ST_UB(q1_org, data + image_width); 1564cabdff1aSopenharmony_ci 1565cabdff1aSopenharmony_ci is_less_than_beta = __msa_andi_b(is_less_than_beta, 1); 1566cabdff1aSopenharmony_ci tc = tc + (v16i8) is_less_than_beta; 1567cabdff1aSopenharmony_ci } 1568cabdff1aSopenharmony_ci { 1569cabdff1aSopenharmony_ci v16i8 negate_thresh, sign_negate_thresh; 1570cabdff1aSopenharmony_ci v8i16 threshold_r, threshold_l; 1571cabdff1aSopenharmony_ci v8i16 negate_thresh_l, negate_thresh_r; 1572cabdff1aSopenharmony_ci 1573cabdff1aSopenharmony_ci negate_thresh = zero - tc; 1574cabdff1aSopenharmony_ci sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0); 1575cabdff1aSopenharmony_ci 1576cabdff1aSopenharmony_ci ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh, 1577cabdff1aSopenharmony_ci threshold_r, negate_thresh_r); 1578cabdff1aSopenharmony_ci AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, 1579cabdff1aSopenharmony_ci negate_thresh_r, threshold_r, p0_r, q0_r); 1580cabdff1aSopenharmony_ci 1581cabdff1aSopenharmony_ci threshold_l = (v8i16) __msa_ilvl_b(zero, tc); 1582cabdff1aSopenharmony_ci negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh, 1583cabdff1aSopenharmony_ci negate_thresh); 1584cabdff1aSopenharmony_ci AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l, 1585cabdff1aSopenharmony_ci negate_thresh_l, threshold_l, p0_l, q0_l); 1586cabdff1aSopenharmony_ci } 1587cabdff1aSopenharmony_ci 1588cabdff1aSopenharmony_ci PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0); 1589cabdff1aSopenharmony_ci 1590cabdff1aSopenharmony_ci p0_org = __msa_bmnz_v(p0_org, p0, is_less_than); 1591cabdff1aSopenharmony_ci q0_org = __msa_bmnz_v(q0_org, q0, is_less_than); 1592cabdff1aSopenharmony_ci 1593cabdff1aSopenharmony_ci ST_UB(p0_org, (data - image_width)); 1594cabdff1aSopenharmony_ci ST_UB(q0_org, data); 1595cabdff1aSopenharmony_ci } 1596cabdff1aSopenharmony_ci } 1597cabdff1aSopenharmony_ci} 1598cabdff1aSopenharmony_ci 1599cabdff1aSopenharmony_cistatic void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, ptrdiff_t stride, 1600cabdff1aSopenharmony_ci int32_t alpha_in, int32_t beta_in, 1601cabdff1aSopenharmony_ci int8_t *tc0) 1602cabdff1aSopenharmony_ci{ 1603cabdff1aSopenharmony_ci uint8_t *data = in; 1604cabdff1aSopenharmony_ci uint32_t out0, out1, out2, out3; 1605cabdff1aSopenharmony_ci uint64_t load; 1606cabdff1aSopenharmony_ci uint32_t tc_val; 1607cabdff1aSopenharmony_ci v16u8 alpha, beta; 1608cabdff1aSopenharmony_ci v16i8 inp0 = { 0 }; 1609cabdff1aSopenharmony_ci v16i8 inp1 = { 0 }; 1610cabdff1aSopenharmony_ci v16i8 inp2 = { 0 }; 1611cabdff1aSopenharmony_ci v16i8 inp3 = { 0 }; 1612cabdff1aSopenharmony_ci v16i8 inp4 = { 0 }; 1613cabdff1aSopenharmony_ci v16i8 inp5 = { 0 }; 1614cabdff1aSopenharmony_ci v16i8 inp6 = { 0 }; 1615cabdff1aSopenharmony_ci v16i8 inp7 = { 0 }; 1616cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3; 1617cabdff1aSopenharmony_ci v8i16 src4, src5, src6, src7; 1618cabdff1aSopenharmony_ci v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0; 1619cabdff1aSopenharmony_ci v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; 1620cabdff1aSopenharmony_ci v16u8 is_less_than_beta1, is_less_than_beta2; 1621cabdff1aSopenharmony_ci v8i16 tc, tc_orig_r, tc_plus1; 1622cabdff1aSopenharmony_ci v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 }; 1623cabdff1aSopenharmony_ci v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1; 1624cabdff1aSopenharmony_ci v8i16 src2_r, src3_r; 1625cabdff1aSopenharmony_ci v8i16 p2_r, p1_r, q2_r, q1_r; 1626cabdff1aSopenharmony_ci v16u8 p2, q2, p0, q0; 1627cabdff1aSopenharmony_ci v4i32 dst0, dst1; 1628cabdff1aSopenharmony_ci v16i8 zeros = { 0 }; 1629cabdff1aSopenharmony_ci 1630cabdff1aSopenharmony_ci alpha = (v16u8) __msa_fill_b(alpha_in); 1631cabdff1aSopenharmony_ci beta = (v16u8) __msa_fill_b(beta_in); 1632cabdff1aSopenharmony_ci 1633cabdff1aSopenharmony_ci if (tc0[0] < 0) { 1634cabdff1aSopenharmony_ci data += (2 * stride); 1635cabdff1aSopenharmony_ci } else { 1636cabdff1aSopenharmony_ci load = LD(data - 3); 1637cabdff1aSopenharmony_ci inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load); 1638cabdff1aSopenharmony_ci load = LD(data - 3 + stride); 1639cabdff1aSopenharmony_ci inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load); 1640cabdff1aSopenharmony_ci data += (2 * stride); 1641cabdff1aSopenharmony_ci } 1642cabdff1aSopenharmony_ci 1643cabdff1aSopenharmony_ci if (tc0[1] < 0) { 1644cabdff1aSopenharmony_ci data += (2 * stride); 1645cabdff1aSopenharmony_ci } else { 1646cabdff1aSopenharmony_ci load = LD(data - 3); 1647cabdff1aSopenharmony_ci inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load); 1648cabdff1aSopenharmony_ci load = LD(data - 3 + stride); 1649cabdff1aSopenharmony_ci inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load); 1650cabdff1aSopenharmony_ci data += (2 * stride); 1651cabdff1aSopenharmony_ci } 1652cabdff1aSopenharmony_ci 1653cabdff1aSopenharmony_ci if (tc0[2] < 0) { 1654cabdff1aSopenharmony_ci data += (2 * stride); 1655cabdff1aSopenharmony_ci } else { 1656cabdff1aSopenharmony_ci load = LD(data - 3); 1657cabdff1aSopenharmony_ci inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load); 1658cabdff1aSopenharmony_ci load = LD(data - 3 + stride); 1659cabdff1aSopenharmony_ci inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load); 1660cabdff1aSopenharmony_ci data += (2 * stride); 1661cabdff1aSopenharmony_ci } 1662cabdff1aSopenharmony_ci 1663cabdff1aSopenharmony_ci if (tc0[3] < 0) { 1664cabdff1aSopenharmony_ci data += (2 * stride); 1665cabdff1aSopenharmony_ci } else { 1666cabdff1aSopenharmony_ci load = LD(data - 3); 1667cabdff1aSopenharmony_ci inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load); 1668cabdff1aSopenharmony_ci load = LD(data - 3 + stride); 1669cabdff1aSopenharmony_ci inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load); 1670cabdff1aSopenharmony_ci data += (2 * stride); 1671cabdff1aSopenharmony_ci } 1672cabdff1aSopenharmony_ci 1673cabdff1aSopenharmony_ci ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6, 1674cabdff1aSopenharmony_ci src0, src1, src2, src3); 1675cabdff1aSopenharmony_ci 1676cabdff1aSopenharmony_ci ILVR_H2_SH(src1, src0, src3, src2, src4, src6); 1677cabdff1aSopenharmony_ci ILVL_H2_SH(src1, src0, src3, src2, src5, src7); 1678cabdff1aSopenharmony_ci 1679cabdff1aSopenharmony_ci src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4); 1680cabdff1aSopenharmony_ci src1 = __msa_sldi_b(zeros, (v16i8) src0, 8); 1681cabdff1aSopenharmony_ci src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4); 1682cabdff1aSopenharmony_ci src3 = __msa_sldi_b(zeros, (v16i8) src2, 8); 1683cabdff1aSopenharmony_ci src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5); 1684cabdff1aSopenharmony_ci src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8); 1685cabdff1aSopenharmony_ci 1686cabdff1aSopenharmony_ci p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3); 1687cabdff1aSopenharmony_ci p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2); 1688cabdff1aSopenharmony_ci q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3); 1689cabdff1aSopenharmony_ci p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2); 1690cabdff1aSopenharmony_ci q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3); 1691cabdff1aSopenharmony_ci 1692cabdff1aSopenharmony_ci is_less_than_alpha = (p0_asub_q0 < alpha); 1693cabdff1aSopenharmony_ci is_less_than_beta = (p1_asub_p0 < beta); 1694cabdff1aSopenharmony_ci is_less_than = is_less_than_alpha & is_less_than_beta; 1695cabdff1aSopenharmony_ci is_less_than_beta = (q1_asub_q0 < beta); 1696cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 1697cabdff1aSopenharmony_ci 1698cabdff1aSopenharmony_ci is_less_than_beta1 = (p2_asub_p0 < beta); 1699cabdff1aSopenharmony_ci is_less_than_beta2 = (q2_asub_q0 < beta); 1700cabdff1aSopenharmony_ci 1701cabdff1aSopenharmony_ci p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2); 1702cabdff1aSopenharmony_ci p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0); 1703cabdff1aSopenharmony_ci p0_add_q0 = __msa_srari_h(p0_add_q0, 1); 1704cabdff1aSopenharmony_ci 1705cabdff1aSopenharmony_ci ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r); 1706cabdff1aSopenharmony_ci p2_r += p0_add_q0; 1707cabdff1aSopenharmony_ci p2_r >>= 1; 1708cabdff1aSopenharmony_ci p2_r -= p1_r; 1709cabdff1aSopenharmony_ci ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r); 1710cabdff1aSopenharmony_ci q2_r += p0_add_q0; 1711cabdff1aSopenharmony_ci q2_r >>= 1; 1712cabdff1aSopenharmony_ci q2_r -= q1_r; 1713cabdff1aSopenharmony_ci 1714cabdff1aSopenharmony_ci tc_val = LW(tc0); 1715cabdff1aSopenharmony_ci tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val); 1716cabdff1aSopenharmony_ci tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig); 1717cabdff1aSopenharmony_ci is_tc_orig1 = tc_orig; 1718cabdff1aSopenharmony_ci is_tc_orig2 = tc_orig; 1719cabdff1aSopenharmony_ci tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig); 1720cabdff1aSopenharmony_ci tc = tc_orig_r; 1721cabdff1aSopenharmony_ci 1722cabdff1aSopenharmony_ci CLIP_SH(p2_r, -tc_orig_r, tc_orig_r); 1723cabdff1aSopenharmony_ci CLIP_SH(q2_r, -tc_orig_r, tc_orig_r); 1724cabdff1aSopenharmony_ci 1725cabdff1aSopenharmony_ci p2_r += p1_r; 1726cabdff1aSopenharmony_ci q2_r += q1_r; 1727cabdff1aSopenharmony_ci 1728cabdff1aSopenharmony_ci PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2); 1729cabdff1aSopenharmony_ci 1730cabdff1aSopenharmony_ci is_tc_orig1 = (zeros < is_tc_orig1); 1731cabdff1aSopenharmony_ci is_tc_orig2 = is_tc_orig1; 1732cabdff1aSopenharmony_ci is_tc_orig1 = is_less_than_beta1 & is_tc_orig1; 1733cabdff1aSopenharmony_ci is_tc_orig2 = is_less_than_beta2 & is_tc_orig2; 1734cabdff1aSopenharmony_ci is_tc_orig1 = is_less_than & is_tc_orig1; 1735cabdff1aSopenharmony_ci is_tc_orig2 = is_less_than & is_tc_orig2; 1736cabdff1aSopenharmony_ci 1737cabdff1aSopenharmony_ci p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1); 1738cabdff1aSopenharmony_ci q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2); 1739cabdff1aSopenharmony_ci 1740cabdff1aSopenharmony_ci q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0); 1741cabdff1aSopenharmony_ci q0_sub_p0 <<= 2; 1742cabdff1aSopenharmony_ci p1_sub_q1 = p1_r - q1_r; 1743cabdff1aSopenharmony_ci q0_sub_p0 += p1_sub_q1; 1744cabdff1aSopenharmony_ci q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3); 1745cabdff1aSopenharmony_ci 1746cabdff1aSopenharmony_ci tc_plus1 = tc + 1; 1747cabdff1aSopenharmony_ci is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1, 1748cabdff1aSopenharmony_ci (v16i8) is_less_than_beta1); 1749cabdff1aSopenharmony_ci tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1); 1750cabdff1aSopenharmony_ci tc_plus1 = tc + 1; 1751cabdff1aSopenharmony_ci is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2, 1752cabdff1aSopenharmony_ci (v16i8) is_less_than_beta2); 1753cabdff1aSopenharmony_ci tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2); 1754cabdff1aSopenharmony_ci 1755cabdff1aSopenharmony_ci CLIP_SH(q0_sub_p0, -tc, tc); 1756cabdff1aSopenharmony_ci 1757cabdff1aSopenharmony_ci ILVR_B2_SH(zeros, src2, zeros, src3, src2_r, src3_r); 1758cabdff1aSopenharmony_ci src2_r += q0_sub_p0; 1759cabdff1aSopenharmony_ci src3_r -= q0_sub_p0; 1760cabdff1aSopenharmony_ci 1761cabdff1aSopenharmony_ci CLIP_SH2_0_255(src2_r, src3_r); 1762cabdff1aSopenharmony_ci 1763cabdff1aSopenharmony_ci PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0); 1764cabdff1aSopenharmony_ci 1765cabdff1aSopenharmony_ci p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than); 1766cabdff1aSopenharmony_ci q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than); 1767cabdff1aSopenharmony_ci 1768cabdff1aSopenharmony_ci ILVR_B2_UB(p0, p2, q2, q0, p2, q2); 1769cabdff1aSopenharmony_ci 1770cabdff1aSopenharmony_ci ILVRL_H2_SW(q2, p2, dst0, dst1); 1771cabdff1aSopenharmony_ci 1772cabdff1aSopenharmony_ci data = in; 1773cabdff1aSopenharmony_ci 1774cabdff1aSopenharmony_ci out0 = __msa_copy_u_w(dst0, 0); 1775cabdff1aSopenharmony_ci out1 = __msa_copy_u_w(dst0, 1); 1776cabdff1aSopenharmony_ci out2 = __msa_copy_u_w(dst0, 2); 1777cabdff1aSopenharmony_ci out3 = __msa_copy_u_w(dst0, 3); 1778cabdff1aSopenharmony_ci 1779cabdff1aSopenharmony_ci if (tc0[0] < 0) { 1780cabdff1aSopenharmony_ci data += (2 * stride); 1781cabdff1aSopenharmony_ci } else { 1782cabdff1aSopenharmony_ci SW(out0, (data - 2)); 1783cabdff1aSopenharmony_ci data += stride; 1784cabdff1aSopenharmony_ci SW(out1, (data - 2)); 1785cabdff1aSopenharmony_ci data += stride; 1786cabdff1aSopenharmony_ci } 1787cabdff1aSopenharmony_ci 1788cabdff1aSopenharmony_ci if (tc0[1] < 0) { 1789cabdff1aSopenharmony_ci data += (2 * stride); 1790cabdff1aSopenharmony_ci } else { 1791cabdff1aSopenharmony_ci SW(out2, (data - 2)); 1792cabdff1aSopenharmony_ci data += stride; 1793cabdff1aSopenharmony_ci SW(out3, (data - 2)); 1794cabdff1aSopenharmony_ci data += stride; 1795cabdff1aSopenharmony_ci } 1796cabdff1aSopenharmony_ci 1797cabdff1aSopenharmony_ci out0 = __msa_copy_u_w(dst1, 0); 1798cabdff1aSopenharmony_ci out1 = __msa_copy_u_w(dst1, 1); 1799cabdff1aSopenharmony_ci out2 = __msa_copy_u_w(dst1, 2); 1800cabdff1aSopenharmony_ci out3 = __msa_copy_u_w(dst1, 3); 1801cabdff1aSopenharmony_ci 1802cabdff1aSopenharmony_ci if (tc0[2] < 0) { 1803cabdff1aSopenharmony_ci data += (2 * stride); 1804cabdff1aSopenharmony_ci } else { 1805cabdff1aSopenharmony_ci SW(out0, (data - 2)); 1806cabdff1aSopenharmony_ci data += stride; 1807cabdff1aSopenharmony_ci SW(out1, (data - 2)); 1808cabdff1aSopenharmony_ci data += stride; 1809cabdff1aSopenharmony_ci } 1810cabdff1aSopenharmony_ci 1811cabdff1aSopenharmony_ci if (tc0[3] >= 0) { 1812cabdff1aSopenharmony_ci SW(out2, (data - 2)); 1813cabdff1aSopenharmony_ci data += stride; 1814cabdff1aSopenharmony_ci SW(out3, (data - 2)); 1815cabdff1aSopenharmony_ci } 1816cabdff1aSopenharmony_ci} 1817cabdff1aSopenharmony_ci 1818cabdff1aSopenharmony_cistatic void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data, 1819cabdff1aSopenharmony_ci uint8_t bs0, uint8_t bs1, 1820cabdff1aSopenharmony_ci uint8_t bs2, uint8_t bs3, 1821cabdff1aSopenharmony_ci uint8_t tc0, uint8_t tc1, 1822cabdff1aSopenharmony_ci uint8_t tc2, uint8_t tc3, 1823cabdff1aSopenharmony_ci uint8_t alpha_in, 1824cabdff1aSopenharmony_ci uint8_t beta_in, 1825cabdff1aSopenharmony_ci ptrdiff_t img_width) 1826cabdff1aSopenharmony_ci{ 1827cabdff1aSopenharmony_ci v16u8 alpha, beta; 1828cabdff1aSopenharmony_ci v8i16 tmp_vec; 1829cabdff1aSopenharmony_ci v8i16 bs = { 0 }; 1830cabdff1aSopenharmony_ci v8i16 tc = { 0 }; 1831cabdff1aSopenharmony_ci v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0; 1832cabdff1aSopenharmony_ci v16u8 is_less_than; 1833cabdff1aSopenharmony_ci v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0; 1834cabdff1aSopenharmony_ci v8i16 p0_r, q0_r; 1835cabdff1aSopenharmony_ci v16u8 p1_org, p0_org, q0_org, q1_org; 1836cabdff1aSopenharmony_ci v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 1837cabdff1aSopenharmony_ci v16i8 negate_tc, sign_negate_tc; 1838cabdff1aSopenharmony_ci v8i16 tc_r, negate_tc_r; 1839cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 1840cabdff1aSopenharmony_ci 1841cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(bs0); 1842cabdff1aSopenharmony_ci bs = __msa_insve_h(bs, 0, tmp_vec); 1843cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(bs1); 1844cabdff1aSopenharmony_ci bs = __msa_insve_h(bs, 1, tmp_vec); 1845cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(bs2); 1846cabdff1aSopenharmony_ci bs = __msa_insve_h(bs, 2, tmp_vec); 1847cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(bs3); 1848cabdff1aSopenharmony_ci bs = __msa_insve_h(bs, 3, tmp_vec); 1849cabdff1aSopenharmony_ci 1850cabdff1aSopenharmony_ci if (!__msa_test_bz_v((v16u8) bs)) { 1851cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(tc0); 1852cabdff1aSopenharmony_ci tc = __msa_insve_h(tc, 0, tmp_vec); 1853cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(tc1); 1854cabdff1aSopenharmony_ci tc = __msa_insve_h(tc, 1, tmp_vec); 1855cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(tc2); 1856cabdff1aSopenharmony_ci tc = __msa_insve_h(tc, 2, tmp_vec); 1857cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(tc3); 1858cabdff1aSopenharmony_ci tc = __msa_insve_h(tc, 3, tmp_vec); 1859cabdff1aSopenharmony_ci 1860cabdff1aSopenharmony_ci is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs); 1861cabdff1aSopenharmony_ci 1862cabdff1aSopenharmony_ci alpha = (v16u8) __msa_fill_b(alpha_in); 1863cabdff1aSopenharmony_ci beta = (v16u8) __msa_fill_b(beta_in); 1864cabdff1aSopenharmony_ci 1865cabdff1aSopenharmony_ci LD_UB4(data - (img_width << 1), img_width, 1866cabdff1aSopenharmony_ci p1_org, p0_org, q0_org, q1_org); 1867cabdff1aSopenharmony_ci 1868cabdff1aSopenharmony_ci p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); 1869cabdff1aSopenharmony_ci p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); 1870cabdff1aSopenharmony_ci q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); 1871cabdff1aSopenharmony_ci 1872cabdff1aSopenharmony_ci is_less_than_alpha = (p0_asub_q0 < alpha); 1873cabdff1aSopenharmony_ci is_less_than_beta = (p1_asub_p0 < beta); 1874cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than_alpha; 1875cabdff1aSopenharmony_ci is_less_than_beta = (q1_asub_q0 < beta); 1876cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 1877cabdff1aSopenharmony_ci is_less_than = is_less_than & is_bs_greater_than0; 1878cabdff1aSopenharmony_ci 1879cabdff1aSopenharmony_ci is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than); 1880cabdff1aSopenharmony_ci 1881cabdff1aSopenharmony_ci if (!__msa_test_bz_v(is_less_than)) { 1882cabdff1aSopenharmony_ci negate_tc = zero - (v16i8) tc; 1883cabdff1aSopenharmony_ci sign_negate_tc = __msa_clti_s_b(negate_tc, 0); 1884cabdff1aSopenharmony_ci 1885cabdff1aSopenharmony_ci ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r); 1886cabdff1aSopenharmony_ci 1887cabdff1aSopenharmony_ci ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org, 1888cabdff1aSopenharmony_ci p1_org_r, p0_org_r, q0_org_r, q1_org_r); 1889cabdff1aSopenharmony_ci 1890cabdff1aSopenharmony_ci AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r, 1891cabdff1aSopenharmony_ci tc_r, p0_r, q0_r); 1892cabdff1aSopenharmony_ci 1893cabdff1aSopenharmony_ci PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0); 1894cabdff1aSopenharmony_ci 1895cabdff1aSopenharmony_ci p0_org = __msa_bmnz_v(p0_org, p0, is_less_than); 1896cabdff1aSopenharmony_ci q0_org = __msa_bmnz_v(q0_org, q0, is_less_than); 1897cabdff1aSopenharmony_ci 1898cabdff1aSopenharmony_ci ST_UB(q0_org, data); 1899cabdff1aSopenharmony_ci ST_UB(p0_org, (data - img_width)); 1900cabdff1aSopenharmony_ci } 1901cabdff1aSopenharmony_ci } 1902cabdff1aSopenharmony_ci} 1903cabdff1aSopenharmony_ci 1904cabdff1aSopenharmony_cistatic void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data, 1905cabdff1aSopenharmony_ci uint8_t bs0, uint8_t bs1, 1906cabdff1aSopenharmony_ci uint8_t bs2, uint8_t bs3, 1907cabdff1aSopenharmony_ci uint8_t tc0, uint8_t tc1, 1908cabdff1aSopenharmony_ci uint8_t tc2, uint8_t tc3, 1909cabdff1aSopenharmony_ci uint8_t alpha_in, 1910cabdff1aSopenharmony_ci uint8_t beta_in, 1911cabdff1aSopenharmony_ci ptrdiff_t img_width) 1912cabdff1aSopenharmony_ci{ 1913cabdff1aSopenharmony_ci uint8_t *src; 1914cabdff1aSopenharmony_ci v16u8 alpha, beta; 1915cabdff1aSopenharmony_ci v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; 1916cabdff1aSopenharmony_ci v16u8 is_less_than, is_less_than_beta, is_less_than_alpha; 1917cabdff1aSopenharmony_ci v16u8 p0, q0; 1918cabdff1aSopenharmony_ci v8i16 p0_r = { 0 }; 1919cabdff1aSopenharmony_ci v8i16 q0_r = { 0 }; 1920cabdff1aSopenharmony_ci v16u8 p1_org, p0_org, q0_org, q1_org; 1921cabdff1aSopenharmony_ci v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 1922cabdff1aSopenharmony_ci v16u8 is_bs_greater_than0; 1923cabdff1aSopenharmony_ci v8i16 tc_r, negate_tc_r; 1924cabdff1aSopenharmony_ci v16i8 negate_tc, sign_negate_tc; 1925cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 1926cabdff1aSopenharmony_ci v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 1927cabdff1aSopenharmony_ci v8i16 tmp1, tmp_vec, bs = { 0 }; 1928cabdff1aSopenharmony_ci v8i16 tc = { 0 }; 1929cabdff1aSopenharmony_ci 1930cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(bs0); 1931cabdff1aSopenharmony_ci bs = __msa_insve_h(bs, 0, tmp_vec); 1932cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(bs1); 1933cabdff1aSopenharmony_ci bs = __msa_insve_h(bs, 1, tmp_vec); 1934cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(bs2); 1935cabdff1aSopenharmony_ci bs = __msa_insve_h(bs, 2, tmp_vec); 1936cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(bs3); 1937cabdff1aSopenharmony_ci bs = __msa_insve_h(bs, 3, tmp_vec); 1938cabdff1aSopenharmony_ci 1939cabdff1aSopenharmony_ci if (!__msa_test_bz_v((v16u8) bs)) { 1940cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(tc0); 1941cabdff1aSopenharmony_ci tc = __msa_insve_h(tc, 0, tmp_vec); 1942cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(tc1); 1943cabdff1aSopenharmony_ci tc = __msa_insve_h(tc, 1, tmp_vec); 1944cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(tc2); 1945cabdff1aSopenharmony_ci tc = __msa_insve_h(tc, 2, tmp_vec); 1946cabdff1aSopenharmony_ci tmp_vec = (v8i16) __msa_fill_b(tc3); 1947cabdff1aSopenharmony_ci tc = __msa_insve_h(tc, 3, tmp_vec); 1948cabdff1aSopenharmony_ci 1949cabdff1aSopenharmony_ci is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs); 1950cabdff1aSopenharmony_ci 1951cabdff1aSopenharmony_ci LD_UB8((data - 2), img_width, 1952cabdff1aSopenharmony_ci row0, row1, row2, row3, row4, row5, row6, row7); 1953cabdff1aSopenharmony_ci 1954cabdff1aSopenharmony_ci TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, 1955cabdff1aSopenharmony_ci row4, row5, row6, row7, 1956cabdff1aSopenharmony_ci p1_org, p0_org, q0_org, q1_org); 1957cabdff1aSopenharmony_ci 1958cabdff1aSopenharmony_ci p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); 1959cabdff1aSopenharmony_ci p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); 1960cabdff1aSopenharmony_ci q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); 1961cabdff1aSopenharmony_ci 1962cabdff1aSopenharmony_ci alpha = (v16u8) __msa_fill_b(alpha_in); 1963cabdff1aSopenharmony_ci beta = (v16u8) __msa_fill_b(beta_in); 1964cabdff1aSopenharmony_ci 1965cabdff1aSopenharmony_ci is_less_than_alpha = (p0_asub_q0 < alpha); 1966cabdff1aSopenharmony_ci is_less_than_beta = (p1_asub_p0 < beta); 1967cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than_alpha; 1968cabdff1aSopenharmony_ci is_less_than_beta = (q1_asub_q0 < beta); 1969cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 1970cabdff1aSopenharmony_ci is_less_than = is_bs_greater_than0 & is_less_than; 1971cabdff1aSopenharmony_ci 1972cabdff1aSopenharmony_ci is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than); 1973cabdff1aSopenharmony_ci 1974cabdff1aSopenharmony_ci if (!__msa_test_bz_v(is_less_than)) { 1975cabdff1aSopenharmony_ci ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org, 1976cabdff1aSopenharmony_ci p1_org_r, p0_org_r, q0_org_r, q1_org_r); 1977cabdff1aSopenharmony_ci 1978cabdff1aSopenharmony_ci negate_tc = zero - (v16i8) tc; 1979cabdff1aSopenharmony_ci sign_negate_tc = __msa_clti_s_b(negate_tc, 0); 1980cabdff1aSopenharmony_ci 1981cabdff1aSopenharmony_ci ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r); 1982cabdff1aSopenharmony_ci 1983cabdff1aSopenharmony_ci AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r, 1984cabdff1aSopenharmony_ci tc_r, p0_r, q0_r); 1985cabdff1aSopenharmony_ci 1986cabdff1aSopenharmony_ci PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0); 1987cabdff1aSopenharmony_ci 1988cabdff1aSopenharmony_ci p0_org = __msa_bmnz_v(p0_org, p0, is_less_than); 1989cabdff1aSopenharmony_ci q0_org = __msa_bmnz_v(q0_org, q0, is_less_than); 1990cabdff1aSopenharmony_ci tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org); 1991cabdff1aSopenharmony_ci src = data - 1; 1992cabdff1aSopenharmony_ci ST_H4(tmp1, 0, 1, 2, 3, src, img_width); 1993cabdff1aSopenharmony_ci src += 4 * img_width; 1994cabdff1aSopenharmony_ci ST_H4(tmp1, 4, 5, 6, 7, src, img_width); 1995cabdff1aSopenharmony_ci } 1996cabdff1aSopenharmony_ci } 1997cabdff1aSopenharmony_ci} 1998cabdff1aSopenharmony_ci 1999cabdff1aSopenharmony_cistatic void avc_h_loop_filter_chroma422_msa(uint8_t *src, ptrdiff_t stride, 2000cabdff1aSopenharmony_ci int32_t alpha_in, int32_t beta_in, 2001cabdff1aSopenharmony_ci int8_t *tc0) 2002cabdff1aSopenharmony_ci{ 2003cabdff1aSopenharmony_ci int32_t col, tc_val; 2004cabdff1aSopenharmony_ci v16u8 alpha, beta, res; 2005cabdff1aSopenharmony_ci 2006cabdff1aSopenharmony_ci alpha = (v16u8) __msa_fill_b(alpha_in); 2007cabdff1aSopenharmony_ci beta = (v16u8) __msa_fill_b(beta_in); 2008cabdff1aSopenharmony_ci 2009cabdff1aSopenharmony_ci for (col = 0; col < 4; col++) { 2010cabdff1aSopenharmony_ci tc_val = (tc0[col] - 1) + 1; 2011cabdff1aSopenharmony_ci 2012cabdff1aSopenharmony_ci if (tc_val <= 0) { 2013cabdff1aSopenharmony_ci src += (4 * stride); 2014cabdff1aSopenharmony_ci continue; 2015cabdff1aSopenharmony_ci } 2016cabdff1aSopenharmony_ci 2017cabdff1aSopenharmony_ci AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res); 2018cabdff1aSopenharmony_ci ST_H4(res, 0, 1, 2, 3, (src - 1), stride); 2019cabdff1aSopenharmony_ci src += (4 * stride); 2020cabdff1aSopenharmony_ci } 2021cabdff1aSopenharmony_ci} 2022cabdff1aSopenharmony_ci 2023cabdff1aSopenharmony_cistatic void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, 2024cabdff1aSopenharmony_ci ptrdiff_t stride, 2025cabdff1aSopenharmony_ci int32_t alpha_in, 2026cabdff1aSopenharmony_ci int32_t beta_in, 2027cabdff1aSopenharmony_ci int8_t *tc0) 2028cabdff1aSopenharmony_ci{ 2029cabdff1aSopenharmony_ci int32_t col, tc_val; 2030cabdff1aSopenharmony_ci int16_t out0, out1; 2031cabdff1aSopenharmony_ci v16u8 alpha, beta, res; 2032cabdff1aSopenharmony_ci 2033cabdff1aSopenharmony_ci alpha = (v16u8) __msa_fill_b(alpha_in); 2034cabdff1aSopenharmony_ci beta = (v16u8) __msa_fill_b(beta_in); 2035cabdff1aSopenharmony_ci 2036cabdff1aSopenharmony_ci for (col = 0; col < 4; col++) { 2037cabdff1aSopenharmony_ci tc_val = (tc0[col] - 1) + 1; 2038cabdff1aSopenharmony_ci 2039cabdff1aSopenharmony_ci if (tc_val <= 0) { 2040cabdff1aSopenharmony_ci src += 4 * stride; 2041cabdff1aSopenharmony_ci continue; 2042cabdff1aSopenharmony_ci } 2043cabdff1aSopenharmony_ci 2044cabdff1aSopenharmony_ci AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res); 2045cabdff1aSopenharmony_ci 2046cabdff1aSopenharmony_ci out0 = __msa_copy_s_h((v8i16) res, 0); 2047cabdff1aSopenharmony_ci out1 = __msa_copy_s_h((v8i16) res, 1); 2048cabdff1aSopenharmony_ci 2049cabdff1aSopenharmony_ci SH(out0, (src - 1)); 2050cabdff1aSopenharmony_ci src += stride; 2051cabdff1aSopenharmony_ci SH(out1, (src - 1)); 2052cabdff1aSopenharmony_ci src += stride; 2053cabdff1aSopenharmony_ci } 2054cabdff1aSopenharmony_ci} 2055cabdff1aSopenharmony_ci 2056cabdff1aSopenharmony_civoid ff_h264_h_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width, 2057cabdff1aSopenharmony_ci int alpha, int beta, int8_t *tc) 2058cabdff1aSopenharmony_ci{ 2059cabdff1aSopenharmony_ci// uint8_t bs0 = 1; 2060cabdff1aSopenharmony_ci// uint8_t bs1 = 1; 2061cabdff1aSopenharmony_ci// uint8_t bs2 = 1; 2062cabdff1aSopenharmony_ci// uint8_t bs3 = 1; 2063cabdff1aSopenharmony_ci// 2064cabdff1aSopenharmony_ci// if (tc[0] < 0) 2065cabdff1aSopenharmony_ci// bs0 = 0; 2066cabdff1aSopenharmony_ci// if (tc[1] < 0) 2067cabdff1aSopenharmony_ci// bs1 = 0; 2068cabdff1aSopenharmony_ci// if (tc[2] < 0) 2069cabdff1aSopenharmony_ci// bs2 = 0; 2070cabdff1aSopenharmony_ci// if (tc[3] < 0) 2071cabdff1aSopenharmony_ci// bs3 = 0; 2072cabdff1aSopenharmony_ci// 2073cabdff1aSopenharmony_ci// avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3, 2074cabdff1aSopenharmony_ci// tc[0], tc[1], tc[2], tc[3], 2075cabdff1aSopenharmony_ci// alpha, beta, img_width); 2076cabdff1aSopenharmony_ci avc_loopfilter_luma_inter_edge_ver_msa(data, img_width, alpha, beta, tc); 2077cabdff1aSopenharmony_ci} 2078cabdff1aSopenharmony_ci 2079cabdff1aSopenharmony_civoid ff_h264_v_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width, 2080cabdff1aSopenharmony_ci int alpha, int beta, int8_t *tc) 2081cabdff1aSopenharmony_ci{ 2082cabdff1aSopenharmony_ci 2083cabdff1aSopenharmony_ci uint8_t bs0 = 1; 2084cabdff1aSopenharmony_ci uint8_t bs1 = 1; 2085cabdff1aSopenharmony_ci uint8_t bs2 = 1; 2086cabdff1aSopenharmony_ci uint8_t bs3 = 1; 2087cabdff1aSopenharmony_ci 2088cabdff1aSopenharmony_ci if (tc[0] < 0) 2089cabdff1aSopenharmony_ci bs0 = 0; 2090cabdff1aSopenharmony_ci if (tc[1] < 0) 2091cabdff1aSopenharmony_ci bs1 = 0; 2092cabdff1aSopenharmony_ci if (tc[2] < 0) 2093cabdff1aSopenharmony_ci bs2 = 0; 2094cabdff1aSopenharmony_ci if (tc[3] < 0) 2095cabdff1aSopenharmony_ci bs3 = 0; 2096cabdff1aSopenharmony_ci 2097cabdff1aSopenharmony_ci avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3, 2098cabdff1aSopenharmony_ci tc[0], tc[1], tc[2], tc[3], 2099cabdff1aSopenharmony_ci alpha, beta, img_width); 2100cabdff1aSopenharmony_ci} 2101cabdff1aSopenharmony_ci 2102cabdff1aSopenharmony_civoid ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width, 2103cabdff1aSopenharmony_ci int alpha, int beta, int8_t *tc) 2104cabdff1aSopenharmony_ci{ 2105cabdff1aSopenharmony_ci uint8_t bs0 = 1; 2106cabdff1aSopenharmony_ci uint8_t bs1 = 1; 2107cabdff1aSopenharmony_ci uint8_t bs2 = 1; 2108cabdff1aSopenharmony_ci uint8_t bs3 = 1; 2109cabdff1aSopenharmony_ci 2110cabdff1aSopenharmony_ci if (tc[0] < 0) 2111cabdff1aSopenharmony_ci bs0 = 0; 2112cabdff1aSopenharmony_ci if (tc[1] < 0) 2113cabdff1aSopenharmony_ci bs1 = 0; 2114cabdff1aSopenharmony_ci if (tc[2] < 0) 2115cabdff1aSopenharmony_ci bs2 = 0; 2116cabdff1aSopenharmony_ci if (tc[3] < 0) 2117cabdff1aSopenharmony_ci bs3 = 0; 2118cabdff1aSopenharmony_ci 2119cabdff1aSopenharmony_ci avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3, 2120cabdff1aSopenharmony_ci tc[0], tc[1], tc[2], tc[3], 2121cabdff1aSopenharmony_ci alpha, beta, img_width); 2122cabdff1aSopenharmony_ci} 2123cabdff1aSopenharmony_ci 2124cabdff1aSopenharmony_civoid ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width, 2125cabdff1aSopenharmony_ci int alpha, int beta, int8_t *tc) 2126cabdff1aSopenharmony_ci{ 2127cabdff1aSopenharmony_ci uint8_t bs0 = 1; 2128cabdff1aSopenharmony_ci uint8_t bs1 = 1; 2129cabdff1aSopenharmony_ci uint8_t bs2 = 1; 2130cabdff1aSopenharmony_ci uint8_t bs3 = 1; 2131cabdff1aSopenharmony_ci 2132cabdff1aSopenharmony_ci if (tc[0] < 0) 2133cabdff1aSopenharmony_ci bs0 = 0; 2134cabdff1aSopenharmony_ci if (tc[1] < 0) 2135cabdff1aSopenharmony_ci bs1 = 0; 2136cabdff1aSopenharmony_ci if (tc[2] < 0) 2137cabdff1aSopenharmony_ci bs2 = 0; 2138cabdff1aSopenharmony_ci if (tc[3] < 0) 2139cabdff1aSopenharmony_ci bs3 = 0; 2140cabdff1aSopenharmony_ci 2141cabdff1aSopenharmony_ci avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3, 2142cabdff1aSopenharmony_ci tc[0], tc[1], tc[2], tc[3], 2143cabdff1aSopenharmony_ci alpha, beta, img_width); 2144cabdff1aSopenharmony_ci} 2145cabdff1aSopenharmony_ci 2146cabdff1aSopenharmony_civoid ff_h264_h_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width, 2147cabdff1aSopenharmony_ci int alpha, int beta) 2148cabdff1aSopenharmony_ci{ 2149cabdff1aSopenharmony_ci avc_loopfilter_luma_intra_edge_ver_msa(data, (uint8_t) alpha, 2150cabdff1aSopenharmony_ci (uint8_t) beta, 2151cabdff1aSopenharmony_ci img_width); 2152cabdff1aSopenharmony_ci} 2153cabdff1aSopenharmony_ci 2154cabdff1aSopenharmony_civoid ff_h264_v_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width, 2155cabdff1aSopenharmony_ci int alpha, int beta) 2156cabdff1aSopenharmony_ci{ 2157cabdff1aSopenharmony_ci avc_loopfilter_luma_intra_edge_hor_msa(data, (uint8_t) alpha, 2158cabdff1aSopenharmony_ci (uint8_t) beta, 2159cabdff1aSopenharmony_ci img_width); 2160cabdff1aSopenharmony_ci} 2161cabdff1aSopenharmony_ci 2162cabdff1aSopenharmony_civoid ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width, 2163cabdff1aSopenharmony_ci int alpha, int beta) 2164cabdff1aSopenharmony_ci{ 2165cabdff1aSopenharmony_ci avc_loopfilter_cb_or_cr_intra_edge_ver_msa(data, (uint8_t) alpha, 2166cabdff1aSopenharmony_ci (uint8_t) beta, 2167cabdff1aSopenharmony_ci img_width); 2168cabdff1aSopenharmony_ci} 2169cabdff1aSopenharmony_ci 2170cabdff1aSopenharmony_civoid ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width, 2171cabdff1aSopenharmony_ci int alpha, int beta) 2172cabdff1aSopenharmony_ci{ 2173cabdff1aSopenharmony_ci avc_loopfilter_cb_or_cr_intra_edge_hor_msa(data, (uint8_t) alpha, 2174cabdff1aSopenharmony_ci (uint8_t) beta, 2175cabdff1aSopenharmony_ci img_width); 2176cabdff1aSopenharmony_ci} 2177cabdff1aSopenharmony_ci 2178cabdff1aSopenharmony_civoid ff_h264_h_loop_filter_chroma422_msa(uint8_t *src, 2179cabdff1aSopenharmony_ci ptrdiff_t ystride, 2180cabdff1aSopenharmony_ci int32_t alpha, int32_t beta, 2181cabdff1aSopenharmony_ci int8_t *tc0) 2182cabdff1aSopenharmony_ci{ 2183cabdff1aSopenharmony_ci avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0); 2184cabdff1aSopenharmony_ci} 2185cabdff1aSopenharmony_ci 2186cabdff1aSopenharmony_civoid ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, 2187cabdff1aSopenharmony_ci ptrdiff_t ystride, 2188cabdff1aSopenharmony_ci int32_t alpha, 2189cabdff1aSopenharmony_ci int32_t beta, 2190cabdff1aSopenharmony_ci int8_t *tc0) 2191cabdff1aSopenharmony_ci{ 2192cabdff1aSopenharmony_ci avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0); 2193cabdff1aSopenharmony_ci} 2194cabdff1aSopenharmony_ci 2195cabdff1aSopenharmony_civoid ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src, 2196cabdff1aSopenharmony_ci ptrdiff_t ystride, 2197cabdff1aSopenharmony_ci int32_t alpha, 2198cabdff1aSopenharmony_ci int32_t beta, 2199cabdff1aSopenharmony_ci int8_t *tc0) 2200cabdff1aSopenharmony_ci{ 2201cabdff1aSopenharmony_ci avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0); 2202cabdff1aSopenharmony_ci} 2203cabdff1aSopenharmony_ci 2204cabdff1aSopenharmony_civoid ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, 2205cabdff1aSopenharmony_ci ptrdiff_t ystride, 2206cabdff1aSopenharmony_ci int32_t alpha, 2207cabdff1aSopenharmony_ci int32_t beta) 2208cabdff1aSopenharmony_ci{ 2209cabdff1aSopenharmony_ci avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta); 2210cabdff1aSopenharmony_ci} 2211cabdff1aSopenharmony_ci 2212cabdff1aSopenharmony_civoid ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride, 2213cabdff1aSopenharmony_ci int height, int log2_denom, 2214cabdff1aSopenharmony_ci int weight_src, int offset_in) 2215cabdff1aSopenharmony_ci{ 2216cabdff1aSopenharmony_ci uint32_t offset_val; 2217cabdff1aSopenharmony_ci v16i8 zero = { 0 }; 2218cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 2219cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2220cabdff1aSopenharmony_ci v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r; 2221cabdff1aSopenharmony_ci v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r; 2222cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2223cabdff1aSopenharmony_ci v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; 2224cabdff1aSopenharmony_ci v8i16 wgt, denom, offset; 2225cabdff1aSopenharmony_ci 2226cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 2227cabdff1aSopenharmony_ci 2228cabdff1aSopenharmony_ci wgt = __msa_fill_h(weight_src); 2229cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_val); 2230cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom); 2231cabdff1aSopenharmony_ci 2232cabdff1aSopenharmony_ci LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 2233cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r, 2234cabdff1aSopenharmony_ci src2_r, src3_r); 2235cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l, 2236cabdff1aSopenharmony_ci src2_l, src3_l); 2237cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r, 2238cabdff1aSopenharmony_ci src6_r, src7_r); 2239cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l, 2240cabdff1aSopenharmony_ci src6_l, src7_l); 2241cabdff1aSopenharmony_ci MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2, 2242cabdff1aSopenharmony_ci tmp3); 2243cabdff1aSopenharmony_ci MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6, 2244cabdff1aSopenharmony_ci tmp7); 2245cabdff1aSopenharmony_ci MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10, 2246cabdff1aSopenharmony_ci tmp11); 2247cabdff1aSopenharmony_ci MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13, 2248cabdff1aSopenharmony_ci tmp14, tmp15); 2249cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0, 2250cabdff1aSopenharmony_ci tmp1, tmp2, tmp3); 2251cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4, 2252cabdff1aSopenharmony_ci tmp5, tmp6, tmp7); 2253cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8, 2254cabdff1aSopenharmony_ci tmp9, tmp10, tmp11); 2255cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset, 2256cabdff1aSopenharmony_ci tmp12, tmp13, tmp14, tmp15); 2257cabdff1aSopenharmony_ci MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0); 2258cabdff1aSopenharmony_ci MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0); 2259cabdff1aSopenharmony_ci SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom); 2260cabdff1aSopenharmony_ci SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom); 2261cabdff1aSopenharmony_ci SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); 2262cabdff1aSopenharmony_ci SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7); 2263cabdff1aSopenharmony_ci PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, 2264cabdff1aSopenharmony_ci dst2, dst3); 2265cabdff1aSopenharmony_ci PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, 2266cabdff1aSopenharmony_ci dst5, dst6, dst7); 2267cabdff1aSopenharmony_ci ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride); 2268cabdff1aSopenharmony_ci src += 8 * stride; 2269cabdff1aSopenharmony_ci 2270cabdff1aSopenharmony_ci if (16 == height) { 2271cabdff1aSopenharmony_ci LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 2272cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, 2273cabdff1aSopenharmony_ci src1_r, src2_r, src3_r); 2274cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, 2275cabdff1aSopenharmony_ci src1_l, src2_l, src3_l); 2276cabdff1aSopenharmony_ci ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, 2277cabdff1aSopenharmony_ci src5_r, src6_r, src7_r); 2278cabdff1aSopenharmony_ci ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, 2279cabdff1aSopenharmony_ci src5_l, src6_l, src7_l); 2280cabdff1aSopenharmony_ci MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, 2281cabdff1aSopenharmony_ci tmp2, tmp3); 2282cabdff1aSopenharmony_ci MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, 2283cabdff1aSopenharmony_ci tmp6, tmp7); 2284cabdff1aSopenharmony_ci MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, 2285cabdff1aSopenharmony_ci tmp10, tmp11); 2286cabdff1aSopenharmony_ci MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13, 2287cabdff1aSopenharmony_ci tmp14, tmp15); 2288cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, 2289cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 2290cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, 2291cabdff1aSopenharmony_ci tmp4, tmp5, tmp6, tmp7); 2292cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, 2293cabdff1aSopenharmony_ci tmp8, tmp9, tmp10, tmp11); 2294cabdff1aSopenharmony_ci ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset, 2295cabdff1aSopenharmony_ci tmp12, tmp13, tmp14, tmp15); 2296cabdff1aSopenharmony_ci MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0); 2297cabdff1aSopenharmony_ci MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0); 2298cabdff1aSopenharmony_ci SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom); 2299cabdff1aSopenharmony_ci SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom); 2300cabdff1aSopenharmony_ci SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); 2301cabdff1aSopenharmony_ci SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7); 2302cabdff1aSopenharmony_ci PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, 2303cabdff1aSopenharmony_ci dst2, dst3); 2304cabdff1aSopenharmony_ci PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, 2305cabdff1aSopenharmony_ci dst5, dst6, dst7); 2306cabdff1aSopenharmony_ci ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride); 2307cabdff1aSopenharmony_ci } 2308cabdff1aSopenharmony_ci} 2309cabdff1aSopenharmony_ci 2310cabdff1aSopenharmony_civoid ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride, 2311cabdff1aSopenharmony_ci int height, int log2_denom, 2312cabdff1aSopenharmony_ci int weight_src, int offset) 2313cabdff1aSopenharmony_ci{ 2314cabdff1aSopenharmony_ci if (4 == height) { 2315cabdff1aSopenharmony_ci avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset); 2316cabdff1aSopenharmony_ci } else if (8 == height) { 2317cabdff1aSopenharmony_ci avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset); 2318cabdff1aSopenharmony_ci } else { 2319cabdff1aSopenharmony_ci avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset); 2320cabdff1aSopenharmony_ci } 2321cabdff1aSopenharmony_ci} 2322cabdff1aSopenharmony_ci 2323cabdff1aSopenharmony_civoid ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride, 2324cabdff1aSopenharmony_ci int height, int log2_denom, 2325cabdff1aSopenharmony_ci int weight_src, int offset) 2326cabdff1aSopenharmony_ci{ 2327cabdff1aSopenharmony_ci if (2 == height) { 2328cabdff1aSopenharmony_ci avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset); 2329cabdff1aSopenharmony_ci } else if (4 == height) { 2330cabdff1aSopenharmony_ci avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset); 2331cabdff1aSopenharmony_ci } else { 2332cabdff1aSopenharmony_ci avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset); 2333cabdff1aSopenharmony_ci } 2334cabdff1aSopenharmony_ci} 2335cabdff1aSopenharmony_ci 2336cabdff1aSopenharmony_civoid ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src, 2337cabdff1aSopenharmony_ci ptrdiff_t stride, int height, 2338cabdff1aSopenharmony_ci int log2_denom, int weight_dst, 2339cabdff1aSopenharmony_ci int weight_src, int offset_in) 2340cabdff1aSopenharmony_ci{ 2341cabdff1aSopenharmony_ci v16i8 src_wgt, dst_wgt, wgt; 2342cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 2343cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2344cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2345cabdff1aSopenharmony_ci v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 2346cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2347cabdff1aSopenharmony_ci v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; 2348cabdff1aSopenharmony_ci v8i16 denom, offset; 2349cabdff1aSopenharmony_ci 2350cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 2351cabdff1aSopenharmony_ci offset_in += (128 * (weight_src + weight_dst)); 2352cabdff1aSopenharmony_ci 2353cabdff1aSopenharmony_ci src_wgt = __msa_fill_b(weight_src); 2354cabdff1aSopenharmony_ci dst_wgt = __msa_fill_b(weight_dst); 2355cabdff1aSopenharmony_ci offset = __msa_fill_h(offset_in); 2356cabdff1aSopenharmony_ci denom = __msa_fill_h(log2_denom + 1); 2357cabdff1aSopenharmony_ci 2358cabdff1aSopenharmony_ci wgt = __msa_ilvev_b(dst_wgt, src_wgt); 2359cabdff1aSopenharmony_ci 2360cabdff1aSopenharmony_ci LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 2361cabdff1aSopenharmony_ci src += 8 * stride; 2362cabdff1aSopenharmony_ci LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 2363cabdff1aSopenharmony_ci XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7); 2364cabdff1aSopenharmony_ci XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 2365cabdff1aSopenharmony_ci ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, vec4, 2366cabdff1aSopenharmony_ci vec6); 2367cabdff1aSopenharmony_ci ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, vec5, 2368cabdff1aSopenharmony_ci vec7); 2369cabdff1aSopenharmony_ci ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10, 2370cabdff1aSopenharmony_ci vec12, vec14); 2371cabdff1aSopenharmony_ci ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11, 2372cabdff1aSopenharmony_ci vec13, vec15); 2373cabdff1aSopenharmony_ci tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 2374cabdff1aSopenharmony_ci tmp1 = __msa_dpadd_s_h(offset, wgt, vec1); 2375cabdff1aSopenharmony_ci tmp2 = __msa_dpadd_s_h(offset, wgt, vec2); 2376cabdff1aSopenharmony_ci tmp3 = __msa_dpadd_s_h(offset, wgt, vec3); 2377cabdff1aSopenharmony_ci tmp4 = __msa_dpadd_s_h(offset, wgt, vec4); 2378cabdff1aSopenharmony_ci tmp5 = __msa_dpadd_s_h(offset, wgt, vec5); 2379cabdff1aSopenharmony_ci tmp6 = __msa_dpadd_s_h(offset, wgt, vec6); 2380cabdff1aSopenharmony_ci tmp7 = __msa_dpadd_s_h(offset, wgt, vec7); 2381cabdff1aSopenharmony_ci tmp8 = __msa_dpadd_s_h(offset, wgt, vec8); 2382cabdff1aSopenharmony_ci tmp9 = __msa_dpadd_s_h(offset, wgt, vec9); 2383cabdff1aSopenharmony_ci tmp10 = __msa_dpadd_s_h(offset, wgt, vec10); 2384cabdff1aSopenharmony_ci tmp11 = __msa_dpadd_s_h(offset, wgt, vec11); 2385cabdff1aSopenharmony_ci tmp12 = __msa_dpadd_s_h(offset, wgt, vec12); 2386cabdff1aSopenharmony_ci tmp13 = __msa_dpadd_s_h(offset, wgt, vec13); 2387cabdff1aSopenharmony_ci tmp14 = __msa_dpadd_s_h(offset, wgt, vec14); 2388cabdff1aSopenharmony_ci tmp15 = __msa_dpadd_s_h(offset, wgt, vec15); 2389cabdff1aSopenharmony_ci SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); 2390cabdff1aSopenharmony_ci SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); 2391cabdff1aSopenharmony_ci SRA_4V(tmp8, tmp9, tmp10, tmp11, denom); 2392cabdff1aSopenharmony_ci SRA_4V(tmp12, tmp13, tmp14, tmp15, denom); 2393cabdff1aSopenharmony_ci CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2394cabdff1aSopenharmony_ci CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15); 2395cabdff1aSopenharmony_ci PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, 2396cabdff1aSopenharmony_ci dst2, dst3); 2397cabdff1aSopenharmony_ci PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, 2398cabdff1aSopenharmony_ci dst5, dst6, dst7); 2399cabdff1aSopenharmony_ci ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride); 2400cabdff1aSopenharmony_ci dst += 8 * stride; 2401cabdff1aSopenharmony_ci 2402cabdff1aSopenharmony_ci if (16 == height) { 2403cabdff1aSopenharmony_ci LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 2404cabdff1aSopenharmony_ci LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 2405cabdff1aSopenharmony_ci XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7); 2406cabdff1aSopenharmony_ci XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 2407cabdff1aSopenharmony_ci ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, 2408cabdff1aSopenharmony_ci vec4, vec6); 2409cabdff1aSopenharmony_ci ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, 2410cabdff1aSopenharmony_ci vec5, vec7); 2411cabdff1aSopenharmony_ci ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10, 2412cabdff1aSopenharmony_ci vec12, vec14); 2413cabdff1aSopenharmony_ci ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11, 2414cabdff1aSopenharmony_ci vec13, vec15); 2415cabdff1aSopenharmony_ci tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 2416cabdff1aSopenharmony_ci tmp1 = __msa_dpadd_s_h(offset, wgt, vec1); 2417cabdff1aSopenharmony_ci tmp2 = __msa_dpadd_s_h(offset, wgt, vec2); 2418cabdff1aSopenharmony_ci tmp3 = __msa_dpadd_s_h(offset, wgt, vec3); 2419cabdff1aSopenharmony_ci tmp4 = __msa_dpadd_s_h(offset, wgt, vec4); 2420cabdff1aSopenharmony_ci tmp5 = __msa_dpadd_s_h(offset, wgt, vec5); 2421cabdff1aSopenharmony_ci tmp6 = __msa_dpadd_s_h(offset, wgt, vec6); 2422cabdff1aSopenharmony_ci tmp7 = __msa_dpadd_s_h(offset, wgt, vec7); 2423cabdff1aSopenharmony_ci tmp8 = __msa_dpadd_s_h(offset, wgt, vec8); 2424cabdff1aSopenharmony_ci tmp9 = __msa_dpadd_s_h(offset, wgt, vec9); 2425cabdff1aSopenharmony_ci tmp10 = __msa_dpadd_s_h(offset, wgt, vec10); 2426cabdff1aSopenharmony_ci tmp11 = __msa_dpadd_s_h(offset, wgt, vec11); 2427cabdff1aSopenharmony_ci tmp12 = __msa_dpadd_s_h(offset, wgt, vec12); 2428cabdff1aSopenharmony_ci tmp13 = __msa_dpadd_s_h(offset, wgt, vec13); 2429cabdff1aSopenharmony_ci tmp14 = __msa_dpadd_s_h(offset, wgt, vec14); 2430cabdff1aSopenharmony_ci tmp15 = __msa_dpadd_s_h(offset, wgt, vec15); 2431cabdff1aSopenharmony_ci SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); 2432cabdff1aSopenharmony_ci SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); 2433cabdff1aSopenharmony_ci SRA_4V(tmp8, tmp9, tmp10, tmp11, denom); 2434cabdff1aSopenharmony_ci SRA_4V(tmp12, tmp13, tmp14, tmp15, denom); 2435cabdff1aSopenharmony_ci CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2436cabdff1aSopenharmony_ci CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15); 2437cabdff1aSopenharmony_ci PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, 2438cabdff1aSopenharmony_ci dst2, dst3); 2439cabdff1aSopenharmony_ci PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, 2440cabdff1aSopenharmony_ci dst5, dst6, dst7); 2441cabdff1aSopenharmony_ci ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride); 2442cabdff1aSopenharmony_ci } 2443cabdff1aSopenharmony_ci} 2444cabdff1aSopenharmony_ci 2445cabdff1aSopenharmony_civoid ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src, 2446cabdff1aSopenharmony_ci ptrdiff_t stride, int height, 2447cabdff1aSopenharmony_ci int log2_denom, int weight_dst, 2448cabdff1aSopenharmony_ci int weight_src, int offset) 2449cabdff1aSopenharmony_ci{ 2450cabdff1aSopenharmony_ci if (4 == height) { 2451cabdff1aSopenharmony_ci avc_biwgt_8x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst, 2452cabdff1aSopenharmony_ci offset); 2453cabdff1aSopenharmony_ci } else if (8 == height) { 2454cabdff1aSopenharmony_ci avc_biwgt_8x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst, 2455cabdff1aSopenharmony_ci offset); 2456cabdff1aSopenharmony_ci } else { 2457cabdff1aSopenharmony_ci avc_biwgt_8x16_msa(src, dst, stride, log2_denom, weight_src, weight_dst, 2458cabdff1aSopenharmony_ci offset); 2459cabdff1aSopenharmony_ci } 2460cabdff1aSopenharmony_ci} 2461cabdff1aSopenharmony_ci 2462cabdff1aSopenharmony_civoid ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src, 2463cabdff1aSopenharmony_ci ptrdiff_t stride, int height, 2464cabdff1aSopenharmony_ci int log2_denom, int weight_dst, 2465cabdff1aSopenharmony_ci int weight_src, int offset) 2466cabdff1aSopenharmony_ci{ 2467cabdff1aSopenharmony_ci if (2 == height) { 2468cabdff1aSopenharmony_ci avc_biwgt_4x2_msa(src, dst, stride, log2_denom, weight_src, weight_dst, 2469cabdff1aSopenharmony_ci offset); 2470cabdff1aSopenharmony_ci } else if (4 == height) { 2471cabdff1aSopenharmony_ci avc_biwgt_4x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst, 2472cabdff1aSopenharmony_ci offset); 2473cabdff1aSopenharmony_ci } else { 2474cabdff1aSopenharmony_ci avc_biwgt_4x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst, 2475cabdff1aSopenharmony_ci offset); 2476cabdff1aSopenharmony_ci } 2477cabdff1aSopenharmony_ci} 2478