1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 22cabdff1aSopenharmony_ci#include "h264dsp_mips.h" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_cistatic const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = { 25cabdff1aSopenharmony_ci /* 8 width cases */ 26cabdff1aSopenharmony_ci 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12, 27cabdff1aSopenharmony_ci 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, 28cabdff1aSopenharmony_ci 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ci /* 4 width cases */ 31cabdff1aSopenharmony_ci 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24, 32cabdff1aSopenharmony_ci 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23, 33cabdff1aSopenharmony_ci 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22, 34cabdff1aSopenharmony_ci}; 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_ci#define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \ 37cabdff1aSopenharmony_ci out1, out2) \ 38cabdff1aSopenharmony_ci{ \ 39cabdff1aSopenharmony_ci v16i8 tmp0_m, tmp1_m; \ 40cabdff1aSopenharmony_ci v16i8 minus5b_m = __msa_ldi_b(-5); \ 41cabdff1aSopenharmony_ci v16i8 plus20b_m = __msa_ldi_b(20); \ 42cabdff1aSopenharmony_ci \ 43cabdff1aSopenharmony_ci ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \ 44cabdff1aSopenharmony_ci HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \ 45cabdff1aSopenharmony_ci ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \ 46cabdff1aSopenharmony_ci DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \ 47cabdff1aSopenharmony_ci ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \ 48cabdff1aSopenharmony_ci DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \ 49cabdff1aSopenharmony_ci} 50cabdff1aSopenharmony_ci 51cabdff1aSopenharmony_ci#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \ 52cabdff1aSopenharmony_ci( { \ 53cabdff1aSopenharmony_ci v8i16 out0_m; \ 54cabdff1aSopenharmony_ci v16i8 tmp0_m; \ 55cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); \ 56cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); \ 57cabdff1aSopenharmony_ci \ 58cabdff1aSopenharmony_ci tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0); \ 59cabdff1aSopenharmony_ci out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \ 60cabdff1aSopenharmony_ci \ 61cabdff1aSopenharmony_ci tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0); \ 62cabdff1aSopenharmony_ci out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \ 63cabdff1aSopenharmony_ci \ 64cabdff1aSopenharmony_ci tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0); \ 65cabdff1aSopenharmony_ci out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m); \ 66cabdff1aSopenharmony_ci \ 67cabdff1aSopenharmony_ci out0_m; \ 68cabdff1aSopenharmony_ci} ) 69cabdff1aSopenharmony_ci 70cabdff1aSopenharmony_ci#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \ 71cabdff1aSopenharmony_ci( { \ 72cabdff1aSopenharmony_ci v8i16 out0_m; \ 73cabdff1aSopenharmony_ci \ 74cabdff1aSopenharmony_ci out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \ 75cabdff1aSopenharmony_ci out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \ 76cabdff1aSopenharmony_ci out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \ 77cabdff1aSopenharmony_ci \ 78cabdff1aSopenharmony_ci out0_m; \ 79cabdff1aSopenharmony_ci} ) 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_ci#define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2) \ 82cabdff1aSopenharmony_ci( { \ 83cabdff1aSopenharmony_ci v4i32 out0_m; \ 84cabdff1aSopenharmony_ci \ 85cabdff1aSopenharmony_ci out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0); \ 86cabdff1aSopenharmony_ci out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1); \ 87cabdff1aSopenharmony_ci out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2); \ 88cabdff1aSopenharmony_ci out0_m = __msa_srari_w(out0_m, 10); \ 89cabdff1aSopenharmony_ci out0_m = __msa_sat_s_w(out0_m, 7); \ 90cabdff1aSopenharmony_ci out0_m; \ 91cabdff1aSopenharmony_ci} ) 92cabdff1aSopenharmony_ci 93cabdff1aSopenharmony_cistatic void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y, 94cabdff1aSopenharmony_ci uint8_t *dst, int32_t stride) 95cabdff1aSopenharmony_ci{ 96cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 97cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 98cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 99cabdff1aSopenharmony_ci v16u8 out; 100cabdff1aSopenharmony_ci v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8; 101cabdff1aSopenharmony_ci v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 102cabdff1aSopenharmony_ci v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r; 103cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, filt0, filt1, filt2; 104cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1; 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 107cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 108cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 109cabdff1aSopenharmony_ci 110cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_ci LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 113cabdff1aSopenharmony_ci src_y += (5 * stride); 114cabdff1aSopenharmony_ci 115cabdff1aSopenharmony_ci src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1); 116cabdff1aSopenharmony_ci src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2); 117cabdff1aSopenharmony_ci src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3); 118cabdff1aSopenharmony_ci src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4); 119cabdff1aSopenharmony_ci 120cabdff1aSopenharmony_ci XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3); 121cabdff1aSopenharmony_ci 122cabdff1aSopenharmony_ci LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 123cabdff1aSopenharmony_ci XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 124cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2); 125cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2); 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci SRARI_H2_SH(hz_out0, hz_out1, 5); 128cabdff1aSopenharmony_ci SAT_SH2_SH(hz_out0, hz_out1, 7); 129cabdff1aSopenharmony_ci 130cabdff1aSopenharmony_ci LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8); 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5); 133cabdff1aSopenharmony_ci src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6); 134cabdff1aSopenharmony_ci src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7); 135cabdff1aSopenharmony_ci src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8); 136cabdff1aSopenharmony_ci 137cabdff1aSopenharmony_ci XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7); 138cabdff1aSopenharmony_ci ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r); 139cabdff1aSopenharmony_ci ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r); 140cabdff1aSopenharmony_ci vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1, 141cabdff1aSopenharmony_ci filt2); 142cabdff1aSopenharmony_ci vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1, 143cabdff1aSopenharmony_ci filt2); 144cabdff1aSopenharmony_ci SRARI_H2_SH(vt_out0, vt_out1, 5); 145cabdff1aSopenharmony_ci SAT_SH2_SH(vt_out0, vt_out1, 7); 146cabdff1aSopenharmony_ci 147cabdff1aSopenharmony_ci out0 = __msa_srari_h((hz_out0 + vt_out0), 1); 148cabdff1aSopenharmony_ci out1 = __msa_srari_h((hz_out1 + vt_out1), 1); 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_ci SAT_SH2_SH(out0, out1, 7); 151cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 152cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 153cabdff1aSopenharmony_ci} 154cabdff1aSopenharmony_ci 155cabdff1aSopenharmony_cistatic void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y, 156cabdff1aSopenharmony_ci uint8_t *dst, int32_t stride) 157cabdff1aSopenharmony_ci{ 158cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 159cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 160cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 161cabdff1aSopenharmony_ci v16u8 out0, out1; 162cabdff1aSopenharmony_ci v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2; 163cabdff1aSopenharmony_ci v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 164cabdff1aSopenharmony_ci v16i8 src_vt7, src_vt8, src_vt9, src_vt10, src_vt11, src_vt12; 165cabdff1aSopenharmony_ci v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r; 166cabdff1aSopenharmony_ci v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r; 167cabdff1aSopenharmony_ci v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2; 168cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2; 169cabdff1aSopenharmony_ci v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3; 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 172cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 173cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 174cabdff1aSopenharmony_ci 175cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 176cabdff1aSopenharmony_ci LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 177cabdff1aSopenharmony_ci src_y += (5 * stride); 178cabdff1aSopenharmony_ci 179cabdff1aSopenharmony_ci XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ci LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 182cabdff1aSopenharmony_ci XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 183cabdff1aSopenharmony_ci src_x += (4 * stride); 184cabdff1aSopenharmony_ci 185cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 186cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 187cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 188cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 189cabdff1aSopenharmony_ci 190cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); 191cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); 192cabdff1aSopenharmony_ci 193cabdff1aSopenharmony_ci LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8); 194cabdff1aSopenharmony_ci src_y += (4 * stride); 195cabdff1aSopenharmony_ci XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8); 196cabdff1aSopenharmony_ci 197cabdff1aSopenharmony_ci ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4, 198cabdff1aSopenharmony_ci src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r); 199cabdff1aSopenharmony_ci ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8, 200cabdff1aSopenharmony_ci src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r); 201cabdff1aSopenharmony_ci vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1, 202cabdff1aSopenharmony_ci filt2); 203cabdff1aSopenharmony_ci vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1, 204cabdff1aSopenharmony_ci filt2); 205cabdff1aSopenharmony_ci vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1, 206cabdff1aSopenharmony_ci filt2); 207cabdff1aSopenharmony_ci vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1, 208cabdff1aSopenharmony_ci filt2); 209cabdff1aSopenharmony_ci SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5); 210cabdff1aSopenharmony_ci SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7); 211cabdff1aSopenharmony_ci 212cabdff1aSopenharmony_ci tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1); 213cabdff1aSopenharmony_ci tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1); 214cabdff1aSopenharmony_ci tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1); 215cabdff1aSopenharmony_ci tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1); 216cabdff1aSopenharmony_ci 217cabdff1aSopenharmony_ci LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 218cabdff1aSopenharmony_ci XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 219cabdff1aSopenharmony_ci 220cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 221cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 222cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 223cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 224cabdff1aSopenharmony_ci dst += (4 * stride); 225cabdff1aSopenharmony_ci 226cabdff1aSopenharmony_ci LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12); 227cabdff1aSopenharmony_ci XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12); 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 230cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 231cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 232cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); 235cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10, 238cabdff1aSopenharmony_ci src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r, 239cabdff1aSopenharmony_ci src_vt1211_r); 240cabdff1aSopenharmony_ci vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1, 241cabdff1aSopenharmony_ci filt2); 242cabdff1aSopenharmony_ci vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1, 243cabdff1aSopenharmony_ci filt2); 244cabdff1aSopenharmony_ci vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1, 245cabdff1aSopenharmony_ci filt2); 246cabdff1aSopenharmony_ci vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0, 247cabdff1aSopenharmony_ci filt1, filt2); 248cabdff1aSopenharmony_ci SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5); 249cabdff1aSopenharmony_ci SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7); 250cabdff1aSopenharmony_ci 251cabdff1aSopenharmony_ci tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1); 252cabdff1aSopenharmony_ci tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1); 253cabdff1aSopenharmony_ci tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1); 254cabdff1aSopenharmony_ci tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1); 255cabdff1aSopenharmony_ci 256cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 257cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 258cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 259cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 260cabdff1aSopenharmony_ci} 261cabdff1aSopenharmony_ci 262cabdff1aSopenharmony_cistatic void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x, 263cabdff1aSopenharmony_ci const uint8_t *src_y, uint8_t *dst, 264cabdff1aSopenharmony_ci int32_t stride) 265cabdff1aSopenharmony_ci{ 266cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 267cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 268cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 269cabdff1aSopenharmony_ci const uint8_t *src_x_tmp = src_x; 270cabdff1aSopenharmony_ci const uint8_t *src_y_tmp = src_y; 271cabdff1aSopenharmony_ci uint8_t *dst_tmp = dst; 272cabdff1aSopenharmony_ci uint32_t multiple8_cnt, loop_cnt; 273cabdff1aSopenharmony_ci v16u8 tmp0, tmp1; 274cabdff1aSopenharmony_ci v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2; 275cabdff1aSopenharmony_ci v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 276cabdff1aSopenharmony_ci v16i8 src_vt7, src_vt8; 277cabdff1aSopenharmony_ci v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r; 278cabdff1aSopenharmony_ci v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2; 279cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2; 280cabdff1aSopenharmony_ci v8i16 vt_out3, out0, out1, out2, out3; 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 283cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 284cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 285cabdff1aSopenharmony_ci 286cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 287cabdff1aSopenharmony_ci 288cabdff1aSopenharmony_ci for (multiple8_cnt = 2; multiple8_cnt--;) { 289cabdff1aSopenharmony_ci src_x = src_x_tmp; 290cabdff1aSopenharmony_ci src_y = src_y_tmp; 291cabdff1aSopenharmony_ci dst = dst_tmp; 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 294cabdff1aSopenharmony_ci src_y += (5 * stride); 295cabdff1aSopenharmony_ci 296cabdff1aSopenharmony_ci XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 297cabdff1aSopenharmony_ci 298cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 299cabdff1aSopenharmony_ci LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 300cabdff1aSopenharmony_ci XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 301cabdff1aSopenharmony_ci src_x += (4 * stride); 302cabdff1aSopenharmony_ci 303cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 304cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 305cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 306cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 307cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); 308cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); 309cabdff1aSopenharmony_ci 310cabdff1aSopenharmony_ci LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8); 311cabdff1aSopenharmony_ci src_y += (4 * stride); 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8); 314cabdff1aSopenharmony_ci ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, 315cabdff1aSopenharmony_ci src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, 316cabdff1aSopenharmony_ci src_vt43_r); 317cabdff1aSopenharmony_ci ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, 318cabdff1aSopenharmony_ci src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, 319cabdff1aSopenharmony_ci src_vt87_r); 320cabdff1aSopenharmony_ci vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, 321cabdff1aSopenharmony_ci filt1, filt2); 322cabdff1aSopenharmony_ci vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, 323cabdff1aSopenharmony_ci filt1, filt2); 324cabdff1aSopenharmony_ci vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, 325cabdff1aSopenharmony_ci filt1, filt2); 326cabdff1aSopenharmony_ci vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, 327cabdff1aSopenharmony_ci filt1, filt2); 328cabdff1aSopenharmony_ci SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5); 329cabdff1aSopenharmony_ci SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7); 330cabdff1aSopenharmony_ci 331cabdff1aSopenharmony_ci out0 = __msa_srari_h((hz_out0 + vt_out0), 1); 332cabdff1aSopenharmony_ci out1 = __msa_srari_h((hz_out1 + vt_out1), 1); 333cabdff1aSopenharmony_ci out2 = __msa_srari_h((hz_out2 + vt_out2), 1); 334cabdff1aSopenharmony_ci out3 = __msa_srari_h((hz_out3 + vt_out3), 1); 335cabdff1aSopenharmony_ci 336cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 337cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0, out1); 338cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out2, out3); 339cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, stride); 340cabdff1aSopenharmony_ci dst += (4 * stride); 341cabdff1aSopenharmony_ci 342cabdff1aSopenharmony_ci src_vt0 = src_vt4; 343cabdff1aSopenharmony_ci src_vt1 = src_vt5; 344cabdff1aSopenharmony_ci src_vt2 = src_vt6; 345cabdff1aSopenharmony_ci src_vt3 = src_vt7; 346cabdff1aSopenharmony_ci src_vt4 = src_vt8; 347cabdff1aSopenharmony_ci } 348cabdff1aSopenharmony_ci 349cabdff1aSopenharmony_ci src_x_tmp += 8; 350cabdff1aSopenharmony_ci src_y_tmp += 8; 351cabdff1aSopenharmony_ci dst_tmp += 8; 352cabdff1aSopenharmony_ci } 353cabdff1aSopenharmony_ci} 354cabdff1aSopenharmony_ci 355cabdff1aSopenharmony_cistatic void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x, 356cabdff1aSopenharmony_ci const uint8_t *src_y, 357cabdff1aSopenharmony_ci uint8_t *dst, 358cabdff1aSopenharmony_ci int32_t stride) 359cabdff1aSopenharmony_ci{ 360cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 361cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 362cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 363cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 364cabdff1aSopenharmony_ci v16u8 res, dst0 = { 0 }; 365cabdff1aSopenharmony_ci v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8; 366cabdff1aSopenharmony_ci v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 367cabdff1aSopenharmony_ci v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r; 368cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, filt0, filt1, filt2; 369cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, vt_out0, vt_out1, res0, res1; 370cabdff1aSopenharmony_ci 371cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 372cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 373cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 374cabdff1aSopenharmony_ci 375cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 376cabdff1aSopenharmony_ci 377cabdff1aSopenharmony_ci LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 378cabdff1aSopenharmony_ci src_y += (5 * stride); 379cabdff1aSopenharmony_ci 380cabdff1aSopenharmony_ci src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1); 381cabdff1aSopenharmony_ci src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2); 382cabdff1aSopenharmony_ci src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3); 383cabdff1aSopenharmony_ci src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4); 384cabdff1aSopenharmony_ci 385cabdff1aSopenharmony_ci XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3); 386cabdff1aSopenharmony_ci 387cabdff1aSopenharmony_ci LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 388cabdff1aSopenharmony_ci XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 389cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2); 390cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2); 391cabdff1aSopenharmony_ci 392cabdff1aSopenharmony_ci SRARI_H2_SH(hz_out0, hz_out1, 5); 393cabdff1aSopenharmony_ci SAT_SH2_SH(hz_out0, hz_out1, 7); 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_ci LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8); 396cabdff1aSopenharmony_ci 397cabdff1aSopenharmony_ci src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5); 398cabdff1aSopenharmony_ci src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6); 399cabdff1aSopenharmony_ci src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7); 400cabdff1aSopenharmony_ci src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8); 401cabdff1aSopenharmony_ci 402cabdff1aSopenharmony_ci XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7); 403cabdff1aSopenharmony_ci ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r); 404cabdff1aSopenharmony_ci ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r); 405cabdff1aSopenharmony_ci vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1, 406cabdff1aSopenharmony_ci filt2); 407cabdff1aSopenharmony_ci vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1, 408cabdff1aSopenharmony_ci filt2); 409cabdff1aSopenharmony_ci SRARI_H2_SH(vt_out0, vt_out1, 5); 410cabdff1aSopenharmony_ci SAT_SH2_SH(vt_out0, vt_out1, 7); 411cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 412cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 413cabdff1aSopenharmony_ci 414cabdff1aSopenharmony_ci res1 = __msa_srari_h((hz_out1 + vt_out1), 1); 415cabdff1aSopenharmony_ci res0 = __msa_srari_h((hz_out0 + vt_out0), 1); 416cabdff1aSopenharmony_ci 417cabdff1aSopenharmony_ci SAT_SH2_SH(res0, res1, 7); 418cabdff1aSopenharmony_ci res = PCKEV_XORI128_UB(res0, res1); 419cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b(res, dst0); 420cabdff1aSopenharmony_ci 421cabdff1aSopenharmony_ci ST_W4(dst0, 0, 1, 2, 3, dst, stride); 422cabdff1aSopenharmony_ci} 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_cistatic void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x, 425cabdff1aSopenharmony_ci const uint8_t *src_y, 426cabdff1aSopenharmony_ci uint8_t *dst, 427cabdff1aSopenharmony_ci int32_t stride) 428cabdff1aSopenharmony_ci{ 429cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 430cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 431cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 432cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 433cabdff1aSopenharmony_ci v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 }; 434cabdff1aSopenharmony_ci v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt0, src_vt1, src_vt2; 435cabdff1aSopenharmony_ci v16i8 src_vt3, src_vt4, src_vt5, src_vt6, src_vt7, src_vt8; 436cabdff1aSopenharmony_ci v16i8 src_vt9, src_vt10, src_vt11, src_vt12, mask0, mask1, mask2; 437cabdff1aSopenharmony_ci v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r; 438cabdff1aSopenharmony_ci v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r; 439cabdff1aSopenharmony_ci v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2; 440cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2; 441cabdff1aSopenharmony_ci v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3; 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 444cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 445cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 446cabdff1aSopenharmony_ci 447cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 448cabdff1aSopenharmony_ci LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 449cabdff1aSopenharmony_ci src_y += (5 * stride); 450cabdff1aSopenharmony_ci 451cabdff1aSopenharmony_ci XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 452cabdff1aSopenharmony_ci 453cabdff1aSopenharmony_ci LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 454cabdff1aSopenharmony_ci XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 455cabdff1aSopenharmony_ci src_x += (4 * stride); 456cabdff1aSopenharmony_ci 457cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 458cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 459cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 460cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 461cabdff1aSopenharmony_ci 462cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); 463cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); 464cabdff1aSopenharmony_ci 465cabdff1aSopenharmony_ci LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8); 466cabdff1aSopenharmony_ci src_y += (4 * stride); 467cabdff1aSopenharmony_ci XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8); 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_ci ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4, 470cabdff1aSopenharmony_ci src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r); 471cabdff1aSopenharmony_ci ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8, 472cabdff1aSopenharmony_ci src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r); 473cabdff1aSopenharmony_ci vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1, 474cabdff1aSopenharmony_ci filt2); 475cabdff1aSopenharmony_ci vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1, 476cabdff1aSopenharmony_ci filt2); 477cabdff1aSopenharmony_ci vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1, 478cabdff1aSopenharmony_ci filt2); 479cabdff1aSopenharmony_ci vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1, 480cabdff1aSopenharmony_ci filt2); 481cabdff1aSopenharmony_ci SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5); 482cabdff1aSopenharmony_ci SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7); 483cabdff1aSopenharmony_ci 484cabdff1aSopenharmony_ci tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1); 485cabdff1aSopenharmony_ci tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1); 486cabdff1aSopenharmony_ci tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1); 487cabdff1aSopenharmony_ci tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1); 488cabdff1aSopenharmony_ci 489cabdff1aSopenharmony_ci LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 490cabdff1aSopenharmony_ci XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 491cabdff1aSopenharmony_ci 492cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 493cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 494cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 495cabdff1aSopenharmony_ci 496cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 497cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 498cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 499cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 500cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 501cabdff1aSopenharmony_ci dst += (4 * stride); 502cabdff1aSopenharmony_ci 503cabdff1aSopenharmony_ci LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12); 504cabdff1aSopenharmony_ci XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12); 505cabdff1aSopenharmony_ci 506cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 507cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 508cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 509cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 510cabdff1aSopenharmony_ci 511cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); 512cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); 513cabdff1aSopenharmony_ci 514cabdff1aSopenharmony_ci ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10, 515cabdff1aSopenharmony_ci src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r, 516cabdff1aSopenharmony_ci src_vt1211_r); 517cabdff1aSopenharmony_ci vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1, 518cabdff1aSopenharmony_ci filt2); 519cabdff1aSopenharmony_ci vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1, 520cabdff1aSopenharmony_ci filt2); 521cabdff1aSopenharmony_ci vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1, 522cabdff1aSopenharmony_ci filt2); 523cabdff1aSopenharmony_ci vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0, 524cabdff1aSopenharmony_ci filt1, filt2); 525cabdff1aSopenharmony_ci SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5); 526cabdff1aSopenharmony_ci SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7); 527cabdff1aSopenharmony_ci 528cabdff1aSopenharmony_ci tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1); 529cabdff1aSopenharmony_ci tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1); 530cabdff1aSopenharmony_ci tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1); 531cabdff1aSopenharmony_ci tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1); 532cabdff1aSopenharmony_ci 533cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 534cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 535cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 536cabdff1aSopenharmony_ci 537cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 538cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 539cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 540cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 541cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 542cabdff1aSopenharmony_ci} 543cabdff1aSopenharmony_ci 544cabdff1aSopenharmony_cistatic void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x, 545cabdff1aSopenharmony_ci const uint8_t *src_y, 546cabdff1aSopenharmony_ci uint8_t *dst, 547cabdff1aSopenharmony_ci int32_t stride) 548cabdff1aSopenharmony_ci{ 549cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 550cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 551cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 552cabdff1aSopenharmony_ci const uint8_t *src_x_tmp = src_x; 553cabdff1aSopenharmony_ci const uint8_t *src_y_tmp = src_y; 554cabdff1aSopenharmony_ci uint8_t *dst_tmp = dst; 555cabdff1aSopenharmony_ci uint32_t multiple8_cnt, loop_cnt; 556cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 557cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, dst0 = { 0 }, dst1 = { 0 }; 558cabdff1aSopenharmony_ci v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2; 559cabdff1aSopenharmony_ci v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6; 560cabdff1aSopenharmony_ci v16i8 src_vt7, src_vt8; 561cabdff1aSopenharmony_ci v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r; 562cabdff1aSopenharmony_ci v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2; 563cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2; 564cabdff1aSopenharmony_ci v8i16 vt_out3, out0, out1, out2, out3; 565cabdff1aSopenharmony_ci 566cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 567cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 568cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 569cabdff1aSopenharmony_ci 570cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 571cabdff1aSopenharmony_ci 572cabdff1aSopenharmony_ci for (multiple8_cnt = 2; multiple8_cnt--;) { 573cabdff1aSopenharmony_ci src_x = src_x_tmp; 574cabdff1aSopenharmony_ci src_y = src_y_tmp; 575cabdff1aSopenharmony_ci dst = dst_tmp; 576cabdff1aSopenharmony_ci 577cabdff1aSopenharmony_ci LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 578cabdff1aSopenharmony_ci src_y += (5 * stride); 579cabdff1aSopenharmony_ci 580cabdff1aSopenharmony_ci XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4); 581cabdff1aSopenharmony_ci 582cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 583cabdff1aSopenharmony_ci LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3); 584cabdff1aSopenharmony_ci XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3); 585cabdff1aSopenharmony_ci src_x += (4 * stride); 586cabdff1aSopenharmony_ci 587cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2); 588cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2); 589cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2); 590cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2); 591cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5); 592cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7); 593cabdff1aSopenharmony_ci 594cabdff1aSopenharmony_ci LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8); 595cabdff1aSopenharmony_ci src_y += (4 * stride); 596cabdff1aSopenharmony_ci 597cabdff1aSopenharmony_ci XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8); 598cabdff1aSopenharmony_ci ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, 599cabdff1aSopenharmony_ci src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, 600cabdff1aSopenharmony_ci src_vt43_r); 601cabdff1aSopenharmony_ci ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, 602cabdff1aSopenharmony_ci src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, 603cabdff1aSopenharmony_ci src_vt87_r); 604cabdff1aSopenharmony_ci vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, 605cabdff1aSopenharmony_ci filt1, filt2); 606cabdff1aSopenharmony_ci vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, 607cabdff1aSopenharmony_ci filt1, filt2); 608cabdff1aSopenharmony_ci vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, 609cabdff1aSopenharmony_ci filt1, filt2); 610cabdff1aSopenharmony_ci vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, 611cabdff1aSopenharmony_ci filt1, filt2); 612cabdff1aSopenharmony_ci SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5); 613cabdff1aSopenharmony_ci SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7); 614cabdff1aSopenharmony_ci 615cabdff1aSopenharmony_ci out0 = __msa_srari_h((hz_out0 + vt_out0), 1); 616cabdff1aSopenharmony_ci out1 = __msa_srari_h((hz_out1 + vt_out1), 1); 617cabdff1aSopenharmony_ci out2 = __msa_srari_h((hz_out2 + vt_out2), 1); 618cabdff1aSopenharmony_ci out3 = __msa_srari_h((hz_out3 + vt_out3), 1); 619cabdff1aSopenharmony_ci 620cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 621cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 622cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 623cabdff1aSopenharmony_ci 624cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 625cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0, out1); 626cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out2, out3); 627cabdff1aSopenharmony_ci AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1); 628cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 629cabdff1aSopenharmony_ci dst += (4 * stride); 630cabdff1aSopenharmony_ci 631cabdff1aSopenharmony_ci src_vt0 = src_vt4; 632cabdff1aSopenharmony_ci src_vt1 = src_vt5; 633cabdff1aSopenharmony_ci src_vt2 = src_vt6; 634cabdff1aSopenharmony_ci src_vt3 = src_vt7; 635cabdff1aSopenharmony_ci src_vt4 = src_vt8; 636cabdff1aSopenharmony_ci } 637cabdff1aSopenharmony_ci 638cabdff1aSopenharmony_ci src_x_tmp += 8; 639cabdff1aSopenharmony_ci src_y_tmp += 8; 640cabdff1aSopenharmony_ci dst_tmp += 8; 641cabdff1aSopenharmony_ci } 642cabdff1aSopenharmony_ci} 643cabdff1aSopenharmony_ci 644cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, 645cabdff1aSopenharmony_ci ptrdiff_t stride) 646cabdff1aSopenharmony_ci{ 647cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 648cabdff1aSopenharmony_ci v16u8 src8, src9, src10, src11, src12, src13, src14, src15; 649cabdff1aSopenharmony_ci 650cabdff1aSopenharmony_ci LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 651cabdff1aSopenharmony_ci src += (8 * stride); 652cabdff1aSopenharmony_ci LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15); 653cabdff1aSopenharmony_ci 654cabdff1aSopenharmony_ci ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride); 655cabdff1aSopenharmony_ci dst += (8 * stride); 656cabdff1aSopenharmony_ci ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride); 657cabdff1aSopenharmony_ci} 658cabdff1aSopenharmony_ci 659cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, 660cabdff1aSopenharmony_ci ptrdiff_t stride) 661cabdff1aSopenharmony_ci{ 662cabdff1aSopenharmony_ci uint64_t src0, src1, src2, src3, src4, src5, src6, src7; 663cabdff1aSopenharmony_ci 664cabdff1aSopenharmony_ci LD4(src, stride, src0, src1, src2, src3); 665cabdff1aSopenharmony_ci src += 4 * stride; 666cabdff1aSopenharmony_ci LD4(src, stride, src4, src5, src6, src7); 667cabdff1aSopenharmony_ci SD4(src0, src1, src2, src3, dst, stride); 668cabdff1aSopenharmony_ci dst += 4 * stride; 669cabdff1aSopenharmony_ci SD4(src4, src5, src6, src7, dst, stride); 670cabdff1aSopenharmony_ci} 671cabdff1aSopenharmony_ci 672cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, 673cabdff1aSopenharmony_ci ptrdiff_t stride) 674cabdff1aSopenharmony_ci{ 675cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 676cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 677cabdff1aSopenharmony_ci 678cabdff1aSopenharmony_ci LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 679cabdff1aSopenharmony_ci src += (8 * stride); 680cabdff1aSopenharmony_ci LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 681cabdff1aSopenharmony_ci 682cabdff1aSopenharmony_ci AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, 683cabdff1aSopenharmony_ci dst2, dst3); 684cabdff1aSopenharmony_ci AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, 685cabdff1aSopenharmony_ci dst6, dst7); 686cabdff1aSopenharmony_ci ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride); 687cabdff1aSopenharmony_ci dst += (8 * stride); 688cabdff1aSopenharmony_ci 689cabdff1aSopenharmony_ci LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 690cabdff1aSopenharmony_ci LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 691cabdff1aSopenharmony_ci 692cabdff1aSopenharmony_ci AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, 693cabdff1aSopenharmony_ci dst2, dst3); 694cabdff1aSopenharmony_ci AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, 695cabdff1aSopenharmony_ci dst6, dst7); 696cabdff1aSopenharmony_ci ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride); 697cabdff1aSopenharmony_ci} 698cabdff1aSopenharmony_ci 699cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, 700cabdff1aSopenharmony_ci ptrdiff_t stride) 701cabdff1aSopenharmony_ci{ 702cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; 703cabdff1aSopenharmony_ci v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 704cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 705cabdff1aSopenharmony_ci 706cabdff1aSopenharmony_ci LD4(src, stride, tp0, tp1, tp2, tp3); 707cabdff1aSopenharmony_ci src += 4 * stride; 708cabdff1aSopenharmony_ci LD4(src, stride, tp4, tp5, tp6, tp7); 709cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, src0); 710cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, src1); 711cabdff1aSopenharmony_ci INSERT_D2_UB(tp4, tp5, src2); 712cabdff1aSopenharmony_ci INSERT_D2_UB(tp6, tp7, src3); 713cabdff1aSopenharmony_ci 714cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 715cabdff1aSopenharmony_ci LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7); 716cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 717cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 718cabdff1aSopenharmony_ci INSERT_D2_UB(tp4, tp5, dst2); 719cabdff1aSopenharmony_ci INSERT_D2_UB(tp6, tp7, dst3); 720cabdff1aSopenharmony_ci 721cabdff1aSopenharmony_ci AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, 722cabdff1aSopenharmony_ci dst2, dst3); 723cabdff1aSopenharmony_ci 724cabdff1aSopenharmony_ci ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 725cabdff1aSopenharmony_ci} 726cabdff1aSopenharmony_ci 727cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src, 728cabdff1aSopenharmony_ci ptrdiff_t stride) 729cabdff1aSopenharmony_ci{ 730cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 731cabdff1aSopenharmony_ci v16u8 src0 = { 0 }, dst0 = { 0 }; 732cabdff1aSopenharmony_ci 733cabdff1aSopenharmony_ci LW4(src, stride, tp0, tp1, tp2, tp3); 734cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 735cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 736cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 737cabdff1aSopenharmony_ci 738cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b(src0, dst0); 739cabdff1aSopenharmony_ci 740cabdff1aSopenharmony_ci ST_W4(dst0, 0, 1, 2, 3, dst, stride); 741cabdff1aSopenharmony_ci} 742cabdff1aSopenharmony_ci 743cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, 744cabdff1aSopenharmony_ci ptrdiff_t stride) 745cabdff1aSopenharmony_ci{ 746cabdff1aSopenharmony_ci uint32_t loop_cnt; 747cabdff1aSopenharmony_ci v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6; 748cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11; 749cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 750cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 751cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 752cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 753cabdff1aSopenharmony_ci 754cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 755cabdff1aSopenharmony_ci mask3 = mask0 + 8; 756cabdff1aSopenharmony_ci mask4 = mask1 + 8; 757cabdff1aSopenharmony_ci mask5 = mask2 + 8; 758cabdff1aSopenharmony_ci src -= 2; 759cabdff1aSopenharmony_ci 760cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 761cabdff1aSopenharmony_ci LD_SB2(src, 16, src0, src1); 762cabdff1aSopenharmony_ci src += stride; 763cabdff1aSopenharmony_ci LD_SB2(src, 16, src2, src3); 764cabdff1aSopenharmony_ci src += stride; 765cabdff1aSopenharmony_ci LD_SB2(src, 16, src4, src5); 766cabdff1aSopenharmony_ci src += stride; 767cabdff1aSopenharmony_ci LD_SB2(src, 16, src6, src7); 768cabdff1aSopenharmony_ci src += stride; 769cabdff1aSopenharmony_ci 770cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 771cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3); 772cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9); 773cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4); 774cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10); 775cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5); 776cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11); 777cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); 778cabdff1aSopenharmony_ci DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 779cabdff1aSopenharmony_ci minus5b, res0, res1, res2, res3); 780cabdff1aSopenharmony_ci DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 781cabdff1aSopenharmony_ci plus20b, res0, res1, res2, res3); 782cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3); 783cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9); 784cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4); 785cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10); 786cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5); 787cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11); 788cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); 789cabdff1aSopenharmony_ci DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 790cabdff1aSopenharmony_ci minus5b, res4, res5, res6, res7); 791cabdff1aSopenharmony_ci DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 792cabdff1aSopenharmony_ci plus20b, res4, res5, res6, res7); 793cabdff1aSopenharmony_ci SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2, 794cabdff1aSopenharmony_ci src0, src2, src4, src6); 795cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 5); 796cabdff1aSopenharmony_ci SRARI_H4_SH(res4, res5, res6, res7, 5); 797cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 798cabdff1aSopenharmony_ci SAT_SH4_SH(res4, res5, res6, res7, 7); 799cabdff1aSopenharmony_ci PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1); 800cabdff1aSopenharmony_ci PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3); 801cabdff1aSopenharmony_ci dst0 = __msa_aver_s_b(dst0, src0); 802cabdff1aSopenharmony_ci dst1 = __msa_aver_s_b(dst1, src2); 803cabdff1aSopenharmony_ci dst2 = __msa_aver_s_b(dst2, src4); 804cabdff1aSopenharmony_ci dst3 = __msa_aver_s_b(dst3, src6); 805cabdff1aSopenharmony_ci XORI_B4_128_SB(dst0, dst1, dst2, dst3); 806cabdff1aSopenharmony_ci ST_SB4(dst0, dst1, dst2, dst3, dst, stride); 807cabdff1aSopenharmony_ci dst += (4 * stride); 808cabdff1aSopenharmony_ci } 809cabdff1aSopenharmony_ci} 810cabdff1aSopenharmony_ci 811cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, 812cabdff1aSopenharmony_ci ptrdiff_t stride) 813cabdff1aSopenharmony_ci{ 814cabdff1aSopenharmony_ci uint32_t loop_cnt; 815cabdff1aSopenharmony_ci v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6; 816cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11; 817cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 818cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 819cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 820cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 821cabdff1aSopenharmony_ci 822cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 823cabdff1aSopenharmony_ci mask3 = mask0 + 8; 824cabdff1aSopenharmony_ci mask4 = mask1 + 8; 825cabdff1aSopenharmony_ci mask5 = mask2 + 8; 826cabdff1aSopenharmony_ci src -= 2; 827cabdff1aSopenharmony_ci 828cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 829cabdff1aSopenharmony_ci LD_SB2(src, 16, src0, src1); 830cabdff1aSopenharmony_ci src += stride; 831cabdff1aSopenharmony_ci LD_SB2(src, 16, src2, src3); 832cabdff1aSopenharmony_ci src += stride; 833cabdff1aSopenharmony_ci LD_SB2(src, 16, src4, src5); 834cabdff1aSopenharmony_ci src += stride; 835cabdff1aSopenharmony_ci LD_SB2(src, 16, src6, src7); 836cabdff1aSopenharmony_ci src += stride; 837cabdff1aSopenharmony_ci 838cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 839cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3); 840cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9); 841cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4); 842cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10); 843cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5); 844cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11); 845cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); 846cabdff1aSopenharmony_ci DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 847cabdff1aSopenharmony_ci minus5b, res0, res1, res2, res3); 848cabdff1aSopenharmony_ci DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 849cabdff1aSopenharmony_ci plus20b, res0, res1, res2, res3); 850cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3); 851cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9); 852cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4); 853cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10); 854cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5); 855cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11); 856cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); 857cabdff1aSopenharmony_ci DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 858cabdff1aSopenharmony_ci minus5b, res4, res5, res6, res7); 859cabdff1aSopenharmony_ci DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 860cabdff1aSopenharmony_ci plus20b, res4, res5, res6, res7); 861cabdff1aSopenharmony_ci SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3, 862cabdff1aSopenharmony_ci src0, src2, src4, src6); 863cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 5); 864cabdff1aSopenharmony_ci SRARI_H4_SH(res4, res5, res6, res7, 5); 865cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 866cabdff1aSopenharmony_ci SAT_SH4_SH(res4, res5, res6, res7, 7); 867cabdff1aSopenharmony_ci PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1); 868cabdff1aSopenharmony_ci PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3); 869cabdff1aSopenharmony_ci dst0 = __msa_aver_s_b(dst0, src0); 870cabdff1aSopenharmony_ci dst1 = __msa_aver_s_b(dst1, src2); 871cabdff1aSopenharmony_ci dst2 = __msa_aver_s_b(dst2, src4); 872cabdff1aSopenharmony_ci dst3 = __msa_aver_s_b(dst3, src6); 873cabdff1aSopenharmony_ci XORI_B4_128_SB(dst0, dst1, dst2, dst3); 874cabdff1aSopenharmony_ci ST_SB4(dst0, dst1, dst2, dst3, dst, stride); 875cabdff1aSopenharmony_ci dst += (4 * stride); 876cabdff1aSopenharmony_ci } 877cabdff1aSopenharmony_ci} 878cabdff1aSopenharmony_ci 879cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, 880cabdff1aSopenharmony_ci ptrdiff_t stride) 881cabdff1aSopenharmony_ci{ 882cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 883cabdff1aSopenharmony_ci v16i8 tmp0, tmp1, tmp2, tmp3, vec11; 884cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 885cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 886cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 887cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 888cabdff1aSopenharmony_ci 889cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 890cabdff1aSopenharmony_ci LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7); 891cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 892cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 893cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 894cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); 895cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 896cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 897cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 898cabdff1aSopenharmony_ci res0, res1, res2, res3); 899cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); 900cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); 901cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 902cabdff1aSopenharmony_ci res0, res1, res2, res3); 903cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 904cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 905cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7); 906cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 907cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 908cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 909cabdff1aSopenharmony_ci res4, res5, res6, res7); 910cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9); 911cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); 912cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 913cabdff1aSopenharmony_ci res4, res5, res6, res7); 914cabdff1aSopenharmony_ci SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2, 915cabdff1aSopenharmony_ci src0, src1, src2, src3); 916cabdff1aSopenharmony_ci SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2, 917cabdff1aSopenharmony_ci src4, src5, src6, src7); 918cabdff1aSopenharmony_ci PCKEV_D2_SB(src1, src0, src3, src2, src0, src1); 919cabdff1aSopenharmony_ci PCKEV_D2_SB(src5, src4, src7, src6, src4, src5); 920cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 5); 921cabdff1aSopenharmony_ci SRARI_H4_SH(res4, res5, res6, res7, 5); 922cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 923cabdff1aSopenharmony_ci SAT_SH4_SH(res4, res5, res6, res7, 7); 924cabdff1aSopenharmony_ci PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1); 925cabdff1aSopenharmony_ci PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3); 926cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_b(tmp0, src0); 927cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_b(tmp1, src1); 928cabdff1aSopenharmony_ci tmp2 = __msa_aver_s_b(tmp2, src4); 929cabdff1aSopenharmony_ci tmp3 = __msa_aver_s_b(tmp3, src5); 930cabdff1aSopenharmony_ci XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3); 931cabdff1aSopenharmony_ci ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 932cabdff1aSopenharmony_ci} 933cabdff1aSopenharmony_ci 934cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, 935cabdff1aSopenharmony_ci ptrdiff_t stride) 936cabdff1aSopenharmony_ci{ 937cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 938cabdff1aSopenharmony_ci v16i8 tmp0, tmp1, tmp2, tmp3, vec11; 939cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 940cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 941cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 942cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 943cabdff1aSopenharmony_ci 944cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 945cabdff1aSopenharmony_ci LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7); 946cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 947cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 948cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 949cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); 950cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 951cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 952cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 953cabdff1aSopenharmony_ci res0, res1, res2, res3); 954cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); 955cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); 956cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 957cabdff1aSopenharmony_ci res0, res1, res2, res3); 958cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 959cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 960cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7); 961cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 962cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 963cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 964cabdff1aSopenharmony_ci res4, res5, res6, res7); 965cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9); 966cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); 967cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 968cabdff1aSopenharmony_ci res4, res5, res6, res7); 969cabdff1aSopenharmony_ci SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3, 970cabdff1aSopenharmony_ci src0, src1, src2, src3); 971cabdff1aSopenharmony_ci SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3, 972cabdff1aSopenharmony_ci src4, src5, src6, src7); 973cabdff1aSopenharmony_ci PCKEV_D2_SB(src1, src0, src3, src2, src0, src1); 974cabdff1aSopenharmony_ci PCKEV_D2_SB(src5, src4, src7, src6, src4, src5); 975cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 5); 976cabdff1aSopenharmony_ci SRARI_H4_SH(res4, res5, res6, res7, 5); 977cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 978cabdff1aSopenharmony_ci SAT_SH4_SH(res4, res5, res6, res7, 7); 979cabdff1aSopenharmony_ci PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1); 980cabdff1aSopenharmony_ci PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3); 981cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_b(tmp0, src0); 982cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_b(tmp1, src1); 983cabdff1aSopenharmony_ci tmp2 = __msa_aver_s_b(tmp2, src4); 984cabdff1aSopenharmony_ci tmp3 = __msa_aver_s_b(tmp3, src5); 985cabdff1aSopenharmony_ci XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3); 986cabdff1aSopenharmony_ci ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 987cabdff1aSopenharmony_ci} 988cabdff1aSopenharmony_ci 989cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, 990cabdff1aSopenharmony_ci ptrdiff_t stride) 991cabdff1aSopenharmony_ci{ 992cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2; 993cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 994cabdff1aSopenharmony_ci v8i16 res0, res1; 995cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 996cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 997cabdff1aSopenharmony_ci 998cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 999cabdff1aSopenharmony_ci LD_SB4(src - 2, stride, src0, src1, src2, src3); 1000cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 1001cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 1002cabdff1aSopenharmony_ci HADD_SB2_SH(vec0, vec1, res0, res1); 1003cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 1004cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1); 1005cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 1006cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1); 1007cabdff1aSopenharmony_ci SRARI_H2_SH(res0, res1, 5); 1008cabdff1aSopenharmony_ci SAT_SH2_SH(res0, res1, 7); 1009cabdff1aSopenharmony_ci res = __msa_pckev_b((v16i8) res1, (v16i8) res0); 1010cabdff1aSopenharmony_ci SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2, 1011cabdff1aSopenharmony_ci src0, src1, src2, src3); 1012cabdff1aSopenharmony_ci src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); 1013cabdff1aSopenharmony_ci src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); 1014cabdff1aSopenharmony_ci src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1); 1015cabdff1aSopenharmony_ci res = __msa_aver_s_b(res, src0); 1016cabdff1aSopenharmony_ci res = (v16i8) __msa_xori_b((v16u8) res, 128); 1017cabdff1aSopenharmony_ci ST_W4(res, 0, 1, 2, 3, dst, stride); 1018cabdff1aSopenharmony_ci} 1019cabdff1aSopenharmony_ci 1020cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, 1021cabdff1aSopenharmony_ci ptrdiff_t stride) 1022cabdff1aSopenharmony_ci{ 1023cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2; 1024cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 1025cabdff1aSopenharmony_ci v8i16 res0, res1; 1026cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 1027cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 1028cabdff1aSopenharmony_ci 1029cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 1030cabdff1aSopenharmony_ci LD_SB4(src - 2, stride, src0, src1, src2, src3); 1031cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 1032cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 1033cabdff1aSopenharmony_ci HADD_SB2_SH(vec0, vec1, res0, res1); 1034cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 1035cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1); 1036cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 1037cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1); 1038cabdff1aSopenharmony_ci SRARI_H2_SH(res0, res1, 5); 1039cabdff1aSopenharmony_ci SAT_SH2_SH(res0, res1, 7); 1040cabdff1aSopenharmony_ci res = __msa_pckev_b((v16i8) res1, (v16i8) res0); 1041cabdff1aSopenharmony_ci SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3, 1042cabdff1aSopenharmony_ci src0, src1, src2, src3); 1043cabdff1aSopenharmony_ci src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); 1044cabdff1aSopenharmony_ci src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); 1045cabdff1aSopenharmony_ci src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1); 1046cabdff1aSopenharmony_ci res = __msa_aver_s_b(res, src0); 1047cabdff1aSopenharmony_ci res = (v16i8) __msa_xori_b((v16u8) res, 128); 1048cabdff1aSopenharmony_ci ST_W4(res, 0, 1, 2, 3, dst, stride); 1049cabdff1aSopenharmony_ci} 1050cabdff1aSopenharmony_ci 1051cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, 1052cabdff1aSopenharmony_ci ptrdiff_t stride) 1053cabdff1aSopenharmony_ci{ 1054cabdff1aSopenharmony_ci uint32_t loop_cnt; 1055cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 1056cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 1057cabdff1aSopenharmony_ci v16i8 vec11; 1058cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 1059cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 1060cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 1061cabdff1aSopenharmony_ci 1062cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 1063cabdff1aSopenharmony_ci src -= 2; 1064cabdff1aSopenharmony_ci 1065cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 1066cabdff1aSopenharmony_ci LD_SB2(src, 8, src0, src1); 1067cabdff1aSopenharmony_ci src += stride; 1068cabdff1aSopenharmony_ci LD_SB2(src, 8, src2, src3); 1069cabdff1aSopenharmony_ci src += stride; 1070cabdff1aSopenharmony_ci LD_SB2(src, 8, src4, src5); 1071cabdff1aSopenharmony_ci src += stride; 1072cabdff1aSopenharmony_ci LD_SB2(src, 8, src6, src7); 1073cabdff1aSopenharmony_ci src += stride; 1074cabdff1aSopenharmony_ci 1075cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 1076cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3); 1077cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9); 1078cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4); 1079cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10); 1080cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5); 1081cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11); 1082cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); 1083cabdff1aSopenharmony_ci DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 1084cabdff1aSopenharmony_ci minus5b, res0, res1, res2, res3); 1085cabdff1aSopenharmony_ci DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 1086cabdff1aSopenharmony_ci plus20b, res0, res1, res2, res3); 1087cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3); 1088cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9); 1089cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4); 1090cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10); 1091cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5); 1092cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11); 1093cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); 1094cabdff1aSopenharmony_ci DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 1095cabdff1aSopenharmony_ci minus5b, res4, res5, res6, res7); 1096cabdff1aSopenharmony_ci DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 1097cabdff1aSopenharmony_ci plus20b, res4, res5, res6, res7); 1098cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 5); 1099cabdff1aSopenharmony_ci SRARI_H4_SH(res4, res5, res6, res7, 5); 1100cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 1101cabdff1aSopenharmony_ci SAT_SH4_SH(res4, res5, res6, res7, 7); 1102cabdff1aSopenharmony_ci PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1, 1103cabdff1aSopenharmony_ci vec2, vec3); 1104cabdff1aSopenharmony_ci XORI_B4_128_SB(vec0, vec1, vec2, vec3); 1105cabdff1aSopenharmony_ci ST_SB4(vec0, vec1, vec2, vec3, dst, stride); 1106cabdff1aSopenharmony_ci dst += (4 * stride); 1107cabdff1aSopenharmony_ci } 1108cabdff1aSopenharmony_ci} 1109cabdff1aSopenharmony_ci 1110cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, 1111cabdff1aSopenharmony_ci ptrdiff_t stride) 1112cabdff1aSopenharmony_ci{ 1113cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 1114cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 1115cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 1116cabdff1aSopenharmony_ci v16i8 vec11; 1117cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 1118cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 1119cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 1120cabdff1aSopenharmony_ci 1121cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 1122cabdff1aSopenharmony_ci LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7); 1123cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 1124cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 1125cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 1126cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); 1127cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 1128cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 1129cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 1130cabdff1aSopenharmony_ci res0, res1, res2, res3); 1131cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); 1132cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); 1133cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, 1134cabdff1aSopenharmony_ci plus20b, res0, res1, res2, res3); 1135cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 1136cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 1137cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7); 1138cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 1139cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 1140cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 1141cabdff1aSopenharmony_ci res4, res5, res6, res7); 1142cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9); 1143cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); 1144cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, 1145cabdff1aSopenharmony_ci plus20b, res4, res5, res6, res7); 1146cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 5); 1147cabdff1aSopenharmony_ci SRARI_H4_SH(res4, res5, res6, res7, 5); 1148cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 1149cabdff1aSopenharmony_ci SAT_SH4_SH(res4, res5, res6, res7, 7); 1150cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(res0, res1); 1151cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(res2, res3); 1152cabdff1aSopenharmony_ci out2 = PCKEV_XORI128_UB(res4, res5); 1153cabdff1aSopenharmony_ci out3 = PCKEV_XORI128_UB(res6, res7); 1154cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1155cabdff1aSopenharmony_ci} 1156cabdff1aSopenharmony_ci 1157cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, 1158cabdff1aSopenharmony_ci ptrdiff_t stride) 1159cabdff1aSopenharmony_ci{ 1160cabdff1aSopenharmony_ci v16u8 out; 1161cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, mask0, mask1, mask2; 1162cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 1163cabdff1aSopenharmony_ci v8i16 res0, res1; 1164cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 1165cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 1166cabdff1aSopenharmony_ci 1167cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 1168cabdff1aSopenharmony_ci LD_SB4(src - 2, stride, src0, src1, src2, src3); 1169cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 1170cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 1171cabdff1aSopenharmony_ci HADD_SB2_SH(vec0, vec1, res0, res1); 1172cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 1173cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1); 1174cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 1175cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1); 1176cabdff1aSopenharmony_ci SRARI_H2_SH(res0, res1, 5); 1177cabdff1aSopenharmony_ci SAT_SH2_SH(res0, res1, 7); 1178cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(res0, res1); 1179cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 1180cabdff1aSopenharmony_ci} 1181cabdff1aSopenharmony_ci 1182cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, 1183cabdff1aSopenharmony_ci ptrdiff_t stride) 1184cabdff1aSopenharmony_ci{ 1185cabdff1aSopenharmony_ci int32_t loop_cnt; 1186cabdff1aSopenharmony_ci int16_t filt_const0 = 0xfb01; 1187cabdff1aSopenharmony_ci int16_t filt_const1 = 0x1414; 1188cabdff1aSopenharmony_ci int16_t filt_const2 = 0x1fb; 1189cabdff1aSopenharmony_ci v16u8 res0, res1, res2, res3; 1190cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1191cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 1192cabdff1aSopenharmony_ci v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 1193cabdff1aSopenharmony_ci v16i8 src65_l, src87_l, filt0, filt1, filt2; 1194cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 1195cabdff1aSopenharmony_ci 1196cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 1197cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 1198cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 1199cabdff1aSopenharmony_ci 1200cabdff1aSopenharmony_ci src -= (stride * 2); 1201cabdff1aSopenharmony_ci 1202cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 1203cabdff1aSopenharmony_ci src += (5 * stride); 1204cabdff1aSopenharmony_ci 1205cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 1206cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1207cabdff1aSopenharmony_ci src32_r, src43_r); 1208cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 1209cabdff1aSopenharmony_ci src32_l, src43_l); 1210cabdff1aSopenharmony_ci 1211cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 1212cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 1213cabdff1aSopenharmony_ci src += (4 * stride); 1214cabdff1aSopenharmony_ci 1215cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 1216cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 1217cabdff1aSopenharmony_ci src65_r, src76_r, src87_r); 1218cabdff1aSopenharmony_ci ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 1219cabdff1aSopenharmony_ci src65_l, src76_l, src87_l); 1220cabdff1aSopenharmony_ci out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 1221cabdff1aSopenharmony_ci out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 1222cabdff1aSopenharmony_ci out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 1223cabdff1aSopenharmony_ci out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 1224cabdff1aSopenharmony_ci out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 1225cabdff1aSopenharmony_ci out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 1226cabdff1aSopenharmony_ci out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 1227cabdff1aSopenharmony_ci out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 1228cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 1229cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1230cabdff1aSopenharmony_ci SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); 1231cabdff1aSopenharmony_ci SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1232cabdff1aSopenharmony_ci PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 1233cabdff1aSopenharmony_ci out3_r, res0, res1, res2, res3); 1234cabdff1aSopenharmony_ci res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2); 1235cabdff1aSopenharmony_ci res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3); 1236cabdff1aSopenharmony_ci res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4); 1237cabdff1aSopenharmony_ci res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5); 1238cabdff1aSopenharmony_ci XORI_B4_128_UB(res0, res1, res2, res3); 1239cabdff1aSopenharmony_ci ST_UB4(res0, res1, res2, res3, dst, stride); 1240cabdff1aSopenharmony_ci dst += (4 * stride); 1241cabdff1aSopenharmony_ci 1242cabdff1aSopenharmony_ci src10_r = src54_r; 1243cabdff1aSopenharmony_ci src32_r = src76_r; 1244cabdff1aSopenharmony_ci src21_r = src65_r; 1245cabdff1aSopenharmony_ci src43_r = src87_r; 1246cabdff1aSopenharmony_ci src10_l = src54_l; 1247cabdff1aSopenharmony_ci src32_l = src76_l; 1248cabdff1aSopenharmony_ci src21_l = src65_l; 1249cabdff1aSopenharmony_ci src43_l = src87_l; 1250cabdff1aSopenharmony_ci src2 = src6; 1251cabdff1aSopenharmony_ci src3 = src7; 1252cabdff1aSopenharmony_ci src4 = src8; 1253cabdff1aSopenharmony_ci } 1254cabdff1aSopenharmony_ci} 1255cabdff1aSopenharmony_ci 1256cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, 1257cabdff1aSopenharmony_ci ptrdiff_t stride) 1258cabdff1aSopenharmony_ci{ 1259cabdff1aSopenharmony_ci int32_t loop_cnt; 1260cabdff1aSopenharmony_ci int16_t filt_const0 = 0xfb01; 1261cabdff1aSopenharmony_ci int16_t filt_const1 = 0x1414; 1262cabdff1aSopenharmony_ci int16_t filt_const2 = 0x1fb; 1263cabdff1aSopenharmony_ci v16u8 res0, res1, res2, res3; 1264cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1265cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 1266cabdff1aSopenharmony_ci v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 1267cabdff1aSopenharmony_ci v16i8 src65_l, src87_l, filt0, filt1, filt2; 1268cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 1269cabdff1aSopenharmony_ci 1270cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 1271cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 1272cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 1273cabdff1aSopenharmony_ci 1274cabdff1aSopenharmony_ci src -= (stride * 2); 1275cabdff1aSopenharmony_ci 1276cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 1277cabdff1aSopenharmony_ci src += (5 * stride); 1278cabdff1aSopenharmony_ci 1279cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 1280cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1281cabdff1aSopenharmony_ci src32_r, src43_r); 1282cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 1283cabdff1aSopenharmony_ci src32_l, src43_l); 1284cabdff1aSopenharmony_ci 1285cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 1286cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 1287cabdff1aSopenharmony_ci src += (4 * stride); 1288cabdff1aSopenharmony_ci 1289cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 1290cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 1291cabdff1aSopenharmony_ci src65_r, src76_r, src87_r); 1292cabdff1aSopenharmony_ci ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 1293cabdff1aSopenharmony_ci src65_l, src76_l, src87_l); 1294cabdff1aSopenharmony_ci out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 1295cabdff1aSopenharmony_ci out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 1296cabdff1aSopenharmony_ci out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 1297cabdff1aSopenharmony_ci out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 1298cabdff1aSopenharmony_ci out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 1299cabdff1aSopenharmony_ci out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 1300cabdff1aSopenharmony_ci out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 1301cabdff1aSopenharmony_ci out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 1302cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 1303cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1304cabdff1aSopenharmony_ci SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); 1305cabdff1aSopenharmony_ci SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1306cabdff1aSopenharmony_ci PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 1307cabdff1aSopenharmony_ci out3_r, res0, res1, res2, res3); 1308cabdff1aSopenharmony_ci res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3); 1309cabdff1aSopenharmony_ci res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4); 1310cabdff1aSopenharmony_ci res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5); 1311cabdff1aSopenharmony_ci res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6); 1312cabdff1aSopenharmony_ci XORI_B4_128_UB(res0, res1, res2, res3); 1313cabdff1aSopenharmony_ci ST_UB4(res0, res1, res2, res3, dst, stride); 1314cabdff1aSopenharmony_ci dst += (4 * stride); 1315cabdff1aSopenharmony_ci 1316cabdff1aSopenharmony_ci src10_r = src54_r; 1317cabdff1aSopenharmony_ci src32_r = src76_r; 1318cabdff1aSopenharmony_ci src21_r = src65_r; 1319cabdff1aSopenharmony_ci src43_r = src87_r; 1320cabdff1aSopenharmony_ci src10_l = src54_l; 1321cabdff1aSopenharmony_ci src32_l = src76_l; 1322cabdff1aSopenharmony_ci src21_l = src65_l; 1323cabdff1aSopenharmony_ci src43_l = src87_l; 1324cabdff1aSopenharmony_ci src3 = src7; 1325cabdff1aSopenharmony_ci src4 = src8; 1326cabdff1aSopenharmony_ci } 1327cabdff1aSopenharmony_ci} 1328cabdff1aSopenharmony_ci 1329cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, 1330cabdff1aSopenharmony_ci ptrdiff_t stride) 1331cabdff1aSopenharmony_ci{ 1332cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 1333cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 1334cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 1335cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1336cabdff1aSopenharmony_ci v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r; 1337cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r; 1338cabdff1aSopenharmony_ci v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3; 1339cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r; 1340cabdff1aSopenharmony_ci 1341cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 1342cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 1343cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 1344cabdff1aSopenharmony_ci 1345cabdff1aSopenharmony_ci src -= (stride * 2); 1346cabdff1aSopenharmony_ci 1347cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 1348cabdff1aSopenharmony_ci src += (5 * stride); 1349cabdff1aSopenharmony_ci LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12); 1350cabdff1aSopenharmony_ci XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12); 1351cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 1352cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1353cabdff1aSopenharmony_ci src32_r, src43_r); 1354cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 1355cabdff1aSopenharmony_ci src76_r, src87_r); 1356cabdff1aSopenharmony_ci ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r, 1357cabdff1aSopenharmony_ci src109_r, src1110_r, src1211_r); 1358cabdff1aSopenharmony_ci out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 1359cabdff1aSopenharmony_ci out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 1360cabdff1aSopenharmony_ci out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 1361cabdff1aSopenharmony_ci out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 1362cabdff1aSopenharmony_ci out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2); 1363cabdff1aSopenharmony_ci out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2); 1364cabdff1aSopenharmony_ci out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2); 1365cabdff1aSopenharmony_ci out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2); 1366cabdff1aSopenharmony_ci PCKEV_D2_SB(src3, src2, src5, src4, tmp0, tmp1); 1367cabdff1aSopenharmony_ci PCKEV_D2_SB(src7, src6, src9, src8, tmp2, tmp3); 1368cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 1369cabdff1aSopenharmony_ci SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5); 1370cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1371cabdff1aSopenharmony_ci SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7); 1372cabdff1aSopenharmony_ci PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1); 1373cabdff1aSopenharmony_ci PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3); 1374cabdff1aSopenharmony_ci out0 = __msa_aver_s_b(out0, tmp0); 1375cabdff1aSopenharmony_ci out1 = __msa_aver_s_b(out1, tmp1); 1376cabdff1aSopenharmony_ci out2 = __msa_aver_s_b(out2, tmp2); 1377cabdff1aSopenharmony_ci out3 = __msa_aver_s_b(out3, tmp3); 1378cabdff1aSopenharmony_ci XORI_B4_128_SB(out0, out1, out2, out3); 1379cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1380cabdff1aSopenharmony_ci} 1381cabdff1aSopenharmony_ci 1382cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, 1383cabdff1aSopenharmony_ci ptrdiff_t stride) 1384cabdff1aSopenharmony_ci{ 1385cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 1386cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 1387cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 1388cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1389cabdff1aSopenharmony_ci v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r; 1390cabdff1aSopenharmony_ci v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r; 1391cabdff1aSopenharmony_ci v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3; 1392cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r; 1393cabdff1aSopenharmony_ci 1394cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 1395cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 1396cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 1397cabdff1aSopenharmony_ci 1398cabdff1aSopenharmony_ci src -= (stride * 2); 1399cabdff1aSopenharmony_ci 1400cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 1401cabdff1aSopenharmony_ci src += (5 * stride); 1402cabdff1aSopenharmony_ci LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12); 1403cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 1404cabdff1aSopenharmony_ci XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12); 1405cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1406cabdff1aSopenharmony_ci src32_r, src43_r); 1407cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 1408cabdff1aSopenharmony_ci src76_r, src87_r); 1409cabdff1aSopenharmony_ci ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r, 1410cabdff1aSopenharmony_ci src109_r, src1110_r, src1211_r); 1411cabdff1aSopenharmony_ci out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 1412cabdff1aSopenharmony_ci out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 1413cabdff1aSopenharmony_ci out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 1414cabdff1aSopenharmony_ci out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 1415cabdff1aSopenharmony_ci out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2); 1416cabdff1aSopenharmony_ci out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2); 1417cabdff1aSopenharmony_ci out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2); 1418cabdff1aSopenharmony_ci out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2); 1419cabdff1aSopenharmony_ci PCKEV_D2_SB(src4, src3, src6, src5, tmp0, tmp1); 1420cabdff1aSopenharmony_ci PCKEV_D2_SB(src8, src7, src10, src9, tmp2, tmp3); 1421cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 1422cabdff1aSopenharmony_ci SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5); 1423cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1424cabdff1aSopenharmony_ci SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7); 1425cabdff1aSopenharmony_ci PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1); 1426cabdff1aSopenharmony_ci PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3); 1427cabdff1aSopenharmony_ci out0 = __msa_aver_s_b(out0, tmp0); 1428cabdff1aSopenharmony_ci out1 = __msa_aver_s_b(out1, tmp1); 1429cabdff1aSopenharmony_ci out2 = __msa_aver_s_b(out2, tmp2); 1430cabdff1aSopenharmony_ci out3 = __msa_aver_s_b(out3, tmp3); 1431cabdff1aSopenharmony_ci XORI_B4_128_SB(out0, out1, out2, out3); 1432cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 1433cabdff1aSopenharmony_ci} 1434cabdff1aSopenharmony_ci 1435cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, 1436cabdff1aSopenharmony_ci ptrdiff_t stride) 1437cabdff1aSopenharmony_ci{ 1438cabdff1aSopenharmony_ci int16_t filt_const0 = 0xfb01; 1439cabdff1aSopenharmony_ci int16_t filt_const1 = 0x1414; 1440cabdff1aSopenharmony_ci int16_t filt_const2 = 0x1fb; 1441cabdff1aSopenharmony_ci v16u8 out; 1442cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1443cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 1444cabdff1aSopenharmony_ci v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 1445cabdff1aSopenharmony_ci v8i16 out10, out32; 1446cabdff1aSopenharmony_ci 1447cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 1448cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 1449cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 1450cabdff1aSopenharmony_ci 1451cabdff1aSopenharmony_ci src -= (stride * 2); 1452cabdff1aSopenharmony_ci 1453cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 1454cabdff1aSopenharmony_ci src += (5 * stride); 1455cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1456cabdff1aSopenharmony_ci src32_r, src43_r); 1457cabdff1aSopenharmony_ci ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 1458cabdff1aSopenharmony_ci XORI_B2_128_SB(src2110, src4332); 1459cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 1460cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 1461cabdff1aSopenharmony_ci src76_r, src87_r); 1462cabdff1aSopenharmony_ci ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); 1463cabdff1aSopenharmony_ci XORI_B2_128_SB(src6554, src8776); 1464cabdff1aSopenharmony_ci out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 1465cabdff1aSopenharmony_ci out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 1466cabdff1aSopenharmony_ci SRARI_H2_SH(out10, out32, 5); 1467cabdff1aSopenharmony_ci SAT_SH2_SH(out10, out32, 7); 1468cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out10, out32); 1469cabdff1aSopenharmony_ci src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); 1470cabdff1aSopenharmony_ci src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5); 1471cabdff1aSopenharmony_ci src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r); 1472cabdff1aSopenharmony_ci out = __msa_aver_u_b(out, (v16u8) src32_r); 1473cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 1474cabdff1aSopenharmony_ci} 1475cabdff1aSopenharmony_ci 1476cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, 1477cabdff1aSopenharmony_ci ptrdiff_t stride) 1478cabdff1aSopenharmony_ci{ 1479cabdff1aSopenharmony_ci int16_t filt_const0 = 0xfb01; 1480cabdff1aSopenharmony_ci int16_t filt_const1 = 0x1414; 1481cabdff1aSopenharmony_ci int16_t filt_const2 = 0x1fb; 1482cabdff1aSopenharmony_ci v16u8 out; 1483cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1484cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 1485cabdff1aSopenharmony_ci v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 1486cabdff1aSopenharmony_ci v8i16 out10, out32; 1487cabdff1aSopenharmony_ci 1488cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 1489cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 1490cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 1491cabdff1aSopenharmony_ci 1492cabdff1aSopenharmony_ci src -= (stride * 2); 1493cabdff1aSopenharmony_ci 1494cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 1495cabdff1aSopenharmony_ci src += (5 * stride); 1496cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1497cabdff1aSopenharmony_ci src32_r, src43_r); 1498cabdff1aSopenharmony_ci ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 1499cabdff1aSopenharmony_ci XORI_B2_128_SB(src2110, src4332); 1500cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 1501cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 1502cabdff1aSopenharmony_ci src76_r, src87_r); 1503cabdff1aSopenharmony_ci ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); 1504cabdff1aSopenharmony_ci XORI_B2_128_SB(src6554, src8776); 1505cabdff1aSopenharmony_ci out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 1506cabdff1aSopenharmony_ci out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 1507cabdff1aSopenharmony_ci SRARI_H2_SH(out10, out32, 5); 1508cabdff1aSopenharmony_ci SAT_SH2_SH(out10, out32, 7); 1509cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out10, out32); 1510cabdff1aSopenharmony_ci src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4); 1511cabdff1aSopenharmony_ci src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6); 1512cabdff1aSopenharmony_ci src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r); 1513cabdff1aSopenharmony_ci out = __msa_aver_u_b(out, (v16u8) src32_r); 1514cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 1515cabdff1aSopenharmony_ci} 1516cabdff1aSopenharmony_ci 1517cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, 1518cabdff1aSopenharmony_ci ptrdiff_t stride) 1519cabdff1aSopenharmony_ci{ 1520cabdff1aSopenharmony_ci avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2), dst, stride); 1521cabdff1aSopenharmony_ci} 1522cabdff1aSopenharmony_ci 1523cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, 1524cabdff1aSopenharmony_ci ptrdiff_t stride) 1525cabdff1aSopenharmony_ci{ 1526cabdff1aSopenharmony_ci avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2) + 1, dst, stride); 1527cabdff1aSopenharmony_ci} 1528cabdff1aSopenharmony_ci 1529cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, 1530cabdff1aSopenharmony_ci ptrdiff_t stride) 1531cabdff1aSopenharmony_ci{ 1532cabdff1aSopenharmony_ci avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2), dst, 1533cabdff1aSopenharmony_ci stride); 1534cabdff1aSopenharmony_ci} 1535cabdff1aSopenharmony_ci 1536cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, 1537cabdff1aSopenharmony_ci ptrdiff_t stride) 1538cabdff1aSopenharmony_ci{ 1539cabdff1aSopenharmony_ci avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2) + 1, dst, 1540cabdff1aSopenharmony_ci stride); 1541cabdff1aSopenharmony_ci} 1542cabdff1aSopenharmony_ci 1543cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, 1544cabdff1aSopenharmony_ci ptrdiff_t stride) 1545cabdff1aSopenharmony_ci{ 1546cabdff1aSopenharmony_ci avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2), dst, stride); 1547cabdff1aSopenharmony_ci} 1548cabdff1aSopenharmony_ci 1549cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, 1550cabdff1aSopenharmony_ci ptrdiff_t stride) 1551cabdff1aSopenharmony_ci{ 1552cabdff1aSopenharmony_ci avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2) + 1, dst, stride); 1553cabdff1aSopenharmony_ci} 1554cabdff1aSopenharmony_ci 1555cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, 1556cabdff1aSopenharmony_ci ptrdiff_t stride) 1557cabdff1aSopenharmony_ci{ 1558cabdff1aSopenharmony_ci avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2), dst, stride); 1559cabdff1aSopenharmony_ci} 1560cabdff1aSopenharmony_ci 1561cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, 1562cabdff1aSopenharmony_ci ptrdiff_t stride) 1563cabdff1aSopenharmony_ci{ 1564cabdff1aSopenharmony_ci avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2) + 1, dst, 1565cabdff1aSopenharmony_ci stride); 1566cabdff1aSopenharmony_ci} 1567cabdff1aSopenharmony_ci 1568cabdff1aSopenharmony_ci 1569cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, 1570cabdff1aSopenharmony_ci ptrdiff_t stride) 1571cabdff1aSopenharmony_ci{ 1572cabdff1aSopenharmony_ci avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2), dst, stride); 1573cabdff1aSopenharmony_ci} 1574cabdff1aSopenharmony_ci 1575cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, 1576cabdff1aSopenharmony_ci ptrdiff_t stride) 1577cabdff1aSopenharmony_ci{ 1578cabdff1aSopenharmony_ci avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2) + 1, dst, stride); 1579cabdff1aSopenharmony_ci} 1580cabdff1aSopenharmony_ci 1581cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, 1582cabdff1aSopenharmony_ci ptrdiff_t stride) 1583cabdff1aSopenharmony_ci{ 1584cabdff1aSopenharmony_ci avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2), dst, stride); 1585cabdff1aSopenharmony_ci} 1586cabdff1aSopenharmony_ci 1587cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, 1588cabdff1aSopenharmony_ci ptrdiff_t stride) 1589cabdff1aSopenharmony_ci{ 1590cabdff1aSopenharmony_ci avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2) + 1, dst, 1591cabdff1aSopenharmony_ci stride); 1592cabdff1aSopenharmony_ci} 1593cabdff1aSopenharmony_ci 1594cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, 1595cabdff1aSopenharmony_ci ptrdiff_t stride) 1596cabdff1aSopenharmony_ci{ 1597cabdff1aSopenharmony_ci uint8_t *dst_tmp = dst; 1598cabdff1aSopenharmony_ci const uint8_t *src_tmp = src - (2 * stride) - 2; 1599cabdff1aSopenharmony_ci uint32_t multiple8_cnt, loop_cnt; 1600cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 1601cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 1602cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 1603cabdff1aSopenharmony_ci v16u8 out0, out1; 1604cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1; 1605cabdff1aSopenharmony_ci v16i8 mask2; 1606cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1607cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1608cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 1609cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l; 1610cabdff1aSopenharmony_ci v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l; 1611cabdff1aSopenharmony_ci v8i16 hz_out87_l, filt0, filt1, filt2; 1612cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 1613cabdff1aSopenharmony_ci 1614cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 1615cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 1616cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 1617cabdff1aSopenharmony_ci 1618cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 1619cabdff1aSopenharmony_ci 1620cabdff1aSopenharmony_ci for (multiple8_cnt = 2; multiple8_cnt--;) { 1621cabdff1aSopenharmony_ci dst = dst_tmp; 1622cabdff1aSopenharmony_ci src = src_tmp; 1623cabdff1aSopenharmony_ci 1624cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 1625cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 1626cabdff1aSopenharmony_ci src += (5 * stride); 1627cabdff1aSopenharmony_ci 1628cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 1629cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 1630cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 1631cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 1632cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 1633cabdff1aSopenharmony_ci 1634cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 1635cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 1636cabdff1aSopenharmony_ci src += (4 * stride); 1637cabdff1aSopenharmony_ci 1638cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 1639cabdff1aSopenharmony_ci 1640cabdff1aSopenharmony_ci hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 1641cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 1642cabdff1aSopenharmony_ci hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 1643cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 1644cabdff1aSopenharmony_ci 1645cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 1646cabdff1aSopenharmony_ci hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, 1647cabdff1aSopenharmony_ci hz_out43_r); 1648cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 1649cabdff1aSopenharmony_ci hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, 1650cabdff1aSopenharmony_ci hz_out43_l); 1651cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 1652cabdff1aSopenharmony_ci hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, 1653cabdff1aSopenharmony_ci hz_out87_r); 1654cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 1655cabdff1aSopenharmony_ci hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, 1656cabdff1aSopenharmony_ci hz_out87_l); 1657cabdff1aSopenharmony_ci 1658cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, 1659cabdff1aSopenharmony_ci filt1, filt2); 1660cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, 1661cabdff1aSopenharmony_ci filt1, filt2); 1662cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1663cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, 1664cabdff1aSopenharmony_ci filt1, filt2); 1665cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, 1666cabdff1aSopenharmony_ci filt1, filt2); 1667cabdff1aSopenharmony_ci dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1668cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, 1669cabdff1aSopenharmony_ci filt1, filt2); 1670cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, 1671cabdff1aSopenharmony_ci filt1, filt2); 1672cabdff1aSopenharmony_ci dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1673cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, 1674cabdff1aSopenharmony_ci filt1, filt2); 1675cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, 1676cabdff1aSopenharmony_ci filt1, filt2); 1677cabdff1aSopenharmony_ci dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1678cabdff1aSopenharmony_ci 1679cabdff1aSopenharmony_ci dst1 = __msa_srari_h(hz_out2, 5); 1680cabdff1aSopenharmony_ci dst3 = __msa_srari_h(hz_out3, 5); 1681cabdff1aSopenharmony_ci dst5 = __msa_srari_h(hz_out4, 5); 1682cabdff1aSopenharmony_ci dst7 = __msa_srari_h(hz_out5, 5); 1683cabdff1aSopenharmony_ci SAT_SH4_SH(dst1, dst3, dst5, dst7, 7); 1684cabdff1aSopenharmony_ci 1685cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst0, dst1); 1686cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst2, dst3); 1687cabdff1aSopenharmony_ci dst2 = __msa_aver_s_h(dst4, dst5); 1688cabdff1aSopenharmony_ci dst3 = __msa_aver_s_h(dst6, dst7); 1689cabdff1aSopenharmony_ci 1690cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0, dst1); 1691cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst2, dst3); 1692cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 1693cabdff1aSopenharmony_ci dst += (4 * stride); 1694cabdff1aSopenharmony_ci 1695cabdff1aSopenharmony_ci hz_out0 = hz_out4; 1696cabdff1aSopenharmony_ci hz_out1 = hz_out5; 1697cabdff1aSopenharmony_ci hz_out2 = hz_out6; 1698cabdff1aSopenharmony_ci hz_out3 = hz_out7; 1699cabdff1aSopenharmony_ci hz_out4 = hz_out8; 1700cabdff1aSopenharmony_ci } 1701cabdff1aSopenharmony_ci 1702cabdff1aSopenharmony_ci src_tmp += 8; 1703cabdff1aSopenharmony_ci dst_tmp += 8; 1704cabdff1aSopenharmony_ci } 1705cabdff1aSopenharmony_ci} 1706cabdff1aSopenharmony_ci 1707cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, 1708cabdff1aSopenharmony_ci ptrdiff_t stride) 1709cabdff1aSopenharmony_ci{ 1710cabdff1aSopenharmony_ci uint8_t *dst_tmp = dst; 1711cabdff1aSopenharmony_ci const uint8_t *src_tmp = src - (2 * stride) - 2; 1712cabdff1aSopenharmony_ci uint32_t multiple8_cnt, loop_cnt; 1713cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 1714cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 1715cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 1716cabdff1aSopenharmony_ci v16u8 out0, out1; 1717cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1; 1718cabdff1aSopenharmony_ci v16i8 mask2; 1719cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1720cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1721cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 1722cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l; 1723cabdff1aSopenharmony_ci v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l; 1724cabdff1aSopenharmony_ci v8i16 hz_out87_l, filt0, filt1, filt2; 1725cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 1726cabdff1aSopenharmony_ci 1727cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 1728cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 1729cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 1730cabdff1aSopenharmony_ci 1731cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 1732cabdff1aSopenharmony_ci 1733cabdff1aSopenharmony_ci for (multiple8_cnt = 2; multiple8_cnt--;) { 1734cabdff1aSopenharmony_ci dst = dst_tmp; 1735cabdff1aSopenharmony_ci src = src_tmp; 1736cabdff1aSopenharmony_ci 1737cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 1738cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 1739cabdff1aSopenharmony_ci src += (5 * stride); 1740cabdff1aSopenharmony_ci 1741cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 1742cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 1743cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 1744cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 1745cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 1746cabdff1aSopenharmony_ci 1747cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 1748cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 1749cabdff1aSopenharmony_ci src += (4 * stride); 1750cabdff1aSopenharmony_ci 1751cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 1752cabdff1aSopenharmony_ci 1753cabdff1aSopenharmony_ci hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 1754cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 1755cabdff1aSopenharmony_ci hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 1756cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 1757cabdff1aSopenharmony_ci 1758cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 1759cabdff1aSopenharmony_ci hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, 1760cabdff1aSopenharmony_ci hz_out43_r); 1761cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 1762cabdff1aSopenharmony_ci hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, 1763cabdff1aSopenharmony_ci hz_out43_l); 1764cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 1765cabdff1aSopenharmony_ci hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, 1766cabdff1aSopenharmony_ci hz_out87_r); 1767cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 1768cabdff1aSopenharmony_ci hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, 1769cabdff1aSopenharmony_ci hz_out87_l); 1770cabdff1aSopenharmony_ci 1771cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, 1772cabdff1aSopenharmony_ci filt1, filt2); 1773cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, 1774cabdff1aSopenharmony_ci filt1, filt2); 1775cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1776cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, 1777cabdff1aSopenharmony_ci filt1, filt2); 1778cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, 1779cabdff1aSopenharmony_ci filt1, filt2); 1780cabdff1aSopenharmony_ci dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1781cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, 1782cabdff1aSopenharmony_ci filt1, filt2); 1783cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, 1784cabdff1aSopenharmony_ci filt1, filt2); 1785cabdff1aSopenharmony_ci dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1786cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, 1787cabdff1aSopenharmony_ci filt1, filt2); 1788cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, 1789cabdff1aSopenharmony_ci filt1, filt2); 1790cabdff1aSopenharmony_ci dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1791cabdff1aSopenharmony_ci 1792cabdff1aSopenharmony_ci dst1 = __msa_srari_h(hz_out3, 5); 1793cabdff1aSopenharmony_ci dst3 = __msa_srari_h(hz_out4, 5); 1794cabdff1aSopenharmony_ci dst5 = __msa_srari_h(hz_out5, 5); 1795cabdff1aSopenharmony_ci dst7 = __msa_srari_h(hz_out6, 5); 1796cabdff1aSopenharmony_ci SAT_SH4_SH(dst1, dst3, dst5, dst7, 7); 1797cabdff1aSopenharmony_ci 1798cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst0, dst1); 1799cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst2, dst3); 1800cabdff1aSopenharmony_ci dst2 = __msa_aver_s_h(dst4, dst5); 1801cabdff1aSopenharmony_ci dst3 = __msa_aver_s_h(dst6, dst7); 1802cabdff1aSopenharmony_ci 1803cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0, dst1); 1804cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst2, dst3); 1805cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 1806cabdff1aSopenharmony_ci dst += (4 * stride); 1807cabdff1aSopenharmony_ci 1808cabdff1aSopenharmony_ci hz_out0 = hz_out4; 1809cabdff1aSopenharmony_ci hz_out1 = hz_out5; 1810cabdff1aSopenharmony_ci hz_out2 = hz_out6; 1811cabdff1aSopenharmony_ci hz_out3 = hz_out7; 1812cabdff1aSopenharmony_ci hz_out4 = hz_out8; 1813cabdff1aSopenharmony_ci } 1814cabdff1aSopenharmony_ci 1815cabdff1aSopenharmony_ci src_tmp += 8; 1816cabdff1aSopenharmony_ci dst_tmp += 8; 1817cabdff1aSopenharmony_ci } 1818cabdff1aSopenharmony_ci} 1819cabdff1aSopenharmony_ci 1820cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, 1821cabdff1aSopenharmony_ci ptrdiff_t stride) 1822cabdff1aSopenharmony_ci{ 1823cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 1824cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 1825cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 1826cabdff1aSopenharmony_ci v16u8 out0, out1; 1827cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1828cabdff1aSopenharmony_ci v16i8 src11, src12, mask0, mask1, mask2; 1829cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1830cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12; 1831cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 1832cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r; 1833cabdff1aSopenharmony_ci v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3; 1834cabdff1aSopenharmony_ci v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l; 1835cabdff1aSopenharmony_ci v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l; 1836cabdff1aSopenharmony_ci v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2; 1837cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 1838cabdff1aSopenharmony_ci 1839cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 1840cabdff1aSopenharmony_ci 1841cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 1842cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 1843cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 1844cabdff1aSopenharmony_ci 1845cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 1846cabdff1aSopenharmony_ci 1847cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 1848cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 1849cabdff1aSopenharmony_ci src += (5 * stride); 1850cabdff1aSopenharmony_ci 1851cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 1852cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 1853cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 1854cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 1855cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 1856cabdff1aSopenharmony_ci 1857cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 1858cabdff1aSopenharmony_ci src += (4 * stride); 1859cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 1860cabdff1aSopenharmony_ci 1861cabdff1aSopenharmony_ci hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 1862cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 1863cabdff1aSopenharmony_ci hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 1864cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 1865cabdff1aSopenharmony_ci 1866cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 1867cabdff1aSopenharmony_ci hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 1868cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 1869cabdff1aSopenharmony_ci hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l); 1870cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 1871cabdff1aSopenharmony_ci hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 1872cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 1873cabdff1aSopenharmony_ci hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l); 1874cabdff1aSopenharmony_ci 1875cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 1876cabdff1aSopenharmony_ci filt2); 1877cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1, 1878cabdff1aSopenharmony_ci filt2); 1879cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1880cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 1881cabdff1aSopenharmony_ci filt2); 1882cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1, 1883cabdff1aSopenharmony_ci filt2); 1884cabdff1aSopenharmony_ci dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1885cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 1886cabdff1aSopenharmony_ci filt2); 1887cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1, 1888cabdff1aSopenharmony_ci filt2); 1889cabdff1aSopenharmony_ci dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1890cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 1891cabdff1aSopenharmony_ci filt2); 1892cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1, 1893cabdff1aSopenharmony_ci filt2); 1894cabdff1aSopenharmony_ci dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1895cabdff1aSopenharmony_ci 1896cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5); 1897cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7); 1898cabdff1aSopenharmony_ci 1899cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst0, hz_out2); 1900cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst1, hz_out3); 1901cabdff1aSopenharmony_ci dst2 = __msa_aver_s_h(dst2, hz_out4); 1902cabdff1aSopenharmony_ci dst3 = __msa_aver_s_h(dst3, hz_out5); 1903cabdff1aSopenharmony_ci 1904cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0, dst1); 1905cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst2, dst3); 1906cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 1907cabdff1aSopenharmony_ci dst += (4 * stride); 1908cabdff1aSopenharmony_ci 1909cabdff1aSopenharmony_ci LD_SB4(src, stride, src9, src10, src11, src12); 1910cabdff1aSopenharmony_ci XORI_B4_128_SB(src9, src10, src11, src12); 1911cabdff1aSopenharmony_ci hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2); 1912cabdff1aSopenharmony_ci hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2); 1913cabdff1aSopenharmony_ci hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2); 1914cabdff1aSopenharmony_ci hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2); 1915cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 1916cabdff1aSopenharmony_ci hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r, 1917cabdff1aSopenharmony_ci hz_out1211_r); 1918cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 1919cabdff1aSopenharmony_ci hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l, 1920cabdff1aSopenharmony_ci hz_out1211_l); 1921cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1, 1922cabdff1aSopenharmony_ci filt2); 1923cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1, 1924cabdff1aSopenharmony_ci filt2); 1925cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1926cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1, 1927cabdff1aSopenharmony_ci filt2); 1928cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1, 1929cabdff1aSopenharmony_ci filt2); 1930cabdff1aSopenharmony_ci dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1931cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1, 1932cabdff1aSopenharmony_ci filt2); 1933cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1, 1934cabdff1aSopenharmony_ci filt2); 1935cabdff1aSopenharmony_ci dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1936cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1, 1937cabdff1aSopenharmony_ci filt2); 1938cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1, 1939cabdff1aSopenharmony_ci filt2); 1940cabdff1aSopenharmony_ci dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 1941cabdff1aSopenharmony_ci 1942cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5); 1943cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7); 1944cabdff1aSopenharmony_ci 1945cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst0, hz_out6); 1946cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst1, hz_out7); 1947cabdff1aSopenharmony_ci dst2 = __msa_aver_s_h(dst2, hz_out8); 1948cabdff1aSopenharmony_ci dst3 = __msa_aver_s_h(dst3, hz_out9); 1949cabdff1aSopenharmony_ci 1950cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0, dst1); 1951cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst2, dst3); 1952cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 1953cabdff1aSopenharmony_ci} 1954cabdff1aSopenharmony_ci 1955cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, 1956cabdff1aSopenharmony_ci ptrdiff_t stride) 1957cabdff1aSopenharmony_ci{ 1958cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 1959cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 1960cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 1961cabdff1aSopenharmony_ci v16u8 out0, out1; 1962cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1963cabdff1aSopenharmony_ci v16i8 src11, src12, mask0, mask1, mask2; 1964cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1965cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12; 1966cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 1967cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r; 1968cabdff1aSopenharmony_ci v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3; 1969cabdff1aSopenharmony_ci v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l; 1970cabdff1aSopenharmony_ci v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l; 1971cabdff1aSopenharmony_ci v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2; 1972cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 1973cabdff1aSopenharmony_ci 1974cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 1975cabdff1aSopenharmony_ci 1976cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 1977cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 1978cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 1979cabdff1aSopenharmony_ci 1980cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 1981cabdff1aSopenharmony_ci 1982cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 1983cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 1984cabdff1aSopenharmony_ci src += (5 * stride); 1985cabdff1aSopenharmony_ci 1986cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 1987cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 1988cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 1989cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 1990cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 1991cabdff1aSopenharmony_ci 1992cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 1993cabdff1aSopenharmony_ci src += (4 * stride); 1994cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 1995cabdff1aSopenharmony_ci 1996cabdff1aSopenharmony_ci hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 1997cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 1998cabdff1aSopenharmony_ci hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 1999cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 2000cabdff1aSopenharmony_ci 2001cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 2002cabdff1aSopenharmony_ci hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 2003cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 2004cabdff1aSopenharmony_ci hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l); 2005cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 2006cabdff1aSopenharmony_ci hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 2007cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 2008cabdff1aSopenharmony_ci hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l); 2009cabdff1aSopenharmony_ci 2010cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 2011cabdff1aSopenharmony_ci filt2); 2012cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1, 2013cabdff1aSopenharmony_ci filt2); 2014cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2015cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 2016cabdff1aSopenharmony_ci filt2); 2017cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1, 2018cabdff1aSopenharmony_ci filt2); 2019cabdff1aSopenharmony_ci dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2020cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 2021cabdff1aSopenharmony_ci filt2); 2022cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1, 2023cabdff1aSopenharmony_ci filt2); 2024cabdff1aSopenharmony_ci dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2025cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 2026cabdff1aSopenharmony_ci filt2); 2027cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1, 2028cabdff1aSopenharmony_ci filt2); 2029cabdff1aSopenharmony_ci dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2030cabdff1aSopenharmony_ci 2031cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5); 2032cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7); 2033cabdff1aSopenharmony_ci 2034cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst0, hz_out3); 2035cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst1, hz_out4); 2036cabdff1aSopenharmony_ci dst2 = __msa_aver_s_h(dst2, hz_out5); 2037cabdff1aSopenharmony_ci dst3 = __msa_aver_s_h(dst3, hz_out6); 2038cabdff1aSopenharmony_ci 2039cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0, dst1); 2040cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst2, dst3); 2041cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 2042cabdff1aSopenharmony_ci dst += (4 * stride); 2043cabdff1aSopenharmony_ci 2044cabdff1aSopenharmony_ci LD_SB4(src, stride, src9, src10, src11, src12); 2045cabdff1aSopenharmony_ci XORI_B4_128_SB(src9, src10, src11, src12); 2046cabdff1aSopenharmony_ci hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2); 2047cabdff1aSopenharmony_ci hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2); 2048cabdff1aSopenharmony_ci hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2); 2049cabdff1aSopenharmony_ci hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2); 2050cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 2051cabdff1aSopenharmony_ci hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r, 2052cabdff1aSopenharmony_ci hz_out1211_r); 2053cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 2054cabdff1aSopenharmony_ci hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l, 2055cabdff1aSopenharmony_ci hz_out1211_l); 2056cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1, 2057cabdff1aSopenharmony_ci filt2); 2058cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1, 2059cabdff1aSopenharmony_ci filt2); 2060cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2061cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1, 2062cabdff1aSopenharmony_ci filt2); 2063cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1, 2064cabdff1aSopenharmony_ci filt2); 2065cabdff1aSopenharmony_ci dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2066cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1, 2067cabdff1aSopenharmony_ci filt2); 2068cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1, 2069cabdff1aSopenharmony_ci filt2); 2070cabdff1aSopenharmony_ci dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2071cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1, 2072cabdff1aSopenharmony_ci filt2); 2073cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1, 2074cabdff1aSopenharmony_ci filt2); 2075cabdff1aSopenharmony_ci dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2076cabdff1aSopenharmony_ci 2077cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5); 2078cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7); 2079cabdff1aSopenharmony_ci 2080cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst0, hz_out7); 2081cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst1, hz_out8); 2082cabdff1aSopenharmony_ci dst2 = __msa_aver_s_h(dst2, hz_out9); 2083cabdff1aSopenharmony_ci dst3 = __msa_aver_s_h(dst3, hz_out10); 2084cabdff1aSopenharmony_ci 2085cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0, dst1); 2086cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst2, dst3); 2087cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 2088cabdff1aSopenharmony_ci} 2089cabdff1aSopenharmony_ci 2090cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, 2091cabdff1aSopenharmony_ci ptrdiff_t stride) 2092cabdff1aSopenharmony_ci{ 2093cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 2094cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 2095cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 2096cabdff1aSopenharmony_ci v16u8 res; 2097cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2098cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2; 2099cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 2100cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2; 2101cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 2102cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r; 2103cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 2104cabdff1aSopenharmony_ci 2105cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 2106cabdff1aSopenharmony_ci 2107cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 2108cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 2109cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 2110cabdff1aSopenharmony_ci 2111cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 2112cabdff1aSopenharmony_ci 2113cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 2114cabdff1aSopenharmony_ci src += (5 * stride); 2115cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 2116cabdff1aSopenharmony_ci 2117cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 2118cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 2119cabdff1aSopenharmony_ci 2120cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2); 2121cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2); 2122cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2); 2123cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2); 2124cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 2125cabdff1aSopenharmony_ci PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); 2126cabdff1aSopenharmony_ci PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7); 2127cabdff1aSopenharmony_ci 2128cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 2129cabdff1aSopenharmony_ci hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 2130cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 2131cabdff1aSopenharmony_ci hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 2132cabdff1aSopenharmony_ci 2133cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 2134cabdff1aSopenharmony_ci filt2); 2135cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 2136cabdff1aSopenharmony_ci filt2); 2137cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2138cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 2139cabdff1aSopenharmony_ci filt2); 2140cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 2141cabdff1aSopenharmony_ci filt2); 2142cabdff1aSopenharmony_ci dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2143cabdff1aSopenharmony_ci 2144cabdff1aSopenharmony_ci SRARI_H2_SH(hz_out2, hz_out4, 5); 2145cabdff1aSopenharmony_ci SAT_SH2_SH(hz_out2, hz_out4, 7); 2146cabdff1aSopenharmony_ci 2147cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst0, hz_out2); 2148cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst1, hz_out4); 2149cabdff1aSopenharmony_ci 2150cabdff1aSopenharmony_ci res = PCKEV_XORI128_UB(dst0, dst1); 2151cabdff1aSopenharmony_ci ST_W4(res, 0, 1, 2, 3, dst, stride); 2152cabdff1aSopenharmony_ci} 2153cabdff1aSopenharmony_ci 2154cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, 2155cabdff1aSopenharmony_ci ptrdiff_t stride) 2156cabdff1aSopenharmony_ci{ 2157cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 2158cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 2159cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 2160cabdff1aSopenharmony_ci v16u8 res; 2161cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2162cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2; 2163cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 2164cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2; 2165cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 2166cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r; 2167cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 2168cabdff1aSopenharmony_ci 2169cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 2170cabdff1aSopenharmony_ci 2171cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 2172cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 2173cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 2174cabdff1aSopenharmony_ci 2175cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 2176cabdff1aSopenharmony_ci 2177cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 2178cabdff1aSopenharmony_ci src += (5 * stride); 2179cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 2180cabdff1aSopenharmony_ci 2181cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 2182cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 2183cabdff1aSopenharmony_ci 2184cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2); 2185cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2); 2186cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2); 2187cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2); 2188cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 2189cabdff1aSopenharmony_ci PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); 2190cabdff1aSopenharmony_ci PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7); 2191cabdff1aSopenharmony_ci 2192cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 2193cabdff1aSopenharmony_ci hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 2194cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 2195cabdff1aSopenharmony_ci hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 2196cabdff1aSopenharmony_ci 2197cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 2198cabdff1aSopenharmony_ci filt2); 2199cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 2200cabdff1aSopenharmony_ci filt2); 2201cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2202cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 2203cabdff1aSopenharmony_ci filt2); 2204cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 2205cabdff1aSopenharmony_ci filt2); 2206cabdff1aSopenharmony_ci dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2207cabdff1aSopenharmony_ci 2208cabdff1aSopenharmony_ci PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1); 2209cabdff1aSopenharmony_ci SRARI_H2_SH(hz_out0, hz_out1, 5); 2210cabdff1aSopenharmony_ci SAT_SH2_SH(hz_out0, hz_out1, 7); 2211cabdff1aSopenharmony_ci 2212cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst0, hz_out0); 2213cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst1, hz_out1); 2214cabdff1aSopenharmony_ci 2215cabdff1aSopenharmony_ci res = PCKEV_XORI128_UB(dst0, dst1); 2216cabdff1aSopenharmony_ci ST_W4(res, 0, 1, 2, 3, dst, stride); 2217cabdff1aSopenharmony_ci} 2218cabdff1aSopenharmony_ci 2219cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, 2220cabdff1aSopenharmony_ci ptrdiff_t stride) 2221cabdff1aSopenharmony_ci{ 2222cabdff1aSopenharmony_ci int32_t loop_cnt; 2223cabdff1aSopenharmony_ci int16_t filt_const0 = 0xfb01; 2224cabdff1aSopenharmony_ci int16_t filt_const1 = 0x1414; 2225cabdff1aSopenharmony_ci int16_t filt_const2 = 0x1fb; 2226cabdff1aSopenharmony_ci v16u8 res0, res1, res2, res3; 2227cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2228cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 2229cabdff1aSopenharmony_ci v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 2230cabdff1aSopenharmony_ci v16i8 src65_l, src87_l, filt0, filt1, filt2; 2231cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 2232cabdff1aSopenharmony_ci 2233cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 2234cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 2235cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 2236cabdff1aSopenharmony_ci src -= (stride * 2); 2237cabdff1aSopenharmony_ci 2238cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 2239cabdff1aSopenharmony_ci src += (5 * stride); 2240cabdff1aSopenharmony_ci 2241cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 2242cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 2243cabdff1aSopenharmony_ci src32_r, src43_r); 2244cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 2245cabdff1aSopenharmony_ci src32_l, src43_l); 2246cabdff1aSopenharmony_ci 2247cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 2248cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 2249cabdff1aSopenharmony_ci src += (4 * stride); 2250cabdff1aSopenharmony_ci 2251cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 2252cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 2253cabdff1aSopenharmony_ci src65_r, src76_r, src87_r); 2254cabdff1aSopenharmony_ci ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 2255cabdff1aSopenharmony_ci src65_l, src76_l, src87_l); 2256cabdff1aSopenharmony_ci out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 2257cabdff1aSopenharmony_ci out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 2258cabdff1aSopenharmony_ci out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 2259cabdff1aSopenharmony_ci out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 2260cabdff1aSopenharmony_ci out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 2261cabdff1aSopenharmony_ci out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 2262cabdff1aSopenharmony_ci out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 2263cabdff1aSopenharmony_ci out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 2264cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 2265cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 2266cabdff1aSopenharmony_ci SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); 2267cabdff1aSopenharmony_ci SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 2268cabdff1aSopenharmony_ci PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 2269cabdff1aSopenharmony_ci out3_r, res0, res1, res2, res3); 2270cabdff1aSopenharmony_ci XORI_B4_128_UB(res0, res1, res2, res3); 2271cabdff1aSopenharmony_ci ST_UB4(res0, res1, res2, res3, dst, stride); 2272cabdff1aSopenharmony_ci dst += (4 * stride); 2273cabdff1aSopenharmony_ci 2274cabdff1aSopenharmony_ci src10_r = src54_r; 2275cabdff1aSopenharmony_ci src32_r = src76_r; 2276cabdff1aSopenharmony_ci src21_r = src65_r; 2277cabdff1aSopenharmony_ci src43_r = src87_r; 2278cabdff1aSopenharmony_ci src10_l = src54_l; 2279cabdff1aSopenharmony_ci src32_l = src76_l; 2280cabdff1aSopenharmony_ci src21_l = src65_l; 2281cabdff1aSopenharmony_ci src43_l = src87_l; 2282cabdff1aSopenharmony_ci src4 = src8; 2283cabdff1aSopenharmony_ci } 2284cabdff1aSopenharmony_ci} 2285cabdff1aSopenharmony_ci 2286cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, 2287cabdff1aSopenharmony_ci ptrdiff_t stride) 2288cabdff1aSopenharmony_ci{ 2289cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 2290cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 2291cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 2292cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 2293cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2294cabdff1aSopenharmony_ci v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r; 2295cabdff1aSopenharmony_ci v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r; 2296cabdff1aSopenharmony_ci v16i8 filt0, filt1, filt2; 2297cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r; 2298cabdff1aSopenharmony_ci 2299cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 2300cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 2301cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 2302cabdff1aSopenharmony_ci 2303cabdff1aSopenharmony_ci src -= (stride * 2); 2304cabdff1aSopenharmony_ci 2305cabdff1aSopenharmony_ci LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 2306cabdff1aSopenharmony_ci src += (8 * stride); 2307cabdff1aSopenharmony_ci LD_SB5(src, stride, src8, src9, src10, src11, src12); 2308cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 2309cabdff1aSopenharmony_ci src32_r, src43_r); 2310cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r, 2311cabdff1aSopenharmony_ci src98_r, src109_r); 2312cabdff1aSopenharmony_ci ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r, 2313cabdff1aSopenharmony_ci src910_r, src1110_r, src1211_r); 2314cabdff1aSopenharmony_ci XORI_B4_128_SB(src10_r, src21_r, src32_r, src43_r); 2315cabdff1aSopenharmony_ci XORI_B4_128_SB(src76_r, src87_r, src98_r, src109_r); 2316cabdff1aSopenharmony_ci XORI_B4_128_SB(src89_r, src910_r, src1110_r, src1211_r); 2317cabdff1aSopenharmony_ci out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); 2318cabdff1aSopenharmony_ci out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); 2319cabdff1aSopenharmony_ci out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); 2320cabdff1aSopenharmony_ci out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); 2321cabdff1aSopenharmony_ci out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2); 2322cabdff1aSopenharmony_ci out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2); 2323cabdff1aSopenharmony_ci out6_r = AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2); 2324cabdff1aSopenharmony_ci out7_r = AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2); 2325cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 2326cabdff1aSopenharmony_ci SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5); 2327cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 2328cabdff1aSopenharmony_ci SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7); 2329cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(out0_r, out1_r); 2330cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(out2_r, out3_r); 2331cabdff1aSopenharmony_ci out2 = PCKEV_XORI128_UB(out4_r, out5_r); 2332cabdff1aSopenharmony_ci out3 = PCKEV_XORI128_UB(out6_r, out7_r); 2333cabdff1aSopenharmony_ci ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 2334cabdff1aSopenharmony_ci} 2335cabdff1aSopenharmony_ci 2336cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, 2337cabdff1aSopenharmony_ci ptrdiff_t stride) 2338cabdff1aSopenharmony_ci{ 2339cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 2340cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 2341cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 2342cabdff1aSopenharmony_ci v16u8 out; 2343cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2344cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 2345cabdff1aSopenharmony_ci v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 2346cabdff1aSopenharmony_ci v8i16 out10, out32; 2347cabdff1aSopenharmony_ci 2348cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 2349cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 2350cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 2351cabdff1aSopenharmony_ci 2352cabdff1aSopenharmony_ci src -= (stride * 2); 2353cabdff1aSopenharmony_ci 2354cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 2355cabdff1aSopenharmony_ci src += (5 * stride); 2356cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 2357cabdff1aSopenharmony_ci 2358cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 2359cabdff1aSopenharmony_ci src32_r, src43_r); 2360cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 2361cabdff1aSopenharmony_ci src76_r, src87_r); 2362cabdff1aSopenharmony_ci ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, 2363cabdff1aSopenharmony_ci src76_r, src2110, src4332, src6554, src8776); 2364cabdff1aSopenharmony_ci XORI_B4_128_SB(src2110, src4332, src6554, src8776); 2365cabdff1aSopenharmony_ci out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 2366cabdff1aSopenharmony_ci out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 2367cabdff1aSopenharmony_ci SRARI_H2_SH(out10, out32, 5); 2368cabdff1aSopenharmony_ci SAT_SH2_SH(out10, out32, 7); 2369cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out10, out32); 2370cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 2371cabdff1aSopenharmony_ci} 2372cabdff1aSopenharmony_ci 2373cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, 2374cabdff1aSopenharmony_ci ptrdiff_t stride) 2375cabdff1aSopenharmony_ci{ 2376cabdff1aSopenharmony_ci uint32_t row; 2377cabdff1aSopenharmony_ci v16u8 out; 2378cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2379cabdff1aSopenharmony_ci v16i8 src11; 2380cabdff1aSopenharmony_ci v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3; 2381cabdff1aSopenharmony_ci v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 2382cabdff1aSopenharmony_ci v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5; 2383cabdff1aSopenharmony_ci v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 2384cabdff1aSopenharmony_ci v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 2385cabdff1aSopenharmony_ci v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 2386cabdff1aSopenharmony_ci v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 2387cabdff1aSopenharmony_ci v8i16 minus5h = __msa_ldi_h(-5); 2388cabdff1aSopenharmony_ci v8i16 plus20h = __msa_ldi_h(20); 2389cabdff1aSopenharmony_ci 2390cabdff1aSopenharmony_ci mask3 = mask0 + 4; 2391cabdff1aSopenharmony_ci mask4 = mask1 + 4; 2392cabdff1aSopenharmony_ci mask5 = mask2 + 4; 2393cabdff1aSopenharmony_ci 2394cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 2395cabdff1aSopenharmony_ci 2396cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 2397cabdff1aSopenharmony_ci LD_SB5(src + 8, stride, src7, src8, src9, src10, src11); 2398cabdff1aSopenharmony_ci src += (5 * stride); 2399cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 2400cabdff1aSopenharmony_ci XORI_B5_128_SB(src7, src8, src9, src10, src11); 2401cabdff1aSopenharmony_ci 2402cabdff1aSopenharmony_ci for (row = 16; row--;) { 2403cabdff1aSopenharmony_ci LD_SB2(src, 8, src5, src6); 2404cabdff1aSopenharmony_ci src += stride; 2405cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 2406cabdff1aSopenharmony_ci 2407cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 2408cabdff1aSopenharmony_ci vt_res0, vt_res1); 2409cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6, 2410cabdff1aSopenharmony_ci vt_res2, vt_res3); 2411cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2412cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 2413cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2414cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 2415cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 2416cabdff1aSopenharmony_ci mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 2417cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 2418cabdff1aSopenharmony_ci mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 2419cabdff1aSopenharmony_ci hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2420cabdff1aSopenharmony_ci hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2421cabdff1aSopenharmony_ci hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 2422cabdff1aSopenharmony_ci hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 2423cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 2424cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 2425cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 2426cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 2427cabdff1aSopenharmony_ci SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 2428cabdff1aSopenharmony_ci SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 2429cabdff1aSopenharmony_ci dst0 = __msa_srari_h(shf_vec2, 5); 2430cabdff1aSopenharmony_ci dst1 = __msa_srari_h(shf_vec5, 5); 2431cabdff1aSopenharmony_ci dst2 = __msa_srari_h(shf_vec8, 5); 2432cabdff1aSopenharmony_ci dst3 = __msa_srari_h(shf_vec11, 5); 2433cabdff1aSopenharmony_ci SAT_SH4_SH(dst0, dst1, dst2, dst3, 7); 2434cabdff1aSopenharmony_ci PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1); 2435cabdff1aSopenharmony_ci PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3); 2436cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst2, dst0); 2437cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst3, dst1); 2438cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(dst0, dst1); 2439cabdff1aSopenharmony_ci ST_UB(out, dst); 2440cabdff1aSopenharmony_ci dst += stride; 2441cabdff1aSopenharmony_ci 2442cabdff1aSopenharmony_ci src0 = src1; 2443cabdff1aSopenharmony_ci src1 = src2; 2444cabdff1aSopenharmony_ci src2 = src3; 2445cabdff1aSopenharmony_ci src3 = src4; 2446cabdff1aSopenharmony_ci src4 = src5; 2447cabdff1aSopenharmony_ci src7 = src8; 2448cabdff1aSopenharmony_ci src8 = src9; 2449cabdff1aSopenharmony_ci src9 = src10; 2450cabdff1aSopenharmony_ci src10 = src11; 2451cabdff1aSopenharmony_ci src11 = src6; 2452cabdff1aSopenharmony_ci } 2453cabdff1aSopenharmony_ci} 2454cabdff1aSopenharmony_ci 2455cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, 2456cabdff1aSopenharmony_ci ptrdiff_t stride) 2457cabdff1aSopenharmony_ci{ 2458cabdff1aSopenharmony_ci uint32_t row; 2459cabdff1aSopenharmony_ci v16u8 out; 2460cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2461cabdff1aSopenharmony_ci v16i8 src11; 2462cabdff1aSopenharmony_ci v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3; 2463cabdff1aSopenharmony_ci v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 2464cabdff1aSopenharmony_ci v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5; 2465cabdff1aSopenharmony_ci v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 2466cabdff1aSopenharmony_ci v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 2467cabdff1aSopenharmony_ci v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 2468cabdff1aSopenharmony_ci v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 2469cabdff1aSopenharmony_ci v8i16 minus5h = __msa_ldi_h(-5); 2470cabdff1aSopenharmony_ci v8i16 plus20h = __msa_ldi_h(20); 2471cabdff1aSopenharmony_ci 2472cabdff1aSopenharmony_ci mask3 = mask0 + 4; 2473cabdff1aSopenharmony_ci mask4 = mask1 + 4; 2474cabdff1aSopenharmony_ci mask5 = mask2 + 4; 2475cabdff1aSopenharmony_ci 2476cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 2477cabdff1aSopenharmony_ci 2478cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 2479cabdff1aSopenharmony_ci LD_SB5(src + 8, stride, src7, src8, src9, src10, src11); 2480cabdff1aSopenharmony_ci src += (5 * stride); 2481cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 2482cabdff1aSopenharmony_ci XORI_B5_128_SB(src7, src8, src9, src10, src11); 2483cabdff1aSopenharmony_ci 2484cabdff1aSopenharmony_ci for (row = 16; row--;) { 2485cabdff1aSopenharmony_ci LD_SB2(src, 8, src5, src6); 2486cabdff1aSopenharmony_ci src += stride; 2487cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 2488cabdff1aSopenharmony_ci 2489cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 2490cabdff1aSopenharmony_ci vt_res0, vt_res1); 2491cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6, 2492cabdff1aSopenharmony_ci vt_res2, vt_res3); 2493cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2494cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 2495cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2496cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 2497cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 2498cabdff1aSopenharmony_ci mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 2499cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 2500cabdff1aSopenharmony_ci mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 2501cabdff1aSopenharmony_ci hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2502cabdff1aSopenharmony_ci hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2503cabdff1aSopenharmony_ci hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 2504cabdff1aSopenharmony_ci hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 2505cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 2506cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 2507cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 2508cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 2509cabdff1aSopenharmony_ci SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 2510cabdff1aSopenharmony_ci SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 2511cabdff1aSopenharmony_ci dst0 = __msa_srari_h(shf_vec2, 5); 2512cabdff1aSopenharmony_ci dst1 = __msa_srari_h(shf_vec5, 5); 2513cabdff1aSopenharmony_ci dst2 = __msa_srari_h(shf_vec8, 5); 2514cabdff1aSopenharmony_ci dst3 = __msa_srari_h(shf_vec11, 5); 2515cabdff1aSopenharmony_ci SAT_SH4_SH(dst0, dst1, dst2, dst3, 7); 2516cabdff1aSopenharmony_ci dst0 = __msa_pckod_h(dst2, dst0); 2517cabdff1aSopenharmony_ci dst1 = __msa_pckod_h(dst3, dst1); 2518cabdff1aSopenharmony_ci PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3); 2519cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst2, dst0); 2520cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst3, dst1); 2521cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(dst0, dst1); 2522cabdff1aSopenharmony_ci ST_UB(out, dst); 2523cabdff1aSopenharmony_ci dst += stride; 2524cabdff1aSopenharmony_ci 2525cabdff1aSopenharmony_ci src0 = src1; 2526cabdff1aSopenharmony_ci src1 = src2; 2527cabdff1aSopenharmony_ci src2 = src3; 2528cabdff1aSopenharmony_ci src3 = src4; 2529cabdff1aSopenharmony_ci src4 = src5; 2530cabdff1aSopenharmony_ci src7 = src8; 2531cabdff1aSopenharmony_ci src8 = src9; 2532cabdff1aSopenharmony_ci src9 = src10; 2533cabdff1aSopenharmony_ci src10 = src11; 2534cabdff1aSopenharmony_ci src11 = src6; 2535cabdff1aSopenharmony_ci } 2536cabdff1aSopenharmony_ci} 2537cabdff1aSopenharmony_ci 2538cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, 2539cabdff1aSopenharmony_ci ptrdiff_t stride) 2540cabdff1aSopenharmony_ci{ 2541cabdff1aSopenharmony_ci uint32_t row; 2542cabdff1aSopenharmony_ci v16u8 out; 2543cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 2544cabdff1aSopenharmony_ci v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3; 2545cabdff1aSopenharmony_ci v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 2546cabdff1aSopenharmony_ci v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11; 2547cabdff1aSopenharmony_ci v8i16 mask3, mask4, mask5; 2548cabdff1aSopenharmony_ci v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 2549cabdff1aSopenharmony_ci v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 2550cabdff1aSopenharmony_ci v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 2551cabdff1aSopenharmony_ci v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 2552cabdff1aSopenharmony_ci v8i16 minus5h = __msa_ldi_h(-5); 2553cabdff1aSopenharmony_ci v8i16 plus20h = __msa_ldi_h(20); 2554cabdff1aSopenharmony_ci 2555cabdff1aSopenharmony_ci mask3 = mask0 + 4; 2556cabdff1aSopenharmony_ci mask4 = mask1 + 4; 2557cabdff1aSopenharmony_ci mask5 = mask2 + 4; 2558cabdff1aSopenharmony_ci 2559cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 2560cabdff1aSopenharmony_ci 2561cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 2562cabdff1aSopenharmony_ci src += (5 * stride); 2563cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 2564cabdff1aSopenharmony_ci 2565cabdff1aSopenharmony_ci for (row = 4; row--;) { 2566cabdff1aSopenharmony_ci LD_SB2(src, stride, src5, src6); 2567cabdff1aSopenharmony_ci src += (2 * stride); 2568cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 2569cabdff1aSopenharmony_ci 2570cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 2571cabdff1aSopenharmony_ci vt_res0, vt_res1); 2572cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6, 2573cabdff1aSopenharmony_ci vt_res2, vt_res3); 2574cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2575cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 2576cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2577cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 2578cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 2579cabdff1aSopenharmony_ci mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 2580cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 2581cabdff1aSopenharmony_ci mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 2582cabdff1aSopenharmony_ci hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2583cabdff1aSopenharmony_ci hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2584cabdff1aSopenharmony_ci hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 2585cabdff1aSopenharmony_ci hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 2586cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 2587cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 2588cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 2589cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 2590cabdff1aSopenharmony_ci SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 2591cabdff1aSopenharmony_ci SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 2592cabdff1aSopenharmony_ci dst0 = __msa_srari_h(shf_vec2, 5); 2593cabdff1aSopenharmony_ci dst1 = __msa_srari_h(shf_vec5, 5); 2594cabdff1aSopenharmony_ci dst2 = __msa_srari_h(shf_vec8, 5); 2595cabdff1aSopenharmony_ci dst3 = __msa_srari_h(shf_vec11, 5); 2596cabdff1aSopenharmony_ci SAT_SH4_SH(dst0, dst1, dst2, dst3, 7); 2597cabdff1aSopenharmony_ci PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1); 2598cabdff1aSopenharmony_ci PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3); 2599cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst2, dst0); 2600cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst3, dst1); 2601cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(dst0, dst1); 2602cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst, stride); 2603cabdff1aSopenharmony_ci dst += (2 * stride); 2604cabdff1aSopenharmony_ci 2605cabdff1aSopenharmony_ci src0 = src2; 2606cabdff1aSopenharmony_ci src1 = src3; 2607cabdff1aSopenharmony_ci src2 = src4; 2608cabdff1aSopenharmony_ci src3 = src5; 2609cabdff1aSopenharmony_ci src4 = src6; 2610cabdff1aSopenharmony_ci } 2611cabdff1aSopenharmony_ci} 2612cabdff1aSopenharmony_ci 2613cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, 2614cabdff1aSopenharmony_ci ptrdiff_t stride) 2615cabdff1aSopenharmony_ci{ 2616cabdff1aSopenharmony_ci uint32_t row; 2617cabdff1aSopenharmony_ci v16u8 out; 2618cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 2619cabdff1aSopenharmony_ci v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3; 2620cabdff1aSopenharmony_ci v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 2621cabdff1aSopenharmony_ci v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11; 2622cabdff1aSopenharmony_ci v8i16 mask3, mask4, mask5; 2623cabdff1aSopenharmony_ci v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 2624cabdff1aSopenharmony_ci v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 2625cabdff1aSopenharmony_ci v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 2626cabdff1aSopenharmony_ci v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 2627cabdff1aSopenharmony_ci v8i16 minus5h = __msa_ldi_h(-5); 2628cabdff1aSopenharmony_ci v8i16 plus20h = __msa_ldi_h(20); 2629cabdff1aSopenharmony_ci 2630cabdff1aSopenharmony_ci mask3 = mask0 + 4; 2631cabdff1aSopenharmony_ci mask4 = mask1 + 4; 2632cabdff1aSopenharmony_ci mask5 = mask2 + 4; 2633cabdff1aSopenharmony_ci 2634cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 2635cabdff1aSopenharmony_ci 2636cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 2637cabdff1aSopenharmony_ci src += (5 * stride); 2638cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 2639cabdff1aSopenharmony_ci 2640cabdff1aSopenharmony_ci for (row = 4; row--;) { 2641cabdff1aSopenharmony_ci LD_SB2(src, stride, src5, src6); 2642cabdff1aSopenharmony_ci src += (2 * stride); 2643cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 2644cabdff1aSopenharmony_ci 2645cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 2646cabdff1aSopenharmony_ci vt_res0, vt_res1); 2647cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6, 2648cabdff1aSopenharmony_ci vt_res2, vt_res3); 2649cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2650cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 2651cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2652cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 2653cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 2654cabdff1aSopenharmony_ci mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 2655cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 2656cabdff1aSopenharmony_ci mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 2657cabdff1aSopenharmony_ci hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2658cabdff1aSopenharmony_ci hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2659cabdff1aSopenharmony_ci hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 2660cabdff1aSopenharmony_ci hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 2661cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 2662cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 2663cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 2664cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 2665cabdff1aSopenharmony_ci SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 2666cabdff1aSopenharmony_ci SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 2667cabdff1aSopenharmony_ci dst0 = __msa_srari_h(shf_vec2, 5); 2668cabdff1aSopenharmony_ci dst1 = __msa_srari_h(shf_vec5, 5); 2669cabdff1aSopenharmony_ci dst2 = __msa_srari_h(shf_vec8, 5); 2670cabdff1aSopenharmony_ci dst3 = __msa_srari_h(shf_vec11, 5); 2671cabdff1aSopenharmony_ci SAT_SH4_SH(dst0, dst1, dst2, dst3, 7); 2672cabdff1aSopenharmony_ci dst0 = __msa_pckod_h(dst2, dst0); 2673cabdff1aSopenharmony_ci dst1 = __msa_pckod_h(dst3, dst1); 2674cabdff1aSopenharmony_ci PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3); 2675cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst2, dst0); 2676cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst3, dst1); 2677cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(dst0, dst1); 2678cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst, stride); 2679cabdff1aSopenharmony_ci dst += (2 * stride); 2680cabdff1aSopenharmony_ci 2681cabdff1aSopenharmony_ci src0 = src2; 2682cabdff1aSopenharmony_ci src1 = src3; 2683cabdff1aSopenharmony_ci src2 = src4; 2684cabdff1aSopenharmony_ci src3 = src5; 2685cabdff1aSopenharmony_ci src4 = src6; 2686cabdff1aSopenharmony_ci } 2687cabdff1aSopenharmony_ci} 2688cabdff1aSopenharmony_ci 2689cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, 2690cabdff1aSopenharmony_ci ptrdiff_t stride) 2691cabdff1aSopenharmony_ci{ 2692cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 2693cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 2694cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 2695cabdff1aSopenharmony_ci v16u8 out; 2696cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2697cabdff1aSopenharmony_ci v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r; 2698cabdff1aSopenharmony_ci v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l; 2699cabdff1aSopenharmony_ci v16i8 src76_l, src87_l, filt0, filt1, filt2; 2700cabdff1aSopenharmony_ci v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7; 2701cabdff1aSopenharmony_ci v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 2702cabdff1aSopenharmony_ci v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 2703cabdff1aSopenharmony_ci v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 2704cabdff1aSopenharmony_ci v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 2705cabdff1aSopenharmony_ci v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 2706cabdff1aSopenharmony_ci v8i16 minus5h = __msa_ldi_h(-5); 2707cabdff1aSopenharmony_ci v8i16 plus20h = __msa_ldi_h(20); 2708cabdff1aSopenharmony_ci v8i16 zeros = { 0 }; 2709cabdff1aSopenharmony_ci 2710cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 2711cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 2712cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 2713cabdff1aSopenharmony_ci 2714cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 2715cabdff1aSopenharmony_ci 2716cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 2717cabdff1aSopenharmony_ci src += (5 * stride); 2718cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 2719cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 2720cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 2721cabdff1aSopenharmony_ci 2722cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 2723cabdff1aSopenharmony_ci src32_r, src43_r); 2724cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 2725cabdff1aSopenharmony_ci src76_r, src87_r); 2726cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 2727cabdff1aSopenharmony_ci src32_l, src43_l); 2728cabdff1aSopenharmony_ci ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l, 2729cabdff1aSopenharmony_ci src76_l, src87_l); 2730cabdff1aSopenharmony_ci vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 2731cabdff1aSopenharmony_ci vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 2732cabdff1aSopenharmony_ci vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 2733cabdff1aSopenharmony_ci vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 2734cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2735cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 2736cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2737cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 2738cabdff1aSopenharmony_ci hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2739cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 2740cabdff1aSopenharmony_ci hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2741cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 2742cabdff1aSopenharmony_ci 2743cabdff1aSopenharmony_ci vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 2744cabdff1aSopenharmony_ci vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 2745cabdff1aSopenharmony_ci vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 2746cabdff1aSopenharmony_ci vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 2747cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2748cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec6); 2749cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2750cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec7); 2751cabdff1aSopenharmony_ci hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2752cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2); 2753cabdff1aSopenharmony_ci hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2754cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3); 2755cabdff1aSopenharmony_ci 2756cabdff1aSopenharmony_ci SRARI_W2_SW(hz_res0, hz_res1, 10); 2757cabdff1aSopenharmony_ci SAT_SW2_SW(hz_res0, hz_res1, 7); 2758cabdff1aSopenharmony_ci SRARI_W2_SW(hz_res2, hz_res3, 10); 2759cabdff1aSopenharmony_ci SAT_SW2_SW(hz_res2, hz_res3, 7); 2760cabdff1aSopenharmony_ci 2761cabdff1aSopenharmony_ci dst0 = __msa_srari_h(shf_vec2, 5); 2762cabdff1aSopenharmony_ci dst1 = __msa_srari_h(shf_vec5, 5); 2763cabdff1aSopenharmony_ci dst2 = __msa_srari_h(shf_vec6, 5); 2764cabdff1aSopenharmony_ci dst3 = __msa_srari_h(shf_vec7, 5); 2765cabdff1aSopenharmony_ci 2766cabdff1aSopenharmony_ci SAT_SH2_SH(dst0, dst1, 7); 2767cabdff1aSopenharmony_ci SAT_SH2_SH(dst2, dst3, 7); 2768cabdff1aSopenharmony_ci ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1); 2769cabdff1aSopenharmony_ci ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3); 2770cabdff1aSopenharmony_ci 2771cabdff1aSopenharmony_ci hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0); 2772cabdff1aSopenharmony_ci hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1); 2773cabdff1aSopenharmony_ci hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2); 2774cabdff1aSopenharmony_ci hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3); 2775cabdff1aSopenharmony_ci 2776cabdff1aSopenharmony_ci PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2); 2777cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(dst0, dst2); 2778cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 2779cabdff1aSopenharmony_ci} 2780cabdff1aSopenharmony_ci 2781cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, 2782cabdff1aSopenharmony_ci ptrdiff_t stride) 2783cabdff1aSopenharmony_ci{ 2784cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 2785cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 2786cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 2787cabdff1aSopenharmony_ci v16u8 out; 2788cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 2789cabdff1aSopenharmony_ci v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r; 2790cabdff1aSopenharmony_ci v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l; 2791cabdff1aSopenharmony_ci v16i8 src76_l, src87_l, filt0, filt1, filt2; 2792cabdff1aSopenharmony_ci v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7; 2793cabdff1aSopenharmony_ci v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 2794cabdff1aSopenharmony_ci v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 2795cabdff1aSopenharmony_ci v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 2796cabdff1aSopenharmony_ci v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 2797cabdff1aSopenharmony_ci v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 2798cabdff1aSopenharmony_ci v8i16 minus5h = __msa_ldi_h(-5); 2799cabdff1aSopenharmony_ci v8i16 plus20h = __msa_ldi_h(20); 2800cabdff1aSopenharmony_ci v8i16 zeros = { 0 }; 2801cabdff1aSopenharmony_ci 2802cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 2803cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 2804cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 2805cabdff1aSopenharmony_ci 2806cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 2807cabdff1aSopenharmony_ci 2808cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 2809cabdff1aSopenharmony_ci src += (5 * stride); 2810cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 2811cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 2812cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 2813cabdff1aSopenharmony_ci 2814cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 2815cabdff1aSopenharmony_ci src32_r, src43_r); 2816cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 2817cabdff1aSopenharmony_ci src76_r, src87_r); 2818cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 2819cabdff1aSopenharmony_ci src32_l, src43_l); 2820cabdff1aSopenharmony_ci ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l, 2821cabdff1aSopenharmony_ci src76_l, src87_l); 2822cabdff1aSopenharmony_ci 2823cabdff1aSopenharmony_ci vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 2824cabdff1aSopenharmony_ci vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 2825cabdff1aSopenharmony_ci vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 2826cabdff1aSopenharmony_ci vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 2827cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2828cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 2829cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2830cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 2831cabdff1aSopenharmony_ci hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2832cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 2833cabdff1aSopenharmony_ci hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2834cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 2835cabdff1aSopenharmony_ci 2836cabdff1aSopenharmony_ci vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 2837cabdff1aSopenharmony_ci vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 2838cabdff1aSopenharmony_ci vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 2839cabdff1aSopenharmony_ci vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 2840cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 2841cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec6); 2842cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 2843cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec7); 2844cabdff1aSopenharmony_ci hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0); 2845cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2); 2846cabdff1aSopenharmony_ci hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3); 2847cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3); 2848cabdff1aSopenharmony_ci 2849cabdff1aSopenharmony_ci SRARI_W2_SW(hz_res0, hz_res1, 10); 2850cabdff1aSopenharmony_ci SAT_SW2_SW(hz_res0, hz_res1, 7); 2851cabdff1aSopenharmony_ci SRARI_W2_SW(hz_res2, hz_res3, 10); 2852cabdff1aSopenharmony_ci SAT_SW2_SW(hz_res2, hz_res3, 7); 2853cabdff1aSopenharmony_ci 2854cabdff1aSopenharmony_ci dst0 = __msa_srari_h(shf_vec2, 5); 2855cabdff1aSopenharmony_ci dst1 = __msa_srari_h(shf_vec5, 5); 2856cabdff1aSopenharmony_ci dst2 = __msa_srari_h(shf_vec6, 5); 2857cabdff1aSopenharmony_ci dst3 = __msa_srari_h(shf_vec7, 5); 2858cabdff1aSopenharmony_ci 2859cabdff1aSopenharmony_ci SAT_SH2_SH(dst0, dst1, 7); 2860cabdff1aSopenharmony_ci SAT_SH2_SH(dst2, dst3, 7); 2861cabdff1aSopenharmony_ci 2862cabdff1aSopenharmony_ci dst0 = __msa_ilvod_h(zeros, dst0); 2863cabdff1aSopenharmony_ci dst1 = __msa_ilvod_h(zeros, dst1); 2864cabdff1aSopenharmony_ci dst2 = __msa_ilvod_h(zeros, dst2); 2865cabdff1aSopenharmony_ci dst3 = __msa_ilvod_h(zeros, dst3); 2866cabdff1aSopenharmony_ci 2867cabdff1aSopenharmony_ci hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0); 2868cabdff1aSopenharmony_ci hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1); 2869cabdff1aSopenharmony_ci hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2); 2870cabdff1aSopenharmony_ci hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3); 2871cabdff1aSopenharmony_ci 2872cabdff1aSopenharmony_ci PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2); 2873cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(dst0, dst2); 2874cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 2875cabdff1aSopenharmony_ci} 2876cabdff1aSopenharmony_ci 2877cabdff1aSopenharmony_civoid ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, 2878cabdff1aSopenharmony_ci ptrdiff_t stride) 2879cabdff1aSopenharmony_ci{ 2880cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 2881cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 2882cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 2883cabdff1aSopenharmony_ci const uint8_t *src_tmp = src - (2 * stride) - 2; 2884cabdff1aSopenharmony_ci uint8_t *dst_tmp = dst; 2885cabdff1aSopenharmony_ci uint32_t multiple8_cnt, loop_cnt; 2886cabdff1aSopenharmony_ci v16u8 out0, out1; 2887cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2; 2888cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 2889cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3; 2890cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 2891cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l; 2892cabdff1aSopenharmony_ci v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l; 2893cabdff1aSopenharmony_ci v8i16 hz_out87_l, filt0, filt1, filt2; 2894cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 2895cabdff1aSopenharmony_ci 2896cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 2897cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 2898cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 2899cabdff1aSopenharmony_ci 2900cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 2901cabdff1aSopenharmony_ci 2902cabdff1aSopenharmony_ci for (multiple8_cnt = 2; multiple8_cnt--;) { 2903cabdff1aSopenharmony_ci src = src_tmp; 2904cabdff1aSopenharmony_ci dst = dst_tmp; 2905cabdff1aSopenharmony_ci 2906cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 2907cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 2908cabdff1aSopenharmony_ci src += (5 * stride); 2909cabdff1aSopenharmony_ci 2910cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 2911cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 2912cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 2913cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 2914cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 2915cabdff1aSopenharmony_ci 2916cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 2917cabdff1aSopenharmony_ci LD_SB4(src, stride, src0, src1, src2, src3); 2918cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 2919cabdff1aSopenharmony_ci src += (4 * stride); 2920cabdff1aSopenharmony_ci 2921cabdff1aSopenharmony_ci hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 2922cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 2923cabdff1aSopenharmony_ci hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 2924cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 2925cabdff1aSopenharmony_ci 2926cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 2927cabdff1aSopenharmony_ci hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, 2928cabdff1aSopenharmony_ci hz_out43_r); 2929cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 2930cabdff1aSopenharmony_ci hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, 2931cabdff1aSopenharmony_ci hz_out43_l); 2932cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 2933cabdff1aSopenharmony_ci hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, 2934cabdff1aSopenharmony_ci hz_out87_r); 2935cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 2936cabdff1aSopenharmony_ci hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, 2937cabdff1aSopenharmony_ci hz_out87_l); 2938cabdff1aSopenharmony_ci 2939cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, 2940cabdff1aSopenharmony_ci filt1, filt2); 2941cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, 2942cabdff1aSopenharmony_ci filt1, filt2); 2943cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2944cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, 2945cabdff1aSopenharmony_ci filt1, filt2); 2946cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, 2947cabdff1aSopenharmony_ci filt1, filt2); 2948cabdff1aSopenharmony_ci dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2949cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, 2950cabdff1aSopenharmony_ci filt1, filt2); 2951cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, 2952cabdff1aSopenharmony_ci filt1, filt2); 2953cabdff1aSopenharmony_ci dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2954cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, 2955cabdff1aSopenharmony_ci filt1, filt2); 2956cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, 2957cabdff1aSopenharmony_ci filt1, filt2); 2958cabdff1aSopenharmony_ci dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 2959cabdff1aSopenharmony_ci 2960cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0, dst1); 2961cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst2, dst3); 2962cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 2963cabdff1aSopenharmony_ci dst += (4 * stride); 2964cabdff1aSopenharmony_ci 2965cabdff1aSopenharmony_ci hz_out0 = hz_out4; 2966cabdff1aSopenharmony_ci hz_out1 = hz_out5; 2967cabdff1aSopenharmony_ci hz_out2 = hz_out6; 2968cabdff1aSopenharmony_ci hz_out3 = hz_out7; 2969cabdff1aSopenharmony_ci hz_out4 = hz_out8; 2970cabdff1aSopenharmony_ci } 2971cabdff1aSopenharmony_ci 2972cabdff1aSopenharmony_ci src_tmp += 8; 2973cabdff1aSopenharmony_ci dst_tmp += 8; 2974cabdff1aSopenharmony_ci } 2975cabdff1aSopenharmony_ci} 2976cabdff1aSopenharmony_ci 2977cabdff1aSopenharmony_civoid ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, 2978cabdff1aSopenharmony_ci ptrdiff_t stride) 2979cabdff1aSopenharmony_ci{ 2980cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 2981cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 2982cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 2983cabdff1aSopenharmony_ci v16u8 out0, out1; 2984cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2; 2985cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 2986cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12; 2987cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 2988cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r; 2989cabdff1aSopenharmony_ci v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3; 2990cabdff1aSopenharmony_ci v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l; 2991cabdff1aSopenharmony_ci v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l; 2992cabdff1aSopenharmony_ci v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2; 2993cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 2994cabdff1aSopenharmony_ci 2995cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 2996cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 2997cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 2998cabdff1aSopenharmony_ci 2999cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3000cabdff1aSopenharmony_ci 3001cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 3002cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 3003cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3004cabdff1aSopenharmony_ci src += (5 * stride); 3005cabdff1aSopenharmony_ci 3006cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 3007cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 3008cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 3009cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 3010cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 3011cabdff1aSopenharmony_ci 3012cabdff1aSopenharmony_ci LD_SB4(src, stride, src0, src1, src2, src3); 3013cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 3014cabdff1aSopenharmony_ci src += (4 * stride); 3015cabdff1aSopenharmony_ci hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 3016cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 3017cabdff1aSopenharmony_ci hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 3018cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 3019cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 3020cabdff1aSopenharmony_ci hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 3021cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 3022cabdff1aSopenharmony_ci hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l); 3023cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 3024cabdff1aSopenharmony_ci hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 3025cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 3026cabdff1aSopenharmony_ci hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l); 3027cabdff1aSopenharmony_ci 3028cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 3029cabdff1aSopenharmony_ci filt2); 3030cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1, 3031cabdff1aSopenharmony_ci filt2); 3032cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3033cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 3034cabdff1aSopenharmony_ci filt2); 3035cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1, 3036cabdff1aSopenharmony_ci filt2); 3037cabdff1aSopenharmony_ci dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3038cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 3039cabdff1aSopenharmony_ci filt2); 3040cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1, 3041cabdff1aSopenharmony_ci filt2); 3042cabdff1aSopenharmony_ci dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3043cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 3044cabdff1aSopenharmony_ci filt2); 3045cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1, 3046cabdff1aSopenharmony_ci filt2); 3047cabdff1aSopenharmony_ci dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3048cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0, dst1); 3049cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst2, dst3); 3050cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 3051cabdff1aSopenharmony_ci dst += (4 * stride); 3052cabdff1aSopenharmony_ci 3053cabdff1aSopenharmony_ci LD_SB4(src, stride, src0, src1, src2, src3); 3054cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 3055cabdff1aSopenharmony_ci hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 3056cabdff1aSopenharmony_ci hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 3057cabdff1aSopenharmony_ci hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 3058cabdff1aSopenharmony_ci hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 3059cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 3060cabdff1aSopenharmony_ci hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r, 3061cabdff1aSopenharmony_ci hz_out1211_r); 3062cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 3063cabdff1aSopenharmony_ci hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l, 3064cabdff1aSopenharmony_ci hz_out1211_l); 3065cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1, 3066cabdff1aSopenharmony_ci filt2); 3067cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1, 3068cabdff1aSopenharmony_ci filt2); 3069cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3070cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1, 3071cabdff1aSopenharmony_ci filt2); 3072cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1, 3073cabdff1aSopenharmony_ci filt2); 3074cabdff1aSopenharmony_ci dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3075cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1, 3076cabdff1aSopenharmony_ci filt2); 3077cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1, 3078cabdff1aSopenharmony_ci filt2); 3079cabdff1aSopenharmony_ci dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3080cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1, 3081cabdff1aSopenharmony_ci filt2); 3082cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1, 3083cabdff1aSopenharmony_ci filt2); 3084cabdff1aSopenharmony_ci dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3085cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(dst0, dst1); 3086cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(dst2, dst3); 3087cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 3088cabdff1aSopenharmony_ci} 3089cabdff1aSopenharmony_ci 3090cabdff1aSopenharmony_civoid ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, 3091cabdff1aSopenharmony_ci ptrdiff_t stride) 3092cabdff1aSopenharmony_ci{ 3093cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 3094cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 3095cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 3096cabdff1aSopenharmony_ci v16u8 res; 3097cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3098cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2; 3099cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 3100cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2; 3101cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 3102cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r; 3103cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 3104cabdff1aSopenharmony_ci 3105cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 3106cabdff1aSopenharmony_ci 3107cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 3108cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 3109cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 3110cabdff1aSopenharmony_ci 3111cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 3112cabdff1aSopenharmony_ci 3113cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 3114cabdff1aSopenharmony_ci src += (5 * stride); 3115cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 3116cabdff1aSopenharmony_ci 3117cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3118cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 3119cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2); 3120cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2); 3121cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2); 3122cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2); 3123cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 3124cabdff1aSopenharmony_ci PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); 3125cabdff1aSopenharmony_ci PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7); 3126cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 3127cabdff1aSopenharmony_ci hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 3128cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 3129cabdff1aSopenharmony_ci hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 3130cabdff1aSopenharmony_ci 3131cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 3132cabdff1aSopenharmony_ci filt2); 3133cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 3134cabdff1aSopenharmony_ci filt2); 3135cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3136cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 3137cabdff1aSopenharmony_ci filt2); 3138cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 3139cabdff1aSopenharmony_ci filt2); 3140cabdff1aSopenharmony_ci dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 3141cabdff1aSopenharmony_ci res = PCKEV_XORI128_UB(dst0, dst1); 3142cabdff1aSopenharmony_ci ST_W4(res, 0, 1, 2, 3, dst, stride); 3143cabdff1aSopenharmony_ci} 3144cabdff1aSopenharmony_ci 3145cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, 3146cabdff1aSopenharmony_ci ptrdiff_t stride) 3147cabdff1aSopenharmony_ci{ 3148cabdff1aSopenharmony_ci uint32_t loop_cnt; 3149cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3; 3150cabdff1aSopenharmony_ci v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6; 3151cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11; 3152cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 3153cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 3154cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 3155cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 3156cabdff1aSopenharmony_ci 3157cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3158cabdff1aSopenharmony_ci mask3 = mask0 + 8; 3159cabdff1aSopenharmony_ci mask4 = mask1 + 8; 3160cabdff1aSopenharmony_ci mask5 = mask2 + 8; 3161cabdff1aSopenharmony_ci src -= 2; 3162cabdff1aSopenharmony_ci 3163cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 3164cabdff1aSopenharmony_ci LD_SB2(src, 16, src0, src1); 3165cabdff1aSopenharmony_ci src += stride; 3166cabdff1aSopenharmony_ci LD_SB2(src, 16, src2, src3); 3167cabdff1aSopenharmony_ci src += stride; 3168cabdff1aSopenharmony_ci LD_SB2(src, 16, src4, src5); 3169cabdff1aSopenharmony_ci src += stride; 3170cabdff1aSopenharmony_ci LD_SB2(src, 16, src6, src7); 3171cabdff1aSopenharmony_ci src += stride; 3172cabdff1aSopenharmony_ci 3173cabdff1aSopenharmony_ci LD_UB4(dst, stride, dst0, dst1, dst2, dst3); 3174cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3175cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3); 3176cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9); 3177cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4); 3178cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10); 3179cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5); 3180cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11); 3181cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); 3182cabdff1aSopenharmony_ci DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 3183cabdff1aSopenharmony_ci minus5b, res0, res1, res2, res3); 3184cabdff1aSopenharmony_ci DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 3185cabdff1aSopenharmony_ci plus20b, res0, res1, res2, res3); 3186cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3); 3187cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9); 3188cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4); 3189cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10); 3190cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5); 3191cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11); 3192cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); 3193cabdff1aSopenharmony_ci DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 3194cabdff1aSopenharmony_ci minus5b, res4, res5, res6, res7); 3195cabdff1aSopenharmony_ci DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 3196cabdff1aSopenharmony_ci plus20b, res4, res5, res6, res7); 3197cabdff1aSopenharmony_ci SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2, 3198cabdff1aSopenharmony_ci src0, src2, src4, src6); 3199cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 5); 3200cabdff1aSopenharmony_ci SRARI_H4_SH(res4, res5, res6, res7, 5); 3201cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 3202cabdff1aSopenharmony_ci SAT_SH4_SH(res4, res5, res6, res7, 7); 3203cabdff1aSopenharmony_ci PCKEV_B2_SB(res1, res0, res3, res2, out0, out1); 3204cabdff1aSopenharmony_ci PCKEV_B2_SB(res5, res4, res7, res6, out2, out3); 3205cabdff1aSopenharmony_ci out0 = __msa_aver_s_b(out0, src0); 3206cabdff1aSopenharmony_ci out1 = __msa_aver_s_b(out1, src2); 3207cabdff1aSopenharmony_ci out2 = __msa_aver_s_b(out2, src4); 3208cabdff1aSopenharmony_ci out3 = __msa_aver_s_b(out3, src6); 3209cabdff1aSopenharmony_ci XORI_B4_128_SB(out0, out1, out2, out3); 3210cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 3211cabdff1aSopenharmony_ci AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3); 3212cabdff1aSopenharmony_ci ST_UB4(dst0, dst1, dst2, dst3, dst, stride); 3213cabdff1aSopenharmony_ci dst += (4 * stride); 3214cabdff1aSopenharmony_ci } 3215cabdff1aSopenharmony_ci} 3216cabdff1aSopenharmony_ci 3217cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, 3218cabdff1aSopenharmony_ci ptrdiff_t stride) 3219cabdff1aSopenharmony_ci{ 3220cabdff1aSopenharmony_ci uint32_t loop_cnt; 3221cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3; 3222cabdff1aSopenharmony_ci v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6; 3223cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11; 3224cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 3225cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 3226cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 3227cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 3228cabdff1aSopenharmony_ci 3229cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3230cabdff1aSopenharmony_ci mask3 = mask0 + 8; 3231cabdff1aSopenharmony_ci mask4 = mask1 + 8; 3232cabdff1aSopenharmony_ci mask5 = mask2 + 8; 3233cabdff1aSopenharmony_ci src -= 2; 3234cabdff1aSopenharmony_ci 3235cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 3236cabdff1aSopenharmony_ci LD_SB2(src, 16, src0, src1); 3237cabdff1aSopenharmony_ci src += stride; 3238cabdff1aSopenharmony_ci LD_SB2(src, 16, src2, src3); 3239cabdff1aSopenharmony_ci src += stride; 3240cabdff1aSopenharmony_ci LD_SB2(src, 16, src4, src5); 3241cabdff1aSopenharmony_ci src += stride; 3242cabdff1aSopenharmony_ci LD_SB2(src, 16, src6, src7); 3243cabdff1aSopenharmony_ci src += stride; 3244cabdff1aSopenharmony_ci 3245cabdff1aSopenharmony_ci LD_UB4(dst, stride, dst0, dst1, dst2, dst3); 3246cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3247cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3); 3248cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9); 3249cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4); 3250cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10); 3251cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5); 3252cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11); 3253cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); 3254cabdff1aSopenharmony_ci DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 3255cabdff1aSopenharmony_ci minus5b, res0, res1, res2, res3); 3256cabdff1aSopenharmony_ci DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 3257cabdff1aSopenharmony_ci plus20b, res0, res1, res2, res3); 3258cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3); 3259cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9); 3260cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4); 3261cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10); 3262cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5); 3263cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11); 3264cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); 3265cabdff1aSopenharmony_ci DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 3266cabdff1aSopenharmony_ci minus5b, res4, res5, res6, res7); 3267cabdff1aSopenharmony_ci DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 3268cabdff1aSopenharmony_ci plus20b, res4, res5, res6, res7); 3269cabdff1aSopenharmony_ci SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3, 3270cabdff1aSopenharmony_ci src0, src2, src4, src6); 3271cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 5); 3272cabdff1aSopenharmony_ci SRARI_H4_SH(res4, res5, res6, res7, 5); 3273cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 3274cabdff1aSopenharmony_ci SAT_SH4_SH(res4, res5, res6, res7, 7); 3275cabdff1aSopenharmony_ci PCKEV_B2_SB(res1, res0, res3, res2, out0, out1); 3276cabdff1aSopenharmony_ci PCKEV_B2_SB(res5, res4, res7, res6, out2, out3); 3277cabdff1aSopenharmony_ci out0 = __msa_aver_s_b(out0, src0); 3278cabdff1aSopenharmony_ci out1 = __msa_aver_s_b(out1, src2); 3279cabdff1aSopenharmony_ci out2 = __msa_aver_s_b(out2, src4); 3280cabdff1aSopenharmony_ci out3 = __msa_aver_s_b(out3, src6); 3281cabdff1aSopenharmony_ci XORI_B4_128_SB(out0, out1, out2, out3); 3282cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 3283cabdff1aSopenharmony_ci AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3); 3284cabdff1aSopenharmony_ci ST_UB4(dst0, dst1, dst2, dst3, dst, stride); 3285cabdff1aSopenharmony_ci dst += (4 * stride); 3286cabdff1aSopenharmony_ci } 3287cabdff1aSopenharmony_ci} 3288cabdff1aSopenharmony_ci 3289cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, 3290cabdff1aSopenharmony_ci ptrdiff_t stride) 3291cabdff1aSopenharmony_ci{ 3292cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 3293cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 3294cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 3295cabdff1aSopenharmony_ci v16i8 tmp0, tmp1, tmp2, tmp3, vec11; 3296cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 3297cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 3298cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 3299cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 3300cabdff1aSopenharmony_ci 3301cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3302cabdff1aSopenharmony_ci LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7); 3303cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3304cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 3305cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 3306cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); 3307cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 3308cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 3309cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 3310cabdff1aSopenharmony_ci res0, res1, res2, res3); 3311cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); 3312cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); 3313cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 3314cabdff1aSopenharmony_ci res0, res1, res2, res3); 3315cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 3316cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 3317cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7); 3318cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 3319cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 3320cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 3321cabdff1aSopenharmony_ci res4, res5, res6, res7); 3322cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9); 3323cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); 3324cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 3325cabdff1aSopenharmony_ci res4, res5, res6, res7); 3326cabdff1aSopenharmony_ci SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2, 3327cabdff1aSopenharmony_ci src0, src1, src2, src3); 3328cabdff1aSopenharmony_ci SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2, 3329cabdff1aSopenharmony_ci src4, src5, src6, src7); 3330cabdff1aSopenharmony_ci PCKEV_D2_SB(src1, src0, src3, src2, src0, src1); 3331cabdff1aSopenharmony_ci PCKEV_D2_SB(src5, src4, src7, src6, src4, src5); 3332cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 5); 3333cabdff1aSopenharmony_ci SRARI_H4_SH(res4, res5, res6, res7, 5); 3334cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 3335cabdff1aSopenharmony_ci SAT_SH4_SH(res4, res5, res6, res7, 7); 3336cabdff1aSopenharmony_ci PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1); 3337cabdff1aSopenharmony_ci PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3); 3338cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_b(tmp0, src0); 3339cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_b(tmp1, src1); 3340cabdff1aSopenharmony_ci tmp2 = __msa_aver_s_b(tmp2, src4); 3341cabdff1aSopenharmony_ci tmp3 = __msa_aver_s_b(tmp3, src5); 3342cabdff1aSopenharmony_ci XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3); 3343cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 3344cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 3345cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 3346cabdff1aSopenharmony_ci LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 3347cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst2); 3348cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst3); 3349cabdff1aSopenharmony_ci AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1); 3350cabdff1aSopenharmony_ci AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3); 3351cabdff1aSopenharmony_ci ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 3352cabdff1aSopenharmony_ci} 3353cabdff1aSopenharmony_ci 3354cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, 3355cabdff1aSopenharmony_ci ptrdiff_t stride) 3356cabdff1aSopenharmony_ci{ 3357cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 3358cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 3359cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 3360cabdff1aSopenharmony_ci v16i8 tmp0, tmp1, tmp2, tmp3, vec11; 3361cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 3362cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 3363cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 3364cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 3365cabdff1aSopenharmony_ci 3366cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3367cabdff1aSopenharmony_ci LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7); 3368cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3369cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 3370cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 3371cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); 3372cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 3373cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 3374cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 3375cabdff1aSopenharmony_ci res0, res1, res2, res3); 3376cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); 3377cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); 3378cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 3379cabdff1aSopenharmony_ci res0, res1, res2, res3); 3380cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 3381cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 3382cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7); 3383cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 3384cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 3385cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 3386cabdff1aSopenharmony_ci res4, res5, res6, res7); 3387cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9); 3388cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); 3389cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 3390cabdff1aSopenharmony_ci res4, res5, res6, res7); 3391cabdff1aSopenharmony_ci SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3, 3392cabdff1aSopenharmony_ci src0, src1, src2, src3); 3393cabdff1aSopenharmony_ci SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3, 3394cabdff1aSopenharmony_ci src4, src5, src6, src7); 3395cabdff1aSopenharmony_ci PCKEV_D2_SB(src1, src0, src3, src2, src0, src1); 3396cabdff1aSopenharmony_ci PCKEV_D2_SB(src5, src4, src7, src6, src4, src5); 3397cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 5); 3398cabdff1aSopenharmony_ci SRARI_H4_SH(res4, res5, res6, res7, 5); 3399cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 3400cabdff1aSopenharmony_ci SAT_SH4_SH(res4, res5, res6, res7, 7); 3401cabdff1aSopenharmony_ci PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1); 3402cabdff1aSopenharmony_ci PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3); 3403cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_b(tmp0, src0); 3404cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_b(tmp1, src1); 3405cabdff1aSopenharmony_ci tmp2 = __msa_aver_s_b(tmp2, src4); 3406cabdff1aSopenharmony_ci tmp3 = __msa_aver_s_b(tmp3, src5); 3407cabdff1aSopenharmony_ci XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3); 3408cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 3409cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 3410cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 3411cabdff1aSopenharmony_ci LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 3412cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst2); 3413cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst3); 3414cabdff1aSopenharmony_ci AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1); 3415cabdff1aSopenharmony_ci AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3); 3416cabdff1aSopenharmony_ci ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 3417cabdff1aSopenharmony_ci} 3418cabdff1aSopenharmony_ci 3419cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, 3420cabdff1aSopenharmony_ci ptrdiff_t stride) 3421cabdff1aSopenharmony_ci{ 3422cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 3423cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }; 3424cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5; 3425cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2; 3426cabdff1aSopenharmony_ci v8i16 out0, out1; 3427cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 3428cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 3429cabdff1aSopenharmony_ci 3430cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 3431cabdff1aSopenharmony_ci LD_SB4(src - 2, stride, src0, src1, src2, src3); 3432cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 3433cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 3434cabdff1aSopenharmony_ci HADD_SB2_SH(vec0, vec1, out0, out1); 3435cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 3436cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1); 3437cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 3438cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1); 3439cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, 5); 3440cabdff1aSopenharmony_ci SAT_SH2_SH(out0, out1, 7); 3441cabdff1aSopenharmony_ci res = __msa_pckev_b((v16i8) out1, (v16i8) out0); 3442cabdff1aSopenharmony_ci SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2, 3443cabdff1aSopenharmony_ci src0, src1, src2, src3); 3444cabdff1aSopenharmony_ci src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); 3445cabdff1aSopenharmony_ci src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); 3446cabdff1aSopenharmony_ci src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1); 3447cabdff1aSopenharmony_ci res = __msa_aver_s_b(res, src0); 3448cabdff1aSopenharmony_ci res = (v16i8) __msa_xori_b((v16u8) res, 128); 3449cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 3450cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 3451cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b((v16u8) res, dst0); 3452cabdff1aSopenharmony_ci ST_W4(dst0, 0, 1, 2, 3, dst, stride); 3453cabdff1aSopenharmony_ci} 3454cabdff1aSopenharmony_ci 3455cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, 3456cabdff1aSopenharmony_ci ptrdiff_t stride) 3457cabdff1aSopenharmony_ci{ 3458cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 3459cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }; 3460cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5; 3461cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2; 3462cabdff1aSopenharmony_ci v8i16 out0, out1; 3463cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 3464cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 3465cabdff1aSopenharmony_ci 3466cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 3467cabdff1aSopenharmony_ci LD_SB4(src - 2, stride, src0, src1, src2, src3); 3468cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 3469cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 3470cabdff1aSopenharmony_ci HADD_SB2_SH(vec0, vec1, out0, out1); 3471cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 3472cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1); 3473cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 3474cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1); 3475cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, 5); 3476cabdff1aSopenharmony_ci SAT_SH2_SH(out0, out1, 7); 3477cabdff1aSopenharmony_ci res = __msa_pckev_b((v16i8) out1, (v16i8) out0); 3478cabdff1aSopenharmony_ci SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3, 3479cabdff1aSopenharmony_ci src0, src1, src2, src3); 3480cabdff1aSopenharmony_ci src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); 3481cabdff1aSopenharmony_ci src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); 3482cabdff1aSopenharmony_ci src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1); 3483cabdff1aSopenharmony_ci res = __msa_aver_s_b(res, src0); 3484cabdff1aSopenharmony_ci res = (v16i8) __msa_xori_b((v16u8) res, 128); 3485cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 3486cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 3487cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b((v16u8) res, dst0); 3488cabdff1aSopenharmony_ci ST_W4(dst0, 0, 1, 2, 3, dst, stride); 3489cabdff1aSopenharmony_ci} 3490cabdff1aSopenharmony_ci 3491cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, 3492cabdff1aSopenharmony_ci ptrdiff_t stride) 3493cabdff1aSopenharmony_ci{ 3494cabdff1aSopenharmony_ci uint32_t loop_cnt; 3495cabdff1aSopenharmony_ci v16u8 dst0, dst1, dst2, dst3; 3496cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 3497cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 3498cabdff1aSopenharmony_ci v16i8 vec11; 3499cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 3500cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 3501cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 3502cabdff1aSopenharmony_ci 3503cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3504cabdff1aSopenharmony_ci src -= 2; 3505cabdff1aSopenharmony_ci 3506cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 3507cabdff1aSopenharmony_ci LD_SB2(src, 8, src0, src1); 3508cabdff1aSopenharmony_ci src += stride; 3509cabdff1aSopenharmony_ci LD_SB2(src, 8, src2, src3); 3510cabdff1aSopenharmony_ci src += stride; 3511cabdff1aSopenharmony_ci LD_SB2(src, 8, src4, src5); 3512cabdff1aSopenharmony_ci src += stride; 3513cabdff1aSopenharmony_ci LD_SB2(src, 8, src6, src7); 3514cabdff1aSopenharmony_ci src += stride; 3515cabdff1aSopenharmony_ci 3516cabdff1aSopenharmony_ci LD_UB4(dst, stride, dst0, dst1, dst2, dst3); 3517cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3518cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3); 3519cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9); 3520cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4); 3521cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10); 3522cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5); 3523cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11); 3524cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3); 3525cabdff1aSopenharmony_ci DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 3526cabdff1aSopenharmony_ci minus5b, res0, res1, res2, res3); 3527cabdff1aSopenharmony_ci DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 3528cabdff1aSopenharmony_ci plus20b, res0, res1, res2, res3); 3529cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3); 3530cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9); 3531cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4); 3532cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10); 3533cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5); 3534cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11); 3535cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7); 3536cabdff1aSopenharmony_ci DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, 3537cabdff1aSopenharmony_ci minus5b, res4, res5, res6, res7); 3538cabdff1aSopenharmony_ci DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, 3539cabdff1aSopenharmony_ci plus20b, res4, res5, res6, res7); 3540cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 5); 3541cabdff1aSopenharmony_ci SRARI_H4_SH(res4, res5, res6, res7, 5); 3542cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 3543cabdff1aSopenharmony_ci SAT_SH4_SH(res4, res5, res6, res7, 7); 3544cabdff1aSopenharmony_ci PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1, 3545cabdff1aSopenharmony_ci vec2, vec3); 3546cabdff1aSopenharmony_ci XORI_B4_128_SB(vec0, vec1, vec2, vec3); 3547cabdff1aSopenharmony_ci AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1); 3548cabdff1aSopenharmony_ci AVER_UB2_UB(vec2, dst2, vec3, dst3, dst2, dst3); 3549cabdff1aSopenharmony_ci ST_UB4(dst0, dst1, dst2, dst3, dst, stride); 3550cabdff1aSopenharmony_ci dst += (4 * stride); 3551cabdff1aSopenharmony_ci } 3552cabdff1aSopenharmony_ci} 3553cabdff1aSopenharmony_ci 3554cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, 3555cabdff1aSopenharmony_ci ptrdiff_t stride) 3556cabdff1aSopenharmony_ci{ 3557cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 3558cabdff1aSopenharmony_ci v16u8 out0, out1, out2 = { 0 }, out3 = { 0 }; 3559cabdff1aSopenharmony_ci v16u8 out4, out5, out6 = { 0 }, out7 = { 0 }; 3560cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2; 3561cabdff1aSopenharmony_ci v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 3562cabdff1aSopenharmony_ci v16i8 vec11; 3563cabdff1aSopenharmony_ci v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 3564cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 3565cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 3566cabdff1aSopenharmony_ci 3567cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 3568cabdff1aSopenharmony_ci 3569cabdff1aSopenharmony_ci LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7); 3570cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 3571cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 3572cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 3573cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3); 3574cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 3575cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 3576cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 3577cabdff1aSopenharmony_ci res0, res1, res2, res3); 3578cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9); 3579cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11); 3580cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 3581cabdff1aSopenharmony_ci res0, res1, res2, res3); 3582cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 3583cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 3584cabdff1aSopenharmony_ci HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7); 3585cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 3586cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 3587cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b, 3588cabdff1aSopenharmony_ci res4, res5, res6, res7); 3589cabdff1aSopenharmony_ci VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9); 3590cabdff1aSopenharmony_ci VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); 3591cabdff1aSopenharmony_ci DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, 3592cabdff1aSopenharmony_ci res4, res5, res6, res7); 3593cabdff1aSopenharmony_ci SRARI_H4_SH(res0, res1, res2, res3, 5); 3594cabdff1aSopenharmony_ci SRARI_H4_SH(res4, res5, res6, res7, 5); 3595cabdff1aSopenharmony_ci SAT_SH4_SH(res0, res1, res2, res3, 7); 3596cabdff1aSopenharmony_ci SAT_SH4_SH(res4, res5, res6, res7, 7); 3597cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(res0, res1); 3598cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(res2, res3); 3599cabdff1aSopenharmony_ci out4 = PCKEV_XORI128_UB(res4, res5); 3600cabdff1aSopenharmony_ci out5 = PCKEV_XORI128_UB(res6, res7); 3601cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 3602cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, out2); 3603cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, out3); 3604cabdff1aSopenharmony_ci LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 3605cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, out6); 3606cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, out7); 3607cabdff1aSopenharmony_ci AVER_UB2_UB(out0, out2, out1, out3, out0, out1); 3608cabdff1aSopenharmony_ci AVER_UB2_UB(out4, out6, out5, out7, out4, out5); 3609cabdff1aSopenharmony_ci ST_D8(out0, out1, out4, out5, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 3610cabdff1aSopenharmony_ci} 3611cabdff1aSopenharmony_ci 3612cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, 3613cabdff1aSopenharmony_ci ptrdiff_t stride) 3614cabdff1aSopenharmony_ci{ 3615cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 3616cabdff1aSopenharmony_ci v16u8 res, dst0 = { 0 }; 3617cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, vec4, vec5; 3618cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2; 3619cabdff1aSopenharmony_ci v8i16 res0, res1; 3620cabdff1aSopenharmony_ci v16i8 minus5b = __msa_ldi_b(-5); 3621cabdff1aSopenharmony_ci v16i8 plus20b = __msa_ldi_b(20); 3622cabdff1aSopenharmony_ci 3623cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 3624cabdff1aSopenharmony_ci LD_SB4(src - 2, stride, src0, src1, src2, src3); 3625cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 3626cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1); 3627cabdff1aSopenharmony_ci HADD_SB2_SH(vec0, vec1, res0, res1); 3628cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3); 3629cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1); 3630cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5); 3631cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1); 3632cabdff1aSopenharmony_ci SRARI_H2_SH(res0, res1, 5); 3633cabdff1aSopenharmony_ci SAT_SH2_SH(res0, res1, 7); 3634cabdff1aSopenharmony_ci res = PCKEV_XORI128_UB(res0, res1); 3635cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 3636cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 3637cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, dst0); 3638cabdff1aSopenharmony_ci ST_W4(res, 0, 1, 2, 3, dst, stride); 3639cabdff1aSopenharmony_ci} 3640cabdff1aSopenharmony_ci 3641cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, 3642cabdff1aSopenharmony_ci ptrdiff_t stride) 3643cabdff1aSopenharmony_ci{ 3644cabdff1aSopenharmony_ci int32_t loop_cnt; 3645cabdff1aSopenharmony_ci int16_t filt_const0 = 0xfb01; 3646cabdff1aSopenharmony_ci int16_t filt_const1 = 0x1414; 3647cabdff1aSopenharmony_ci int16_t filt_const2 = 0x1fb; 3648cabdff1aSopenharmony_ci v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; 3649cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3650cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 3651cabdff1aSopenharmony_ci v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 3652cabdff1aSopenharmony_ci v16i8 src65_l, src87_l, filt0, filt1, filt2; 3653cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 3654cabdff1aSopenharmony_ci 3655cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 3656cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 3657cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 3658cabdff1aSopenharmony_ci 3659cabdff1aSopenharmony_ci src -= (stride * 2); 3660cabdff1aSopenharmony_ci 3661cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 3662cabdff1aSopenharmony_ci src += (5 * stride); 3663cabdff1aSopenharmony_ci 3664cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3665cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3666cabdff1aSopenharmony_ci src32_r, src43_r); 3667cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 3668cabdff1aSopenharmony_ci src32_l, src43_l); 3669cabdff1aSopenharmony_ci 3670cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 3671cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 3672cabdff1aSopenharmony_ci src += (4 * stride); 3673cabdff1aSopenharmony_ci 3674cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 3675cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 3676cabdff1aSopenharmony_ci src65_r, src76_r, src87_r); 3677cabdff1aSopenharmony_ci ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 3678cabdff1aSopenharmony_ci src65_l, src76_l, src87_l); 3679cabdff1aSopenharmony_ci out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 3680cabdff1aSopenharmony_ci out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 3681cabdff1aSopenharmony_ci out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 3682cabdff1aSopenharmony_ci out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 3683cabdff1aSopenharmony_ci out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 3684cabdff1aSopenharmony_ci out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 3685cabdff1aSopenharmony_ci out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 3686cabdff1aSopenharmony_ci out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 3687cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 3688cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 3689cabdff1aSopenharmony_ci SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); 3690cabdff1aSopenharmony_ci SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 3691cabdff1aSopenharmony_ci PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 3692cabdff1aSopenharmony_ci out3_r, res0, res1, res2, res3); 3693cabdff1aSopenharmony_ci res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2); 3694cabdff1aSopenharmony_ci res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3); 3695cabdff1aSopenharmony_ci res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4); 3696cabdff1aSopenharmony_ci res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5); 3697cabdff1aSopenharmony_ci LD_UB4(dst, stride, dst0, dst1, dst2, dst3); 3698cabdff1aSopenharmony_ci XORI_B4_128_UB(res0, res1, res2, res3); 3699cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1); 3700cabdff1aSopenharmony_ci AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3); 3701cabdff1aSopenharmony_ci ST_UB4(dst0, dst1, dst2, dst3, dst, stride); 3702cabdff1aSopenharmony_ci dst += (4 * stride); 3703cabdff1aSopenharmony_ci 3704cabdff1aSopenharmony_ci src10_r = src54_r; 3705cabdff1aSopenharmony_ci src32_r = src76_r; 3706cabdff1aSopenharmony_ci src21_r = src65_r; 3707cabdff1aSopenharmony_ci src43_r = src87_r; 3708cabdff1aSopenharmony_ci src10_l = src54_l; 3709cabdff1aSopenharmony_ci src32_l = src76_l; 3710cabdff1aSopenharmony_ci src21_l = src65_l; 3711cabdff1aSopenharmony_ci src43_l = src87_l; 3712cabdff1aSopenharmony_ci src2 = src6; 3713cabdff1aSopenharmony_ci src3 = src7; 3714cabdff1aSopenharmony_ci src4 = src8; 3715cabdff1aSopenharmony_ci } 3716cabdff1aSopenharmony_ci} 3717cabdff1aSopenharmony_ci 3718cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, 3719cabdff1aSopenharmony_ci ptrdiff_t stride) 3720cabdff1aSopenharmony_ci{ 3721cabdff1aSopenharmony_ci int32_t loop_cnt; 3722cabdff1aSopenharmony_ci int16_t filt_const0 = 0xfb01; 3723cabdff1aSopenharmony_ci int16_t filt_const1 = 0x1414; 3724cabdff1aSopenharmony_ci int16_t filt_const2 = 0x1fb; 3725cabdff1aSopenharmony_ci v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; 3726cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3727cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 3728cabdff1aSopenharmony_ci v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 3729cabdff1aSopenharmony_ci v16i8 src65_l, src87_l, filt0, filt1, filt2; 3730cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 3731cabdff1aSopenharmony_ci 3732cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 3733cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 3734cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 3735cabdff1aSopenharmony_ci 3736cabdff1aSopenharmony_ci src -= (stride * 2); 3737cabdff1aSopenharmony_ci 3738cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 3739cabdff1aSopenharmony_ci src += (5 * stride); 3740cabdff1aSopenharmony_ci 3741cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3742cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3743cabdff1aSopenharmony_ci src32_r, src43_r); 3744cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 3745cabdff1aSopenharmony_ci src32_l, src43_l); 3746cabdff1aSopenharmony_ci 3747cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 3748cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 3749cabdff1aSopenharmony_ci src += (4 * stride); 3750cabdff1aSopenharmony_ci 3751cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 3752cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 3753cabdff1aSopenharmony_ci src65_r, src76_r, src87_r); 3754cabdff1aSopenharmony_ci ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 3755cabdff1aSopenharmony_ci src65_l, src76_l, src87_l); 3756cabdff1aSopenharmony_ci out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 3757cabdff1aSopenharmony_ci out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 3758cabdff1aSopenharmony_ci out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 3759cabdff1aSopenharmony_ci out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 3760cabdff1aSopenharmony_ci out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 3761cabdff1aSopenharmony_ci out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 3762cabdff1aSopenharmony_ci out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 3763cabdff1aSopenharmony_ci out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 3764cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 3765cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 3766cabdff1aSopenharmony_ci SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); 3767cabdff1aSopenharmony_ci SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 3768cabdff1aSopenharmony_ci PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 3769cabdff1aSopenharmony_ci out3_r, res0, res1, res2, res3); 3770cabdff1aSopenharmony_ci res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3); 3771cabdff1aSopenharmony_ci res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4); 3772cabdff1aSopenharmony_ci res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5); 3773cabdff1aSopenharmony_ci res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6); 3774cabdff1aSopenharmony_ci LD_UB4(dst, stride, dst0, dst1, dst2, dst3); 3775cabdff1aSopenharmony_ci XORI_B4_128_UB(res0, res1, res2, res3); 3776cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1); 3777cabdff1aSopenharmony_ci AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3); 3778cabdff1aSopenharmony_ci ST_UB4(dst0, dst1, dst2, dst3, dst, stride); 3779cabdff1aSopenharmony_ci dst += (4 * stride); 3780cabdff1aSopenharmony_ci 3781cabdff1aSopenharmony_ci src10_r = src54_r; 3782cabdff1aSopenharmony_ci src32_r = src76_r; 3783cabdff1aSopenharmony_ci src21_r = src65_r; 3784cabdff1aSopenharmony_ci src43_r = src87_r; 3785cabdff1aSopenharmony_ci src10_l = src54_l; 3786cabdff1aSopenharmony_ci src32_l = src76_l; 3787cabdff1aSopenharmony_ci src21_l = src65_l; 3788cabdff1aSopenharmony_ci src43_l = src87_l; 3789cabdff1aSopenharmony_ci src3 = src7; 3790cabdff1aSopenharmony_ci src4 = src8; 3791cabdff1aSopenharmony_ci } 3792cabdff1aSopenharmony_ci} 3793cabdff1aSopenharmony_ci 3794cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, 3795cabdff1aSopenharmony_ci ptrdiff_t stride) 3796cabdff1aSopenharmony_ci{ 3797cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 3798cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 3799cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 3800cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 3801cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 3802cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12; 3803cabdff1aSopenharmony_ci v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r; 3804cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; 3805cabdff1aSopenharmony_ci v16i8 filt0, filt1, filt2, out0, out1, out2, out3; 3806cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r; 3807cabdff1aSopenharmony_ci 3808cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 3809cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 3810cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 3811cabdff1aSopenharmony_ci 3812cabdff1aSopenharmony_ci src -= (stride * 2); 3813cabdff1aSopenharmony_ci 3814cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 3815cabdff1aSopenharmony_ci src += (5 * stride); 3816cabdff1aSopenharmony_ci 3817cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3818cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3819cabdff1aSopenharmony_ci src32_r, src43_r); 3820cabdff1aSopenharmony_ci LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14); 3821cabdff1aSopenharmony_ci XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14); 3822cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r, 3823cabdff1aSopenharmony_ci src87_r, src98_r, src109_r); 3824cabdff1aSopenharmony_ci out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); 3825cabdff1aSopenharmony_ci out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); 3826cabdff1aSopenharmony_ci out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); 3827cabdff1aSopenharmony_ci out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); 3828cabdff1aSopenharmony_ci PCKEV_D2_SB(src3, src2, src7, src4, tmp0, tmp1); 3829cabdff1aSopenharmony_ci ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r, 3830cabdff1aSopenharmony_ci src21_r, src32_r, src43_r); 3831cabdff1aSopenharmony_ci out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2); 3832cabdff1aSopenharmony_ci out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2); 3833cabdff1aSopenharmony_ci out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2); 3834cabdff1aSopenharmony_ci out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2); 3835cabdff1aSopenharmony_ci PCKEV_D2_SB(src9, src8, src11, src10, tmp2, tmp3); 3836cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 3837cabdff1aSopenharmony_ci SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5); 3838cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 3839cabdff1aSopenharmony_ci SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7); 3840cabdff1aSopenharmony_ci 3841cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 3842cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 3843cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 3844cabdff1aSopenharmony_ci LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 3845cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst2); 3846cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst3); 3847cabdff1aSopenharmony_ci 3848cabdff1aSopenharmony_ci PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1); 3849cabdff1aSopenharmony_ci PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3); 3850cabdff1aSopenharmony_ci out0 = __msa_aver_s_b(out0, tmp0); 3851cabdff1aSopenharmony_ci out1 = __msa_aver_s_b(out1, tmp1); 3852cabdff1aSopenharmony_ci out2 = __msa_aver_s_b(out2, tmp2); 3853cabdff1aSopenharmony_ci out3 = __msa_aver_s_b(out3, tmp3); 3854cabdff1aSopenharmony_ci XORI_B4_128_SB(out0, out1, out2, out3); 3855cabdff1aSopenharmony_ci AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1, 3856cabdff1aSopenharmony_ci dst2, dst3); 3857cabdff1aSopenharmony_ci ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 3858cabdff1aSopenharmony_ci} 3859cabdff1aSopenharmony_ci 3860cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, 3861cabdff1aSopenharmony_ci ptrdiff_t stride) 3862cabdff1aSopenharmony_ci{ 3863cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 3864cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 3865cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 3866cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 3867cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 3868cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12; 3869cabdff1aSopenharmony_ci v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r; 3870cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; 3871cabdff1aSopenharmony_ci v16i8 filt0, filt1, filt2, out0, out1, out2, out3; 3872cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r; 3873cabdff1aSopenharmony_ci 3874cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 3875cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 3876cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 3877cabdff1aSopenharmony_ci 3878cabdff1aSopenharmony_ci src -= (stride * 2); 3879cabdff1aSopenharmony_ci 3880cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 3881cabdff1aSopenharmony_ci src += (5 * stride); 3882cabdff1aSopenharmony_ci 3883cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 3884cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3885cabdff1aSopenharmony_ci src32_r, src43_r); 3886cabdff1aSopenharmony_ci LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14); 3887cabdff1aSopenharmony_ci XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14); 3888cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r, 3889cabdff1aSopenharmony_ci src87_r, src98_r, src109_r); 3890cabdff1aSopenharmony_ci out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); 3891cabdff1aSopenharmony_ci out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); 3892cabdff1aSopenharmony_ci out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); 3893cabdff1aSopenharmony_ci out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); 3894cabdff1aSopenharmony_ci PCKEV_D2_SB(src4, src3, src8, src7, tmp0, tmp1); 3895cabdff1aSopenharmony_ci ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r, 3896cabdff1aSopenharmony_ci src21_r, src32_r, src43_r); 3897cabdff1aSopenharmony_ci out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2); 3898cabdff1aSopenharmony_ci out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2); 3899cabdff1aSopenharmony_ci out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2); 3900cabdff1aSopenharmony_ci out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2); 3901cabdff1aSopenharmony_ci PCKEV_D2_SB(src10, src9, src12, src11, tmp2, tmp3); 3902cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 3903cabdff1aSopenharmony_ci SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5); 3904cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 3905cabdff1aSopenharmony_ci SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7); 3906cabdff1aSopenharmony_ci 3907cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 3908cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 3909cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 3910cabdff1aSopenharmony_ci LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 3911cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst2); 3912cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst3); 3913cabdff1aSopenharmony_ci 3914cabdff1aSopenharmony_ci PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1); 3915cabdff1aSopenharmony_ci PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3); 3916cabdff1aSopenharmony_ci out0 = __msa_aver_s_b(out0, tmp0); 3917cabdff1aSopenharmony_ci out1 = __msa_aver_s_b(out1, tmp1); 3918cabdff1aSopenharmony_ci out2 = __msa_aver_s_b(out2, tmp2); 3919cabdff1aSopenharmony_ci out3 = __msa_aver_s_b(out3, tmp3); 3920cabdff1aSopenharmony_ci XORI_B4_128_SB(out0, out1, out2, out3); 3921cabdff1aSopenharmony_ci AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1, 3922cabdff1aSopenharmony_ci dst2, dst3); 3923cabdff1aSopenharmony_ci ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 3924cabdff1aSopenharmony_ci} 3925cabdff1aSopenharmony_ci 3926cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, 3927cabdff1aSopenharmony_ci ptrdiff_t stride) 3928cabdff1aSopenharmony_ci{ 3929cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 3930cabdff1aSopenharmony_ci int16_t filt_const0 = 0xfb01; 3931cabdff1aSopenharmony_ci int16_t filt_const1 = 0x1414; 3932cabdff1aSopenharmony_ci int16_t filt_const2 = 0x1fb; 3933cabdff1aSopenharmony_ci v16u8 res, dst0 = { 0 }; 3934cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3935cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 3936cabdff1aSopenharmony_ci v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 3937cabdff1aSopenharmony_ci v8i16 out10, out32; 3938cabdff1aSopenharmony_ci 3939cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 3940cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 3941cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 3942cabdff1aSopenharmony_ci 3943cabdff1aSopenharmony_ci src -= (stride * 2); 3944cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 3945cabdff1aSopenharmony_ci src += (5 * stride); 3946cabdff1aSopenharmony_ci 3947cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3948cabdff1aSopenharmony_ci src32_r, src43_r); 3949cabdff1aSopenharmony_ci ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 3950cabdff1aSopenharmony_ci XORI_B2_128_SB(src2110, src4332); 3951cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 3952cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 3953cabdff1aSopenharmony_ci src76_r, src87_r); 3954cabdff1aSopenharmony_ci ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); 3955cabdff1aSopenharmony_ci XORI_B2_128_SB(src6554, src8776); 3956cabdff1aSopenharmony_ci src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); 3957cabdff1aSopenharmony_ci src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5); 3958cabdff1aSopenharmony_ci src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r); 3959cabdff1aSopenharmony_ci out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 3960cabdff1aSopenharmony_ci out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 3961cabdff1aSopenharmony_ci SRARI_H2_SH(out10, out32, 5); 3962cabdff1aSopenharmony_ci SAT_SH2_SH(out10, out32, 7); 3963cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 3964cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 3965cabdff1aSopenharmony_ci res = PCKEV_XORI128_UB(out10, out32); 3966cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, (v16u8) src32_r); 3967cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b(res, dst0); 3968cabdff1aSopenharmony_ci ST_W4(dst0, 0, 1, 2, 3, dst, stride); 3969cabdff1aSopenharmony_ci} 3970cabdff1aSopenharmony_ci 3971cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, 3972cabdff1aSopenharmony_ci ptrdiff_t stride) 3973cabdff1aSopenharmony_ci{ 3974cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 3975cabdff1aSopenharmony_ci int16_t filt_const0 = 0xfb01; 3976cabdff1aSopenharmony_ci int16_t filt_const1 = 0x1414; 3977cabdff1aSopenharmony_ci int16_t filt_const2 = 0x1fb; 3978cabdff1aSopenharmony_ci v16u8 res, dst0 = { 0 }; 3979cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3980cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 3981cabdff1aSopenharmony_ci v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 3982cabdff1aSopenharmony_ci v8i16 out10, out32; 3983cabdff1aSopenharmony_ci 3984cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 3985cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 3986cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 3987cabdff1aSopenharmony_ci 3988cabdff1aSopenharmony_ci src -= (stride * 2); 3989cabdff1aSopenharmony_ci 3990cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 3991cabdff1aSopenharmony_ci src += (5 * stride); 3992cabdff1aSopenharmony_ci 3993cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 3994cabdff1aSopenharmony_ci src32_r, src43_r); 3995cabdff1aSopenharmony_ci ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 3996cabdff1aSopenharmony_ci XORI_B2_128_SB(src2110, src4332); 3997cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 3998cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 3999cabdff1aSopenharmony_ci src76_r, src87_r); 4000cabdff1aSopenharmony_ci ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); 4001cabdff1aSopenharmony_ci XORI_B2_128_SB(src6554, src8776); 4002cabdff1aSopenharmony_ci out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 4003cabdff1aSopenharmony_ci out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 4004cabdff1aSopenharmony_ci SRARI_H2_SH(out10, out32, 5); 4005cabdff1aSopenharmony_ci SAT_SH2_SH(out10, out32, 7); 4006cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 4007cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 4008cabdff1aSopenharmony_ci res = PCKEV_XORI128_UB(out10, out32); 4009cabdff1aSopenharmony_ci src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4); 4010cabdff1aSopenharmony_ci src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6); 4011cabdff1aSopenharmony_ci src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r); 4012cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, (v16u8) src32_r); 4013cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b(res, dst0); 4014cabdff1aSopenharmony_ci ST_W4(dst0, 0, 1, 2, 3, dst, stride); 4015cabdff1aSopenharmony_ci} 4016cabdff1aSopenharmony_ci 4017cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, 4018cabdff1aSopenharmony_ci ptrdiff_t stride) 4019cabdff1aSopenharmony_ci{ 4020cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2, 4021cabdff1aSopenharmony_ci src - (stride * 2), 4022cabdff1aSopenharmony_ci dst, stride); 4023cabdff1aSopenharmony_ci} 4024cabdff1aSopenharmony_ci 4025cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, 4026cabdff1aSopenharmony_ci ptrdiff_t stride) 4027cabdff1aSopenharmony_ci{ 4028cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2, 4029cabdff1aSopenharmony_ci src - (stride * 2) + 4030cabdff1aSopenharmony_ci sizeof(uint8_t), 4031cabdff1aSopenharmony_ci dst, stride); 4032cabdff1aSopenharmony_ci} 4033cabdff1aSopenharmony_ci 4034cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, 4035cabdff1aSopenharmony_ci ptrdiff_t stride) 4036cabdff1aSopenharmony_ci{ 4037cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2, 4038cabdff1aSopenharmony_ci src - (stride * 2), 4039cabdff1aSopenharmony_ci dst, stride); 4040cabdff1aSopenharmony_ci} 4041cabdff1aSopenharmony_ci 4042cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, 4043cabdff1aSopenharmony_ci ptrdiff_t stride) 4044cabdff1aSopenharmony_ci{ 4045cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2, 4046cabdff1aSopenharmony_ci src - (stride * 2) + 4047cabdff1aSopenharmony_ci sizeof(uint8_t), 4048cabdff1aSopenharmony_ci dst, stride); 4049cabdff1aSopenharmony_ci} 4050cabdff1aSopenharmony_ci 4051cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, 4052cabdff1aSopenharmony_ci ptrdiff_t stride) 4053cabdff1aSopenharmony_ci{ 4054cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2, 4055cabdff1aSopenharmony_ci src - (stride * 2), 4056cabdff1aSopenharmony_ci dst, stride); 4057cabdff1aSopenharmony_ci} 4058cabdff1aSopenharmony_ci 4059cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, 4060cabdff1aSopenharmony_ci ptrdiff_t stride) 4061cabdff1aSopenharmony_ci{ 4062cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2, 4063cabdff1aSopenharmony_ci src - (stride * 2) + 4064cabdff1aSopenharmony_ci sizeof(uint8_t), dst, stride); 4065cabdff1aSopenharmony_ci} 4066cabdff1aSopenharmony_ci 4067cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, 4068cabdff1aSopenharmony_ci ptrdiff_t stride) 4069cabdff1aSopenharmony_ci{ 4070cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2, 4071cabdff1aSopenharmony_ci src - (stride * 2), 4072cabdff1aSopenharmony_ci dst, stride); 4073cabdff1aSopenharmony_ci} 4074cabdff1aSopenharmony_ci 4075cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, 4076cabdff1aSopenharmony_ci ptrdiff_t stride) 4077cabdff1aSopenharmony_ci{ 4078cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2, 4079cabdff1aSopenharmony_ci src - (stride * 2) + 4080cabdff1aSopenharmony_ci sizeof(uint8_t), dst, stride); 4081cabdff1aSopenharmony_ci} 4082cabdff1aSopenharmony_ci 4083cabdff1aSopenharmony_ci 4084cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, 4085cabdff1aSopenharmony_ci ptrdiff_t stride) 4086cabdff1aSopenharmony_ci{ 4087cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2, 4088cabdff1aSopenharmony_ci src - (stride * 2), 4089cabdff1aSopenharmony_ci dst, stride); 4090cabdff1aSopenharmony_ci} 4091cabdff1aSopenharmony_ci 4092cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, 4093cabdff1aSopenharmony_ci ptrdiff_t stride) 4094cabdff1aSopenharmony_ci{ 4095cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2, 4096cabdff1aSopenharmony_ci src - (stride * 2) + 4097cabdff1aSopenharmony_ci sizeof(uint8_t), dst, stride); 4098cabdff1aSopenharmony_ci} 4099cabdff1aSopenharmony_ci 4100cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, 4101cabdff1aSopenharmony_ci ptrdiff_t stride) 4102cabdff1aSopenharmony_ci{ 4103cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2, 4104cabdff1aSopenharmony_ci src - (stride * 2), 4105cabdff1aSopenharmony_ci dst, stride); 4106cabdff1aSopenharmony_ci} 4107cabdff1aSopenharmony_ci 4108cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, 4109cabdff1aSopenharmony_ci ptrdiff_t stride) 4110cabdff1aSopenharmony_ci{ 4111cabdff1aSopenharmony_ci avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2, 4112cabdff1aSopenharmony_ci src - (stride * 2) + 4113cabdff1aSopenharmony_ci sizeof(uint8_t), dst, stride); 4114cabdff1aSopenharmony_ci} 4115cabdff1aSopenharmony_ci 4116cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, 4117cabdff1aSopenharmony_ci ptrdiff_t stride) 4118cabdff1aSopenharmony_ci{ 4119cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 4120cabdff1aSopenharmony_ci uint8_t *dst_tmp = dst; 4121cabdff1aSopenharmony_ci const uint8_t *src_tmp = src - (2 * stride) - 2; 4122cabdff1aSopenharmony_ci uint32_t multiple8_cnt, loop_cnt; 4123cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 4124cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 4125cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 4126cabdff1aSopenharmony_ci v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 }; 4127cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1; 4128cabdff1aSopenharmony_ci v16i8 mask2; 4129cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 4130cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 4131cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 4132cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l; 4133cabdff1aSopenharmony_ci v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l; 4134cabdff1aSopenharmony_ci v8i16 hz_out87_l, filt0, filt1, filt2; 4135cabdff1aSopenharmony_ci v4i32 tmp0_w, tmp1_w; 4136cabdff1aSopenharmony_ci 4137cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 4138cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 4139cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 4140cabdff1aSopenharmony_ci 4141cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 4142cabdff1aSopenharmony_ci 4143cabdff1aSopenharmony_ci for (multiple8_cnt = 2; multiple8_cnt--;) { 4144cabdff1aSopenharmony_ci dst = dst_tmp; 4145cabdff1aSopenharmony_ci src = src_tmp; 4146cabdff1aSopenharmony_ci 4147cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 4148cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4149cabdff1aSopenharmony_ci src += (5 * stride); 4150cabdff1aSopenharmony_ci 4151cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 4152cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 4153cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 4154cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 4155cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 4156cabdff1aSopenharmony_ci 4157cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 4158cabdff1aSopenharmony_ci LD_SB2(src, stride, src5, src6); 4159cabdff1aSopenharmony_ci src += (2 * stride); 4160cabdff1aSopenharmony_ci 4161cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 4162cabdff1aSopenharmony_ci hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 4163cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 4164cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 4165cabdff1aSopenharmony_ci hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, 4166cabdff1aSopenharmony_ci hz_out43_r); 4167cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 4168cabdff1aSopenharmony_ci hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, 4169cabdff1aSopenharmony_ci hz_out43_l); 4170cabdff1aSopenharmony_ci ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, 4171cabdff1aSopenharmony_ci hz_out65_r); 4172cabdff1aSopenharmony_ci ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, 4173cabdff1aSopenharmony_ci hz_out65_l); 4174cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, 4175cabdff1aSopenharmony_ci filt1, filt2); 4176cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, 4177cabdff1aSopenharmony_ci filt1, filt2); 4178cabdff1aSopenharmony_ci tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4179cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, 4180cabdff1aSopenharmony_ci filt1, filt2); 4181cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, 4182cabdff1aSopenharmony_ci filt1, filt2); 4183cabdff1aSopenharmony_ci tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4184cabdff1aSopenharmony_ci 4185cabdff1aSopenharmony_ci tmp1 = __msa_srari_h(hz_out2, 5); 4186cabdff1aSopenharmony_ci tmp3 = __msa_srari_h(hz_out3, 5); 4187cabdff1aSopenharmony_ci SAT_SH2_SH(tmp1, tmp3, 7); 4188cabdff1aSopenharmony_ci 4189cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_h(tmp0, tmp1); 4190cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_h(tmp2, tmp3); 4191cabdff1aSopenharmony_ci 4192cabdff1aSopenharmony_ci LD2(dst, stride, tp0, tp1); 4193cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 4194cabdff1aSopenharmony_ci 4195cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 4196cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b(out0, dst0); 4197cabdff1aSopenharmony_ci ST_D2(dst0, 0, 1, dst, stride); 4198cabdff1aSopenharmony_ci dst += (2 * stride); 4199cabdff1aSopenharmony_ci 4200cabdff1aSopenharmony_ci LD_SB2(src, stride, src7, src8); 4201cabdff1aSopenharmony_ci src += (2 * stride); 4202cabdff1aSopenharmony_ci 4203cabdff1aSopenharmony_ci XORI_B2_128_SB(src7, src8); 4204cabdff1aSopenharmony_ci hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 4205cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 4206cabdff1aSopenharmony_ci ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r, 4207cabdff1aSopenharmony_ci hz_out87_r); 4208cabdff1aSopenharmony_ci ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l, 4209cabdff1aSopenharmony_ci hz_out87_l); 4210cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, 4211cabdff1aSopenharmony_ci filt1, filt2); 4212cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, 4213cabdff1aSopenharmony_ci filt1, filt2); 4214cabdff1aSopenharmony_ci tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4215cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, 4216cabdff1aSopenharmony_ci filt1, filt2); 4217cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, 4218cabdff1aSopenharmony_ci filt1, filt2); 4219cabdff1aSopenharmony_ci tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4220cabdff1aSopenharmony_ci 4221cabdff1aSopenharmony_ci tmp5 = __msa_srari_h(hz_out4, 5); 4222cabdff1aSopenharmony_ci tmp7 = __msa_srari_h(hz_out5, 5); 4223cabdff1aSopenharmony_ci SAT_SH2_SH(tmp5, tmp7, 7); 4224cabdff1aSopenharmony_ci 4225cabdff1aSopenharmony_ci tmp2 = __msa_aver_s_h(tmp4, tmp5); 4226cabdff1aSopenharmony_ci tmp3 = __msa_aver_s_h(tmp6, tmp7); 4227cabdff1aSopenharmony_ci 4228cabdff1aSopenharmony_ci LD2(dst, stride, tp2, tp3); 4229cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 4230cabdff1aSopenharmony_ci 4231cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 4232cabdff1aSopenharmony_ci dst1 = __msa_aver_u_b(out1, dst1); 4233cabdff1aSopenharmony_ci ST_D2(dst1, 0, 1, dst, stride); 4234cabdff1aSopenharmony_ci dst += (2 * stride); 4235cabdff1aSopenharmony_ci 4236cabdff1aSopenharmony_ci hz_out0 = hz_out4; 4237cabdff1aSopenharmony_ci hz_out1 = hz_out5; 4238cabdff1aSopenharmony_ci hz_out2 = hz_out6; 4239cabdff1aSopenharmony_ci hz_out3 = hz_out7; 4240cabdff1aSopenharmony_ci hz_out4 = hz_out8; 4241cabdff1aSopenharmony_ci } 4242cabdff1aSopenharmony_ci 4243cabdff1aSopenharmony_ci src_tmp += 8; 4244cabdff1aSopenharmony_ci dst_tmp += 8; 4245cabdff1aSopenharmony_ci } 4246cabdff1aSopenharmony_ci} 4247cabdff1aSopenharmony_ci 4248cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, 4249cabdff1aSopenharmony_ci ptrdiff_t stride) 4250cabdff1aSopenharmony_ci{ 4251cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 4252cabdff1aSopenharmony_ci uint8_t *dst_tmp = dst; 4253cabdff1aSopenharmony_ci const uint8_t *src_tmp = src - (2 * stride) - 2; 4254cabdff1aSopenharmony_ci uint32_t multiple8_cnt, loop_cnt; 4255cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 4256cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 4257cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 4258cabdff1aSopenharmony_ci v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 }; 4259cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1; 4260cabdff1aSopenharmony_ci v16i8 mask2; 4261cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 4262cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 4263cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 4264cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l; 4265cabdff1aSopenharmony_ci v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l; 4266cabdff1aSopenharmony_ci v8i16 hz_out87_l, filt0, filt1, filt2; 4267cabdff1aSopenharmony_ci v4i32 tmp0_w, tmp1_w; 4268cabdff1aSopenharmony_ci 4269cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 4270cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 4271cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 4272cabdff1aSopenharmony_ci 4273cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 4274cabdff1aSopenharmony_ci 4275cabdff1aSopenharmony_ci for (multiple8_cnt = 2; multiple8_cnt--;) { 4276cabdff1aSopenharmony_ci dst = dst_tmp; 4277cabdff1aSopenharmony_ci src = src_tmp; 4278cabdff1aSopenharmony_ci 4279cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 4280cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4281cabdff1aSopenharmony_ci src += (5 * stride); 4282cabdff1aSopenharmony_ci 4283cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 4284cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 4285cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 4286cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 4287cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 4288cabdff1aSopenharmony_ci 4289cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 4290cabdff1aSopenharmony_ci LD_SB2(src, stride, src5, src6); 4291cabdff1aSopenharmony_ci src += (2 * stride); 4292cabdff1aSopenharmony_ci 4293cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 4294cabdff1aSopenharmony_ci hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 4295cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 4296cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 4297cabdff1aSopenharmony_ci hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, 4298cabdff1aSopenharmony_ci hz_out43_r); 4299cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 4300cabdff1aSopenharmony_ci hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, 4301cabdff1aSopenharmony_ci hz_out43_l); 4302cabdff1aSopenharmony_ci ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, hz_out65_r); 4303cabdff1aSopenharmony_ci ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, hz_out65_l); 4304cabdff1aSopenharmony_ci 4305cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, 4306cabdff1aSopenharmony_ci filt1, filt2); 4307cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, 4308cabdff1aSopenharmony_ci filt1, filt2); 4309cabdff1aSopenharmony_ci tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4310cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, 4311cabdff1aSopenharmony_ci filt1, filt2); 4312cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, 4313cabdff1aSopenharmony_ci filt1, filt2); 4314cabdff1aSopenharmony_ci tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4315cabdff1aSopenharmony_ci 4316cabdff1aSopenharmony_ci tmp1 = __msa_srari_h(hz_out3, 5); 4317cabdff1aSopenharmony_ci tmp3 = __msa_srari_h(hz_out4, 5); 4318cabdff1aSopenharmony_ci SAT_SH2_SH(tmp1, tmp3, 7); 4319cabdff1aSopenharmony_ci 4320cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_h(tmp0, tmp1); 4321cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_h(tmp2, tmp3); 4322cabdff1aSopenharmony_ci 4323cabdff1aSopenharmony_ci LD2(dst, stride, tp0, tp1); 4324cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 4325cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 4326cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b(out0, dst0); 4327cabdff1aSopenharmony_ci ST_D2(dst0, 0, 1, dst, stride); 4328cabdff1aSopenharmony_ci dst += (2 * stride); 4329cabdff1aSopenharmony_ci 4330cabdff1aSopenharmony_ci LD_SB2(src, stride, src7, src8); 4331cabdff1aSopenharmony_ci src += (2 * stride); 4332cabdff1aSopenharmony_ci 4333cabdff1aSopenharmony_ci XORI_B2_128_SB(src7, src8); 4334cabdff1aSopenharmony_ci hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 4335cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 4336cabdff1aSopenharmony_ci ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r, 4337cabdff1aSopenharmony_ci hz_out87_r); 4338cabdff1aSopenharmony_ci ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l, 4339cabdff1aSopenharmony_ci hz_out87_l); 4340cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, 4341cabdff1aSopenharmony_ci filt1, filt2); 4342cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, 4343cabdff1aSopenharmony_ci filt1, filt2); 4344cabdff1aSopenharmony_ci tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4345cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, 4346cabdff1aSopenharmony_ci filt1, filt2); 4347cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, 4348cabdff1aSopenharmony_ci filt1, filt2); 4349cabdff1aSopenharmony_ci tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4350cabdff1aSopenharmony_ci 4351cabdff1aSopenharmony_ci tmp5 = __msa_srari_h(hz_out5, 5); 4352cabdff1aSopenharmony_ci tmp7 = __msa_srari_h(hz_out6, 5); 4353cabdff1aSopenharmony_ci SAT_SH2_SH(tmp5, tmp7, 7); 4354cabdff1aSopenharmony_ci 4355cabdff1aSopenharmony_ci tmp2 = __msa_aver_s_h(tmp4, tmp5); 4356cabdff1aSopenharmony_ci tmp3 = __msa_aver_s_h(tmp6, tmp7); 4357cabdff1aSopenharmony_ci 4358cabdff1aSopenharmony_ci LD2(dst, stride, tp2, tp3); 4359cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 4360cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 4361cabdff1aSopenharmony_ci dst1 = __msa_aver_u_b(out1, dst1); 4362cabdff1aSopenharmony_ci ST_D2(dst1, 0, 1, dst, stride); 4363cabdff1aSopenharmony_ci dst += (2 * stride); 4364cabdff1aSopenharmony_ci 4365cabdff1aSopenharmony_ci hz_out0 = hz_out4; 4366cabdff1aSopenharmony_ci hz_out1 = hz_out5; 4367cabdff1aSopenharmony_ci hz_out2 = hz_out6; 4368cabdff1aSopenharmony_ci hz_out3 = hz_out7; 4369cabdff1aSopenharmony_ci hz_out4 = hz_out8; 4370cabdff1aSopenharmony_ci } 4371cabdff1aSopenharmony_ci 4372cabdff1aSopenharmony_ci src_tmp += 8; 4373cabdff1aSopenharmony_ci dst_tmp += 8; 4374cabdff1aSopenharmony_ci } 4375cabdff1aSopenharmony_ci} 4376cabdff1aSopenharmony_ci 4377cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, 4378cabdff1aSopenharmony_ci ptrdiff_t stride) 4379cabdff1aSopenharmony_ci{ 4380cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 4381cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 4382cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 4383cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 4384cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1; 4385cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4386cabdff1aSopenharmony_ci v16i8 src11, src12, mask0, mask1, mask2; 4387cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 4388cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12; 4389cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 4390cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r; 4391cabdff1aSopenharmony_ci v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3; 4392cabdff1aSopenharmony_ci v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l; 4393cabdff1aSopenharmony_ci v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l; 4394cabdff1aSopenharmony_ci v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2; 4395cabdff1aSopenharmony_ci v4i32 tmp0_w, tmp1_w; 4396cabdff1aSopenharmony_ci 4397cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 4398cabdff1aSopenharmony_ci 4399cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 4400cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 4401cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 4402cabdff1aSopenharmony_ci 4403cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 4404cabdff1aSopenharmony_ci 4405cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 4406cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4407cabdff1aSopenharmony_ci src += (5 * stride); 4408cabdff1aSopenharmony_ci 4409cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 4410cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 4411cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 4412cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 4413cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 4414cabdff1aSopenharmony_ci 4415cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 4416cabdff1aSopenharmony_ci src += (4 * stride); 4417cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 4418cabdff1aSopenharmony_ci 4419cabdff1aSopenharmony_ci hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 4420cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 4421cabdff1aSopenharmony_ci hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 4422cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 4423cabdff1aSopenharmony_ci 4424cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 4425cabdff1aSopenharmony_ci hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 4426cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 4427cabdff1aSopenharmony_ci hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l); 4428cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 4429cabdff1aSopenharmony_ci hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 4430cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 4431cabdff1aSopenharmony_ci hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l); 4432cabdff1aSopenharmony_ci 4433cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 4434cabdff1aSopenharmony_ci filt2); 4435cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1, 4436cabdff1aSopenharmony_ci filt2); 4437cabdff1aSopenharmony_ci tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4438cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 4439cabdff1aSopenharmony_ci filt2); 4440cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1, 4441cabdff1aSopenharmony_ci filt2); 4442cabdff1aSopenharmony_ci tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4443cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 4444cabdff1aSopenharmony_ci filt2); 4445cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1, 4446cabdff1aSopenharmony_ci filt2); 4447cabdff1aSopenharmony_ci tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4448cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 4449cabdff1aSopenharmony_ci filt2); 4450cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1, 4451cabdff1aSopenharmony_ci filt2); 4452cabdff1aSopenharmony_ci tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4453cabdff1aSopenharmony_ci 4454cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5); 4455cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7); 4456cabdff1aSopenharmony_ci 4457cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 4458cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 4459cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 4460cabdff1aSopenharmony_ci 4461cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_h(tmp0, hz_out2); 4462cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_h(tmp1, hz_out3); 4463cabdff1aSopenharmony_ci tmp2 = __msa_aver_s_h(tmp2, hz_out4); 4464cabdff1aSopenharmony_ci tmp3 = __msa_aver_s_h(tmp3, hz_out5); 4465cabdff1aSopenharmony_ci 4466cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 4467cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 4468cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 4469cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 4470cabdff1aSopenharmony_ci dst += (4 * stride); 4471cabdff1aSopenharmony_ci 4472cabdff1aSopenharmony_ci LD_SB4(src, stride, src9, src10, src11, src12); 4473cabdff1aSopenharmony_ci XORI_B4_128_SB(src9, src10, src11, src12); 4474cabdff1aSopenharmony_ci hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2); 4475cabdff1aSopenharmony_ci hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2); 4476cabdff1aSopenharmony_ci hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2); 4477cabdff1aSopenharmony_ci hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2); 4478cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 4479cabdff1aSopenharmony_ci hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r, 4480cabdff1aSopenharmony_ci hz_out1211_r); 4481cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 4482cabdff1aSopenharmony_ci hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l, 4483cabdff1aSopenharmony_ci hz_out1211_l); 4484cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1, 4485cabdff1aSopenharmony_ci filt2); 4486cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1, 4487cabdff1aSopenharmony_ci filt2); 4488cabdff1aSopenharmony_ci tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4489cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1, 4490cabdff1aSopenharmony_ci filt2); 4491cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1, 4492cabdff1aSopenharmony_ci filt2); 4493cabdff1aSopenharmony_ci tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4494cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1, 4495cabdff1aSopenharmony_ci filt2); 4496cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1, 4497cabdff1aSopenharmony_ci filt2); 4498cabdff1aSopenharmony_ci tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4499cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1, 4500cabdff1aSopenharmony_ci filt2); 4501cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1, 4502cabdff1aSopenharmony_ci filt2); 4503cabdff1aSopenharmony_ci tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4504cabdff1aSopenharmony_ci 4505cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5); 4506cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7); 4507cabdff1aSopenharmony_ci 4508cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 4509cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 4510cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 4511cabdff1aSopenharmony_ci 4512cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_h(tmp0, hz_out6); 4513cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_h(tmp1, hz_out7); 4514cabdff1aSopenharmony_ci tmp2 = __msa_aver_s_h(tmp2, hz_out8); 4515cabdff1aSopenharmony_ci tmp3 = __msa_aver_s_h(tmp3, hz_out9); 4516cabdff1aSopenharmony_ci 4517cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 4518cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 4519cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 4520cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 4521cabdff1aSopenharmony_ci} 4522cabdff1aSopenharmony_ci 4523cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, 4524cabdff1aSopenharmony_ci ptrdiff_t stride) 4525cabdff1aSopenharmony_ci{ 4526cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 4527cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 4528cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 4529cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 4530cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1; 4531cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4532cabdff1aSopenharmony_ci v16i8 src11, src12, mask0, mask1, mask2; 4533cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 4534cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12; 4535cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 4536cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r; 4537cabdff1aSopenharmony_ci v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3; 4538cabdff1aSopenharmony_ci v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l; 4539cabdff1aSopenharmony_ci v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l; 4540cabdff1aSopenharmony_ci v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2; 4541cabdff1aSopenharmony_ci v4i32 tmp0_w, tmp1_w; 4542cabdff1aSopenharmony_ci 4543cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 4544cabdff1aSopenharmony_ci 4545cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 4546cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 4547cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 4548cabdff1aSopenharmony_ci 4549cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 4550cabdff1aSopenharmony_ci 4551cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 4552cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4553cabdff1aSopenharmony_ci src += (5 * stride); 4554cabdff1aSopenharmony_ci 4555cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 4556cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 4557cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 4558cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 4559cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 4560cabdff1aSopenharmony_ci 4561cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 4562cabdff1aSopenharmony_ci src += (4 * stride); 4563cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 4564cabdff1aSopenharmony_ci 4565cabdff1aSopenharmony_ci hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2); 4566cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2); 4567cabdff1aSopenharmony_ci hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2); 4568cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 4569cabdff1aSopenharmony_ci 4570cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 4571cabdff1aSopenharmony_ci hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 4572cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 4573cabdff1aSopenharmony_ci hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l); 4574cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 4575cabdff1aSopenharmony_ci hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 4576cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 4577cabdff1aSopenharmony_ci hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l); 4578cabdff1aSopenharmony_ci 4579cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 4580cabdff1aSopenharmony_ci filt2); 4581cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1, 4582cabdff1aSopenharmony_ci filt2); 4583cabdff1aSopenharmony_ci tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4584cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 4585cabdff1aSopenharmony_ci filt2); 4586cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1, 4587cabdff1aSopenharmony_ci filt2); 4588cabdff1aSopenharmony_ci tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4589cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 4590cabdff1aSopenharmony_ci filt2); 4591cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1, 4592cabdff1aSopenharmony_ci filt2); 4593cabdff1aSopenharmony_ci tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4594cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 4595cabdff1aSopenharmony_ci filt2); 4596cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1, 4597cabdff1aSopenharmony_ci filt2); 4598cabdff1aSopenharmony_ci tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4599cabdff1aSopenharmony_ci 4600cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5); 4601cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7); 4602cabdff1aSopenharmony_ci 4603cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 4604cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 4605cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 4606cabdff1aSopenharmony_ci 4607cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_h(tmp0, hz_out3); 4608cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_h(tmp1, hz_out4); 4609cabdff1aSopenharmony_ci tmp2 = __msa_aver_s_h(tmp2, hz_out5); 4610cabdff1aSopenharmony_ci tmp3 = __msa_aver_s_h(tmp3, hz_out6); 4611cabdff1aSopenharmony_ci 4612cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 4613cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 4614cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 4615cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 4616cabdff1aSopenharmony_ci dst += (4 * stride); 4617cabdff1aSopenharmony_ci 4618cabdff1aSopenharmony_ci LD_SB4(src, stride, src9, src10, src11, src12); 4619cabdff1aSopenharmony_ci XORI_B4_128_SB(src9, src10, src11, src12); 4620cabdff1aSopenharmony_ci hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2); 4621cabdff1aSopenharmony_ci hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2); 4622cabdff1aSopenharmony_ci hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2); 4623cabdff1aSopenharmony_ci hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2); 4624cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 4625cabdff1aSopenharmony_ci hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r, 4626cabdff1aSopenharmony_ci hz_out1211_r); 4627cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 4628cabdff1aSopenharmony_ci hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l, 4629cabdff1aSopenharmony_ci hz_out1211_l); 4630cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1, 4631cabdff1aSopenharmony_ci filt2); 4632cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1, 4633cabdff1aSopenharmony_ci filt2); 4634cabdff1aSopenharmony_ci tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4635cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1, 4636cabdff1aSopenharmony_ci filt2); 4637cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1, 4638cabdff1aSopenharmony_ci filt2); 4639cabdff1aSopenharmony_ci tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4640cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1, 4641cabdff1aSopenharmony_ci filt2); 4642cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1, 4643cabdff1aSopenharmony_ci filt2); 4644cabdff1aSopenharmony_ci tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4645cabdff1aSopenharmony_ci tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1, 4646cabdff1aSopenharmony_ci filt2); 4647cabdff1aSopenharmony_ci tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1, 4648cabdff1aSopenharmony_ci filt2); 4649cabdff1aSopenharmony_ci tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w); 4650cabdff1aSopenharmony_ci 4651cabdff1aSopenharmony_ci SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5); 4652cabdff1aSopenharmony_ci SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7); 4653cabdff1aSopenharmony_ci 4654cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 4655cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 4656cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 4657cabdff1aSopenharmony_ci 4658cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_h(tmp0, hz_out7); 4659cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_h(tmp1, hz_out8); 4660cabdff1aSopenharmony_ci tmp2 = __msa_aver_s_h(tmp2, hz_out9); 4661cabdff1aSopenharmony_ci tmp3 = __msa_aver_s_h(tmp3, hz_out10); 4662cabdff1aSopenharmony_ci 4663cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 4664cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 4665cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 4666cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 4667cabdff1aSopenharmony_ci} 4668cabdff1aSopenharmony_ci 4669cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, 4670cabdff1aSopenharmony_ci ptrdiff_t stride) 4671cabdff1aSopenharmony_ci{ 4672cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 4673cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 4674cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 4675cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 4676cabdff1aSopenharmony_ci v16u8 res, out = { 0 }; 4677cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 4678cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2; 4679cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 4680cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2; 4681cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 4682cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r; 4683cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 4684cabdff1aSopenharmony_ci 4685cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 4686cabdff1aSopenharmony_ci 4687cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 4688cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 4689cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 4690cabdff1aSopenharmony_ci 4691cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 4692cabdff1aSopenharmony_ci 4693cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 4694cabdff1aSopenharmony_ci src += (5 * stride); 4695cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 4696cabdff1aSopenharmony_ci 4697cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4698cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 4699cabdff1aSopenharmony_ci 4700cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2); 4701cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2); 4702cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2); 4703cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2); 4704cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 4705cabdff1aSopenharmony_ci PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); 4706cabdff1aSopenharmony_ci PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7); 4707cabdff1aSopenharmony_ci 4708cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 4709cabdff1aSopenharmony_ci hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 4710cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 4711cabdff1aSopenharmony_ci hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 4712cabdff1aSopenharmony_ci 4713cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 4714cabdff1aSopenharmony_ci filt2); 4715cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 4716cabdff1aSopenharmony_ci filt2); 4717cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 4718cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 4719cabdff1aSopenharmony_ci filt2); 4720cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 4721cabdff1aSopenharmony_ci filt2); 4722cabdff1aSopenharmony_ci dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 4723cabdff1aSopenharmony_ci 4724cabdff1aSopenharmony_ci SRARI_H2_SH(hz_out2, hz_out4, 5); 4725cabdff1aSopenharmony_ci SAT_SH2_SH(hz_out2, hz_out4, 7); 4726cabdff1aSopenharmony_ci 4727cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst0, hz_out2); 4728cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst1, hz_out4); 4729cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 4730cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, out); 4731cabdff1aSopenharmony_ci res = PCKEV_XORI128_UB(dst0, dst1); 4732cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, out); 4733cabdff1aSopenharmony_ci ST_W4(res, 0, 1, 2, 3, dst, stride); 4734cabdff1aSopenharmony_ci} 4735cabdff1aSopenharmony_ci 4736cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, 4737cabdff1aSopenharmony_ci ptrdiff_t stride) 4738cabdff1aSopenharmony_ci{ 4739cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 4740cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 4741cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 4742cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 4743cabdff1aSopenharmony_ci v16u8 res, out = { 0 }; 4744cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 4745cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2; 4746cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 4747cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2; 4748cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 4749cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r; 4750cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 4751cabdff1aSopenharmony_ci 4752cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 4753cabdff1aSopenharmony_ci 4754cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 4755cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 4756cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 4757cabdff1aSopenharmony_ci 4758cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 4759cabdff1aSopenharmony_ci 4760cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 4761cabdff1aSopenharmony_ci src += (5 * stride); 4762cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 4763cabdff1aSopenharmony_ci 4764cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4765cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 4766cabdff1aSopenharmony_ci 4767cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2); 4768cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2); 4769cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2); 4770cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2); 4771cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 4772cabdff1aSopenharmony_ci PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); 4773cabdff1aSopenharmony_ci PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7); 4774cabdff1aSopenharmony_ci 4775cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 4776cabdff1aSopenharmony_ci hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 4777cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 4778cabdff1aSopenharmony_ci hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 4779cabdff1aSopenharmony_ci 4780cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 4781cabdff1aSopenharmony_ci filt2); 4782cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 4783cabdff1aSopenharmony_ci filt2); 4784cabdff1aSopenharmony_ci dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 4785cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 4786cabdff1aSopenharmony_ci filt2); 4787cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 4788cabdff1aSopenharmony_ci filt2); 4789cabdff1aSopenharmony_ci dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 4790cabdff1aSopenharmony_ci 4791cabdff1aSopenharmony_ci PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1); 4792cabdff1aSopenharmony_ci SRARI_H2_SH(hz_out0, hz_out1, 5); 4793cabdff1aSopenharmony_ci SAT_SH2_SH(hz_out0, hz_out1, 7); 4794cabdff1aSopenharmony_ci 4795cabdff1aSopenharmony_ci dst0 = __msa_aver_s_h(dst0, hz_out0); 4796cabdff1aSopenharmony_ci dst1 = __msa_aver_s_h(dst1, hz_out1); 4797cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 4798cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, out); 4799cabdff1aSopenharmony_ci res = PCKEV_XORI128_UB(dst0, dst1); 4800cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, out); 4801cabdff1aSopenharmony_ci ST_W4(res, 0, 1, 2, 3, dst, stride); 4802cabdff1aSopenharmony_ci} 4803cabdff1aSopenharmony_ci 4804cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, 4805cabdff1aSopenharmony_ci ptrdiff_t stride) 4806cabdff1aSopenharmony_ci{ 4807cabdff1aSopenharmony_ci int32_t loop_cnt; 4808cabdff1aSopenharmony_ci int16_t filt_const0 = 0xfb01; 4809cabdff1aSopenharmony_ci int16_t filt_const1 = 0x1414; 4810cabdff1aSopenharmony_ci int16_t filt_const2 = 0x1fb; 4811cabdff1aSopenharmony_ci v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; 4812cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 4813cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 4814cabdff1aSopenharmony_ci v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 4815cabdff1aSopenharmony_ci v16i8 src65_l, src87_l, filt0, filt1, filt2; 4816cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 4817cabdff1aSopenharmony_ci 4818cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 4819cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 4820cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 4821cabdff1aSopenharmony_ci src -= (stride * 2); 4822cabdff1aSopenharmony_ci 4823cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 4824cabdff1aSopenharmony_ci src += (5 * stride); 4825cabdff1aSopenharmony_ci 4826cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4827cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 4828cabdff1aSopenharmony_ci src32_r, src43_r); 4829cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 4830cabdff1aSopenharmony_ci src32_l, src43_l); 4831cabdff1aSopenharmony_ci 4832cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 4833cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 4834cabdff1aSopenharmony_ci src += (4 * stride); 4835cabdff1aSopenharmony_ci 4836cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 4837cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 4838cabdff1aSopenharmony_ci src65_r, src76_r, src87_r); 4839cabdff1aSopenharmony_ci ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 4840cabdff1aSopenharmony_ci src65_l, src76_l, src87_l); 4841cabdff1aSopenharmony_ci out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 4842cabdff1aSopenharmony_ci out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 4843cabdff1aSopenharmony_ci out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 4844cabdff1aSopenharmony_ci out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 4845cabdff1aSopenharmony_ci out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 4846cabdff1aSopenharmony_ci out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 4847cabdff1aSopenharmony_ci out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 4848cabdff1aSopenharmony_ci out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 4849cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 4850cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 4851cabdff1aSopenharmony_ci SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5); 4852cabdff1aSopenharmony_ci SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 4853cabdff1aSopenharmony_ci LD_UB4(dst, stride, dst0, dst1, dst2, dst3); 4854cabdff1aSopenharmony_ci PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 4855cabdff1aSopenharmony_ci out3_r, res0, res1, res2, res3); 4856cabdff1aSopenharmony_ci XORI_B4_128_UB(res0, res1, res2, res3); 4857cabdff1aSopenharmony_ci AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 4858cabdff1aSopenharmony_ci AVER_UB2_UB(res2, dst2, res3, dst3, res2, res3); 4859cabdff1aSopenharmony_ci ST_UB4(res0, res1, res2, res3, dst, stride); 4860cabdff1aSopenharmony_ci dst += (4 * stride); 4861cabdff1aSopenharmony_ci 4862cabdff1aSopenharmony_ci src10_r = src54_r; 4863cabdff1aSopenharmony_ci src32_r = src76_r; 4864cabdff1aSopenharmony_ci src21_r = src65_r; 4865cabdff1aSopenharmony_ci src43_r = src87_r; 4866cabdff1aSopenharmony_ci src10_l = src54_l; 4867cabdff1aSopenharmony_ci src32_l = src76_l; 4868cabdff1aSopenharmony_ci src21_l = src65_l; 4869cabdff1aSopenharmony_ci src43_l = src87_l; 4870cabdff1aSopenharmony_ci src4 = src8; 4871cabdff1aSopenharmony_ci } 4872cabdff1aSopenharmony_ci} 4873cabdff1aSopenharmony_ci 4874cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, 4875cabdff1aSopenharmony_ci ptrdiff_t stride) 4876cabdff1aSopenharmony_ci{ 4877cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 4878cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 4879cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 4880cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 4881cabdff1aSopenharmony_ci v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; 4882cabdff1aSopenharmony_ci v16u8 out0, out1, out2, out3; 4883cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src109_r; 4884cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; 4885cabdff1aSopenharmony_ci v16i8 filt0, filt1, filt2; 4886cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r; 4887cabdff1aSopenharmony_ci 4888cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 4889cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 4890cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 4891cabdff1aSopenharmony_ci 4892cabdff1aSopenharmony_ci src -= (stride * 2); 4893cabdff1aSopenharmony_ci 4894cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 4895cabdff1aSopenharmony_ci src += (5 * stride); 4896cabdff1aSopenharmony_ci 4897cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 4898cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 4899cabdff1aSopenharmony_ci src32_r, src43_r); 4900cabdff1aSopenharmony_ci 4901cabdff1aSopenharmony_ci LD_SB4(src, stride, src7, src8, src9, src10); 4902cabdff1aSopenharmony_ci src += (4 * stride); 4903cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 4904cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r, 4905cabdff1aSopenharmony_ci src87_r, src98_r, src109_r); 4906cabdff1aSopenharmony_ci out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); 4907cabdff1aSopenharmony_ci out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); 4908cabdff1aSopenharmony_ci out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); 4909cabdff1aSopenharmony_ci out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); 4910cabdff1aSopenharmony_ci 4911cabdff1aSopenharmony_ci LD_SB4(src, stride, src0, src1, src2, src3); 4912cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 4913cabdff1aSopenharmony_ci ILVR_B4_SB(src0, src10, src1, src0, src2, src1, src3, src2, src10_r, 4914cabdff1aSopenharmony_ci src21_r, src32_r, src43_r); 4915cabdff1aSopenharmony_ci out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2); 4916cabdff1aSopenharmony_ci out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2); 4917cabdff1aSopenharmony_ci out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2); 4918cabdff1aSopenharmony_ci out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2); 4919cabdff1aSopenharmony_ci 4920cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 4921cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 4922cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 4923cabdff1aSopenharmony_ci LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 4924cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst2); 4925cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst3); 4926cabdff1aSopenharmony_ci 4927cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5); 4928cabdff1aSopenharmony_ci SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5); 4929cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 4930cabdff1aSopenharmony_ci SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7); 4931cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(out0_r, out1_r); 4932cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(out2_r, out3_r); 4933cabdff1aSopenharmony_ci out2 = PCKEV_XORI128_UB(out4_r, out5_r); 4934cabdff1aSopenharmony_ci out3 = PCKEV_XORI128_UB(out6_r, out7_r); 4935cabdff1aSopenharmony_ci AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1, 4936cabdff1aSopenharmony_ci dst2, dst3); 4937cabdff1aSopenharmony_ci ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 4938cabdff1aSopenharmony_ci} 4939cabdff1aSopenharmony_ci 4940cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, 4941cabdff1aSopenharmony_ci ptrdiff_t stride) 4942cabdff1aSopenharmony_ci{ 4943cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 4944cabdff1aSopenharmony_ci int16_t filt_const0 = 0xfb01; 4945cabdff1aSopenharmony_ci int16_t filt_const1 = 0x1414; 4946cabdff1aSopenharmony_ci int16_t filt_const2 = 0x1fb; 4947cabdff1aSopenharmony_ci v16u8 res, dst0 = { 0 }; 4948cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 4949cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 4950cabdff1aSopenharmony_ci v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 4951cabdff1aSopenharmony_ci v8i16 out10, out32; 4952cabdff1aSopenharmony_ci 4953cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 4954cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 4955cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 4956cabdff1aSopenharmony_ci 4957cabdff1aSopenharmony_ci src -= (stride * 2); 4958cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 4959cabdff1aSopenharmony_ci src += (5 * stride); 4960cabdff1aSopenharmony_ci 4961cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 4962cabdff1aSopenharmony_ci src32_r, src43_r); 4963cabdff1aSopenharmony_ci ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 4964cabdff1aSopenharmony_ci XORI_B2_128_SB(src2110, src4332); 4965cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 4966cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 4967cabdff1aSopenharmony_ci src76_r, src87_r); 4968cabdff1aSopenharmony_ci ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); 4969cabdff1aSopenharmony_ci XORI_B2_128_SB(src6554, src8776); 4970cabdff1aSopenharmony_ci out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 4971cabdff1aSopenharmony_ci out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 4972cabdff1aSopenharmony_ci SRARI_H2_SH(out10, out32, 5); 4973cabdff1aSopenharmony_ci SAT_SH2_SH(out10, out32, 7); 4974cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 4975cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 4976cabdff1aSopenharmony_ci res = PCKEV_XORI128_UB(out10, out32); 4977cabdff1aSopenharmony_ci dst0 = __msa_aver_u_b(res, dst0); 4978cabdff1aSopenharmony_ci ST_W4(dst0, 0, 1, 2, 3, dst, stride); 4979cabdff1aSopenharmony_ci} 4980cabdff1aSopenharmony_ci 4981cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, 4982cabdff1aSopenharmony_ci ptrdiff_t stride) 4983cabdff1aSopenharmony_ci{ 4984cabdff1aSopenharmony_ci uint32_t row; 4985cabdff1aSopenharmony_ci v16u8 out, dst0; 4986cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4987cabdff1aSopenharmony_ci v16i8 src11; 4988cabdff1aSopenharmony_ci v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3; 4989cabdff1aSopenharmony_ci v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 4990cabdff1aSopenharmony_ci v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5; 4991cabdff1aSopenharmony_ci v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 4992cabdff1aSopenharmony_ci v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 4993cabdff1aSopenharmony_ci v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 4994cabdff1aSopenharmony_ci v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 4995cabdff1aSopenharmony_ci v8i16 minus5h = __msa_ldi_h(-5); 4996cabdff1aSopenharmony_ci v8i16 plus20h = __msa_ldi_h(20); 4997cabdff1aSopenharmony_ci 4998cabdff1aSopenharmony_ci mask3 = mask0 + 4; 4999cabdff1aSopenharmony_ci mask4 = mask1 + 4; 5000cabdff1aSopenharmony_ci mask5 = mask2 + 4; 5001cabdff1aSopenharmony_ci 5002cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 5003cabdff1aSopenharmony_ci 5004cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 5005cabdff1aSopenharmony_ci LD_SB5(src + 8, stride, src7, src8, src9, src10, src11); 5006cabdff1aSopenharmony_ci src += (5 * stride); 5007cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 5008cabdff1aSopenharmony_ci XORI_B5_128_SB(src7, src8, src9, src10, src11); 5009cabdff1aSopenharmony_ci 5010cabdff1aSopenharmony_ci for (row = 16; row--;) { 5011cabdff1aSopenharmony_ci LD_SB2(src, 8, src5, src6); 5012cabdff1aSopenharmony_ci src += stride; 5013cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 5014cabdff1aSopenharmony_ci dst0 = LD_UB(dst); 5015cabdff1aSopenharmony_ci 5016cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 5017cabdff1aSopenharmony_ci vt_res0, vt_res1); 5018cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6, 5019cabdff1aSopenharmony_ci vt_res2, vt_res3); 5020cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5021cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 5022cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5023cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 5024cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 5025cabdff1aSopenharmony_ci mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 5026cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 5027cabdff1aSopenharmony_ci mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 5028cabdff1aSopenharmony_ci hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5029cabdff1aSopenharmony_ci hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5030cabdff1aSopenharmony_ci hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 5031cabdff1aSopenharmony_ci hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 5032cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 5033cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 5034cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 5035cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 5036cabdff1aSopenharmony_ci SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 5037cabdff1aSopenharmony_ci SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 5038cabdff1aSopenharmony_ci tmp0 = __msa_srari_h(shf_vec2, 5); 5039cabdff1aSopenharmony_ci tmp1 = __msa_srari_h(shf_vec5, 5); 5040cabdff1aSopenharmony_ci tmp2 = __msa_srari_h(shf_vec8, 5); 5041cabdff1aSopenharmony_ci tmp3 = __msa_srari_h(shf_vec11, 5); 5042cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 5043cabdff1aSopenharmony_ci PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 5044cabdff1aSopenharmony_ci PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3); 5045cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_h(tmp2, tmp0); 5046cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_h(tmp3, tmp1); 5047cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(tmp0, tmp1); 5048cabdff1aSopenharmony_ci out = __msa_aver_u_b(out, dst0); 5049cabdff1aSopenharmony_ci ST_UB(out, dst); 5050cabdff1aSopenharmony_ci dst += stride; 5051cabdff1aSopenharmony_ci 5052cabdff1aSopenharmony_ci src0 = src1; 5053cabdff1aSopenharmony_ci src1 = src2; 5054cabdff1aSopenharmony_ci src2 = src3; 5055cabdff1aSopenharmony_ci src3 = src4; 5056cabdff1aSopenharmony_ci src4 = src5; 5057cabdff1aSopenharmony_ci src7 = src8; 5058cabdff1aSopenharmony_ci src8 = src9; 5059cabdff1aSopenharmony_ci src9 = src10; 5060cabdff1aSopenharmony_ci src10 = src11; 5061cabdff1aSopenharmony_ci src11 = src6; 5062cabdff1aSopenharmony_ci } 5063cabdff1aSopenharmony_ci} 5064cabdff1aSopenharmony_ci 5065cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, 5066cabdff1aSopenharmony_ci ptrdiff_t stride) 5067cabdff1aSopenharmony_ci{ 5068cabdff1aSopenharmony_ci uint32_t row; 5069cabdff1aSopenharmony_ci v16u8 out, dst0; 5070cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 5071cabdff1aSopenharmony_ci v16i8 src11; 5072cabdff1aSopenharmony_ci v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3; 5073cabdff1aSopenharmony_ci v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 5074cabdff1aSopenharmony_ci v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5; 5075cabdff1aSopenharmony_ci v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 5076cabdff1aSopenharmony_ci v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 5077cabdff1aSopenharmony_ci v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 5078cabdff1aSopenharmony_ci v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 5079cabdff1aSopenharmony_ci v8i16 minus5h = __msa_ldi_h(-5); 5080cabdff1aSopenharmony_ci v8i16 plus20h = __msa_ldi_h(20); 5081cabdff1aSopenharmony_ci 5082cabdff1aSopenharmony_ci mask3 = mask0 + 4; 5083cabdff1aSopenharmony_ci mask4 = mask1 + 4; 5084cabdff1aSopenharmony_ci mask5 = mask2 + 4; 5085cabdff1aSopenharmony_ci 5086cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 5087cabdff1aSopenharmony_ci 5088cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 5089cabdff1aSopenharmony_ci LD_SB5(src + 8, stride, src7, src8, src9, src10, src11); 5090cabdff1aSopenharmony_ci src += (5 * stride); 5091cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 5092cabdff1aSopenharmony_ci XORI_B5_128_SB(src7, src8, src9, src10, src11); 5093cabdff1aSopenharmony_ci 5094cabdff1aSopenharmony_ci for (row = 16; row--;) { 5095cabdff1aSopenharmony_ci LD_SB2(src, 8, src5, src6); 5096cabdff1aSopenharmony_ci src += stride; 5097cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 5098cabdff1aSopenharmony_ci dst0 = LD_UB(dst); 5099cabdff1aSopenharmony_ci 5100cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 5101cabdff1aSopenharmony_ci vt_res0, vt_res1); 5102cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6, 5103cabdff1aSopenharmony_ci vt_res2, vt_res3); 5104cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5105cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 5106cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5107cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 5108cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 5109cabdff1aSopenharmony_ci mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 5110cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 5111cabdff1aSopenharmony_ci mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 5112cabdff1aSopenharmony_ci hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5113cabdff1aSopenharmony_ci hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5114cabdff1aSopenharmony_ci hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 5115cabdff1aSopenharmony_ci hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 5116cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 5117cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 5118cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 5119cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 5120cabdff1aSopenharmony_ci SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 5121cabdff1aSopenharmony_ci SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 5122cabdff1aSopenharmony_ci tmp0 = __msa_srari_h(shf_vec2, 5); 5123cabdff1aSopenharmony_ci tmp1 = __msa_srari_h(shf_vec5, 5); 5124cabdff1aSopenharmony_ci tmp2 = __msa_srari_h(shf_vec8, 5); 5125cabdff1aSopenharmony_ci tmp3 = __msa_srari_h(shf_vec11, 5); 5126cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 5127cabdff1aSopenharmony_ci tmp0 = __msa_pckod_h(tmp2, tmp0); 5128cabdff1aSopenharmony_ci tmp1 = __msa_pckod_h(tmp3, tmp1); 5129cabdff1aSopenharmony_ci PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3); 5130cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_h(tmp2, tmp0); 5131cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_h(tmp3, tmp1); 5132cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(tmp0, tmp1); 5133cabdff1aSopenharmony_ci out = __msa_aver_u_b(out, dst0); 5134cabdff1aSopenharmony_ci ST_UB(out, dst); 5135cabdff1aSopenharmony_ci dst += stride; 5136cabdff1aSopenharmony_ci 5137cabdff1aSopenharmony_ci src0 = src1; 5138cabdff1aSopenharmony_ci src1 = src2; 5139cabdff1aSopenharmony_ci src2 = src3; 5140cabdff1aSopenharmony_ci src3 = src4; 5141cabdff1aSopenharmony_ci src4 = src5; 5142cabdff1aSopenharmony_ci src7 = src8; 5143cabdff1aSopenharmony_ci src8 = src9; 5144cabdff1aSopenharmony_ci src9 = src10; 5145cabdff1aSopenharmony_ci src10 = src11; 5146cabdff1aSopenharmony_ci src11 = src6; 5147cabdff1aSopenharmony_ci } 5148cabdff1aSopenharmony_ci} 5149cabdff1aSopenharmony_ci 5150cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, 5151cabdff1aSopenharmony_ci ptrdiff_t stride) 5152cabdff1aSopenharmony_ci{ 5153cabdff1aSopenharmony_ci uint32_t row; 5154cabdff1aSopenharmony_ci uint64_t tp0, tp1; 5155cabdff1aSopenharmony_ci v16u8 out, dst0 = { 0 }; 5156cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 5157cabdff1aSopenharmony_ci v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3; 5158cabdff1aSopenharmony_ci v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 5159cabdff1aSopenharmony_ci v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11; 5160cabdff1aSopenharmony_ci v8i16 mask3, mask4, mask5; 5161cabdff1aSopenharmony_ci v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 5162cabdff1aSopenharmony_ci v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 5163cabdff1aSopenharmony_ci v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 5164cabdff1aSopenharmony_ci v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 5165cabdff1aSopenharmony_ci v8i16 minus5h = __msa_ldi_h(-5); 5166cabdff1aSopenharmony_ci v8i16 plus20h = __msa_ldi_h(20); 5167cabdff1aSopenharmony_ci 5168cabdff1aSopenharmony_ci mask3 = mask0 + 4; 5169cabdff1aSopenharmony_ci mask4 = mask1 + 4; 5170cabdff1aSopenharmony_ci mask5 = mask2 + 4; 5171cabdff1aSopenharmony_ci 5172cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 5173cabdff1aSopenharmony_ci 5174cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 5175cabdff1aSopenharmony_ci src += (5 * stride); 5176cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 5177cabdff1aSopenharmony_ci 5178cabdff1aSopenharmony_ci for (row = 4; row--;) { 5179cabdff1aSopenharmony_ci LD_SB2(src, stride, src5, src6); 5180cabdff1aSopenharmony_ci src += (2 * stride); 5181cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 5182cabdff1aSopenharmony_ci 5183cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 5184cabdff1aSopenharmony_ci vt_res0, vt_res1); 5185cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6, 5186cabdff1aSopenharmony_ci vt_res2, vt_res3); 5187cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5188cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 5189cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5190cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 5191cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 5192cabdff1aSopenharmony_ci mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 5193cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 5194cabdff1aSopenharmony_ci mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 5195cabdff1aSopenharmony_ci hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5196cabdff1aSopenharmony_ci hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5197cabdff1aSopenharmony_ci hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 5198cabdff1aSopenharmony_ci hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 5199cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 5200cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 5201cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 5202cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 5203cabdff1aSopenharmony_ci SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 5204cabdff1aSopenharmony_ci SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 5205cabdff1aSopenharmony_ci tmp0 = __msa_srari_h(shf_vec2, 5); 5206cabdff1aSopenharmony_ci tmp1 = __msa_srari_h(shf_vec5, 5); 5207cabdff1aSopenharmony_ci tmp2 = __msa_srari_h(shf_vec8, 5); 5208cabdff1aSopenharmony_ci tmp3 = __msa_srari_h(shf_vec11, 5); 5209cabdff1aSopenharmony_ci LD2(dst, stride, tp0, tp1); 5210cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 5211cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 5212cabdff1aSopenharmony_ci PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 5213cabdff1aSopenharmony_ci PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3); 5214cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_h(tmp2, tmp0); 5215cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_h(tmp3, tmp1); 5216cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(tmp0, tmp1); 5217cabdff1aSopenharmony_ci out = __msa_aver_u_b(out, dst0); 5218cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst, stride); 5219cabdff1aSopenharmony_ci dst += (2 * stride); 5220cabdff1aSopenharmony_ci 5221cabdff1aSopenharmony_ci src0 = src2; 5222cabdff1aSopenharmony_ci src1 = src3; 5223cabdff1aSopenharmony_ci src2 = src4; 5224cabdff1aSopenharmony_ci src3 = src5; 5225cabdff1aSopenharmony_ci src4 = src6; 5226cabdff1aSopenharmony_ci } 5227cabdff1aSopenharmony_ci} 5228cabdff1aSopenharmony_ci 5229cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, 5230cabdff1aSopenharmony_ci ptrdiff_t stride) 5231cabdff1aSopenharmony_ci{ 5232cabdff1aSopenharmony_ci uint32_t row; 5233cabdff1aSopenharmony_ci uint64_t tp0, tp1; 5234cabdff1aSopenharmony_ci v16u8 out, dst0 = { 0 }; 5235cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 5236cabdff1aSopenharmony_ci v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3; 5237cabdff1aSopenharmony_ci v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 5238cabdff1aSopenharmony_ci v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11; 5239cabdff1aSopenharmony_ci v8i16 mask3, mask4, mask5; 5240cabdff1aSopenharmony_ci v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 5241cabdff1aSopenharmony_ci v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 5242cabdff1aSopenharmony_ci v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 5243cabdff1aSopenharmony_ci v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 5244cabdff1aSopenharmony_ci v8i16 minus5h = __msa_ldi_h(-5); 5245cabdff1aSopenharmony_ci v8i16 plus20h = __msa_ldi_h(20); 5246cabdff1aSopenharmony_ci 5247cabdff1aSopenharmony_ci mask3 = mask0 + 4; 5248cabdff1aSopenharmony_ci mask4 = mask1 + 4; 5249cabdff1aSopenharmony_ci mask5 = mask2 + 4; 5250cabdff1aSopenharmony_ci 5251cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 5252cabdff1aSopenharmony_ci 5253cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 5254cabdff1aSopenharmony_ci src += (5 * stride); 5255cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 5256cabdff1aSopenharmony_ci 5257cabdff1aSopenharmony_ci for (row = 4; row--;) { 5258cabdff1aSopenharmony_ci LD_SB2(src, stride, src5, src6); 5259cabdff1aSopenharmony_ci src += (2 * stride); 5260cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 5261cabdff1aSopenharmony_ci 5262cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5, 5263cabdff1aSopenharmony_ci vt_res0, vt_res1); 5264cabdff1aSopenharmony_ci AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6, 5265cabdff1aSopenharmony_ci vt_res2, vt_res3); 5266cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5267cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 5268cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5269cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 5270cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3, 5271cabdff1aSopenharmony_ci mask4, mask5, shf_vec6, shf_vec7, shf_vec8); 5272cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3, 5273cabdff1aSopenharmony_ci mask4, mask5, shf_vec9, shf_vec10, shf_vec11); 5274cabdff1aSopenharmony_ci hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5275cabdff1aSopenharmony_ci hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5276cabdff1aSopenharmony_ci hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6); 5277cabdff1aSopenharmony_ci hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9); 5278cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 5279cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 5280cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2); 5281cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3); 5282cabdff1aSopenharmony_ci SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10); 5283cabdff1aSopenharmony_ci SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7); 5284cabdff1aSopenharmony_ci tmp0 = __msa_srari_h(shf_vec2, 5); 5285cabdff1aSopenharmony_ci tmp1 = __msa_srari_h(shf_vec5, 5); 5286cabdff1aSopenharmony_ci tmp2 = __msa_srari_h(shf_vec8, 5); 5287cabdff1aSopenharmony_ci tmp3 = __msa_srari_h(shf_vec11, 5); 5288cabdff1aSopenharmony_ci LD2(dst, stride, tp0, tp1); 5289cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 5290cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 5291cabdff1aSopenharmony_ci tmp0 = __msa_pckod_h(tmp2, tmp0); 5292cabdff1aSopenharmony_ci tmp1 = __msa_pckod_h(tmp3, tmp1); 5293cabdff1aSopenharmony_ci PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3); 5294cabdff1aSopenharmony_ci tmp0 = __msa_aver_s_h(tmp2, tmp0); 5295cabdff1aSopenharmony_ci tmp1 = __msa_aver_s_h(tmp3, tmp1); 5296cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(tmp0, tmp1); 5297cabdff1aSopenharmony_ci out = __msa_aver_u_b(out, dst0); 5298cabdff1aSopenharmony_ci ST_D2(out, 0, 1, dst, stride); 5299cabdff1aSopenharmony_ci dst += (2 * stride); 5300cabdff1aSopenharmony_ci 5301cabdff1aSopenharmony_ci src0 = src2; 5302cabdff1aSopenharmony_ci src1 = src3; 5303cabdff1aSopenharmony_ci src2 = src4; 5304cabdff1aSopenharmony_ci src3 = src5; 5305cabdff1aSopenharmony_ci src4 = src6; 5306cabdff1aSopenharmony_ci } 5307cabdff1aSopenharmony_ci} 5308cabdff1aSopenharmony_ci 5309cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, 5310cabdff1aSopenharmony_ci ptrdiff_t stride) 5311cabdff1aSopenharmony_ci{ 5312cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 5313cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 5314cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 5315cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 5316cabdff1aSopenharmony_ci v16u8 out, dstv = { 0 }; 5317cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 5318cabdff1aSopenharmony_ci v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r; 5319cabdff1aSopenharmony_ci v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l; 5320cabdff1aSopenharmony_ci v16i8 src76_l, src87_l, filt0, filt1, filt2; 5321cabdff1aSopenharmony_ci v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7; 5322cabdff1aSopenharmony_ci v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 5323cabdff1aSopenharmony_ci v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 5324cabdff1aSopenharmony_ci v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 5325cabdff1aSopenharmony_ci v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 5326cabdff1aSopenharmony_ci v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 5327cabdff1aSopenharmony_ci v8i16 minus5h = __msa_ldi_h(-5); 5328cabdff1aSopenharmony_ci v8i16 plus20h = __msa_ldi_h(20); 5329cabdff1aSopenharmony_ci v8i16 zeros = { 0 }; 5330cabdff1aSopenharmony_ci 5331cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 5332cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 5333cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 5334cabdff1aSopenharmony_ci 5335cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 5336cabdff1aSopenharmony_ci 5337cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 5338cabdff1aSopenharmony_ci src += (5 * stride); 5339cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 5340cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 5341cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 5342cabdff1aSopenharmony_ci 5343cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 5344cabdff1aSopenharmony_ci src32_r, src43_r); 5345cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 5346cabdff1aSopenharmony_ci src76_r, src87_r); 5347cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 5348cabdff1aSopenharmony_ci src32_l, src43_l); 5349cabdff1aSopenharmony_ci ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l, 5350cabdff1aSopenharmony_ci src76_l, src87_l); 5351cabdff1aSopenharmony_ci vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 5352cabdff1aSopenharmony_ci vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 5353cabdff1aSopenharmony_ci vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 5354cabdff1aSopenharmony_ci vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 5355cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5356cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 5357cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5358cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 5359cabdff1aSopenharmony_ci hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5360cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 5361cabdff1aSopenharmony_ci hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5362cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 5363cabdff1aSopenharmony_ci 5364cabdff1aSopenharmony_ci vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 5365cabdff1aSopenharmony_ci vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 5366cabdff1aSopenharmony_ci vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 5367cabdff1aSopenharmony_ci vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 5368cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5369cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec6); 5370cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5371cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec7); 5372cabdff1aSopenharmony_ci hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5373cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2); 5374cabdff1aSopenharmony_ci hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5375cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3); 5376cabdff1aSopenharmony_ci 5377cabdff1aSopenharmony_ci SRARI_W2_SW(hz_res0, hz_res1, 10); 5378cabdff1aSopenharmony_ci SAT_SW2_SW(hz_res0, hz_res1, 7); 5379cabdff1aSopenharmony_ci SRARI_W2_SW(hz_res2, hz_res3, 10); 5380cabdff1aSopenharmony_ci SAT_SW2_SW(hz_res2, hz_res3, 7); 5381cabdff1aSopenharmony_ci 5382cabdff1aSopenharmony_ci dst0 = __msa_srari_h(shf_vec2, 5); 5383cabdff1aSopenharmony_ci dst1 = __msa_srari_h(shf_vec5, 5); 5384cabdff1aSopenharmony_ci dst2 = __msa_srari_h(shf_vec6, 5); 5385cabdff1aSopenharmony_ci dst3 = __msa_srari_h(shf_vec7, 5); 5386cabdff1aSopenharmony_ci 5387cabdff1aSopenharmony_ci SAT_SH2_SH(dst0, dst1, 7); 5388cabdff1aSopenharmony_ci SAT_SH2_SH(dst2, dst3, 7); 5389cabdff1aSopenharmony_ci ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1); 5390cabdff1aSopenharmony_ci ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3); 5391cabdff1aSopenharmony_ci 5392cabdff1aSopenharmony_ci hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0); 5393cabdff1aSopenharmony_ci hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1); 5394cabdff1aSopenharmony_ci hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2); 5395cabdff1aSopenharmony_ci hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3); 5396cabdff1aSopenharmony_ci 5397cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 5398cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv); 5399cabdff1aSopenharmony_ci PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2); 5400cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(dst0, dst2); 5401cabdff1aSopenharmony_ci out = __msa_aver_u_b(out, dstv); 5402cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 5403cabdff1aSopenharmony_ci} 5404cabdff1aSopenharmony_ci 5405cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, 5406cabdff1aSopenharmony_ci ptrdiff_t stride) 5407cabdff1aSopenharmony_ci{ 5408cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 5409cabdff1aSopenharmony_ci const int16_t filt_const0 = 0xfb01; 5410cabdff1aSopenharmony_ci const int16_t filt_const1 = 0x1414; 5411cabdff1aSopenharmony_ci const int16_t filt_const2 = 0x1fb; 5412cabdff1aSopenharmony_ci v16u8 out, dstv = { 0 }; 5413cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 5414cabdff1aSopenharmony_ci v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r; 5415cabdff1aSopenharmony_ci v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l; 5416cabdff1aSopenharmony_ci v16i8 src76_l, src87_l, filt0, filt1, filt2; 5417cabdff1aSopenharmony_ci v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7; 5418cabdff1aSopenharmony_ci v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6; 5419cabdff1aSopenharmony_ci v4i32 hz_res0, hz_res1, hz_res2, hz_res3; 5420cabdff1aSopenharmony_ci v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 }; 5421cabdff1aSopenharmony_ci v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 }; 5422cabdff1aSopenharmony_ci v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 }; 5423cabdff1aSopenharmony_ci v8i16 minus5h = __msa_ldi_h(-5); 5424cabdff1aSopenharmony_ci v8i16 plus20h = __msa_ldi_h(20); 5425cabdff1aSopenharmony_ci v8i16 zeros = { 0 }; 5426cabdff1aSopenharmony_ci 5427cabdff1aSopenharmony_ci filt0 = (v16i8) __msa_fill_h(filt_const0); 5428cabdff1aSopenharmony_ci filt1 = (v16i8) __msa_fill_h(filt_const1); 5429cabdff1aSopenharmony_ci filt2 = (v16i8) __msa_fill_h(filt_const2); 5430cabdff1aSopenharmony_ci 5431cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 5432cabdff1aSopenharmony_ci 5433cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 5434cabdff1aSopenharmony_ci src += (5 * stride); 5435cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 5436cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 5437cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 5438cabdff1aSopenharmony_ci 5439cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 5440cabdff1aSopenharmony_ci src32_r, src43_r); 5441cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 5442cabdff1aSopenharmony_ci src76_r, src87_r); 5443cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, 5444cabdff1aSopenharmony_ci src32_l, src43_l); 5445cabdff1aSopenharmony_ci ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l, 5446cabdff1aSopenharmony_ci src76_l, src87_l); 5447cabdff1aSopenharmony_ci vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 5448cabdff1aSopenharmony_ci vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 5449cabdff1aSopenharmony_ci vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 5450cabdff1aSopenharmony_ci vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 5451cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5452cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec2); 5453cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5454cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec5); 5455cabdff1aSopenharmony_ci hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5456cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0); 5457cabdff1aSopenharmony_ci hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5458cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1); 5459cabdff1aSopenharmony_ci 5460cabdff1aSopenharmony_ci vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 5461cabdff1aSopenharmony_ci vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 5462cabdff1aSopenharmony_ci vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 5463cabdff1aSopenharmony_ci vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 5464cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0, 5465cabdff1aSopenharmony_ci mask1, mask2, shf_vec0, shf_vec1, shf_vec6); 5466cabdff1aSopenharmony_ci VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0, 5467cabdff1aSopenharmony_ci mask1, mask2, shf_vec3, shf_vec4, shf_vec7); 5468cabdff1aSopenharmony_ci hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0); 5469cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2); 5470cabdff1aSopenharmony_ci hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3); 5471cabdff1aSopenharmony_ci DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3); 5472cabdff1aSopenharmony_ci 5473cabdff1aSopenharmony_ci SRARI_W2_SW(hz_res0, hz_res1, 10); 5474cabdff1aSopenharmony_ci SAT_SW2_SW(hz_res0, hz_res1, 7); 5475cabdff1aSopenharmony_ci SRARI_W2_SW(hz_res2, hz_res3, 10); 5476cabdff1aSopenharmony_ci SAT_SW2_SW(hz_res2, hz_res3, 7); 5477cabdff1aSopenharmony_ci 5478cabdff1aSopenharmony_ci dst0 = __msa_srari_h(shf_vec2, 5); 5479cabdff1aSopenharmony_ci dst1 = __msa_srari_h(shf_vec5, 5); 5480cabdff1aSopenharmony_ci dst2 = __msa_srari_h(shf_vec6, 5); 5481cabdff1aSopenharmony_ci dst3 = __msa_srari_h(shf_vec7, 5); 5482cabdff1aSopenharmony_ci 5483cabdff1aSopenharmony_ci SAT_SH2_SH(dst0, dst1, 7); 5484cabdff1aSopenharmony_ci SAT_SH2_SH(dst2, dst3, 7); 5485cabdff1aSopenharmony_ci 5486cabdff1aSopenharmony_ci dst0 = __msa_ilvod_h(zeros, dst0); 5487cabdff1aSopenharmony_ci dst1 = __msa_ilvod_h(zeros, dst1); 5488cabdff1aSopenharmony_ci dst2 = __msa_ilvod_h(zeros, dst2); 5489cabdff1aSopenharmony_ci dst3 = __msa_ilvod_h(zeros, dst3); 5490cabdff1aSopenharmony_ci 5491cabdff1aSopenharmony_ci hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0); 5492cabdff1aSopenharmony_ci hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1); 5493cabdff1aSopenharmony_ci hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2); 5494cabdff1aSopenharmony_ci hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3); 5495cabdff1aSopenharmony_ci 5496cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 5497cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv); 5498cabdff1aSopenharmony_ci PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2); 5499cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(dst0, dst2); 5500cabdff1aSopenharmony_ci out = __msa_aver_u_b(out, dstv); 5501cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, stride); 5502cabdff1aSopenharmony_ci} 5503cabdff1aSopenharmony_ci 5504cabdff1aSopenharmony_civoid ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, 5505cabdff1aSopenharmony_ci ptrdiff_t stride) 5506cabdff1aSopenharmony_ci{ 5507cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 5508cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 5509cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 5510cabdff1aSopenharmony_ci const uint8_t *src_tmp = src - (2 * stride) - 2; 5511cabdff1aSopenharmony_ci uint8_t *dst_tmp = dst; 5512cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 5513cabdff1aSopenharmony_ci uint32_t multiple8_cnt, loop_cnt; 5514cabdff1aSopenharmony_ci v16u8 dst0, dst1, out0, out1; 5515cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2; 5516cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 5517cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, res0, res1, res2, res3; 5518cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 5519cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l; 5520cabdff1aSopenharmony_ci v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l; 5521cabdff1aSopenharmony_ci v8i16 hz_out87_l, filt0, filt1, filt2; 5522cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 5523cabdff1aSopenharmony_ci 5524cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 5525cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 5526cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 5527cabdff1aSopenharmony_ci 5528cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 5529cabdff1aSopenharmony_ci 5530cabdff1aSopenharmony_ci for (multiple8_cnt = 2; multiple8_cnt--;) { 5531cabdff1aSopenharmony_ci src = src_tmp; 5532cabdff1aSopenharmony_ci dst = dst_tmp; 5533cabdff1aSopenharmony_ci 5534cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 5535cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 5536cabdff1aSopenharmony_ci src += (5 * stride); 5537cabdff1aSopenharmony_ci 5538cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 5539cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 5540cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 5541cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 5542cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 5543cabdff1aSopenharmony_ci 5544cabdff1aSopenharmony_ci for (loop_cnt = 4; loop_cnt--;) { 5545cabdff1aSopenharmony_ci LD_SB4(src, stride, src0, src1, src2, src3); 5546cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 5547cabdff1aSopenharmony_ci src += (4 * stride); 5548cabdff1aSopenharmony_ci 5549cabdff1aSopenharmony_ci hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 5550cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 5551cabdff1aSopenharmony_ci hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 5552cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 5553cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 5554cabdff1aSopenharmony_ci hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, 5555cabdff1aSopenharmony_ci hz_out43_r); 5556cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, 5557cabdff1aSopenharmony_ci hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, 5558cabdff1aSopenharmony_ci hz_out43_l); 5559cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 5560cabdff1aSopenharmony_ci hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, 5561cabdff1aSopenharmony_ci hz_out87_r); 5562cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, 5563cabdff1aSopenharmony_ci hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, 5564cabdff1aSopenharmony_ci hz_out87_l); 5565cabdff1aSopenharmony_ci 5566cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, 5567cabdff1aSopenharmony_ci filt1, filt2); 5568cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, 5569cabdff1aSopenharmony_ci filt1, filt2); 5570cabdff1aSopenharmony_ci res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5571cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, 5572cabdff1aSopenharmony_ci filt1, filt2); 5573cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, 5574cabdff1aSopenharmony_ci filt1, filt2); 5575cabdff1aSopenharmony_ci res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5576cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, 5577cabdff1aSopenharmony_ci filt1, filt2); 5578cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, 5579cabdff1aSopenharmony_ci filt1, filt2); 5580cabdff1aSopenharmony_ci res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5581cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, 5582cabdff1aSopenharmony_ci filt1, filt2); 5583cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, 5584cabdff1aSopenharmony_ci filt1, filt2); 5585cabdff1aSopenharmony_ci res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5586cabdff1aSopenharmony_ci 5587cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 5588cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 5589cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 5590cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(res0, res1); 5591cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(res2, res3); 5592cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1); 5593cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, stride); 5594cabdff1aSopenharmony_ci dst += (4 * stride); 5595cabdff1aSopenharmony_ci 5596cabdff1aSopenharmony_ci hz_out0 = hz_out4; 5597cabdff1aSopenharmony_ci hz_out1 = hz_out5; 5598cabdff1aSopenharmony_ci hz_out2 = hz_out6; 5599cabdff1aSopenharmony_ci hz_out3 = hz_out7; 5600cabdff1aSopenharmony_ci hz_out4 = hz_out8; 5601cabdff1aSopenharmony_ci } 5602cabdff1aSopenharmony_ci 5603cabdff1aSopenharmony_ci src_tmp += 8; 5604cabdff1aSopenharmony_ci dst_tmp += 8; 5605cabdff1aSopenharmony_ci } 5606cabdff1aSopenharmony_ci} 5607cabdff1aSopenharmony_ci 5608cabdff1aSopenharmony_civoid ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, 5609cabdff1aSopenharmony_ci ptrdiff_t stride) 5610cabdff1aSopenharmony_ci{ 5611cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 5612cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 5613cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 5614cabdff1aSopenharmony_ci uint64_t tp0, tp1, tp2, tp3; 5615cabdff1aSopenharmony_ci v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 }; 5616cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2; 5617cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 5618cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12; 5619cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 5620cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r; 5621cabdff1aSopenharmony_ci v8i16 hz_out1110_r, hz_out1211_r, res0, res1, res2, res3; 5622cabdff1aSopenharmony_ci v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l; 5623cabdff1aSopenharmony_ci v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l; 5624cabdff1aSopenharmony_ci v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2; 5625cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 5626cabdff1aSopenharmony_ci 5627cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 5628cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 5629cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 5630cabdff1aSopenharmony_ci 5631cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2); 5632cabdff1aSopenharmony_ci 5633cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 5634cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 5635cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 5636cabdff1aSopenharmony_ci src += (5 * stride); 5637cabdff1aSopenharmony_ci 5638cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 5639cabdff1aSopenharmony_ci hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 5640cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 5641cabdff1aSopenharmony_ci hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 5642cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2); 5643cabdff1aSopenharmony_ci 5644cabdff1aSopenharmony_ci LD_SB4(src, stride, src0, src1, src2, src3); 5645cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 5646cabdff1aSopenharmony_ci src += (4 * stride); 5647cabdff1aSopenharmony_ci hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 5648cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 5649cabdff1aSopenharmony_ci hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 5650cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 5651cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 5652cabdff1aSopenharmony_ci hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 5653cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 5654cabdff1aSopenharmony_ci hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l); 5655cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 5656cabdff1aSopenharmony_ci hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 5657cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 5658cabdff1aSopenharmony_ci hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l); 5659cabdff1aSopenharmony_ci 5660cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 5661cabdff1aSopenharmony_ci filt2); 5662cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1, 5663cabdff1aSopenharmony_ci filt2); 5664cabdff1aSopenharmony_ci res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5665cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 5666cabdff1aSopenharmony_ci filt2); 5667cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1, 5668cabdff1aSopenharmony_ci filt2); 5669cabdff1aSopenharmony_ci res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5670cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 5671cabdff1aSopenharmony_ci filt2); 5672cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1, 5673cabdff1aSopenharmony_ci filt2); 5674cabdff1aSopenharmony_ci res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5675cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 5676cabdff1aSopenharmony_ci filt2); 5677cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1, 5678cabdff1aSopenharmony_ci filt2); 5679cabdff1aSopenharmony_ci res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5680cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 5681cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 5682cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 5683cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(res0, res1); 5684cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(res2, res3); 5685cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 5686cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 5687cabdff1aSopenharmony_ci dst += (4 * stride); 5688cabdff1aSopenharmony_ci 5689cabdff1aSopenharmony_ci LD_SB4(src, stride, src0, src1, src2, src3); 5690cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 5691cabdff1aSopenharmony_ci hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2); 5692cabdff1aSopenharmony_ci hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2); 5693cabdff1aSopenharmony_ci hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2); 5694cabdff1aSopenharmony_ci hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2); 5695cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 5696cabdff1aSopenharmony_ci hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r, 5697cabdff1aSopenharmony_ci hz_out1211_r); 5698cabdff1aSopenharmony_ci ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10, 5699cabdff1aSopenharmony_ci hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l, 5700cabdff1aSopenharmony_ci hz_out1211_l); 5701cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1, 5702cabdff1aSopenharmony_ci filt2); 5703cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1, 5704cabdff1aSopenharmony_ci filt2); 5705cabdff1aSopenharmony_ci res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5706cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1, 5707cabdff1aSopenharmony_ci filt2); 5708cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1, 5709cabdff1aSopenharmony_ci filt2); 5710cabdff1aSopenharmony_ci res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5711cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1, 5712cabdff1aSopenharmony_ci filt2); 5713cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1, 5714cabdff1aSopenharmony_ci filt2); 5715cabdff1aSopenharmony_ci res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5716cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1, 5717cabdff1aSopenharmony_ci filt2); 5718cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1, 5719cabdff1aSopenharmony_ci filt2); 5720cabdff1aSopenharmony_ci res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5721cabdff1aSopenharmony_ci LD4(dst, stride, tp0, tp1, tp2, tp3); 5722cabdff1aSopenharmony_ci INSERT_D2_UB(tp0, tp1, dst0); 5723cabdff1aSopenharmony_ci INSERT_D2_UB(tp2, tp3, dst1); 5724cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(res0, res1); 5725cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(res2, res3); 5726cabdff1aSopenharmony_ci AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1); 5727cabdff1aSopenharmony_ci ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 5728cabdff1aSopenharmony_ci} 5729cabdff1aSopenharmony_ci 5730cabdff1aSopenharmony_civoid ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, 5731cabdff1aSopenharmony_ci ptrdiff_t stride) 5732cabdff1aSopenharmony_ci{ 5733cabdff1aSopenharmony_ci const int32_t filt_const0 = 0xfffb0001; 5734cabdff1aSopenharmony_ci const int32_t filt_const1 = 0x140014; 5735cabdff1aSopenharmony_ci const int32_t filt_const2 = 0x1fffb; 5736cabdff1aSopenharmony_ci uint32_t tp0, tp1, tp2, tp3; 5737cabdff1aSopenharmony_ci v16u8 res, dst0 = { 0 }; 5738cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 5739cabdff1aSopenharmony_ci v16i8 mask0, mask1, mask2; 5740cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 5741cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, res0, res1, filt0, filt1, filt2; 5742cabdff1aSopenharmony_ci v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r; 5743cabdff1aSopenharmony_ci v8i16 hz_out65_r, hz_out76_r, hz_out87_r; 5744cabdff1aSopenharmony_ci v4i32 tmp0, tmp1; 5745cabdff1aSopenharmony_ci 5746cabdff1aSopenharmony_ci LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2); 5747cabdff1aSopenharmony_ci 5748cabdff1aSopenharmony_ci filt0 = (v8i16) __msa_fill_w(filt_const0); 5749cabdff1aSopenharmony_ci filt1 = (v8i16) __msa_fill_w(filt_const1); 5750cabdff1aSopenharmony_ci filt2 = (v8i16) __msa_fill_w(filt_const2); 5751cabdff1aSopenharmony_ci 5752cabdff1aSopenharmony_ci src -= ((2 * stride) + 2); 5753cabdff1aSopenharmony_ci 5754cabdff1aSopenharmony_ci LD_SB5(src, stride, src0, src1, src2, src3, src4); 5755cabdff1aSopenharmony_ci src += (5 * stride); 5756cabdff1aSopenharmony_ci LD_SB4(src, stride, src5, src6, src7, src8); 5757cabdff1aSopenharmony_ci 5758cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 5759cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 5760cabdff1aSopenharmony_ci hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2); 5761cabdff1aSopenharmony_ci hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2); 5762cabdff1aSopenharmony_ci hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2); 5763cabdff1aSopenharmony_ci hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2); 5764cabdff1aSopenharmony_ci hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2); 5765cabdff1aSopenharmony_ci PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3); 5766cabdff1aSopenharmony_ci PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7); 5767cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4, 5768cabdff1aSopenharmony_ci hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r); 5769cabdff1aSopenharmony_ci ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8, 5770cabdff1aSopenharmony_ci hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r); 5771cabdff1aSopenharmony_ci 5772cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1, 5773cabdff1aSopenharmony_ci filt2); 5774cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1, 5775cabdff1aSopenharmony_ci filt2); 5776cabdff1aSopenharmony_ci res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5777cabdff1aSopenharmony_ci tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1, 5778cabdff1aSopenharmony_ci filt2); 5779cabdff1aSopenharmony_ci tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1, 5780cabdff1aSopenharmony_ci filt2); 5781cabdff1aSopenharmony_ci res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0); 5782cabdff1aSopenharmony_ci LW4(dst, stride, tp0, tp1, tp2, tp3); 5783cabdff1aSopenharmony_ci INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 5784cabdff1aSopenharmony_ci res = PCKEV_XORI128_UB(res0, res1); 5785cabdff1aSopenharmony_ci res = __msa_aver_u_b(res, dst0); 5786cabdff1aSopenharmony_ci ST_W4(res, 0, 1, 2, 3, dst, stride); 5787cabdff1aSopenharmony_ci} 5788