1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavcodec/vp8dsp.h" 22cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 23cabdff1aSopenharmony_ci#include "vp8dsp_mips.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_cistatic const uint8_t mc_filt_mask_arr[16 * 3] = { 26cabdff1aSopenharmony_ci /* 8 width cases */ 27cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28cabdff1aSopenharmony_ci /* 4 width cases */ 29cabdff1aSopenharmony_ci 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 30cabdff1aSopenharmony_ci /* 4 width cases */ 31cabdff1aSopenharmony_ci 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 32cabdff1aSopenharmony_ci}; 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_cistatic const int8_t subpel_filters_msa[7][8] = { 35cabdff1aSopenharmony_ci {-6, 123, 12, -1, 0, 0, 0, 0}, 36cabdff1aSopenharmony_ci {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */ 37cabdff1aSopenharmony_ci {-9, 93, 50, -6, 0, 0, 0, 0}, 38cabdff1aSopenharmony_ci {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */ 39cabdff1aSopenharmony_ci {-6, 50, 93, -9, 0, 0, 0, 0}, 40cabdff1aSopenharmony_ci {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */ 41cabdff1aSopenharmony_ci {-1, 12, 123, -6, 0, 0, 0, 0}, 42cabdff1aSopenharmony_ci}; 43cabdff1aSopenharmony_ci 44cabdff1aSopenharmony_cistatic const int8_t bilinear_filters_msa[7][2] = { 45cabdff1aSopenharmony_ci {112, 16}, 46cabdff1aSopenharmony_ci {96, 32}, 47cabdff1aSopenharmony_ci {80, 48}, 48cabdff1aSopenharmony_ci {64, 64}, 49cabdff1aSopenharmony_ci {48, 80}, 50cabdff1aSopenharmony_ci {32, 96}, 51cabdff1aSopenharmony_ci {16, 112} 52cabdff1aSopenharmony_ci}; 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ci#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \ 55cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2) \ 56cabdff1aSopenharmony_ci( { \ 57cabdff1aSopenharmony_ci v16i8 vec0_m, vec1_m, vec2_m; \ 58cabdff1aSopenharmony_ci v8i16 hz_out_m; \ 59cabdff1aSopenharmony_ci \ 60cabdff1aSopenharmony_ci VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \ 61cabdff1aSopenharmony_ci vec0_m, vec1_m, vec2_m); \ 62cabdff1aSopenharmony_ci hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \ 63cabdff1aSopenharmony_ci filt_h0, filt_h1, filt_h2); \ 64cabdff1aSopenharmony_ci \ 65cabdff1aSopenharmony_ci hz_out_m = __msa_srari_h(hz_out_m, 7); \ 66cabdff1aSopenharmony_ci hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ 67cabdff1aSopenharmony_ci \ 68cabdff1aSopenharmony_ci hz_out_m; \ 69cabdff1aSopenharmony_ci} ) 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_ci#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 72cabdff1aSopenharmony_ci mask0, mask1, mask2, \ 73cabdff1aSopenharmony_ci filt0, filt1, filt2, \ 74cabdff1aSopenharmony_ci out0, out1) \ 75cabdff1aSopenharmony_ci{ \ 76cabdff1aSopenharmony_ci v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ 77cabdff1aSopenharmony_ci \ 78cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 79cabdff1aSopenharmony_ci DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 80cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 81cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 82cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ 83cabdff1aSopenharmony_ci DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \ 84cabdff1aSopenharmony_ci} 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_ci#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 87cabdff1aSopenharmony_ci mask0, mask1, mask2, \ 88cabdff1aSopenharmony_ci filt0, filt1, filt2, \ 89cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 90cabdff1aSopenharmony_ci{ \ 91cabdff1aSopenharmony_ci v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 92cabdff1aSopenharmony_ci \ 93cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 94cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 95cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 96cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 97cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 98cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 99cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \ 100cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \ 101cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 102cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 103cabdff1aSopenharmony_ci DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \ 104cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 105cabdff1aSopenharmony_ci} 106cabdff1aSopenharmony_ci 107cabdff1aSopenharmony_ci#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ 108cabdff1aSopenharmony_ci( { \ 109cabdff1aSopenharmony_ci v8i16 tmp0; \ 110cabdff1aSopenharmony_ci \ 111cabdff1aSopenharmony_ci tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \ 112cabdff1aSopenharmony_ci tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \ 113cabdff1aSopenharmony_ci \ 114cabdff1aSopenharmony_ci tmp0; \ 115cabdff1aSopenharmony_ci} ) 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_ci#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ 118cabdff1aSopenharmony_ci( { \ 119cabdff1aSopenharmony_ci v16i8 vec0_m, vec1_m; \ 120cabdff1aSopenharmony_ci v8i16 hz_out_m; \ 121cabdff1aSopenharmony_ci \ 122cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \ 123cabdff1aSopenharmony_ci hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \ 124cabdff1aSopenharmony_ci \ 125cabdff1aSopenharmony_ci hz_out_m = __msa_srari_h(hz_out_m, 7); \ 126cabdff1aSopenharmony_ci hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ 127cabdff1aSopenharmony_ci \ 128cabdff1aSopenharmony_ci hz_out_m; \ 129cabdff1aSopenharmony_ci} ) 130cabdff1aSopenharmony_ci 131cabdff1aSopenharmony_ci#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 132cabdff1aSopenharmony_ci mask0, mask1, filt0, filt1, \ 133cabdff1aSopenharmony_ci out0, out1) \ 134cabdff1aSopenharmony_ci{ \ 135cabdff1aSopenharmony_ci v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 136cabdff1aSopenharmony_ci \ 137cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 138cabdff1aSopenharmony_ci DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 139cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 140cabdff1aSopenharmony_ci DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 141cabdff1aSopenharmony_ci} 142cabdff1aSopenharmony_ci 143cabdff1aSopenharmony_ci#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 144cabdff1aSopenharmony_ci mask0, mask1, filt0, filt1, \ 145cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 146cabdff1aSopenharmony_ci{ \ 147cabdff1aSopenharmony_ci v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 148cabdff1aSopenharmony_ci \ 149cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 150cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 151cabdff1aSopenharmony_ci DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 152cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 153cabdff1aSopenharmony_ci VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 154cabdff1aSopenharmony_ci VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 155cabdff1aSopenharmony_ci DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 156cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 157cabdff1aSopenharmony_ci} 158cabdff1aSopenharmony_ci 159cabdff1aSopenharmony_cistatic void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride, 160cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 161cabdff1aSopenharmony_ci const int8_t *filter) 162cabdff1aSopenharmony_ci{ 163cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, filt2; 164cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, out; 165cabdff1aSopenharmony_ci v8i16 filt, out0, out1; 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_ci mask0 = LD_UB(&mc_filt_mask_arr[16]); 168cabdff1aSopenharmony_ci src -= 2; 169cabdff1aSopenharmony_ci 170cabdff1aSopenharmony_ci /* rearranging filter */ 171cabdff1aSopenharmony_ci filt = LD_SH(filter); 172cabdff1aSopenharmony_ci SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 173cabdff1aSopenharmony_ci 174cabdff1aSopenharmony_ci mask1 = mask0 + 2; 175cabdff1aSopenharmony_ci mask2 = mask0 + 4; 176cabdff1aSopenharmony_ci 177cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 178cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 179cabdff1aSopenharmony_ci HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 180cabdff1aSopenharmony_ci filt0, filt1, filt2, out0, out1); 181cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, 7); 182cabdff1aSopenharmony_ci SAT_SH2_SH(out0, out1, 7); 183cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 184cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 185cabdff1aSopenharmony_ci} 186cabdff1aSopenharmony_ci 187cabdff1aSopenharmony_cistatic void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride, 188cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 189cabdff1aSopenharmony_ci const int8_t *filter) 190cabdff1aSopenharmony_ci{ 191cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, filt2; 192cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, out; 193cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 194cabdff1aSopenharmony_ci 195cabdff1aSopenharmony_ci mask0 = LD_UB(&mc_filt_mask_arr[16]); 196cabdff1aSopenharmony_ci src -= 2; 197cabdff1aSopenharmony_ci 198cabdff1aSopenharmony_ci /* rearranging filter */ 199cabdff1aSopenharmony_ci filt = LD_SH(filter); 200cabdff1aSopenharmony_ci SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 201cabdff1aSopenharmony_ci 202cabdff1aSopenharmony_ci mask1 = mask0 + 2; 203cabdff1aSopenharmony_ci mask2 = mask0 + 4; 204cabdff1aSopenharmony_ci 205cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 206cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 207cabdff1aSopenharmony_ci src += (4 * src_stride); 208cabdff1aSopenharmony_ci HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 209cabdff1aSopenharmony_ci filt0, filt1, filt2, out0, out1); 210cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 211cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 212cabdff1aSopenharmony_ci HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 213cabdff1aSopenharmony_ci filt0, filt1, filt2, out2, out3); 214cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 7); 215cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 216cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 217cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 218cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 219cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 220cabdff1aSopenharmony_ci} 221cabdff1aSopenharmony_ci 222cabdff1aSopenharmony_civoid ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, 223cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 224cabdff1aSopenharmony_ci int height, int mx, int my) 225cabdff1aSopenharmony_ci{ 226cabdff1aSopenharmony_ci const int8_t *filter = subpel_filters_msa[mx - 1]; 227cabdff1aSopenharmony_ci 228cabdff1aSopenharmony_ci if (4 == height) { 229cabdff1aSopenharmony_ci common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter); 230cabdff1aSopenharmony_ci } else if (8 == height) { 231cabdff1aSopenharmony_ci common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter); 232cabdff1aSopenharmony_ci } 233cabdff1aSopenharmony_ci} 234cabdff1aSopenharmony_ci 235cabdff1aSopenharmony_civoid ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, 236cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 237cabdff1aSopenharmony_ci int height, int mx, int my) 238cabdff1aSopenharmony_ci{ 239cabdff1aSopenharmony_ci uint32_t loop_cnt; 240cabdff1aSopenharmony_ci const int8_t *filter = subpel_filters_msa[mx - 1]; 241cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, filt2; 242cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, tmp0, tmp1; 243cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_ci mask0 = LD_UB(&mc_filt_mask_arr[0]); 246cabdff1aSopenharmony_ci 247cabdff1aSopenharmony_ci src -= 2; 248cabdff1aSopenharmony_ci 249cabdff1aSopenharmony_ci /* rearranging filter */ 250cabdff1aSopenharmony_ci filt = LD_SH(filter); 251cabdff1aSopenharmony_ci SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ci mask1 = mask0 + 2; 254cabdff1aSopenharmony_ci mask2 = mask0 + 4; 255cabdff1aSopenharmony_ci 256cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 257cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 258cabdff1aSopenharmony_ci src += (4 * src_stride); 259cabdff1aSopenharmony_ci HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 260cabdff1aSopenharmony_ci filt0, filt1, filt2, out0, out1, out2, out3); 261cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 7); 262cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 263cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0, out1); 264cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out2, out3); 265cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 266cabdff1aSopenharmony_ci dst += (4 * dst_stride); 267cabdff1aSopenharmony_ci 268cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { 269cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 270cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 271cabdff1aSopenharmony_ci src += (4 * src_stride); 272cabdff1aSopenharmony_ci HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 273cabdff1aSopenharmony_ci filt0, filt1, filt2, out0, out1, out2, out3); 274cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 7); 275cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 276cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0, out1); 277cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out2, out3); 278cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 279cabdff1aSopenharmony_ci dst += (4 * dst_stride); 280cabdff1aSopenharmony_ci } 281cabdff1aSopenharmony_ci} 282cabdff1aSopenharmony_ci 283cabdff1aSopenharmony_civoid ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, 284cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 285cabdff1aSopenharmony_ci int height, int mx, int my) 286cabdff1aSopenharmony_ci{ 287cabdff1aSopenharmony_ci uint32_t loop_cnt; 288cabdff1aSopenharmony_ci const int8_t *filter = subpel_filters_msa[mx - 1]; 289cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2; 290cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, out; 291cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci mask0 = LD_UB(&mc_filt_mask_arr[0]); 294cabdff1aSopenharmony_ci src -= 2; 295cabdff1aSopenharmony_ci 296cabdff1aSopenharmony_ci /* rearranging filter */ 297cabdff1aSopenharmony_ci filt = LD_SH(filter); 298cabdff1aSopenharmony_ci SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 299cabdff1aSopenharmony_ci 300cabdff1aSopenharmony_ci mask1 = mask0 + 2; 301cabdff1aSopenharmony_ci mask2 = mask0 + 4; 302cabdff1aSopenharmony_ci 303cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 304cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src2, src4, src6); 305cabdff1aSopenharmony_ci LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 306cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 307cabdff1aSopenharmony_ci src += (4 * src_stride); 308cabdff1aSopenharmony_ci 309cabdff1aSopenharmony_ci HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 310cabdff1aSopenharmony_ci filt0, filt1, filt2, out0, out1, out2, out3); 311cabdff1aSopenharmony_ci HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2, 312cabdff1aSopenharmony_ci filt0, filt1, filt2, out4, out5, out6, out7); 313cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 7); 314cabdff1aSopenharmony_ci SRARI_H4_SH(out4, out5, out6, out7, 7); 315cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 316cabdff1aSopenharmony_ci SAT_SH4_SH(out4, out5, out6, out7, 7); 317cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 318cabdff1aSopenharmony_ci ST_UB(out, dst); 319cabdff1aSopenharmony_ci dst += dst_stride; 320cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 321cabdff1aSopenharmony_ci ST_UB(out, dst); 322cabdff1aSopenharmony_ci dst += dst_stride; 323cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out4, out5); 324cabdff1aSopenharmony_ci ST_UB(out, dst); 325cabdff1aSopenharmony_ci dst += dst_stride; 326cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out6, out7); 327cabdff1aSopenharmony_ci ST_UB(out, dst); 328cabdff1aSopenharmony_ci dst += dst_stride; 329cabdff1aSopenharmony_ci } 330cabdff1aSopenharmony_ci} 331cabdff1aSopenharmony_ci 332cabdff1aSopenharmony_civoid ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 333cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 334cabdff1aSopenharmony_ci int height, int mx, int my) 335cabdff1aSopenharmony_ci{ 336cabdff1aSopenharmony_ci uint32_t loop_cnt; 337cabdff1aSopenharmony_ci const int8_t *filter = subpel_filters_msa[my - 1]; 338cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 339cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 340cabdff1aSopenharmony_ci v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 341cabdff1aSopenharmony_ci v16u8 out; 342cabdff1aSopenharmony_ci v8i16 filt, out10, out32; 343cabdff1aSopenharmony_ci 344cabdff1aSopenharmony_ci src -= (2 * src_stride); 345cabdff1aSopenharmony_ci 346cabdff1aSopenharmony_ci filt = LD_SH(filter); 347cabdff1aSopenharmony_ci SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 348cabdff1aSopenharmony_ci 349cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 350cabdff1aSopenharmony_ci src += (5 * src_stride); 351cabdff1aSopenharmony_ci 352cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 353cabdff1aSopenharmony_ci src32_r, src43_r); 354cabdff1aSopenharmony_ci ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 355cabdff1aSopenharmony_ci XORI_B2_128_SB(src2110, src4332); 356cabdff1aSopenharmony_ci 357cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 358cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src5, src6, src7, src8); 359cabdff1aSopenharmony_ci src += (4 * src_stride); 360cabdff1aSopenharmony_ci 361cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 362cabdff1aSopenharmony_ci src65_r, src76_r, src87_r); 363cabdff1aSopenharmony_ci ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); 364cabdff1aSopenharmony_ci XORI_B2_128_SB(src6554, src8776); 365cabdff1aSopenharmony_ci out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 366cabdff1aSopenharmony_ci out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 367cabdff1aSopenharmony_ci SRARI_H2_SH(out10, out32, 7); 368cabdff1aSopenharmony_ci SAT_SH2_SH(out10, out32, 7); 369cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out10, out32); 370cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 371cabdff1aSopenharmony_ci dst += (4 * dst_stride); 372cabdff1aSopenharmony_ci 373cabdff1aSopenharmony_ci src2110 = src6554; 374cabdff1aSopenharmony_ci src4332 = src8776; 375cabdff1aSopenharmony_ci src4 = src8; 376cabdff1aSopenharmony_ci } 377cabdff1aSopenharmony_ci} 378cabdff1aSopenharmony_ci 379cabdff1aSopenharmony_civoid ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 380cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 381cabdff1aSopenharmony_ci int height, int mx, int my) 382cabdff1aSopenharmony_ci{ 383cabdff1aSopenharmony_ci uint32_t loop_cnt; 384cabdff1aSopenharmony_ci const int8_t *filter = subpel_filters_msa[my - 1]; 385cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10; 386cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; 387cabdff1aSopenharmony_ci v16i8 src109_r, filt0, filt1, filt2; 388cabdff1aSopenharmony_ci v16u8 tmp0, tmp1; 389cabdff1aSopenharmony_ci v8i16 filt, out0_r, out1_r, out2_r, out3_r; 390cabdff1aSopenharmony_ci 391cabdff1aSopenharmony_ci src -= (2 * src_stride); 392cabdff1aSopenharmony_ci 393cabdff1aSopenharmony_ci filt = LD_SH(filter); 394cabdff1aSopenharmony_ci SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 395cabdff1aSopenharmony_ci 396cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 397cabdff1aSopenharmony_ci src += (5 * src_stride); 398cabdff1aSopenharmony_ci 399cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 400cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3, 401cabdff1aSopenharmony_ci src10_r, src32_r, src21_r, src43_r); 402cabdff1aSopenharmony_ci 403cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 404cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 405cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 406cabdff1aSopenharmony_ci src += (4 * src_stride); 407cabdff1aSopenharmony_ci 408cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r, 409cabdff1aSopenharmony_ci src87_r, src98_r, src109_r); 410cabdff1aSopenharmony_ci out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); 411cabdff1aSopenharmony_ci out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); 412cabdff1aSopenharmony_ci out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); 413cabdff1aSopenharmony_ci out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); 414cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); 415cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 416cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); 417cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); 418cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 419cabdff1aSopenharmony_ci dst += (4 * dst_stride); 420cabdff1aSopenharmony_ci 421cabdff1aSopenharmony_ci src10_r = src76_r; 422cabdff1aSopenharmony_ci src32_r = src98_r; 423cabdff1aSopenharmony_ci src21_r = src87_r; 424cabdff1aSopenharmony_ci src43_r = src109_r; 425cabdff1aSopenharmony_ci src4 = src10; 426cabdff1aSopenharmony_ci } 427cabdff1aSopenharmony_ci} 428cabdff1aSopenharmony_ci 429cabdff1aSopenharmony_civoid ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 430cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 431cabdff1aSopenharmony_ci int height, int mx, int my) 432cabdff1aSopenharmony_ci{ 433cabdff1aSopenharmony_ci uint32_t loop_cnt; 434cabdff1aSopenharmony_ci const int8_t *filter = subpel_filters_msa[my - 1]; 435cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 436cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 437cabdff1aSopenharmony_ci v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 438cabdff1aSopenharmony_ci v16i8 src65_l, src87_l, filt0, filt1, filt2; 439cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3; 440cabdff1aSopenharmony_ci v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; 441cabdff1aSopenharmony_ci 442cabdff1aSopenharmony_ci src -= (2 * src_stride); 443cabdff1aSopenharmony_ci 444cabdff1aSopenharmony_ci filt = LD_SH(filter); 445cabdff1aSopenharmony_ci SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 446cabdff1aSopenharmony_ci 447cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 448cabdff1aSopenharmony_ci src += (5 * src_stride); 449cabdff1aSopenharmony_ci 450cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 451cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r, 452cabdff1aSopenharmony_ci src32_r, src43_r, src21_r); 453cabdff1aSopenharmony_ci ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l, 454cabdff1aSopenharmony_ci src32_l, src43_l, src21_l); 455cabdff1aSopenharmony_ci 456cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 457cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src5, src6, src7, src8); 458cabdff1aSopenharmony_ci src += (4 * src_stride); 459cabdff1aSopenharmony_ci 460cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 461cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, 462cabdff1aSopenharmony_ci src65_r, src76_r, src87_r); 463cabdff1aSopenharmony_ci ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, 464cabdff1aSopenharmony_ci src65_l, src76_l, src87_l); 465cabdff1aSopenharmony_ci out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, 466cabdff1aSopenharmony_ci filt2); 467cabdff1aSopenharmony_ci out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, 468cabdff1aSopenharmony_ci filt2); 469cabdff1aSopenharmony_ci out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, 470cabdff1aSopenharmony_ci filt2); 471cabdff1aSopenharmony_ci out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, 472cabdff1aSopenharmony_ci filt2); 473cabdff1aSopenharmony_ci out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, 474cabdff1aSopenharmony_ci filt2); 475cabdff1aSopenharmony_ci out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, 476cabdff1aSopenharmony_ci filt2); 477cabdff1aSopenharmony_ci out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, 478cabdff1aSopenharmony_ci filt2); 479cabdff1aSopenharmony_ci out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, 480cabdff1aSopenharmony_ci filt2); 481cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); 482cabdff1aSopenharmony_ci SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7); 483cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 484cabdff1aSopenharmony_ci SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 485cabdff1aSopenharmony_ci PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 486cabdff1aSopenharmony_ci out3_r, tmp0, tmp1, tmp2, tmp3); 487cabdff1aSopenharmony_ci XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 488cabdff1aSopenharmony_ci ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 489cabdff1aSopenharmony_ci dst += (4 * dst_stride); 490cabdff1aSopenharmony_ci 491cabdff1aSopenharmony_ci src10_r = src54_r; 492cabdff1aSopenharmony_ci src32_r = src76_r; 493cabdff1aSopenharmony_ci src21_r = src65_r; 494cabdff1aSopenharmony_ci src43_r = src87_r; 495cabdff1aSopenharmony_ci src10_l = src54_l; 496cabdff1aSopenharmony_ci src32_l = src76_l; 497cabdff1aSopenharmony_ci src21_l = src65_l; 498cabdff1aSopenharmony_ci src43_l = src87_l; 499cabdff1aSopenharmony_ci src4 = src8; 500cabdff1aSopenharmony_ci } 501cabdff1aSopenharmony_ci} 502cabdff1aSopenharmony_ci 503cabdff1aSopenharmony_civoid ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 504cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 505cabdff1aSopenharmony_ci int height, int mx, int my) 506cabdff1aSopenharmony_ci{ 507cabdff1aSopenharmony_ci uint32_t loop_cnt; 508cabdff1aSopenharmony_ci const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 509cabdff1aSopenharmony_ci const int8_t *filter_vert = subpel_filters_msa[my - 1]; 510cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 511cabdff1aSopenharmony_ci v16i8 filt_hz0, filt_hz1, filt_hz2; 512cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, out; 513cabdff1aSopenharmony_ci v8i16 tmp0, tmp1; 514cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 515cabdff1aSopenharmony_ci v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3; 516cabdff1aSopenharmony_ci 517cabdff1aSopenharmony_ci mask0 = LD_UB(&mc_filt_mask_arr[16]); 518cabdff1aSopenharmony_ci src -= (2 + 2 * src_stride); 519cabdff1aSopenharmony_ci 520cabdff1aSopenharmony_ci /* rearranging filter */ 521cabdff1aSopenharmony_ci filt = LD_SH(filter_horiz); 522cabdff1aSopenharmony_ci SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); 523cabdff1aSopenharmony_ci 524cabdff1aSopenharmony_ci filt = LD_SH(filter_vert); 525cabdff1aSopenharmony_ci SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); 526cabdff1aSopenharmony_ci 527cabdff1aSopenharmony_ci mask1 = mask0 + 2; 528cabdff1aSopenharmony_ci mask2 = mask0 + 4; 529cabdff1aSopenharmony_ci 530cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 531cabdff1aSopenharmony_ci src += (5 * src_stride); 532cabdff1aSopenharmony_ci 533cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 534cabdff1aSopenharmony_ci hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, 535cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 536cabdff1aSopenharmony_ci hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, 537cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 538cabdff1aSopenharmony_ci hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8); 539cabdff1aSopenharmony_ci hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, 540cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 541cabdff1aSopenharmony_ci ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 542cabdff1aSopenharmony_ci 543cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 544cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src5, src6); 545cabdff1aSopenharmony_ci src += (2 * src_stride); 546cabdff1aSopenharmony_ci 547cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 548cabdff1aSopenharmony_ci hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, 549cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 550cabdff1aSopenharmony_ci hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8); 551cabdff1aSopenharmony_ci 552cabdff1aSopenharmony_ci LD_SB2(src, src_stride, src7, src8); 553cabdff1aSopenharmony_ci src += (2 * src_stride); 554cabdff1aSopenharmony_ci 555cabdff1aSopenharmony_ci XORI_B2_128_SB(src7, src8); 556cabdff1aSopenharmony_ci hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0, 557cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 558cabdff1aSopenharmony_ci hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8); 559cabdff1aSopenharmony_ci 560cabdff1aSopenharmony_ci out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 561cabdff1aSopenharmony_ci tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); 562cabdff1aSopenharmony_ci 563cabdff1aSopenharmony_ci out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); 564cabdff1aSopenharmony_ci tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); 565cabdff1aSopenharmony_ci 566cabdff1aSopenharmony_ci SRARI_H2_SH(tmp0, tmp1, 7); 567cabdff1aSopenharmony_ci SAT_SH2_SH(tmp0, tmp1, 7); 568cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(tmp0, tmp1); 569cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 570cabdff1aSopenharmony_ci dst += (4 * dst_stride); 571cabdff1aSopenharmony_ci 572cabdff1aSopenharmony_ci hz_out3 = hz_out7; 573cabdff1aSopenharmony_ci out0 = out2; 574cabdff1aSopenharmony_ci out1 = out3; 575cabdff1aSopenharmony_ci } 576cabdff1aSopenharmony_ci} 577cabdff1aSopenharmony_ci 578cabdff1aSopenharmony_civoid ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 579cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 580cabdff1aSopenharmony_ci int height, int mx, int my) 581cabdff1aSopenharmony_ci{ 582cabdff1aSopenharmony_ci uint32_t loop_cnt; 583cabdff1aSopenharmony_ci const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 584cabdff1aSopenharmony_ci const int8_t *filter_vert = subpel_filters_msa[my - 1]; 585cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 586cabdff1aSopenharmony_ci v16i8 filt_hz0, filt_hz1, filt_hz2; 587cabdff1aSopenharmony_ci v16u8 mask0, mask1, mask2, vec0, vec1; 588cabdff1aSopenharmony_ci v8i16 filt, filt_vt0, filt_vt1, filt_vt2; 589cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 590cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; 591cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3; 592cabdff1aSopenharmony_ci 593cabdff1aSopenharmony_ci mask0 = LD_UB(&mc_filt_mask_arr[0]); 594cabdff1aSopenharmony_ci src -= (2 + 2 * src_stride); 595cabdff1aSopenharmony_ci 596cabdff1aSopenharmony_ci /* rearranging filter */ 597cabdff1aSopenharmony_ci filt = LD_SH(filter_horiz); 598cabdff1aSopenharmony_ci SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); 599cabdff1aSopenharmony_ci 600cabdff1aSopenharmony_ci mask1 = mask0 + 2; 601cabdff1aSopenharmony_ci mask2 = mask0 + 4; 602cabdff1aSopenharmony_ci 603cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 604cabdff1aSopenharmony_ci src += (5 * src_stride); 605cabdff1aSopenharmony_ci 606cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 607cabdff1aSopenharmony_ci hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, 608cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 609cabdff1aSopenharmony_ci hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, 610cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 611cabdff1aSopenharmony_ci hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, 612cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 613cabdff1aSopenharmony_ci hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, 614cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 615cabdff1aSopenharmony_ci hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, 616cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 617cabdff1aSopenharmony_ci 618cabdff1aSopenharmony_ci filt = LD_SH(filter_vert); 619cabdff1aSopenharmony_ci SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); 620cabdff1aSopenharmony_ci 621cabdff1aSopenharmony_ci ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 622cabdff1aSopenharmony_ci ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4); 623cabdff1aSopenharmony_ci 624cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 625cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src5, src6, src7, src8); 626cabdff1aSopenharmony_ci src += (4 * src_stride); 627cabdff1aSopenharmony_ci 628cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 629cabdff1aSopenharmony_ci hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, 630cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 631cabdff1aSopenharmony_ci out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 632cabdff1aSopenharmony_ci tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); 633cabdff1aSopenharmony_ci 634cabdff1aSopenharmony_ci hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, 635cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 636cabdff1aSopenharmony_ci out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5); 637cabdff1aSopenharmony_ci tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); 638cabdff1aSopenharmony_ci 639cabdff1aSopenharmony_ci hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0, 640cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 641cabdff1aSopenharmony_ci out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); 642cabdff1aSopenharmony_ci tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2); 643cabdff1aSopenharmony_ci 644cabdff1aSopenharmony_ci hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0, 645cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 646cabdff1aSopenharmony_ci out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7); 647cabdff1aSopenharmony_ci tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2); 648cabdff1aSopenharmony_ci 649cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 650cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 651cabdff1aSopenharmony_ci vec0 = PCKEV_XORI128_UB(tmp0, tmp1); 652cabdff1aSopenharmony_ci vec1 = PCKEV_XORI128_UB(tmp2, tmp3); 653cabdff1aSopenharmony_ci ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride); 654cabdff1aSopenharmony_ci dst += (4 * dst_stride); 655cabdff1aSopenharmony_ci 656cabdff1aSopenharmony_ci hz_out4 = hz_out8; 657cabdff1aSopenharmony_ci out0 = out2; 658cabdff1aSopenharmony_ci out1 = out7; 659cabdff1aSopenharmony_ci out3 = out5; 660cabdff1aSopenharmony_ci out4 = out6; 661cabdff1aSopenharmony_ci } 662cabdff1aSopenharmony_ci} 663cabdff1aSopenharmony_ci 664cabdff1aSopenharmony_ci 665cabdff1aSopenharmony_civoid ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 666cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 667cabdff1aSopenharmony_ci int height, int mx, int my) 668cabdff1aSopenharmony_ci{ 669cabdff1aSopenharmony_ci int32_t multiple8_cnt; 670cabdff1aSopenharmony_ci 671cabdff1aSopenharmony_ci for (multiple8_cnt = 2; multiple8_cnt--;) { 672cabdff1aSopenharmony_ci ff_put_vp8_epel8_h6v6_msa(dst, dst_stride, src, src_stride, height, 673cabdff1aSopenharmony_ci mx, my); 674cabdff1aSopenharmony_ci 675cabdff1aSopenharmony_ci src += 8; 676cabdff1aSopenharmony_ci dst += 8; 677cabdff1aSopenharmony_ci } 678cabdff1aSopenharmony_ci} 679cabdff1aSopenharmony_ci 680cabdff1aSopenharmony_cistatic void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, 681cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 682cabdff1aSopenharmony_ci const int8_t *filter) 683cabdff1aSopenharmony_ci{ 684cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 685cabdff1aSopenharmony_ci v8i16 filt, out0, out1; 686cabdff1aSopenharmony_ci v16u8 out; 687cabdff1aSopenharmony_ci 688cabdff1aSopenharmony_ci mask0 = LD_SB(&mc_filt_mask_arr[16]); 689cabdff1aSopenharmony_ci src -= 1; 690cabdff1aSopenharmony_ci 691cabdff1aSopenharmony_ci /* rearranging filter */ 692cabdff1aSopenharmony_ci filt = LD_SH(filter); 693cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 694cabdff1aSopenharmony_ci 695cabdff1aSopenharmony_ci mask1 = mask0 + 2; 696cabdff1aSopenharmony_ci 697cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 698cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 699cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 700cabdff1aSopenharmony_ci filt0, filt1, out0, out1); 701cabdff1aSopenharmony_ci SRARI_H2_SH(out0, out1, 7); 702cabdff1aSopenharmony_ci SAT_SH2_SH(out0, out1, 7); 703cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 704cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 705cabdff1aSopenharmony_ci} 706cabdff1aSopenharmony_ci 707cabdff1aSopenharmony_cistatic void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, 708cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 709cabdff1aSopenharmony_ci const int8_t *filter) 710cabdff1aSopenharmony_ci{ 711cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 712cabdff1aSopenharmony_ci v16u8 out; 713cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 714cabdff1aSopenharmony_ci 715cabdff1aSopenharmony_ci mask0 = LD_SB(&mc_filt_mask_arr[16]); 716cabdff1aSopenharmony_ci src -= 1; 717cabdff1aSopenharmony_ci 718cabdff1aSopenharmony_ci /* rearranging filter */ 719cabdff1aSopenharmony_ci filt = LD_SH(filter); 720cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 721cabdff1aSopenharmony_ci 722cabdff1aSopenharmony_ci mask1 = mask0 + 2; 723cabdff1aSopenharmony_ci 724cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 725cabdff1aSopenharmony_ci src += (4 * src_stride); 726cabdff1aSopenharmony_ci 727cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 728cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 729cabdff1aSopenharmony_ci filt0, filt1, out0, out1); 730cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 731cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 732cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 733cabdff1aSopenharmony_ci filt0, filt1, out2, out3); 734cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 7); 735cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 736cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 737cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 738cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 739cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 740cabdff1aSopenharmony_ci} 741cabdff1aSopenharmony_ci 742cabdff1aSopenharmony_cistatic void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, 743cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 744cabdff1aSopenharmony_ci const int8_t *filter) 745cabdff1aSopenharmony_ci{ 746cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 747cabdff1aSopenharmony_ci v16i8 filt0, filt1, mask0, mask1; 748cabdff1aSopenharmony_ci v16u8 out; 749cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 750cabdff1aSopenharmony_ci 751cabdff1aSopenharmony_ci mask0 = LD_SB(&mc_filt_mask_arr[16]); 752cabdff1aSopenharmony_ci src -= 1; 753cabdff1aSopenharmony_ci 754cabdff1aSopenharmony_ci /* rearranging filter */ 755cabdff1aSopenharmony_ci filt = LD_SH(filter); 756cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 757cabdff1aSopenharmony_ci 758cabdff1aSopenharmony_ci mask1 = mask0 + 2; 759cabdff1aSopenharmony_ci 760cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 761cabdff1aSopenharmony_ci src += (8 * src_stride); 762cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 763cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 764cabdff1aSopenharmony_ci filt0, filt1, out0, out1); 765cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, 766cabdff1aSopenharmony_ci filt0, filt1, out2, out3); 767cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 7); 768cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 769cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 770cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 771cabdff1aSopenharmony_ci dst += (4 * dst_stride); 772cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 773cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 774cabdff1aSopenharmony_ci dst += (4 * dst_stride); 775cabdff1aSopenharmony_ci 776cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 777cabdff1aSopenharmony_ci src += (8 * src_stride); 778cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 779cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 780cabdff1aSopenharmony_ci filt0, filt1, out0, out1); 781cabdff1aSopenharmony_ci HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, 782cabdff1aSopenharmony_ci filt0, filt1, out2, out3); 783cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 7); 784cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 785cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 786cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 787cabdff1aSopenharmony_ci dst += (4 * dst_stride); 788cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 789cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 790cabdff1aSopenharmony_ci} 791cabdff1aSopenharmony_ci 792cabdff1aSopenharmony_civoid ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, 793cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 794cabdff1aSopenharmony_ci int height, int mx, int my) 795cabdff1aSopenharmony_ci{ 796cabdff1aSopenharmony_ci const int8_t *filter = subpel_filters_msa[mx - 1]; 797cabdff1aSopenharmony_ci 798cabdff1aSopenharmony_ci if (4 == height) { 799cabdff1aSopenharmony_ci common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter); 800cabdff1aSopenharmony_ci } else if (8 == height) { 801cabdff1aSopenharmony_ci common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter); 802cabdff1aSopenharmony_ci } else if (16 == height) { 803cabdff1aSopenharmony_ci common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter); 804cabdff1aSopenharmony_ci } 805cabdff1aSopenharmony_ci} 806cabdff1aSopenharmony_ci 807cabdff1aSopenharmony_civoid ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, 808cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 809cabdff1aSopenharmony_ci int height, int mx, int my) 810cabdff1aSopenharmony_ci{ 811cabdff1aSopenharmony_ci uint32_t loop_cnt; 812cabdff1aSopenharmony_ci const int8_t *filter = subpel_filters_msa[mx - 1]; 813cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 814cabdff1aSopenharmony_ci v16u8 tmp0, tmp1; 815cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3; 816cabdff1aSopenharmony_ci 817cabdff1aSopenharmony_ci mask0 = LD_SB(&mc_filt_mask_arr[0]); 818cabdff1aSopenharmony_ci src -= 1; 819cabdff1aSopenharmony_ci 820cabdff1aSopenharmony_ci /* rearranging filter */ 821cabdff1aSopenharmony_ci filt = LD_SH(filter); 822cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 823cabdff1aSopenharmony_ci 824cabdff1aSopenharmony_ci mask1 = mask0 + 2; 825cabdff1aSopenharmony_ci 826cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 827cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 828cabdff1aSopenharmony_ci src += (4 * src_stride); 829cabdff1aSopenharmony_ci 830cabdff1aSopenharmony_ci XORI_B4_128_SB(src0, src1, src2, src3); 831cabdff1aSopenharmony_ci HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, 832cabdff1aSopenharmony_ci filt1, out0, out1, out2, out3); 833cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 7); 834cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 835cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0, out1); 836cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out2, out3); 837cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 838cabdff1aSopenharmony_ci dst += (4 * dst_stride); 839cabdff1aSopenharmony_ci } 840cabdff1aSopenharmony_ci} 841cabdff1aSopenharmony_ci 842cabdff1aSopenharmony_civoid ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, 843cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 844cabdff1aSopenharmony_ci int height, int mx, int my) 845cabdff1aSopenharmony_ci{ 846cabdff1aSopenharmony_ci uint32_t loop_cnt; 847cabdff1aSopenharmony_ci const int8_t *filter = subpel_filters_msa[mx - 1]; 848cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 849cabdff1aSopenharmony_ci v16i8 filt0, filt1, mask0, mask1; 850cabdff1aSopenharmony_ci v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; 851cabdff1aSopenharmony_ci v16u8 out; 852cabdff1aSopenharmony_ci 853cabdff1aSopenharmony_ci mask0 = LD_SB(&mc_filt_mask_arr[0]); 854cabdff1aSopenharmony_ci src -= 1; 855cabdff1aSopenharmony_ci 856cabdff1aSopenharmony_ci /* rearranging filter */ 857cabdff1aSopenharmony_ci filt = LD_SH(filter); 858cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 859cabdff1aSopenharmony_ci 860cabdff1aSopenharmony_ci mask1 = mask0 + 2; 861cabdff1aSopenharmony_ci 862cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 863cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src2, src4, src6); 864cabdff1aSopenharmony_ci LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 865cabdff1aSopenharmony_ci src += (4 * src_stride); 866cabdff1aSopenharmony_ci 867cabdff1aSopenharmony_ci XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 868cabdff1aSopenharmony_ci HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, 869cabdff1aSopenharmony_ci filt1, out0, out1, out2, out3); 870cabdff1aSopenharmony_ci HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0, 871cabdff1aSopenharmony_ci filt1, out4, out5, out6, out7); 872cabdff1aSopenharmony_ci SRARI_H4_SH(out0, out1, out2, out3, 7); 873cabdff1aSopenharmony_ci SRARI_H4_SH(out4, out5, out6, out7, 7); 874cabdff1aSopenharmony_ci SAT_SH4_SH(out0, out1, out2, out3, 7); 875cabdff1aSopenharmony_ci SAT_SH4_SH(out4, out5, out6, out7, 7); 876cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out0, out1); 877cabdff1aSopenharmony_ci ST_UB(out, dst); 878cabdff1aSopenharmony_ci dst += dst_stride; 879cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out2, out3); 880cabdff1aSopenharmony_ci ST_UB(out, dst); 881cabdff1aSopenharmony_ci dst += dst_stride; 882cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out4, out5); 883cabdff1aSopenharmony_ci ST_UB(out, dst); 884cabdff1aSopenharmony_ci dst += dst_stride; 885cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out6, out7); 886cabdff1aSopenharmony_ci ST_UB(out, dst); 887cabdff1aSopenharmony_ci dst += dst_stride; 888cabdff1aSopenharmony_ci } 889cabdff1aSopenharmony_ci} 890cabdff1aSopenharmony_ci 891cabdff1aSopenharmony_civoid ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 892cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 893cabdff1aSopenharmony_ci int height, int mx, int my) 894cabdff1aSopenharmony_ci{ 895cabdff1aSopenharmony_ci uint32_t loop_cnt; 896cabdff1aSopenharmony_ci const int8_t *filter = subpel_filters_msa[my - 1]; 897cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5; 898cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 899cabdff1aSopenharmony_ci v16i8 src2110, src4332, filt0, filt1; 900cabdff1aSopenharmony_ci v8i16 filt, out10, out32; 901cabdff1aSopenharmony_ci v16u8 out; 902cabdff1aSopenharmony_ci 903cabdff1aSopenharmony_ci src -= src_stride; 904cabdff1aSopenharmony_ci 905cabdff1aSopenharmony_ci filt = LD_SH(filter); 906cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 907cabdff1aSopenharmony_ci 908cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 909cabdff1aSopenharmony_ci src += (3 * src_stride); 910cabdff1aSopenharmony_ci 911cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 912cabdff1aSopenharmony_ci 913cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 914cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 915cabdff1aSopenharmony_ci 916cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 917cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src3, src4, src5); 918cabdff1aSopenharmony_ci src += (3 * src_stride); 919cabdff1aSopenharmony_ci ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 920cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); 921cabdff1aSopenharmony_ci src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); 922cabdff1aSopenharmony_ci out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1); 923cabdff1aSopenharmony_ci 924cabdff1aSopenharmony_ci src2 = LD_SB(src); 925cabdff1aSopenharmony_ci src += (src_stride); 926cabdff1aSopenharmony_ci ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r); 927cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r); 928cabdff1aSopenharmony_ci src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 929cabdff1aSopenharmony_ci out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1); 930cabdff1aSopenharmony_ci SRARI_H2_SH(out10, out32, 7); 931cabdff1aSopenharmony_ci SAT_SH2_SH(out10, out32, 7); 932cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(out10, out32); 933cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 934cabdff1aSopenharmony_ci dst += (4 * dst_stride); 935cabdff1aSopenharmony_ci } 936cabdff1aSopenharmony_ci} 937cabdff1aSopenharmony_ci 938cabdff1aSopenharmony_civoid ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 939cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 940cabdff1aSopenharmony_ci int height, int mx, int my) 941cabdff1aSopenharmony_ci{ 942cabdff1aSopenharmony_ci uint32_t loop_cnt; 943cabdff1aSopenharmony_ci const int8_t *filter = subpel_filters_msa[my - 1]; 944cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src7, src8, src9, src10; 945cabdff1aSopenharmony_ci v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1; 946cabdff1aSopenharmony_ci v16u8 tmp0, tmp1; 947cabdff1aSopenharmony_ci v8i16 filt, out0_r, out1_r, out2_r, out3_r; 948cabdff1aSopenharmony_ci 949cabdff1aSopenharmony_ci src -= src_stride; 950cabdff1aSopenharmony_ci 951cabdff1aSopenharmony_ci filt = LD_SH(filter); 952cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 953cabdff1aSopenharmony_ci 954cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 955cabdff1aSopenharmony_ci src += (3 * src_stride); 956cabdff1aSopenharmony_ci 957cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 958cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 959cabdff1aSopenharmony_ci 960cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 961cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src7, src8, src9, src10); 962cabdff1aSopenharmony_ci src += (4 * src_stride); 963cabdff1aSopenharmony_ci 964cabdff1aSopenharmony_ci XORI_B4_128_SB(src7, src8, src9, src10); 965cabdff1aSopenharmony_ci ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, 966cabdff1aSopenharmony_ci src72_r, src87_r, src98_r, src109_r); 967cabdff1aSopenharmony_ci out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1); 968cabdff1aSopenharmony_ci out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1); 969cabdff1aSopenharmony_ci out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1); 970cabdff1aSopenharmony_ci out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1); 971cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); 972cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 973cabdff1aSopenharmony_ci tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); 974cabdff1aSopenharmony_ci tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); 975cabdff1aSopenharmony_ci ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 976cabdff1aSopenharmony_ci dst += (4 * dst_stride); 977cabdff1aSopenharmony_ci 978cabdff1aSopenharmony_ci src10_r = src98_r; 979cabdff1aSopenharmony_ci src21_r = src109_r; 980cabdff1aSopenharmony_ci src2 = src10; 981cabdff1aSopenharmony_ci } 982cabdff1aSopenharmony_ci} 983cabdff1aSopenharmony_ci 984cabdff1aSopenharmony_civoid ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 985cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 986cabdff1aSopenharmony_ci int height, int mx, int my) 987cabdff1aSopenharmony_ci{ 988cabdff1aSopenharmony_ci uint32_t loop_cnt; 989cabdff1aSopenharmony_ci const int8_t *filter = subpel_filters_msa[my - 1]; 990cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 991cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l; 992cabdff1aSopenharmony_ci v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1; 993cabdff1aSopenharmony_ci v16u8 tmp0, tmp1, tmp2, tmp3; 994cabdff1aSopenharmony_ci v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 995cabdff1aSopenharmony_ci 996cabdff1aSopenharmony_ci src -= src_stride; 997cabdff1aSopenharmony_ci 998cabdff1aSopenharmony_ci filt = LD_SH(filter); 999cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 1000cabdff1aSopenharmony_ci 1001cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 1002cabdff1aSopenharmony_ci src += (3 * src_stride); 1003cabdff1aSopenharmony_ci 1004cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 1005cabdff1aSopenharmony_ci ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 1006cabdff1aSopenharmony_ci ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 1007cabdff1aSopenharmony_ci 1008cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1009cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src3, src4, src5, src6); 1010cabdff1aSopenharmony_ci src += (4 * src_stride); 1011cabdff1aSopenharmony_ci 1012cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 1013cabdff1aSopenharmony_ci ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 1014cabdff1aSopenharmony_ci src32_r, src43_r, src54_r, src65_r); 1015cabdff1aSopenharmony_ci ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 1016cabdff1aSopenharmony_ci src32_l, src43_l, src54_l, src65_l); 1017cabdff1aSopenharmony_ci out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1); 1018cabdff1aSopenharmony_ci out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1); 1019cabdff1aSopenharmony_ci out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1); 1020cabdff1aSopenharmony_ci out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1); 1021cabdff1aSopenharmony_ci out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1); 1022cabdff1aSopenharmony_ci out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1); 1023cabdff1aSopenharmony_ci out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1); 1024cabdff1aSopenharmony_ci out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1); 1025cabdff1aSopenharmony_ci SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1026cabdff1aSopenharmony_ci SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1027cabdff1aSopenharmony_ci SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1028cabdff1aSopenharmony_ci SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1029cabdff1aSopenharmony_ci PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 1030cabdff1aSopenharmony_ci out3_r, tmp0, tmp1, tmp2, tmp3); 1031cabdff1aSopenharmony_ci XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 1032cabdff1aSopenharmony_ci ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 1033cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1034cabdff1aSopenharmony_ci 1035cabdff1aSopenharmony_ci src10_r = src54_r; 1036cabdff1aSopenharmony_ci src21_r = src65_r; 1037cabdff1aSopenharmony_ci src10_l = src54_l; 1038cabdff1aSopenharmony_ci src21_l = src65_l; 1039cabdff1aSopenharmony_ci src2 = src6; 1040cabdff1aSopenharmony_ci } 1041cabdff1aSopenharmony_ci} 1042cabdff1aSopenharmony_ci 1043cabdff1aSopenharmony_civoid ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 1044cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1045cabdff1aSopenharmony_ci int height, int mx, int my) 1046cabdff1aSopenharmony_ci{ 1047cabdff1aSopenharmony_ci uint32_t loop_cnt; 1048cabdff1aSopenharmony_ci const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 1049cabdff1aSopenharmony_ci const int8_t *filter_vert = subpel_filters_msa[my - 1]; 1050cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; 1051cabdff1aSopenharmony_ci v16u8 mask0, mask1, out; 1052cabdff1aSopenharmony_ci v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; 1053cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; 1054cabdff1aSopenharmony_ci 1055cabdff1aSopenharmony_ci mask0 = LD_UB(&mc_filt_mask_arr[16]); 1056cabdff1aSopenharmony_ci src -= (1 + 1 * src_stride); 1057cabdff1aSopenharmony_ci 1058cabdff1aSopenharmony_ci /* rearranging filter */ 1059cabdff1aSopenharmony_ci filt = LD_SH(filter_horiz); 1060cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); 1061cabdff1aSopenharmony_ci 1062cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1063cabdff1aSopenharmony_ci 1064cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 1065cabdff1aSopenharmony_ci src += (3 * src_stride); 1066cabdff1aSopenharmony_ci 1067cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 1068cabdff1aSopenharmony_ci hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); 1069cabdff1aSopenharmony_ci hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1); 1070cabdff1aSopenharmony_ci vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 1071cabdff1aSopenharmony_ci 1072cabdff1aSopenharmony_ci filt = LD_SH(filter_vert); 1073cabdff1aSopenharmony_ci SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); 1074cabdff1aSopenharmony_ci 1075cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1076cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src3, src4, src5, src6); 1077cabdff1aSopenharmony_ci src += (4 * src_stride); 1078cabdff1aSopenharmony_ci 1079cabdff1aSopenharmony_ci XORI_B2_128_SB(src3, src4); 1080cabdff1aSopenharmony_ci hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); 1081cabdff1aSopenharmony_ci hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8); 1082cabdff1aSopenharmony_ci vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2); 1083cabdff1aSopenharmony_ci tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1084cabdff1aSopenharmony_ci 1085cabdff1aSopenharmony_ci XORI_B2_128_SB(src5, src6); 1086cabdff1aSopenharmony_ci hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); 1087cabdff1aSopenharmony_ci hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8); 1088cabdff1aSopenharmony_ci vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 1089cabdff1aSopenharmony_ci tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); 1090cabdff1aSopenharmony_ci 1091cabdff1aSopenharmony_ci SRARI_H2_SH(tmp0, tmp1, 7); 1092cabdff1aSopenharmony_ci SAT_SH2_SH(tmp0, tmp1, 7); 1093cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(tmp0, tmp1); 1094cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 1095cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1096cabdff1aSopenharmony_ci 1097cabdff1aSopenharmony_ci hz_out1 = hz_out5; 1098cabdff1aSopenharmony_ci vec0 = vec2; 1099cabdff1aSopenharmony_ci } 1100cabdff1aSopenharmony_ci} 1101cabdff1aSopenharmony_ci 1102cabdff1aSopenharmony_civoid ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 1103cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1104cabdff1aSopenharmony_ci int height, int mx, int my) 1105cabdff1aSopenharmony_ci{ 1106cabdff1aSopenharmony_ci uint32_t loop_cnt; 1107cabdff1aSopenharmony_ci const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 1108cabdff1aSopenharmony_ci const int8_t *filter_vert = subpel_filters_msa[my - 1]; 1109cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; 1110cabdff1aSopenharmony_ci v16u8 mask0, mask1, out0, out1; 1111cabdff1aSopenharmony_ci v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3; 1112cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3; 1113cabdff1aSopenharmony_ci v8i16 vec0, vec1, vec2, vec3, vec4; 1114cabdff1aSopenharmony_ci 1115cabdff1aSopenharmony_ci mask0 = LD_UB(&mc_filt_mask_arr[0]); 1116cabdff1aSopenharmony_ci src -= (1 + 1 * src_stride); 1117cabdff1aSopenharmony_ci 1118cabdff1aSopenharmony_ci /* rearranging filter */ 1119cabdff1aSopenharmony_ci filt = LD_SH(filter_horiz); 1120cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); 1121cabdff1aSopenharmony_ci 1122cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1123cabdff1aSopenharmony_ci 1124cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 1125cabdff1aSopenharmony_ci src += (3 * src_stride); 1126cabdff1aSopenharmony_ci 1127cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 1128cabdff1aSopenharmony_ci hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); 1129cabdff1aSopenharmony_ci hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); 1130cabdff1aSopenharmony_ci hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); 1131cabdff1aSopenharmony_ci ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2); 1132cabdff1aSopenharmony_ci 1133cabdff1aSopenharmony_ci filt = LD_SH(filter_vert); 1134cabdff1aSopenharmony_ci SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); 1135cabdff1aSopenharmony_ci 1136cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1137cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src3, src4, src5, src6); 1138cabdff1aSopenharmony_ci src += (4 * src_stride); 1139cabdff1aSopenharmony_ci 1140cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 1141cabdff1aSopenharmony_ci hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); 1142cabdff1aSopenharmony_ci vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2); 1143cabdff1aSopenharmony_ci tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1144cabdff1aSopenharmony_ci 1145cabdff1aSopenharmony_ci hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); 1146cabdff1aSopenharmony_ci vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3); 1147cabdff1aSopenharmony_ci tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1); 1148cabdff1aSopenharmony_ci 1149cabdff1aSopenharmony_ci hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); 1150cabdff1aSopenharmony_ci vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 1151cabdff1aSopenharmony_ci tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1); 1152cabdff1aSopenharmony_ci 1153cabdff1aSopenharmony_ci hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); 1154cabdff1aSopenharmony_ci ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1); 1155cabdff1aSopenharmony_ci tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1156cabdff1aSopenharmony_ci 1157cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1158cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1159cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 1160cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 1161cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1162cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1163cabdff1aSopenharmony_ci 1164cabdff1aSopenharmony_ci vec0 = vec4; 1165cabdff1aSopenharmony_ci vec2 = vec1; 1166cabdff1aSopenharmony_ci } 1167cabdff1aSopenharmony_ci} 1168cabdff1aSopenharmony_ci 1169cabdff1aSopenharmony_civoid ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 1170cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1171cabdff1aSopenharmony_ci int height, int mx, int my) 1172cabdff1aSopenharmony_ci{ 1173cabdff1aSopenharmony_ci int32_t multiple8_cnt; 1174cabdff1aSopenharmony_ci 1175cabdff1aSopenharmony_ci for (multiple8_cnt = 2; multiple8_cnt--;) { 1176cabdff1aSopenharmony_ci ff_put_vp8_epel8_h4v4_msa(dst, dst_stride, src, src_stride, height, 1177cabdff1aSopenharmony_ci mx, my); 1178cabdff1aSopenharmony_ci 1179cabdff1aSopenharmony_ci src += 8; 1180cabdff1aSopenharmony_ci dst += 8; 1181cabdff1aSopenharmony_ci } 1182cabdff1aSopenharmony_ci} 1183cabdff1aSopenharmony_ci 1184cabdff1aSopenharmony_civoid ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 1185cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1186cabdff1aSopenharmony_ci int height, int mx, int my) 1187cabdff1aSopenharmony_ci{ 1188cabdff1aSopenharmony_ci uint32_t loop_cnt; 1189cabdff1aSopenharmony_ci const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 1190cabdff1aSopenharmony_ci const int8_t *filter_vert = subpel_filters_msa[my - 1]; 1191cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 1192cabdff1aSopenharmony_ci v16i8 filt_hz0, filt_hz1, filt_hz2; 1193cabdff1aSopenharmony_ci v16u8 res0, res1, mask0, mask1, mask2; 1194cabdff1aSopenharmony_ci v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; 1195cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; 1196cabdff1aSopenharmony_ci 1197cabdff1aSopenharmony_ci mask0 = LD_UB(&mc_filt_mask_arr[16]); 1198cabdff1aSopenharmony_ci src -= (2 + 1 * src_stride); 1199cabdff1aSopenharmony_ci 1200cabdff1aSopenharmony_ci /* rearranging filter */ 1201cabdff1aSopenharmony_ci filt = LD_SH(filter_horiz); 1202cabdff1aSopenharmony_ci SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); 1203cabdff1aSopenharmony_ci 1204cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1205cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1206cabdff1aSopenharmony_ci 1207cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 1208cabdff1aSopenharmony_ci src += (3 * src_stride); 1209cabdff1aSopenharmony_ci 1210cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 1211cabdff1aSopenharmony_ci hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, 1212cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 1213cabdff1aSopenharmony_ci hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, 1214cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 1215cabdff1aSopenharmony_ci vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 1216cabdff1aSopenharmony_ci 1217cabdff1aSopenharmony_ci filt = LD_SH(filter_vert); 1218cabdff1aSopenharmony_ci SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); 1219cabdff1aSopenharmony_ci 1220cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1221cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src3, src4, src5, src6); 1222cabdff1aSopenharmony_ci src += (4 * src_stride); 1223cabdff1aSopenharmony_ci 1224cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 1225cabdff1aSopenharmony_ci hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, 1226cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 1227cabdff1aSopenharmony_ci hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8); 1228cabdff1aSopenharmony_ci vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2); 1229cabdff1aSopenharmony_ci tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1230cabdff1aSopenharmony_ci 1231cabdff1aSopenharmony_ci hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, 1232cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 1233cabdff1aSopenharmony_ci hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8); 1234cabdff1aSopenharmony_ci vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 1235cabdff1aSopenharmony_ci tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); 1236cabdff1aSopenharmony_ci 1237cabdff1aSopenharmony_ci SRARI_H2_SH(tmp0, tmp1, 7); 1238cabdff1aSopenharmony_ci SAT_SH2_SH(tmp0, tmp1, 7); 1239cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); 1240cabdff1aSopenharmony_ci XORI_B2_128_UB(res0, res1); 1241cabdff1aSopenharmony_ci ST_W2(res0, 0, 1, dst, dst_stride); 1242cabdff1aSopenharmony_ci ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 1243cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1244cabdff1aSopenharmony_ci 1245cabdff1aSopenharmony_ci hz_out1 = hz_out5; 1246cabdff1aSopenharmony_ci vec0 = vec2; 1247cabdff1aSopenharmony_ci } 1248cabdff1aSopenharmony_ci} 1249cabdff1aSopenharmony_ci 1250cabdff1aSopenharmony_civoid ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 1251cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1252cabdff1aSopenharmony_ci int height, int mx, int my) 1253cabdff1aSopenharmony_ci{ 1254cabdff1aSopenharmony_ci uint32_t loop_cnt; 1255cabdff1aSopenharmony_ci const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 1256cabdff1aSopenharmony_ci const int8_t *filter_vert = subpel_filters_msa[my - 1]; 1257cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6; 1258cabdff1aSopenharmony_ci v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; 1259cabdff1aSopenharmony_ci v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3; 1260cabdff1aSopenharmony_ci v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3; 1261cabdff1aSopenharmony_ci v16u8 out0, out1; 1262cabdff1aSopenharmony_ci 1263cabdff1aSopenharmony_ci mask0 = LD_SB(&mc_filt_mask_arr[0]); 1264cabdff1aSopenharmony_ci src -= (2 + src_stride); 1265cabdff1aSopenharmony_ci 1266cabdff1aSopenharmony_ci /* rearranging filter */ 1267cabdff1aSopenharmony_ci filt = LD_SH(filter_horiz); 1268cabdff1aSopenharmony_ci SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); 1269cabdff1aSopenharmony_ci 1270cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1271cabdff1aSopenharmony_ci mask2 = mask0 + 4; 1272cabdff1aSopenharmony_ci 1273cabdff1aSopenharmony_ci LD_SB3(src, src_stride, src0, src1, src2); 1274cabdff1aSopenharmony_ci src += (3 * src_stride); 1275cabdff1aSopenharmony_ci 1276cabdff1aSopenharmony_ci XORI_B3_128_SB(src0, src1, src2); 1277cabdff1aSopenharmony_ci hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, 1278cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 1279cabdff1aSopenharmony_ci hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, 1280cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 1281cabdff1aSopenharmony_ci hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, 1282cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 1283cabdff1aSopenharmony_ci ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2); 1284cabdff1aSopenharmony_ci 1285cabdff1aSopenharmony_ci filt = LD_SH(filter_vert); 1286cabdff1aSopenharmony_ci SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); 1287cabdff1aSopenharmony_ci 1288cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1289cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src3, src4, src5, src6); 1290cabdff1aSopenharmony_ci src += (4 * src_stride); 1291cabdff1aSopenharmony_ci 1292cabdff1aSopenharmony_ci XORI_B4_128_SB(src3, src4, src5, src6); 1293cabdff1aSopenharmony_ci 1294cabdff1aSopenharmony_ci hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, 1295cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 1296cabdff1aSopenharmony_ci vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2); 1297cabdff1aSopenharmony_ci tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1298cabdff1aSopenharmony_ci 1299cabdff1aSopenharmony_ci hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, 1300cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 1301cabdff1aSopenharmony_ci vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3); 1302cabdff1aSopenharmony_ci tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1); 1303cabdff1aSopenharmony_ci 1304cabdff1aSopenharmony_ci hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, 1305cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 1306cabdff1aSopenharmony_ci vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 1307cabdff1aSopenharmony_ci tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1); 1308cabdff1aSopenharmony_ci 1309cabdff1aSopenharmony_ci hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, 1310cabdff1aSopenharmony_ci filt_hz1, filt_hz2); 1311cabdff1aSopenharmony_ci ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2); 1312cabdff1aSopenharmony_ci tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); 1313cabdff1aSopenharmony_ci 1314cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1315cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1316cabdff1aSopenharmony_ci out0 = PCKEV_XORI128_UB(tmp0, tmp1); 1317cabdff1aSopenharmony_ci out1 = PCKEV_XORI128_UB(tmp2, tmp3); 1318cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1319cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1320cabdff1aSopenharmony_ci } 1321cabdff1aSopenharmony_ci} 1322cabdff1aSopenharmony_ci 1323cabdff1aSopenharmony_civoid ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, 1324cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1325cabdff1aSopenharmony_ci int height, int mx, int my) 1326cabdff1aSopenharmony_ci{ 1327cabdff1aSopenharmony_ci int32_t multiple8_cnt; 1328cabdff1aSopenharmony_ci 1329cabdff1aSopenharmony_ci for (multiple8_cnt = 2; multiple8_cnt--;) { 1330cabdff1aSopenharmony_ci ff_put_vp8_epel8_h6v4_msa(dst, dst_stride, src, src_stride, height, 1331cabdff1aSopenharmony_ci mx, my); 1332cabdff1aSopenharmony_ci 1333cabdff1aSopenharmony_ci src += 8; 1334cabdff1aSopenharmony_ci dst += 8; 1335cabdff1aSopenharmony_ci } 1336cabdff1aSopenharmony_ci} 1337cabdff1aSopenharmony_ci 1338cabdff1aSopenharmony_civoid ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 1339cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1340cabdff1aSopenharmony_ci int height, int mx, int my) 1341cabdff1aSopenharmony_ci{ 1342cabdff1aSopenharmony_ci uint32_t loop_cnt; 1343cabdff1aSopenharmony_ci const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 1344cabdff1aSopenharmony_ci const int8_t *filter_vert = subpel_filters_msa[my - 1]; 1345cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1346cabdff1aSopenharmony_ci v16i8 filt_hz0, filt_hz1, mask0, mask1; 1347cabdff1aSopenharmony_ci v16u8 out; 1348cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1349cabdff1aSopenharmony_ci v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3; 1350cabdff1aSopenharmony_ci v8i16 filt, filt_vt0, filt_vt1, filt_vt2; 1351cabdff1aSopenharmony_ci 1352cabdff1aSopenharmony_ci mask0 = LD_SB(&mc_filt_mask_arr[16]); 1353cabdff1aSopenharmony_ci 1354cabdff1aSopenharmony_ci src -= (1 + 2 * src_stride); 1355cabdff1aSopenharmony_ci 1356cabdff1aSopenharmony_ci /* rearranging filter */ 1357cabdff1aSopenharmony_ci filt = LD_SH(filter_horiz); 1358cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); 1359cabdff1aSopenharmony_ci 1360cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1361cabdff1aSopenharmony_ci 1362cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 1363cabdff1aSopenharmony_ci src += (5 * src_stride); 1364cabdff1aSopenharmony_ci 1365cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 1366cabdff1aSopenharmony_ci hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); 1367cabdff1aSopenharmony_ci hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1); 1368cabdff1aSopenharmony_ci hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); 1369cabdff1aSopenharmony_ci hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8); 1370cabdff1aSopenharmony_ci ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 1371cabdff1aSopenharmony_ci 1372cabdff1aSopenharmony_ci filt = LD_SH(filter_vert); 1373cabdff1aSopenharmony_ci SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); 1374cabdff1aSopenharmony_ci 1375cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1376cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src5, src6, src7, src8); 1377cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 1378cabdff1aSopenharmony_ci src += (4 * src_stride); 1379cabdff1aSopenharmony_ci 1380cabdff1aSopenharmony_ci hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); 1381cabdff1aSopenharmony_ci hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8); 1382cabdff1aSopenharmony_ci out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 1383cabdff1aSopenharmony_ci tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); 1384cabdff1aSopenharmony_ci 1385cabdff1aSopenharmony_ci hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1); 1386cabdff1aSopenharmony_ci hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8); 1387cabdff1aSopenharmony_ci out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); 1388cabdff1aSopenharmony_ci tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); 1389cabdff1aSopenharmony_ci 1390cabdff1aSopenharmony_ci SRARI_H2_SH(tmp0, tmp1, 7); 1391cabdff1aSopenharmony_ci SAT_SH2_SH(tmp0, tmp1, 7); 1392cabdff1aSopenharmony_ci out = PCKEV_XORI128_UB(tmp0, tmp1); 1393cabdff1aSopenharmony_ci ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 1394cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1395cabdff1aSopenharmony_ci 1396cabdff1aSopenharmony_ci hz_out3 = hz_out7; 1397cabdff1aSopenharmony_ci out0 = out2; 1398cabdff1aSopenharmony_ci out1 = out3; 1399cabdff1aSopenharmony_ci } 1400cabdff1aSopenharmony_ci} 1401cabdff1aSopenharmony_ci 1402cabdff1aSopenharmony_civoid ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 1403cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1404cabdff1aSopenharmony_ci int height, int mx, int my) 1405cabdff1aSopenharmony_ci{ 1406cabdff1aSopenharmony_ci uint32_t loop_cnt; 1407cabdff1aSopenharmony_ci const int8_t *filter_horiz = subpel_filters_msa[mx - 1]; 1408cabdff1aSopenharmony_ci const int8_t *filter_vert = subpel_filters_msa[my - 1]; 1409cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1410cabdff1aSopenharmony_ci v16i8 filt_hz0, filt_hz1, mask0, mask1; 1411cabdff1aSopenharmony_ci v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3; 1412cabdff1aSopenharmony_ci v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1413cabdff1aSopenharmony_ci v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; 1414cabdff1aSopenharmony_ci v16u8 vec0, vec1; 1415cabdff1aSopenharmony_ci 1416cabdff1aSopenharmony_ci mask0 = LD_SB(&mc_filt_mask_arr[0]); 1417cabdff1aSopenharmony_ci src -= (1 + 2 * src_stride); 1418cabdff1aSopenharmony_ci 1419cabdff1aSopenharmony_ci /* rearranging filter */ 1420cabdff1aSopenharmony_ci filt = LD_SH(filter_horiz); 1421cabdff1aSopenharmony_ci SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); 1422cabdff1aSopenharmony_ci 1423cabdff1aSopenharmony_ci mask1 = mask0 + 2; 1424cabdff1aSopenharmony_ci 1425cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 1426cabdff1aSopenharmony_ci src += (5 * src_stride); 1427cabdff1aSopenharmony_ci 1428cabdff1aSopenharmony_ci XORI_B5_128_SB(src0, src1, src2, src3, src4); 1429cabdff1aSopenharmony_ci hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); 1430cabdff1aSopenharmony_ci hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); 1431cabdff1aSopenharmony_ci hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); 1432cabdff1aSopenharmony_ci hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); 1433cabdff1aSopenharmony_ci hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); 1434cabdff1aSopenharmony_ci ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 1435cabdff1aSopenharmony_ci ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4); 1436cabdff1aSopenharmony_ci 1437cabdff1aSopenharmony_ci filt = LD_SH(filter_vert); 1438cabdff1aSopenharmony_ci SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); 1439cabdff1aSopenharmony_ci 1440cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1441cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src5, src6, src7, src8); 1442cabdff1aSopenharmony_ci src += (4 * src_stride); 1443cabdff1aSopenharmony_ci 1444cabdff1aSopenharmony_ci XORI_B4_128_SB(src5, src6, src7, src8); 1445cabdff1aSopenharmony_ci 1446cabdff1aSopenharmony_ci hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); 1447cabdff1aSopenharmony_ci out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4); 1448cabdff1aSopenharmony_ci tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); 1449cabdff1aSopenharmony_ci 1450cabdff1aSopenharmony_ci hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); 1451cabdff1aSopenharmony_ci out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5); 1452cabdff1aSopenharmony_ci tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); 1453cabdff1aSopenharmony_ci 1454cabdff1aSopenharmony_ci hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1); 1455cabdff1aSopenharmony_ci out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6); 1456cabdff1aSopenharmony_ci tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2); 1457cabdff1aSopenharmony_ci 1458cabdff1aSopenharmony_ci hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1); 1459cabdff1aSopenharmony_ci out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7); 1460cabdff1aSopenharmony_ci tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2); 1461cabdff1aSopenharmony_ci 1462cabdff1aSopenharmony_ci SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1463cabdff1aSopenharmony_ci SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1464cabdff1aSopenharmony_ci vec0 = PCKEV_XORI128_UB(tmp0, tmp1); 1465cabdff1aSopenharmony_ci vec1 = PCKEV_XORI128_UB(tmp2, tmp3); 1466cabdff1aSopenharmony_ci ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride); 1467cabdff1aSopenharmony_ci dst += (4 * dst_stride); 1468cabdff1aSopenharmony_ci 1469cabdff1aSopenharmony_ci hz_out4 = hz_out8; 1470cabdff1aSopenharmony_ci out0 = out2; 1471cabdff1aSopenharmony_ci out1 = out6; 1472cabdff1aSopenharmony_ci out3 = out5; 1473cabdff1aSopenharmony_ci out4 = out7; 1474cabdff1aSopenharmony_ci } 1475cabdff1aSopenharmony_ci} 1476cabdff1aSopenharmony_ci 1477cabdff1aSopenharmony_civoid ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, 1478cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1479cabdff1aSopenharmony_ci int height, int mx, int my) 1480cabdff1aSopenharmony_ci{ 1481cabdff1aSopenharmony_ci int32_t multiple8_cnt; 1482cabdff1aSopenharmony_ci 1483cabdff1aSopenharmony_ci for (multiple8_cnt = 2; multiple8_cnt--;) { 1484cabdff1aSopenharmony_ci ff_put_vp8_epel8_h4v6_msa(dst, dst_stride, src, src_stride, height, 1485cabdff1aSopenharmony_ci mx, my); 1486cabdff1aSopenharmony_ci 1487cabdff1aSopenharmony_ci src += 8; 1488cabdff1aSopenharmony_ci dst += 8; 1489cabdff1aSopenharmony_ci } 1490cabdff1aSopenharmony_ci} 1491cabdff1aSopenharmony_ci 1492cabdff1aSopenharmony_cistatic void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride, 1493cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1494cabdff1aSopenharmony_ci const int8_t *filter) 1495cabdff1aSopenharmony_ci{ 1496cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, mask; 1497cabdff1aSopenharmony_ci v16u8 filt0, vec0, vec1, res0, res1; 1498cabdff1aSopenharmony_ci v8u16 vec2, vec3, filt; 1499cabdff1aSopenharmony_ci 1500cabdff1aSopenharmony_ci mask = LD_SB(&mc_filt_mask_arr[16]); 1501cabdff1aSopenharmony_ci 1502cabdff1aSopenharmony_ci /* rearranging filter */ 1503cabdff1aSopenharmony_ci filt = LD_UH(filter); 1504cabdff1aSopenharmony_ci filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1505cabdff1aSopenharmony_ci 1506cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 1507cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 1508cabdff1aSopenharmony_ci DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); 1509cabdff1aSopenharmony_ci SRARI_H2_UH(vec2, vec3, 7); 1510cabdff1aSopenharmony_ci PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); 1511cabdff1aSopenharmony_ci ST_W2(res0, 0, 1, dst, dst_stride); 1512cabdff1aSopenharmony_ci ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 1513cabdff1aSopenharmony_ci} 1514cabdff1aSopenharmony_ci 1515cabdff1aSopenharmony_cistatic void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride, 1516cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1517cabdff1aSopenharmony_ci const int8_t *filter) 1518cabdff1aSopenharmony_ci{ 1519cabdff1aSopenharmony_ci v16u8 vec0, vec1, vec2, vec3, filt0; 1520cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 1521cabdff1aSopenharmony_ci v16i8 res0, res1, res2, res3; 1522cabdff1aSopenharmony_ci v8u16 vec4, vec5, vec6, vec7, filt; 1523cabdff1aSopenharmony_ci 1524cabdff1aSopenharmony_ci mask = LD_SB(&mc_filt_mask_arr[16]); 1525cabdff1aSopenharmony_ci 1526cabdff1aSopenharmony_ci /* rearranging filter */ 1527cabdff1aSopenharmony_ci filt = LD_UH(filter); 1528cabdff1aSopenharmony_ci filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1529cabdff1aSopenharmony_ci 1530cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 1531cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 1532cabdff1aSopenharmony_ci VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); 1533cabdff1aSopenharmony_ci DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1534cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1535cabdff1aSopenharmony_ci SRARI_H4_UH(vec4, vec5, vec6, vec7, 7); 1536cabdff1aSopenharmony_ci PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, 1537cabdff1aSopenharmony_ci res0, res1, res2, res3); 1538cabdff1aSopenharmony_ci ST_W2(res0, 0, 1, dst, dst_stride); 1539cabdff1aSopenharmony_ci ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 1540cabdff1aSopenharmony_ci ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride); 1541cabdff1aSopenharmony_ci ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride); 1542cabdff1aSopenharmony_ci} 1543cabdff1aSopenharmony_ci 1544cabdff1aSopenharmony_civoid ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride, 1545cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1546cabdff1aSopenharmony_ci int height, int mx, int my) 1547cabdff1aSopenharmony_ci{ 1548cabdff1aSopenharmony_ci const int8_t *filter = bilinear_filters_msa[mx - 1]; 1549cabdff1aSopenharmony_ci 1550cabdff1aSopenharmony_ci if (4 == height) { 1551cabdff1aSopenharmony_ci common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 1552cabdff1aSopenharmony_ci } else if (8 == height) { 1553cabdff1aSopenharmony_ci common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 1554cabdff1aSopenharmony_ci } 1555cabdff1aSopenharmony_ci} 1556cabdff1aSopenharmony_ci 1557cabdff1aSopenharmony_cistatic void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride, 1558cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1559cabdff1aSopenharmony_ci const int8_t *filter) 1560cabdff1aSopenharmony_ci{ 1561cabdff1aSopenharmony_ci v16u8 filt0; 1562cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, mask; 1563cabdff1aSopenharmony_ci v8u16 vec0, vec1, vec2, vec3, filt; 1564cabdff1aSopenharmony_ci 1565cabdff1aSopenharmony_ci mask = LD_SB(&mc_filt_mask_arr[0]); 1566cabdff1aSopenharmony_ci 1567cabdff1aSopenharmony_ci /* rearranging filter */ 1568cabdff1aSopenharmony_ci filt = LD_UH(filter); 1569cabdff1aSopenharmony_ci filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1570cabdff1aSopenharmony_ci 1571cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 1572cabdff1aSopenharmony_ci VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1573cabdff1aSopenharmony_ci VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1574cabdff1aSopenharmony_ci DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1575cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1576cabdff1aSopenharmony_ci SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1577cabdff1aSopenharmony_ci PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); 1578cabdff1aSopenharmony_ci ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride); 1579cabdff1aSopenharmony_ci} 1580cabdff1aSopenharmony_ci 1581cabdff1aSopenharmony_cistatic void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, 1582cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1583cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1584cabdff1aSopenharmony_ci{ 1585cabdff1aSopenharmony_ci v16u8 filt0; 1586cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, mask, out0, out1; 1587cabdff1aSopenharmony_ci v8u16 vec0, vec1, vec2, vec3, filt; 1588cabdff1aSopenharmony_ci 1589cabdff1aSopenharmony_ci mask = LD_SB(&mc_filt_mask_arr[0]); 1590cabdff1aSopenharmony_ci 1591cabdff1aSopenharmony_ci /* rearranging filter */ 1592cabdff1aSopenharmony_ci filt = LD_UH(filter); 1593cabdff1aSopenharmony_ci filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1594cabdff1aSopenharmony_ci 1595cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 1596cabdff1aSopenharmony_ci src += (4 * src_stride); 1597cabdff1aSopenharmony_ci 1598cabdff1aSopenharmony_ci VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1599cabdff1aSopenharmony_ci VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1600cabdff1aSopenharmony_ci DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1601cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1602cabdff1aSopenharmony_ci SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1603cabdff1aSopenharmony_ci 1604cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 1605cabdff1aSopenharmony_ci src += (4 * src_stride); 1606cabdff1aSopenharmony_ci 1607cabdff1aSopenharmony_ci PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 1608cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1609cabdff1aSopenharmony_ci 1610cabdff1aSopenharmony_ci VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1611cabdff1aSopenharmony_ci VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1612cabdff1aSopenharmony_ci DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1613cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1614cabdff1aSopenharmony_ci SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1615cabdff1aSopenharmony_ci PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 1616cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1617cabdff1aSopenharmony_ci dst += (8 * dst_stride); 1618cabdff1aSopenharmony_ci 1619cabdff1aSopenharmony_ci if (16 == height) { 1620cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 1621cabdff1aSopenharmony_ci src += (4 * src_stride); 1622cabdff1aSopenharmony_ci 1623cabdff1aSopenharmony_ci VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1624cabdff1aSopenharmony_ci VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1625cabdff1aSopenharmony_ci DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1626cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1627cabdff1aSopenharmony_ci SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1628cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src1, src2, src3); 1629cabdff1aSopenharmony_ci src += (4 * src_stride); 1630cabdff1aSopenharmony_ci 1631cabdff1aSopenharmony_ci PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 1632cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1633cabdff1aSopenharmony_ci 1634cabdff1aSopenharmony_ci VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1635cabdff1aSopenharmony_ci VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1636cabdff1aSopenharmony_ci DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1637cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1638cabdff1aSopenharmony_ci SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); 1639cabdff1aSopenharmony_ci PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 1640cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1641cabdff1aSopenharmony_ci } 1642cabdff1aSopenharmony_ci} 1643cabdff1aSopenharmony_ci 1644cabdff1aSopenharmony_civoid ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride, 1645cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1646cabdff1aSopenharmony_ci int height, int mx, int my) 1647cabdff1aSopenharmony_ci{ 1648cabdff1aSopenharmony_ci const int8_t *filter = bilinear_filters_msa[mx - 1]; 1649cabdff1aSopenharmony_ci 1650cabdff1aSopenharmony_ci if (4 == height) { 1651cabdff1aSopenharmony_ci common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 1652cabdff1aSopenharmony_ci } else { 1653cabdff1aSopenharmony_ci common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, 1654cabdff1aSopenharmony_ci height); 1655cabdff1aSopenharmony_ci } 1656cabdff1aSopenharmony_ci} 1657cabdff1aSopenharmony_ci 1658cabdff1aSopenharmony_civoid ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride, 1659cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1660cabdff1aSopenharmony_ci int height, int mx, int my) 1661cabdff1aSopenharmony_ci{ 1662cabdff1aSopenharmony_ci uint32_t loop_cnt; 1663cabdff1aSopenharmony_ci const int8_t *filter = bilinear_filters_msa[mx - 1]; 1664cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 1665cabdff1aSopenharmony_ci v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1666cabdff1aSopenharmony_ci v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 1667cabdff1aSopenharmony_ci 1668cabdff1aSopenharmony_ci mask = LD_SB(&mc_filt_mask_arr[0]); 1669cabdff1aSopenharmony_ci 1670cabdff1aSopenharmony_ci loop_cnt = (height >> 2) - 1; 1671cabdff1aSopenharmony_ci 1672cabdff1aSopenharmony_ci /* rearranging filter */ 1673cabdff1aSopenharmony_ci filt = LD_UH(filter); 1674cabdff1aSopenharmony_ci filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 1675cabdff1aSopenharmony_ci 1676cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src2, src4, src6); 1677cabdff1aSopenharmony_ci LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 1678cabdff1aSopenharmony_ci src += (4 * src_stride); 1679cabdff1aSopenharmony_ci 1680cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 1681cabdff1aSopenharmony_ci VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 1682cabdff1aSopenharmony_ci VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 1683cabdff1aSopenharmony_ci VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 1684cabdff1aSopenharmony_ci DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1685cabdff1aSopenharmony_ci out0, out1, out2, out3); 1686cabdff1aSopenharmony_ci DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 1687cabdff1aSopenharmony_ci out4, out5, out6, out7); 1688cabdff1aSopenharmony_ci SRARI_H4_UH(out0, out1, out2, out3, 7); 1689cabdff1aSopenharmony_ci SRARI_H4_UH(out4, out5, out6, out7, 7); 1690cabdff1aSopenharmony_ci PCKEV_ST_SB(out0, out1, dst); 1691cabdff1aSopenharmony_ci dst += dst_stride; 1692cabdff1aSopenharmony_ci PCKEV_ST_SB(out2, out3, dst); 1693cabdff1aSopenharmony_ci dst += dst_stride; 1694cabdff1aSopenharmony_ci PCKEV_ST_SB(out4, out5, dst); 1695cabdff1aSopenharmony_ci dst += dst_stride; 1696cabdff1aSopenharmony_ci PCKEV_ST_SB(out6, out7, dst); 1697cabdff1aSopenharmony_ci dst += dst_stride; 1698cabdff1aSopenharmony_ci 1699cabdff1aSopenharmony_ci for (; loop_cnt--;) { 1700cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src2, src4, src6); 1701cabdff1aSopenharmony_ci LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 1702cabdff1aSopenharmony_ci src += (4 * src_stride); 1703cabdff1aSopenharmony_ci 1704cabdff1aSopenharmony_ci VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 1705cabdff1aSopenharmony_ci VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 1706cabdff1aSopenharmony_ci VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 1707cabdff1aSopenharmony_ci VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 1708cabdff1aSopenharmony_ci DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1709cabdff1aSopenharmony_ci out0, out1, out2, out3); 1710cabdff1aSopenharmony_ci DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 1711cabdff1aSopenharmony_ci out4, out5, out6, out7); 1712cabdff1aSopenharmony_ci SRARI_H4_UH(out0, out1, out2, out3, 7); 1713cabdff1aSopenharmony_ci SRARI_H4_UH(out4, out5, out6, out7, 7); 1714cabdff1aSopenharmony_ci PCKEV_ST_SB(out0, out1, dst); 1715cabdff1aSopenharmony_ci dst += dst_stride; 1716cabdff1aSopenharmony_ci PCKEV_ST_SB(out2, out3, dst); 1717cabdff1aSopenharmony_ci dst += dst_stride; 1718cabdff1aSopenharmony_ci PCKEV_ST_SB(out4, out5, dst); 1719cabdff1aSopenharmony_ci dst += dst_stride; 1720cabdff1aSopenharmony_ci PCKEV_ST_SB(out6, out7, dst); 1721cabdff1aSopenharmony_ci dst += dst_stride; 1722cabdff1aSopenharmony_ci } 1723cabdff1aSopenharmony_ci} 1724cabdff1aSopenharmony_ci 1725cabdff1aSopenharmony_cistatic void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride, 1726cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1727cabdff1aSopenharmony_ci const int8_t *filter) 1728cabdff1aSopenharmony_ci{ 1729cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4; 1730cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; 1731cabdff1aSopenharmony_ci v16u8 filt0; 1732cabdff1aSopenharmony_ci v8i16 filt; 1733cabdff1aSopenharmony_ci v8u16 tmp0, tmp1; 1734cabdff1aSopenharmony_ci 1735cabdff1aSopenharmony_ci filt = LD_SH(filter); 1736cabdff1aSopenharmony_ci filt0 = (v16u8) __msa_splati_h(filt, 0); 1737cabdff1aSopenharmony_ci 1738cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 1739cabdff1aSopenharmony_ci src += (5 * src_stride); 1740cabdff1aSopenharmony_ci 1741cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, 1742cabdff1aSopenharmony_ci src10_r, src21_r, src32_r, src43_r); 1743cabdff1aSopenharmony_ci ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 1744cabdff1aSopenharmony_ci DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 1745cabdff1aSopenharmony_ci SRARI_H2_UH(tmp0, tmp1, 7); 1746cabdff1aSopenharmony_ci SAT_UH2_UH(tmp0, tmp1, 7); 1747cabdff1aSopenharmony_ci src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 1748cabdff1aSopenharmony_ci ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride); 1749cabdff1aSopenharmony_ci} 1750cabdff1aSopenharmony_ci 1751cabdff1aSopenharmony_cistatic void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride, 1752cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1753cabdff1aSopenharmony_ci const int8_t *filter) 1754cabdff1aSopenharmony_ci{ 1755cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1756cabdff1aSopenharmony_ci v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; 1757cabdff1aSopenharmony_ci v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; 1758cabdff1aSopenharmony_ci v8u16 tmp0, tmp1, tmp2, tmp3; 1759cabdff1aSopenharmony_ci v16u8 filt0; 1760cabdff1aSopenharmony_ci v8i16 filt; 1761cabdff1aSopenharmony_ci 1762cabdff1aSopenharmony_ci filt = LD_SH(filter); 1763cabdff1aSopenharmony_ci filt0 = (v16u8) __msa_splati_h(filt, 0); 1764cabdff1aSopenharmony_ci 1765cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 1766cabdff1aSopenharmony_ci src += (8 * src_stride); 1767cabdff1aSopenharmony_ci 1768cabdff1aSopenharmony_ci src8 = LD_SB(src); 1769cabdff1aSopenharmony_ci src += src_stride; 1770cabdff1aSopenharmony_ci 1771cabdff1aSopenharmony_ci ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1772cabdff1aSopenharmony_ci src32_r, src43_r); 1773cabdff1aSopenharmony_ci ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 1774cabdff1aSopenharmony_ci src76_r, src87_r); 1775cabdff1aSopenharmony_ci ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 1776cabdff1aSopenharmony_ci src87_r, src76_r, src2110, src4332, src6554, src8776); 1777cabdff1aSopenharmony_ci DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, 1778cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1779cabdff1aSopenharmony_ci SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1780cabdff1aSopenharmony_ci SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1781cabdff1aSopenharmony_ci PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); 1782cabdff1aSopenharmony_ci ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 1783cabdff1aSopenharmony_ci} 1784cabdff1aSopenharmony_ci 1785cabdff1aSopenharmony_civoid ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride, 1786cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1787cabdff1aSopenharmony_ci int height, int mx, int my) 1788cabdff1aSopenharmony_ci{ 1789cabdff1aSopenharmony_ci const int8_t *filter = bilinear_filters_msa[my - 1]; 1790cabdff1aSopenharmony_ci 1791cabdff1aSopenharmony_ci if (4 == height) { 1792cabdff1aSopenharmony_ci common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 1793cabdff1aSopenharmony_ci } else if (8 == height) { 1794cabdff1aSopenharmony_ci common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 1795cabdff1aSopenharmony_ci } 1796cabdff1aSopenharmony_ci} 1797cabdff1aSopenharmony_ci 1798cabdff1aSopenharmony_cistatic void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride, 1799cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1800cabdff1aSopenharmony_ci const int8_t *filter) 1801cabdff1aSopenharmony_ci{ 1802cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; 1803cabdff1aSopenharmony_ci v16i8 out0, out1; 1804cabdff1aSopenharmony_ci v8u16 tmp0, tmp1, tmp2, tmp3; 1805cabdff1aSopenharmony_ci v8i16 filt; 1806cabdff1aSopenharmony_ci 1807cabdff1aSopenharmony_ci /* rearranging filter_y */ 1808cabdff1aSopenharmony_ci filt = LD_SH(filter); 1809cabdff1aSopenharmony_ci filt0 = (v16u8) __msa_splati_h(filt, 0); 1810cabdff1aSopenharmony_ci 1811cabdff1aSopenharmony_ci LD_UB5(src, src_stride, src0, src1, src2, src3, src4); 1812cabdff1aSopenharmony_ci ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); 1813cabdff1aSopenharmony_ci ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); 1814cabdff1aSopenharmony_ci DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1815cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1816cabdff1aSopenharmony_ci SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1817cabdff1aSopenharmony_ci SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1818cabdff1aSopenharmony_ci PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 1819cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1820cabdff1aSopenharmony_ci} 1821cabdff1aSopenharmony_ci 1822cabdff1aSopenharmony_cistatic void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, 1823cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1824cabdff1aSopenharmony_ci const int8_t *filter, int32_t height) 1825cabdff1aSopenharmony_ci{ 1826cabdff1aSopenharmony_ci uint32_t loop_cnt; 1827cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1828cabdff1aSopenharmony_ci v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 1829cabdff1aSopenharmony_ci v16i8 out0, out1; 1830cabdff1aSopenharmony_ci v8u16 tmp0, tmp1, tmp2, tmp3; 1831cabdff1aSopenharmony_ci v8i16 filt; 1832cabdff1aSopenharmony_ci 1833cabdff1aSopenharmony_ci /* rearranging filter_y */ 1834cabdff1aSopenharmony_ci filt = LD_SH(filter); 1835cabdff1aSopenharmony_ci filt0 = (v16u8) __msa_splati_h(filt, 0); 1836cabdff1aSopenharmony_ci 1837cabdff1aSopenharmony_ci src0 = LD_UB(src); 1838cabdff1aSopenharmony_ci src += src_stride; 1839cabdff1aSopenharmony_ci 1840cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 1841cabdff1aSopenharmony_ci LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); 1842cabdff1aSopenharmony_ci src += (8 * src_stride); 1843cabdff1aSopenharmony_ci 1844cabdff1aSopenharmony_ci ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 1845cabdff1aSopenharmony_ci vec0, vec1, vec2, vec3); 1846cabdff1aSopenharmony_ci ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, 1847cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 1848cabdff1aSopenharmony_ci DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1849cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1850cabdff1aSopenharmony_ci SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1851cabdff1aSopenharmony_ci SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1852cabdff1aSopenharmony_ci PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 1853cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 1854cabdff1aSopenharmony_ci 1855cabdff1aSopenharmony_ci DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 1856cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1857cabdff1aSopenharmony_ci SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1858cabdff1aSopenharmony_ci SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 1859cabdff1aSopenharmony_ci PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 1860cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 1861cabdff1aSopenharmony_ci dst += (8 * dst_stride); 1862cabdff1aSopenharmony_ci 1863cabdff1aSopenharmony_ci src0 = src8; 1864cabdff1aSopenharmony_ci } 1865cabdff1aSopenharmony_ci} 1866cabdff1aSopenharmony_ci 1867cabdff1aSopenharmony_civoid ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride, 1868cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1869cabdff1aSopenharmony_ci int height, int mx, int my) 1870cabdff1aSopenharmony_ci{ 1871cabdff1aSopenharmony_ci const int8_t *filter = bilinear_filters_msa[my - 1]; 1872cabdff1aSopenharmony_ci 1873cabdff1aSopenharmony_ci if (4 == height) { 1874cabdff1aSopenharmony_ci common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 1875cabdff1aSopenharmony_ci } else { 1876cabdff1aSopenharmony_ci common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, 1877cabdff1aSopenharmony_ci height); 1878cabdff1aSopenharmony_ci } 1879cabdff1aSopenharmony_ci} 1880cabdff1aSopenharmony_ci 1881cabdff1aSopenharmony_civoid ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride, 1882cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 1883cabdff1aSopenharmony_ci int height, int mx, int my) 1884cabdff1aSopenharmony_ci{ 1885cabdff1aSopenharmony_ci uint32_t loop_cnt; 1886cabdff1aSopenharmony_ci const int8_t *filter = bilinear_filters_msa[my - 1]; 1887cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4; 1888cabdff1aSopenharmony_ci v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 1889cabdff1aSopenharmony_ci v8u16 tmp0, tmp1, tmp2, tmp3; 1890cabdff1aSopenharmony_ci v8i16 filt; 1891cabdff1aSopenharmony_ci 1892cabdff1aSopenharmony_ci /* rearranging filter_y */ 1893cabdff1aSopenharmony_ci filt = LD_SH(filter); 1894cabdff1aSopenharmony_ci filt0 = (v16u8) __msa_splati_h(filt, 0); 1895cabdff1aSopenharmony_ci 1896cabdff1aSopenharmony_ci src0 = LD_UB(src); 1897cabdff1aSopenharmony_ci src += src_stride; 1898cabdff1aSopenharmony_ci 1899cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 1900cabdff1aSopenharmony_ci LD_UB4(src, src_stride, src1, src2, src3, src4); 1901cabdff1aSopenharmony_ci src += (4 * src_stride); 1902cabdff1aSopenharmony_ci 1903cabdff1aSopenharmony_ci ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 1904cabdff1aSopenharmony_ci ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 1905cabdff1aSopenharmony_ci DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 1906cabdff1aSopenharmony_ci SRARI_H2_UH(tmp0, tmp1, 7); 1907cabdff1aSopenharmony_ci SAT_UH2_UH(tmp0, tmp1, 7); 1908cabdff1aSopenharmony_ci PCKEV_ST_SB(tmp0, tmp1, dst); 1909cabdff1aSopenharmony_ci dst += dst_stride; 1910cabdff1aSopenharmony_ci 1911cabdff1aSopenharmony_ci ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 1912cabdff1aSopenharmony_ci ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 1913cabdff1aSopenharmony_ci DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 1914cabdff1aSopenharmony_ci SRARI_H2_UH(tmp2, tmp3, 7); 1915cabdff1aSopenharmony_ci SAT_UH2_UH(tmp2, tmp3, 7); 1916cabdff1aSopenharmony_ci PCKEV_ST_SB(tmp2, tmp3, dst); 1917cabdff1aSopenharmony_ci dst += dst_stride; 1918cabdff1aSopenharmony_ci 1919cabdff1aSopenharmony_ci DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 1920cabdff1aSopenharmony_ci SRARI_H2_UH(tmp0, tmp1, 7); 1921cabdff1aSopenharmony_ci SAT_UH2_UH(tmp0, tmp1, 7); 1922cabdff1aSopenharmony_ci PCKEV_ST_SB(tmp0, tmp1, dst); 1923cabdff1aSopenharmony_ci dst += dst_stride; 1924cabdff1aSopenharmony_ci 1925cabdff1aSopenharmony_ci DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 1926cabdff1aSopenharmony_ci SRARI_H2_UH(tmp2, tmp3, 7); 1927cabdff1aSopenharmony_ci SAT_UH2_UH(tmp2, tmp3, 7); 1928cabdff1aSopenharmony_ci PCKEV_ST_SB(tmp2, tmp3, dst); 1929cabdff1aSopenharmony_ci dst += dst_stride; 1930cabdff1aSopenharmony_ci 1931cabdff1aSopenharmony_ci src0 = src4; 1932cabdff1aSopenharmony_ci } 1933cabdff1aSopenharmony_ci} 1934cabdff1aSopenharmony_ci 1935cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride, 1936cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1937cabdff1aSopenharmony_ci const int8_t *filter_horiz, 1938cabdff1aSopenharmony_ci const int8_t *filter_vert) 1939cabdff1aSopenharmony_ci{ 1940cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, mask; 1941cabdff1aSopenharmony_ci v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; 1942cabdff1aSopenharmony_ci v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; 1943cabdff1aSopenharmony_ci 1944cabdff1aSopenharmony_ci mask = LD_SB(&mc_filt_mask_arr[16]); 1945cabdff1aSopenharmony_ci 1946cabdff1aSopenharmony_ci /* rearranging filter */ 1947cabdff1aSopenharmony_ci filt = LD_UH(filter_horiz); 1948cabdff1aSopenharmony_ci filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0); 1949cabdff1aSopenharmony_ci 1950cabdff1aSopenharmony_ci filt = LD_UH(filter_vert); 1951cabdff1aSopenharmony_ci filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0); 1952cabdff1aSopenharmony_ci 1953cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 1954cabdff1aSopenharmony_ci hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); 1955cabdff1aSopenharmony_ci hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); 1956cabdff1aSopenharmony_ci hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 1957cabdff1aSopenharmony_ci hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8); 1958cabdff1aSopenharmony_ci hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2); 1959cabdff1aSopenharmony_ci 1960cabdff1aSopenharmony_ci ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1961cabdff1aSopenharmony_ci DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1962cabdff1aSopenharmony_ci SRARI_H2_UH(tmp0, tmp1, 7); 1963cabdff1aSopenharmony_ci SAT_UH2_UH(tmp0, tmp1, 7); 1964cabdff1aSopenharmony_ci PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); 1965cabdff1aSopenharmony_ci ST_W2(res0, 0, 1, dst, dst_stride); 1966cabdff1aSopenharmony_ci ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 1967cabdff1aSopenharmony_ci} 1968cabdff1aSopenharmony_ci 1969cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride, 1970cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 1971cabdff1aSopenharmony_ci const int8_t *filter_horiz, 1972cabdff1aSopenharmony_ci const int8_t *filter_vert) 1973cabdff1aSopenharmony_ci{ 1974cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; 1975cabdff1aSopenharmony_ci v16i8 res0, res1, res2, res3; 1976cabdff1aSopenharmony_ci v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; 1977cabdff1aSopenharmony_ci v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1978cabdff1aSopenharmony_ci v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; 1979cabdff1aSopenharmony_ci 1980cabdff1aSopenharmony_ci mask = LD_SB(&mc_filt_mask_arr[16]); 1981cabdff1aSopenharmony_ci 1982cabdff1aSopenharmony_ci /* rearranging filter */ 1983cabdff1aSopenharmony_ci filt = LD_UH(filter_horiz); 1984cabdff1aSopenharmony_ci filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0); 1985cabdff1aSopenharmony_ci 1986cabdff1aSopenharmony_ci filt = LD_UH(filter_vert); 1987cabdff1aSopenharmony_ci filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0); 1988cabdff1aSopenharmony_ci 1989cabdff1aSopenharmony_ci LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 1990cabdff1aSopenharmony_ci src += (8 * src_stride); 1991cabdff1aSopenharmony_ci src8 = LD_SB(src); 1992cabdff1aSopenharmony_ci 1993cabdff1aSopenharmony_ci hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7); 1994cabdff1aSopenharmony_ci hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7); 1995cabdff1aSopenharmony_ci hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7); 1996cabdff1aSopenharmony_ci hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7); 1997cabdff1aSopenharmony_ci hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7); 1998cabdff1aSopenharmony_ci SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1, 1999cabdff1aSopenharmony_ci hz_out3, hz_out5); 2000cabdff1aSopenharmony_ci hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6); 2001cabdff1aSopenharmony_ci 2002cabdff1aSopenharmony_ci ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 2003cabdff1aSopenharmony_ci ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); 2004cabdff1aSopenharmony_ci DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, 2005cabdff1aSopenharmony_ci vec4, vec5, vec6, vec7); 2006cabdff1aSopenharmony_ci SRARI_H4_UH(vec4, vec5, vec6, vec7, 7); 2007cabdff1aSopenharmony_ci SAT_UH4_UH(vec4, vec5, vec6, vec7, 7); 2008cabdff1aSopenharmony_ci PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, 2009cabdff1aSopenharmony_ci res0, res1, res2, res3); 2010cabdff1aSopenharmony_ci ST_W2(res0, 0, 1, dst, dst_stride); 2011cabdff1aSopenharmony_ci ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); 2012cabdff1aSopenharmony_ci ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride); 2013cabdff1aSopenharmony_ci ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride); 2014cabdff1aSopenharmony_ci} 2015cabdff1aSopenharmony_ci 2016cabdff1aSopenharmony_civoid ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 2017cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 2018cabdff1aSopenharmony_ci int height, int mx, int my) 2019cabdff1aSopenharmony_ci{ 2020cabdff1aSopenharmony_ci const int8_t *filter_horiz = bilinear_filters_msa[mx - 1]; 2021cabdff1aSopenharmony_ci const int8_t *filter_vert = bilinear_filters_msa[my - 1]; 2022cabdff1aSopenharmony_ci 2023cabdff1aSopenharmony_ci if (4 == height) { 2024cabdff1aSopenharmony_ci common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, 2025cabdff1aSopenharmony_ci filter_horiz, filter_vert); 2026cabdff1aSopenharmony_ci } else if (8 == height) { 2027cabdff1aSopenharmony_ci common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, 2028cabdff1aSopenharmony_ci filter_horiz, filter_vert); 2029cabdff1aSopenharmony_ci } 2030cabdff1aSopenharmony_ci} 2031cabdff1aSopenharmony_ci 2032cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride, 2033cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2034cabdff1aSopenharmony_ci const int8_t *filter_horiz, 2035cabdff1aSopenharmony_ci const int8_t *filter_vert) 2036cabdff1aSopenharmony_ci{ 2037cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, mask, out0, out1; 2038cabdff1aSopenharmony_ci v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; 2039cabdff1aSopenharmony_ci v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 2040cabdff1aSopenharmony_ci v8i16 filt; 2041cabdff1aSopenharmony_ci 2042cabdff1aSopenharmony_ci mask = LD_SB(&mc_filt_mask_arr[0]); 2043cabdff1aSopenharmony_ci 2044cabdff1aSopenharmony_ci /* rearranging filter */ 2045cabdff1aSopenharmony_ci filt = LD_SH(filter_horiz); 2046cabdff1aSopenharmony_ci filt_hz = (v16u8) __msa_splati_h(filt, 0); 2047cabdff1aSopenharmony_ci 2048cabdff1aSopenharmony_ci filt = LD_SH(filter_vert); 2049cabdff1aSopenharmony_ci filt_vt = (v16u8) __msa_splati_h(filt, 0); 2050cabdff1aSopenharmony_ci 2051cabdff1aSopenharmony_ci LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 2052cabdff1aSopenharmony_ci 2053cabdff1aSopenharmony_ci hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 2054cabdff1aSopenharmony_ci hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2055cabdff1aSopenharmony_ci vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2056cabdff1aSopenharmony_ci tmp0 = __msa_dotp_u_h(vec0, filt_vt); 2057cabdff1aSopenharmony_ci 2058cabdff1aSopenharmony_ci hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 2059cabdff1aSopenharmony_ci vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2060cabdff1aSopenharmony_ci tmp1 = __msa_dotp_u_h(vec1, filt_vt); 2061cabdff1aSopenharmony_ci 2062cabdff1aSopenharmony_ci hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 2063cabdff1aSopenharmony_ci vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2064cabdff1aSopenharmony_ci tmp2 = __msa_dotp_u_h(vec2, filt_vt); 2065cabdff1aSopenharmony_ci 2066cabdff1aSopenharmony_ci hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 2067cabdff1aSopenharmony_ci vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2068cabdff1aSopenharmony_ci tmp3 = __msa_dotp_u_h(vec3, filt_vt); 2069cabdff1aSopenharmony_ci 2070cabdff1aSopenharmony_ci SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2071cabdff1aSopenharmony_ci SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); 2072cabdff1aSopenharmony_ci PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 2073cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2074cabdff1aSopenharmony_ci} 2075cabdff1aSopenharmony_ci 2076cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride, 2077cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2078cabdff1aSopenharmony_ci const int8_t *filter_horiz, 2079cabdff1aSopenharmony_ci const int8_t *filter_vert, 2080cabdff1aSopenharmony_ci int32_t height) 2081cabdff1aSopenharmony_ci{ 2082cabdff1aSopenharmony_ci uint32_t loop_cnt; 2083cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, mask, out0, out1; 2084cabdff1aSopenharmony_ci v16u8 filt_hz, filt_vt, vec0; 2085cabdff1aSopenharmony_ci v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; 2086cabdff1aSopenharmony_ci v8i16 filt; 2087cabdff1aSopenharmony_ci 2088cabdff1aSopenharmony_ci mask = LD_SB(&mc_filt_mask_arr[0]); 2089cabdff1aSopenharmony_ci 2090cabdff1aSopenharmony_ci /* rearranging filter */ 2091cabdff1aSopenharmony_ci filt = LD_SH(filter_horiz); 2092cabdff1aSopenharmony_ci filt_hz = (v16u8) __msa_splati_h(filt, 0); 2093cabdff1aSopenharmony_ci 2094cabdff1aSopenharmony_ci filt = LD_SH(filter_vert); 2095cabdff1aSopenharmony_ci filt_vt = (v16u8) __msa_splati_h(filt, 0); 2096cabdff1aSopenharmony_ci 2097cabdff1aSopenharmony_ci src0 = LD_SB(src); 2098cabdff1aSopenharmony_ci src += src_stride; 2099cabdff1aSopenharmony_ci 2100cabdff1aSopenharmony_ci hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 2101cabdff1aSopenharmony_ci 2102cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 2103cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src1, src2, src3, src4); 2104cabdff1aSopenharmony_ci src += (4 * src_stride); 2105cabdff1aSopenharmony_ci 2106cabdff1aSopenharmony_ci hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2107cabdff1aSopenharmony_ci vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2108cabdff1aSopenharmony_ci tmp1 = __msa_dotp_u_h(vec0, filt_vt); 2109cabdff1aSopenharmony_ci 2110cabdff1aSopenharmony_ci hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 2111cabdff1aSopenharmony_ci vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2112cabdff1aSopenharmony_ci tmp2 = __msa_dotp_u_h(vec0, filt_vt); 2113cabdff1aSopenharmony_ci 2114cabdff1aSopenharmony_ci SRARI_H2_UH(tmp1, tmp2, 7); 2115cabdff1aSopenharmony_ci SAT_UH2_UH(tmp1, tmp2, 7); 2116cabdff1aSopenharmony_ci 2117cabdff1aSopenharmony_ci hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 2118cabdff1aSopenharmony_ci vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2119cabdff1aSopenharmony_ci tmp3 = __msa_dotp_u_h(vec0, filt_vt); 2120cabdff1aSopenharmony_ci 2121cabdff1aSopenharmony_ci hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 2122cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src1, src2, src3, src4); 2123cabdff1aSopenharmony_ci src += (4 * src_stride); 2124cabdff1aSopenharmony_ci vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2125cabdff1aSopenharmony_ci tmp4 = __msa_dotp_u_h(vec0, filt_vt); 2126cabdff1aSopenharmony_ci 2127cabdff1aSopenharmony_ci SRARI_H2_UH(tmp3, tmp4, 7); 2128cabdff1aSopenharmony_ci SAT_UH2_UH(tmp3, tmp4, 7); 2129cabdff1aSopenharmony_ci PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); 2130cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2131cabdff1aSopenharmony_ci 2132cabdff1aSopenharmony_ci hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2133cabdff1aSopenharmony_ci vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2134cabdff1aSopenharmony_ci tmp5 = __msa_dotp_u_h(vec0, filt_vt); 2135cabdff1aSopenharmony_ci 2136cabdff1aSopenharmony_ci hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 2137cabdff1aSopenharmony_ci vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2138cabdff1aSopenharmony_ci tmp6 = __msa_dotp_u_h(vec0, filt_vt); 2139cabdff1aSopenharmony_ci 2140cabdff1aSopenharmony_ci hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 2141cabdff1aSopenharmony_ci vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); 2142cabdff1aSopenharmony_ci tmp7 = __msa_dotp_u_h(vec0, filt_vt); 2143cabdff1aSopenharmony_ci 2144cabdff1aSopenharmony_ci hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 2145cabdff1aSopenharmony_ci vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1); 2146cabdff1aSopenharmony_ci tmp8 = __msa_dotp_u_h(vec0, filt_vt); 2147cabdff1aSopenharmony_ci 2148cabdff1aSopenharmony_ci SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7); 2149cabdff1aSopenharmony_ci SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7); 2150cabdff1aSopenharmony_ci PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); 2151cabdff1aSopenharmony_ci ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); 2152cabdff1aSopenharmony_ci dst += (8 * dst_stride); 2153cabdff1aSopenharmony_ci } 2154cabdff1aSopenharmony_ci} 2155cabdff1aSopenharmony_ci 2156cabdff1aSopenharmony_civoid ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 2157cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 2158cabdff1aSopenharmony_ci int height, int mx, int my) 2159cabdff1aSopenharmony_ci{ 2160cabdff1aSopenharmony_ci const int8_t *filter_horiz = bilinear_filters_msa[mx - 1]; 2161cabdff1aSopenharmony_ci const int8_t *filter_vert = bilinear_filters_msa[my - 1]; 2162cabdff1aSopenharmony_ci 2163cabdff1aSopenharmony_ci if (4 == height) { 2164cabdff1aSopenharmony_ci common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, 2165cabdff1aSopenharmony_ci filter_horiz, filter_vert); 2166cabdff1aSopenharmony_ci } else { 2167cabdff1aSopenharmony_ci common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, 2168cabdff1aSopenharmony_ci filter_horiz, filter_vert, height); 2169cabdff1aSopenharmony_ci } 2170cabdff1aSopenharmony_ci} 2171cabdff1aSopenharmony_ci 2172cabdff1aSopenharmony_civoid ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, 2173cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 2174cabdff1aSopenharmony_ci int height, int mx, int my) 2175cabdff1aSopenharmony_ci{ 2176cabdff1aSopenharmony_ci uint32_t loop_cnt; 2177cabdff1aSopenharmony_ci const int8_t *filter_horiz = bilinear_filters_msa[mx - 1]; 2178cabdff1aSopenharmony_ci const int8_t *filter_vert = bilinear_filters_msa[my - 1]; 2179cabdff1aSopenharmony_ci v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 2180cabdff1aSopenharmony_ci v16u8 filt_hz, filt_vt, vec0, vec1; 2181cabdff1aSopenharmony_ci v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; 2182cabdff1aSopenharmony_ci v8i16 filt; 2183cabdff1aSopenharmony_ci 2184cabdff1aSopenharmony_ci mask = LD_SB(&mc_filt_mask_arr[0]); 2185cabdff1aSopenharmony_ci 2186cabdff1aSopenharmony_ci /* rearranging filter */ 2187cabdff1aSopenharmony_ci filt = LD_SH(filter_horiz); 2188cabdff1aSopenharmony_ci filt_hz = (v16u8) __msa_splati_h(filt, 0); 2189cabdff1aSopenharmony_ci 2190cabdff1aSopenharmony_ci filt = LD_SH(filter_vert); 2191cabdff1aSopenharmony_ci filt_vt = (v16u8) __msa_splati_h(filt, 0); 2192cabdff1aSopenharmony_ci 2193cabdff1aSopenharmony_ci LD_SB2(src, 8, src0, src1); 2194cabdff1aSopenharmony_ci src += src_stride; 2195cabdff1aSopenharmony_ci 2196cabdff1aSopenharmony_ci hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 2197cabdff1aSopenharmony_ci hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2198cabdff1aSopenharmony_ci 2199cabdff1aSopenharmony_ci 2200cabdff1aSopenharmony_ci for (loop_cnt = (height >> 2); loop_cnt--;) { 2201cabdff1aSopenharmony_ci LD_SB4(src, src_stride, src0, src2, src4, src6); 2202cabdff1aSopenharmony_ci LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 2203cabdff1aSopenharmony_ci src += (4 * src_stride); 2204cabdff1aSopenharmony_ci 2205cabdff1aSopenharmony_ci hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7); 2206cabdff1aSopenharmony_ci hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); 2207cabdff1aSopenharmony_ci ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 2208cabdff1aSopenharmony_ci DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 2209cabdff1aSopenharmony_ci SRARI_H2_UH(tmp1, tmp2, 7); 2210cabdff1aSopenharmony_ci SAT_UH2_UH(tmp1, tmp2, 7); 2211cabdff1aSopenharmony_ci PCKEV_ST_SB(tmp1, tmp2, dst); 2212cabdff1aSopenharmony_ci dst += dst_stride; 2213cabdff1aSopenharmony_ci 2214cabdff1aSopenharmony_ci hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7); 2215cabdff1aSopenharmony_ci hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7); 2216cabdff1aSopenharmony_ci ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 2217cabdff1aSopenharmony_ci DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 2218cabdff1aSopenharmony_ci SRARI_H2_UH(tmp1, tmp2, 7); 2219cabdff1aSopenharmony_ci SAT_UH2_UH(tmp1, tmp2, 7); 2220cabdff1aSopenharmony_ci PCKEV_ST_SB(tmp1, tmp2, dst); 2221cabdff1aSopenharmony_ci dst += dst_stride; 2222cabdff1aSopenharmony_ci 2223cabdff1aSopenharmony_ci hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7); 2224cabdff1aSopenharmony_ci hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7); 2225cabdff1aSopenharmony_ci ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 2226cabdff1aSopenharmony_ci DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 2227cabdff1aSopenharmony_ci SRARI_H2_UH(tmp1, tmp2, 7); 2228cabdff1aSopenharmony_ci SAT_UH2_UH(tmp1, tmp2, 7); 2229cabdff1aSopenharmony_ci PCKEV_ST_SB(tmp1, tmp2, dst); 2230cabdff1aSopenharmony_ci dst += dst_stride; 2231cabdff1aSopenharmony_ci 2232cabdff1aSopenharmony_ci hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7); 2233cabdff1aSopenharmony_ci hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7); 2234cabdff1aSopenharmony_ci ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 2235cabdff1aSopenharmony_ci DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 2236cabdff1aSopenharmony_ci SRARI_H2_UH(tmp1, tmp2, 7); 2237cabdff1aSopenharmony_ci SAT_UH2_UH(tmp1, tmp2, 7); 2238cabdff1aSopenharmony_ci PCKEV_ST_SB(tmp1, tmp2, dst); 2239cabdff1aSopenharmony_ci dst += dst_stride; 2240cabdff1aSopenharmony_ci } 2241cabdff1aSopenharmony_ci} 2242cabdff1aSopenharmony_ci 2243cabdff1aSopenharmony_civoid ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride, 2244cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 2245cabdff1aSopenharmony_ci int height, int mx, int my) 2246cabdff1aSopenharmony_ci{ 2247cabdff1aSopenharmony_ci int32_t cnt; 2248cabdff1aSopenharmony_ci uint64_t out0, out1, out2, out3, out4, out5, out6, out7; 2249cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 2250cabdff1aSopenharmony_ci 2251cabdff1aSopenharmony_ci if (0 == height % 8) { 2252cabdff1aSopenharmony_ci for (cnt = height >> 3; cnt--;) { 2253cabdff1aSopenharmony_ci LD_UB8(src, src_stride, 2254cabdff1aSopenharmony_ci src0, src1, src2, src3, src4, src5, src6, src7); 2255cabdff1aSopenharmony_ci src += (8 * src_stride); 2256cabdff1aSopenharmony_ci 2257cabdff1aSopenharmony_ci out0 = __msa_copy_u_d((v2i64) src0, 0); 2258cabdff1aSopenharmony_ci out1 = __msa_copy_u_d((v2i64) src1, 0); 2259cabdff1aSopenharmony_ci out2 = __msa_copy_u_d((v2i64) src2, 0); 2260cabdff1aSopenharmony_ci out3 = __msa_copy_u_d((v2i64) src3, 0); 2261cabdff1aSopenharmony_ci out4 = __msa_copy_u_d((v2i64) src4, 0); 2262cabdff1aSopenharmony_ci out5 = __msa_copy_u_d((v2i64) src5, 0); 2263cabdff1aSopenharmony_ci out6 = __msa_copy_u_d((v2i64) src6, 0); 2264cabdff1aSopenharmony_ci out7 = __msa_copy_u_d((v2i64) src7, 0); 2265cabdff1aSopenharmony_ci 2266cabdff1aSopenharmony_ci SD4(out0, out1, out2, out3, dst, dst_stride); 2267cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2268cabdff1aSopenharmony_ci SD4(out4, out5, out6, out7, dst, dst_stride); 2269cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2270cabdff1aSopenharmony_ci } 2271cabdff1aSopenharmony_ci } else if (0 == height % 4) { 2272cabdff1aSopenharmony_ci for (cnt = (height / 4); cnt--;) { 2273cabdff1aSopenharmony_ci LD_UB4(src, src_stride, src0, src1, src2, src3); 2274cabdff1aSopenharmony_ci src += (4 * src_stride); 2275cabdff1aSopenharmony_ci out0 = __msa_copy_u_d((v2i64) src0, 0); 2276cabdff1aSopenharmony_ci out1 = __msa_copy_u_d((v2i64) src1, 0); 2277cabdff1aSopenharmony_ci out2 = __msa_copy_u_d((v2i64) src2, 0); 2278cabdff1aSopenharmony_ci out3 = __msa_copy_u_d((v2i64) src3, 0); 2279cabdff1aSopenharmony_ci 2280cabdff1aSopenharmony_ci SD4(out0, out1, out2, out3, dst, dst_stride); 2281cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2282cabdff1aSopenharmony_ci } 2283cabdff1aSopenharmony_ci } 2284cabdff1aSopenharmony_ci} 2285cabdff1aSopenharmony_ci 2286cabdff1aSopenharmony_cistatic void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride, 2287cabdff1aSopenharmony_ci uint8_t *dst, int32_t dst_stride, 2288cabdff1aSopenharmony_ci int32_t height, int32_t width) 2289cabdff1aSopenharmony_ci{ 2290cabdff1aSopenharmony_ci int32_t cnt, loop_cnt; 2291cabdff1aSopenharmony_ci uint8_t *src_tmp, *dst_tmp; 2292cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 2293cabdff1aSopenharmony_ci 2294cabdff1aSopenharmony_ci for (cnt = (width >> 4); cnt--;) { 2295cabdff1aSopenharmony_ci src_tmp = src; 2296cabdff1aSopenharmony_ci dst_tmp = dst; 2297cabdff1aSopenharmony_ci 2298cabdff1aSopenharmony_ci for (loop_cnt = (height >> 3); loop_cnt--;) { 2299cabdff1aSopenharmony_ci LD_UB8(src_tmp, src_stride, 2300cabdff1aSopenharmony_ci src0, src1, src2, src3, src4, src5, src6, src7); 2301cabdff1aSopenharmony_ci src_tmp += (8 * src_stride); 2302cabdff1aSopenharmony_ci 2303cabdff1aSopenharmony_ci ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, 2304cabdff1aSopenharmony_ci dst_tmp, dst_stride); 2305cabdff1aSopenharmony_ci dst_tmp += (8 * dst_stride); 2306cabdff1aSopenharmony_ci } 2307cabdff1aSopenharmony_ci 2308cabdff1aSopenharmony_ci src += 16; 2309cabdff1aSopenharmony_ci dst += 16; 2310cabdff1aSopenharmony_ci } 2311cabdff1aSopenharmony_ci} 2312cabdff1aSopenharmony_ci 2313cabdff1aSopenharmony_civoid ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride, 2314cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 2315cabdff1aSopenharmony_ci int height, int mx, int my) 2316cabdff1aSopenharmony_ci{ 2317cabdff1aSopenharmony_ci int32_t cnt; 2318cabdff1aSopenharmony_ci v16u8 src0, src1, src2, src3; 2319cabdff1aSopenharmony_ci 2320cabdff1aSopenharmony_ci if (0 == height % 8) { 2321cabdff1aSopenharmony_ci copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); 2322cabdff1aSopenharmony_ci } else if (0 == height % 4) { 2323cabdff1aSopenharmony_ci for (cnt = (height >> 2); cnt--;) { 2324cabdff1aSopenharmony_ci LD_UB4(src, src_stride, src0, src1, src2, src3); 2325cabdff1aSopenharmony_ci src += (4 * src_stride); 2326cabdff1aSopenharmony_ci 2327cabdff1aSopenharmony_ci ST_UB4(src0, src1, src2, src3, dst, dst_stride); 2328cabdff1aSopenharmony_ci dst += (4 * dst_stride); 2329cabdff1aSopenharmony_ci } 2330cabdff1aSopenharmony_ci } 2331cabdff1aSopenharmony_ci} 2332