1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavcodec/vp9dsp.h"
22cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
23cabdff1aSopenharmony_ci#include "vp9dsp_mips.h"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_cistatic const uint8_t mc_filt_mask_arr[16 * 3] = {
26cabdff1aSopenharmony_ci    /* 8 width cases */
27cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28cabdff1aSopenharmony_ci    /* 4 width cases */
29cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30cabdff1aSopenharmony_ci    /* 4 width cases */
31cabdff1aSopenharmony_ci    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32cabdff1aSopenharmony_ci};
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_cistatic const int8_t vp9_bilinear_filters_msa[15][2] = {
35cabdff1aSopenharmony_ci    {120, 8},
36cabdff1aSopenharmony_ci    {112, 16},
37cabdff1aSopenharmony_ci    {104, 24},
38cabdff1aSopenharmony_ci    {96, 32},
39cabdff1aSopenharmony_ci    {88, 40},
40cabdff1aSopenharmony_ci    {80, 48},
41cabdff1aSopenharmony_ci    {72, 56},
42cabdff1aSopenharmony_ci    {64, 64},
43cabdff1aSopenharmony_ci    {56, 72},
44cabdff1aSopenharmony_ci    {48, 80},
45cabdff1aSopenharmony_ci    {40, 88},
46cabdff1aSopenharmony_ci    {32, 96},
47cabdff1aSopenharmony_ci    {24, 104},
48cabdff1aSopenharmony_ci    {16, 112},
49cabdff1aSopenharmony_ci    {8, 120}
50cabdff1aSopenharmony_ci};
51cabdff1aSopenharmony_ci
52cabdff1aSopenharmony_ci#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,             \
53cabdff1aSopenharmony_ci                            filt0, filt1, filt2, filt3)         \
54cabdff1aSopenharmony_ci( {                                                             \
55cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1;                                           \
56cabdff1aSopenharmony_ci                                                                \
57cabdff1aSopenharmony_ci    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
58cabdff1aSopenharmony_ci    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
59cabdff1aSopenharmony_ci    tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2);         \
60cabdff1aSopenharmony_ci    tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3);  \
61cabdff1aSopenharmony_ci    tmp0 = __msa_adds_s_h(tmp0, tmp1);                          \
62cabdff1aSopenharmony_ci                                                                \
63cabdff1aSopenharmony_ci    tmp0;                                                       \
64cabdff1aSopenharmony_ci} )
65cabdff1aSopenharmony_ci
66cabdff1aSopenharmony_ci#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,          \
67cabdff1aSopenharmony_ci                        filt_h0, filt_h1, filt_h2, filt_h3)              \
68cabdff1aSopenharmony_ci( {                                                                      \
69cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
70cabdff1aSopenharmony_ci    v8i16 hz_out_m;                                                      \
71cabdff1aSopenharmony_ci                                                                         \
72cabdff1aSopenharmony_ci    VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,                   \
73cabdff1aSopenharmony_ci               vec0_m, vec1_m, vec2_m, vec3_m);                          \
74cabdff1aSopenharmony_ci    hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m,       \
75cabdff1aSopenharmony_ci                                   filt_h0, filt_h1, filt_h2, filt_h3);  \
76cabdff1aSopenharmony_ci                                                                         \
77cabdff1aSopenharmony_ci    hz_out_m = __msa_srari_h(hz_out_m, 7);                               \
78cabdff1aSopenharmony_ci    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
79cabdff1aSopenharmony_ci                                                                         \
80cabdff1aSopenharmony_ci    hz_out_m;                                                            \
81cabdff1aSopenharmony_ci} )
82cabdff1aSopenharmony_ci
83cabdff1aSopenharmony_ci#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                  \
84cabdff1aSopenharmony_ci                                   mask0, mask1, mask2, mask3,              \
85cabdff1aSopenharmony_ci                                   filt0, filt1, filt2, filt3,              \
86cabdff1aSopenharmony_ci                                   out0, out1)                              \
87cabdff1aSopenharmony_ci{                                                                           \
88cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m,  vec4_m, vec5_m, vec6_m, vec7_m;  \
89cabdff1aSopenharmony_ci    v8i16 res0_m, res1_m, res2_m, res3_m;                                   \
90cabdff1aSopenharmony_ci                                                                            \
91cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);       \
92cabdff1aSopenharmony_ci    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);              \
93cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);       \
94cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);             \
95cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);       \
96cabdff1aSopenharmony_ci    DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);              \
97cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);       \
98cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);             \
99cabdff1aSopenharmony_ci    ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                \
100cabdff1aSopenharmony_ci}
101cabdff1aSopenharmony_ci
102cabdff1aSopenharmony_ci#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
103cabdff1aSopenharmony_ci                                   mask0, mask1, mask2, mask3,                \
104cabdff1aSopenharmony_ci                                   filt0, filt1, filt2, filt3,                \
105cabdff1aSopenharmony_ci                                   out0, out1, out2, out3)                    \
106cabdff1aSopenharmony_ci{                                                                             \
107cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
108cabdff1aSopenharmony_ci    v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \
109cabdff1aSopenharmony_ci                                                                              \
110cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
111cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
112cabdff1aSopenharmony_ci    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
113cabdff1aSopenharmony_ci                res0_m, res1_m, res2_m, res3_m);                              \
114cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
115cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
116cabdff1aSopenharmony_ci    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \
117cabdff1aSopenharmony_ci                res4_m, res5_m, res6_m, res7_m);                              \
118cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
119cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
120cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
121cabdff1aSopenharmony_ci                 res0_m, res1_m, res2_m, res3_m);                             \
122cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
123cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
124cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
125cabdff1aSopenharmony_ci                 res4_m, res5_m, res6_m, res7_m);                             \
126cabdff1aSopenharmony_ci    ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \
127cabdff1aSopenharmony_ci                res7_m, out0, out1, out2, out3);                              \
128cabdff1aSopenharmony_ci}
129cabdff1aSopenharmony_ci
130cabdff1aSopenharmony_ci#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst)  \
131cabdff1aSopenharmony_ci{                                                     \
132cabdff1aSopenharmony_ci    v16u8 tmp_m;                                      \
133cabdff1aSopenharmony_ci                                                      \
134cabdff1aSopenharmony_ci    tmp_m = PCKEV_XORI128_UB(in1, in0);               \
135cabdff1aSopenharmony_ci    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);       \
136cabdff1aSopenharmony_ci    ST_UB(tmp_m, (pdst));                             \
137cabdff1aSopenharmony_ci}
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)                  \
140cabdff1aSopenharmony_ci{                                                             \
141cabdff1aSopenharmony_ci    v16u8 tmp_m;                                              \
142cabdff1aSopenharmony_ci                                                              \
143cabdff1aSopenharmony_ci    tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
144cabdff1aSopenharmony_ci    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);               \
145cabdff1aSopenharmony_ci    ST_UB(tmp_m, (pdst));                                     \
146cabdff1aSopenharmony_ci}
147cabdff1aSopenharmony_ci
148cabdff1aSopenharmony_ci#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3,  dst0, dst1,   \
149cabdff1aSopenharmony_ci                           pdst, stride)                      \
150cabdff1aSopenharmony_ci{                                                             \
151cabdff1aSopenharmony_ci    v16u8 tmp0_m, tmp1_m;                                     \
152cabdff1aSopenharmony_ci    uint8_t *pdst_m = (uint8_t *) (pdst);                     \
153cabdff1aSopenharmony_ci                                                              \
154cabdff1aSopenharmony_ci    PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m);          \
155cabdff1aSopenharmony_ci    AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);  \
156cabdff1aSopenharmony_ci    ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride);        \
157cabdff1aSopenharmony_ci}
158cabdff1aSopenharmony_ci
159cabdff1aSopenharmony_cistatic void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
160cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
161cabdff1aSopenharmony_ci                                 const int8_t *filter)
162cabdff1aSopenharmony_ci{
163cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, out;
164cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
165cabdff1aSopenharmony_ci    v8i16 filt, out0, out1;
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[16]);
168cabdff1aSopenharmony_ci    src -= 3;
169cabdff1aSopenharmony_ci
170cabdff1aSopenharmony_ci    /* rearranging filter */
171cabdff1aSopenharmony_ci    filt = LD_SH(filter);
172cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
173cabdff1aSopenharmony_ci
174cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
175cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
176cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
177cabdff1aSopenharmony_ci
178cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
179cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
180cabdff1aSopenharmony_ci    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
181cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, out0, out1);
182cabdff1aSopenharmony_ci    SRARI_H2_SH(out0, out1, 7);
183cabdff1aSopenharmony_ci    SAT_SH2_SH(out0, out1, 7);
184cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
185cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
186cabdff1aSopenharmony_ci}
187cabdff1aSopenharmony_ci
188cabdff1aSopenharmony_cistatic void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
189cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
190cabdff1aSopenharmony_ci                                 const int8_t *filter)
191cabdff1aSopenharmony_ci{
192cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, filt3;
193cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3;
194cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, out;
195cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
196cabdff1aSopenharmony_ci
197cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[16]);
198cabdff1aSopenharmony_ci    src -= 3;
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_ci    /* rearranging filter */
201cabdff1aSopenharmony_ci    filt = LD_SH(filter);
202cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
203cabdff1aSopenharmony_ci
204cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
205cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
206cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
207cabdff1aSopenharmony_ci
208cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
209cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
210cabdff1aSopenharmony_ci    src += (4 * src_stride);
211cabdff1aSopenharmony_ci    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
212cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, out0, out1);
213cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
214cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
215cabdff1aSopenharmony_ci    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
216cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, out2, out3);
217cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 7);
218cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
219cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
220cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
221cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out2, out3);
222cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
223cabdff1aSopenharmony_ci}
224cabdff1aSopenharmony_ci
225cabdff1aSopenharmony_cistatic void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
226cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
227cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
228cabdff1aSopenharmony_ci{
229cabdff1aSopenharmony_ci    if (4 == height) {
230cabdff1aSopenharmony_ci        common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
231cabdff1aSopenharmony_ci    } else if (8 == height) {
232cabdff1aSopenharmony_ci        common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
233cabdff1aSopenharmony_ci    }
234cabdff1aSopenharmony_ci}
235cabdff1aSopenharmony_ci
236cabdff1aSopenharmony_cistatic void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
237cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
238cabdff1aSopenharmony_ci                                 const int8_t *filter)
239cabdff1aSopenharmony_ci{
240cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
241cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
242cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
243cabdff1aSopenharmony_ci
244cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
245cabdff1aSopenharmony_ci    src -= 3;
246cabdff1aSopenharmony_ci
247cabdff1aSopenharmony_ci    /* rearranging filter */
248cabdff1aSopenharmony_ci    filt = LD_SH(filter);
249cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
250cabdff1aSopenharmony_ci
251cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
252cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
253cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
254cabdff1aSopenharmony_ci
255cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
256cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
257cabdff1aSopenharmony_ci    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
258cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, out0, out1,
259cabdff1aSopenharmony_ci                               out2, out3);
260cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 7);
261cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
262cabdff1aSopenharmony_ci    tmp0 = PCKEV_XORI128_UB(out0, out1);
263cabdff1aSopenharmony_ci    tmp1 = PCKEV_XORI128_UB(out2, out3);
264cabdff1aSopenharmony_ci    ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
265cabdff1aSopenharmony_ci}
266cabdff1aSopenharmony_ci
267cabdff1aSopenharmony_cistatic void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
268cabdff1aSopenharmony_ci                                     uint8_t *dst, int32_t dst_stride,
269cabdff1aSopenharmony_ci                                     const int8_t *filter, int32_t height)
270cabdff1aSopenharmony_ci{
271cabdff1aSopenharmony_ci    uint32_t loop_cnt;
272cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
273cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
274cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
275cabdff1aSopenharmony_ci
276cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
277cabdff1aSopenharmony_ci    src -= 3;
278cabdff1aSopenharmony_ci
279cabdff1aSopenharmony_ci    /* rearranging filter */
280cabdff1aSopenharmony_ci    filt = LD_SH(filter);
281cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
282cabdff1aSopenharmony_ci
283cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
284cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
285cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
288cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
289cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
290cabdff1aSopenharmony_ci        src += (4 * src_stride);
291cabdff1aSopenharmony_ci        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
292cabdff1aSopenharmony_ci                                   mask3, filt0, filt1, filt2, filt3, out0,
293cabdff1aSopenharmony_ci                                   out1, out2, out3);
294cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
295cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
296cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0, out1);
297cabdff1aSopenharmony_ci        tmp1 = PCKEV_XORI128_UB(out2, out3);
298cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
299cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
300cabdff1aSopenharmony_ci    }
301cabdff1aSopenharmony_ci}
302cabdff1aSopenharmony_ci
303cabdff1aSopenharmony_cistatic void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
304cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
305cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
306cabdff1aSopenharmony_ci{
307cabdff1aSopenharmony_ci    if (4 == height) {
308cabdff1aSopenharmony_ci        common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
309cabdff1aSopenharmony_ci    } else {
310cabdff1aSopenharmony_ci        common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
311cabdff1aSopenharmony_ci                                 height);
312cabdff1aSopenharmony_ci    }
313cabdff1aSopenharmony_ci}
314cabdff1aSopenharmony_ci
315cabdff1aSopenharmony_cistatic void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
316cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
317cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
318cabdff1aSopenharmony_ci{
319cabdff1aSopenharmony_ci    uint32_t loop_cnt;
320cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
321cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, out;
322cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
323cabdff1aSopenharmony_ci
324cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
325cabdff1aSopenharmony_ci    src -= 3;
326cabdff1aSopenharmony_ci
327cabdff1aSopenharmony_ci    /* rearranging filter */
328cabdff1aSopenharmony_ci    filt = LD_SH(filter);
329cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
330cabdff1aSopenharmony_ci
331cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
332cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
333cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
334cabdff1aSopenharmony_ci
335cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
336cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src0, src2);
337cabdff1aSopenharmony_ci        LD_SB2(src + 8, src_stride, src1, src3);
338cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
339cabdff1aSopenharmony_ci        src += (2 * src_stride);
340cabdff1aSopenharmony_ci        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
341cabdff1aSopenharmony_ci                                   mask3, filt0, filt1, filt2, filt3, out0,
342cabdff1aSopenharmony_ci                                   out1, out2, out3);
343cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
344cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
345cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
346cabdff1aSopenharmony_ci        ST_UB(out, dst);
347cabdff1aSopenharmony_ci        dst += dst_stride;
348cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
349cabdff1aSopenharmony_ci        ST_UB(out, dst);
350cabdff1aSopenharmony_ci        dst += dst_stride;
351cabdff1aSopenharmony_ci    }
352cabdff1aSopenharmony_ci}
353cabdff1aSopenharmony_ci
354cabdff1aSopenharmony_cistatic void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
355cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
356cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
357cabdff1aSopenharmony_ci{
358cabdff1aSopenharmony_ci    uint32_t loop_cnt;
359cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
360cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, out;
361cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
362cabdff1aSopenharmony_ci
363cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
364cabdff1aSopenharmony_ci    src -= 3;
365cabdff1aSopenharmony_ci
366cabdff1aSopenharmony_ci    /* rearranging filter */
367cabdff1aSopenharmony_ci    filt = LD_SH(filter);
368cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
369cabdff1aSopenharmony_ci
370cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
371cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
372cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
373cabdff1aSopenharmony_ci
374cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
375cabdff1aSopenharmony_ci        src0 = LD_SB(src);
376cabdff1aSopenharmony_ci        src2 = LD_SB(src + 16);
377cabdff1aSopenharmony_ci        src3 = LD_SB(src + 24);
378cabdff1aSopenharmony_ci        src1 = __msa_sldi_b(src2, src0, 8);
379cabdff1aSopenharmony_ci        src += src_stride;
380cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
381cabdff1aSopenharmony_ci        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
382cabdff1aSopenharmony_ci                                   mask3, filt0, filt1, filt2, filt3, out0,
383cabdff1aSopenharmony_ci                                   out1, out2, out3);
384cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
385cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
386cabdff1aSopenharmony_ci
387cabdff1aSopenharmony_ci        src0 = LD_SB(src);
388cabdff1aSopenharmony_ci        src2 = LD_SB(src + 16);
389cabdff1aSopenharmony_ci        src3 = LD_SB(src + 24);
390cabdff1aSopenharmony_ci        src1 = __msa_sldi_b(src2, src0, 8);
391cabdff1aSopenharmony_ci        src += src_stride;
392cabdff1aSopenharmony_ci
393cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
394cabdff1aSopenharmony_ci        ST_UB(out, dst);
395cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
396cabdff1aSopenharmony_ci        ST_UB(out, dst + 16);
397cabdff1aSopenharmony_ci        dst += dst_stride;
398cabdff1aSopenharmony_ci
399cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
400cabdff1aSopenharmony_ci        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
401cabdff1aSopenharmony_ci                                   mask3, filt0, filt1, filt2, filt3, out0,
402cabdff1aSopenharmony_ci                                   out1, out2, out3);
403cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
404cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
405cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
406cabdff1aSopenharmony_ci        ST_UB(out, dst);
407cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
408cabdff1aSopenharmony_ci        ST_UB(out, dst + 16);
409cabdff1aSopenharmony_ci        dst += dst_stride;
410cabdff1aSopenharmony_ci    }
411cabdff1aSopenharmony_ci}
412cabdff1aSopenharmony_ci
413cabdff1aSopenharmony_cistatic void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
414cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
415cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
416cabdff1aSopenharmony_ci{
417cabdff1aSopenharmony_ci    int32_t loop_cnt;
418cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
419cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, out;
420cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
421cabdff1aSopenharmony_ci
422cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
423cabdff1aSopenharmony_ci    src -= 3;
424cabdff1aSopenharmony_ci
425cabdff1aSopenharmony_ci    /* rearranging filter */
426cabdff1aSopenharmony_ci    filt = LD_SH(filter);
427cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
428cabdff1aSopenharmony_ci
429cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
430cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
431cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
432cabdff1aSopenharmony_ci
433cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
434cabdff1aSopenharmony_ci        src0 = LD_SB(src);
435cabdff1aSopenharmony_ci        src2 = LD_SB(src + 16);
436cabdff1aSopenharmony_ci        src3 = LD_SB(src + 24);
437cabdff1aSopenharmony_ci        src1 = __msa_sldi_b(src2, src0, 8);
438cabdff1aSopenharmony_ci
439cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
440cabdff1aSopenharmony_ci        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
441cabdff1aSopenharmony_ci                                   mask2, mask3, filt0, filt1, filt2, filt3,
442cabdff1aSopenharmony_ci                                   out0, out1, out2, out3);
443cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
444cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
445cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
446cabdff1aSopenharmony_ci        ST_UB(out, dst);
447cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
448cabdff1aSopenharmony_ci        ST_UB(out, dst + 16);
449cabdff1aSopenharmony_ci
450cabdff1aSopenharmony_ci        src0 = LD_SB(src + 32);
451cabdff1aSopenharmony_ci        src2 = LD_SB(src + 48);
452cabdff1aSopenharmony_ci        src3 = LD_SB(src + 56);
453cabdff1aSopenharmony_ci        src1 = __msa_sldi_b(src2, src0, 8);
454cabdff1aSopenharmony_ci        src += src_stride;
455cabdff1aSopenharmony_ci
456cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
457cabdff1aSopenharmony_ci        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
458cabdff1aSopenharmony_ci                                   mask2, mask3, filt0, filt1, filt2, filt3,
459cabdff1aSopenharmony_ci                                   out0, out1, out2, out3);
460cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
461cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
462cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
463cabdff1aSopenharmony_ci        ST_UB(out, dst + 32);
464cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
465cabdff1aSopenharmony_ci        ST_UB(out, dst + 48);
466cabdff1aSopenharmony_ci        dst += dst_stride;
467cabdff1aSopenharmony_ci    }
468cabdff1aSopenharmony_ci}
469cabdff1aSopenharmony_ci
470cabdff1aSopenharmony_cistatic void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
471cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
472cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
473cabdff1aSopenharmony_ci{
474cabdff1aSopenharmony_ci    uint32_t loop_cnt;
475cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
476cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
477cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
478cabdff1aSopenharmony_ci    v16i8 src10998, filt0, filt1, filt2, filt3;
479cabdff1aSopenharmony_ci    v16u8 out;
480cabdff1aSopenharmony_ci    v8i16 filt, out10, out32;
481cabdff1aSopenharmony_ci
482cabdff1aSopenharmony_ci    src -= (3 * src_stride);
483cabdff1aSopenharmony_ci
484cabdff1aSopenharmony_ci    filt = LD_SH(filter);
485cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
486cabdff1aSopenharmony_ci
487cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
488cabdff1aSopenharmony_ci    src += (7 * src_stride);
489cabdff1aSopenharmony_ci
490cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
491cabdff1aSopenharmony_ci               src54_r, src21_r);
492cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
493cabdff1aSopenharmony_ci    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
494cabdff1aSopenharmony_ci               src4332, src6554);
495cabdff1aSopenharmony_ci    XORI_B3_128_SB(src2110, src4332, src6554);
496cabdff1aSopenharmony_ci
497cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
498cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
499cabdff1aSopenharmony_ci        src += (4 * src_stride);
500cabdff1aSopenharmony_ci
501cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
502cabdff1aSopenharmony_ci                   src87_r, src98_r, src109_r);
503cabdff1aSopenharmony_ci        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
504cabdff1aSopenharmony_ci        XORI_B2_128_SB(src8776, src10998);
505cabdff1aSopenharmony_ci        out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
506cabdff1aSopenharmony_ci                                    filt1, filt2, filt3);
507cabdff1aSopenharmony_ci        out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
508cabdff1aSopenharmony_ci                                    filt1, filt2, filt3);
509cabdff1aSopenharmony_ci        SRARI_H2_SH(out10, out32, 7);
510cabdff1aSopenharmony_ci        SAT_SH2_SH(out10, out32, 7);
511cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out10, out32);
512cabdff1aSopenharmony_ci        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
513cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
514cabdff1aSopenharmony_ci
515cabdff1aSopenharmony_ci        src2110 = src6554;
516cabdff1aSopenharmony_ci        src4332 = src8776;
517cabdff1aSopenharmony_ci        src6554 = src10998;
518cabdff1aSopenharmony_ci        src6 = src10;
519cabdff1aSopenharmony_ci    }
520cabdff1aSopenharmony_ci}
521cabdff1aSopenharmony_ci
522cabdff1aSopenharmony_cistatic void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
523cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
524cabdff1aSopenharmony_ci                                const int8_t *filter, int32_t height)
525cabdff1aSopenharmony_ci{
526cabdff1aSopenharmony_ci    uint32_t loop_cnt;
527cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
528cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
529cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
530cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1;
531cabdff1aSopenharmony_ci    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
532cabdff1aSopenharmony_ci
533cabdff1aSopenharmony_ci    src -= (3 * src_stride);
534cabdff1aSopenharmony_ci
535cabdff1aSopenharmony_ci    filt = LD_SH(filter);
536cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
537cabdff1aSopenharmony_ci
538cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
539cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
540cabdff1aSopenharmony_ci    src += (7 * src_stride);
541cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
542cabdff1aSopenharmony_ci               src54_r, src21_r);
543cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
544cabdff1aSopenharmony_ci
545cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
546cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
547cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
548cabdff1aSopenharmony_ci        src += (4 * src_stride);
549cabdff1aSopenharmony_ci
550cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
551cabdff1aSopenharmony_ci                   src87_r, src98_r, src109_r);
552cabdff1aSopenharmony_ci        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
553cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
554cabdff1aSopenharmony_ci        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
555cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
556cabdff1aSopenharmony_ci        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
557cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
558cabdff1aSopenharmony_ci        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
559cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
560cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
561cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
562cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
563cabdff1aSopenharmony_ci        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
564cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
565cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
566cabdff1aSopenharmony_ci
567cabdff1aSopenharmony_ci        src10_r = src54_r;
568cabdff1aSopenharmony_ci        src32_r = src76_r;
569cabdff1aSopenharmony_ci        src54_r = src98_r;
570cabdff1aSopenharmony_ci        src21_r = src65_r;
571cabdff1aSopenharmony_ci        src43_r = src87_r;
572cabdff1aSopenharmony_ci        src65_r = src109_r;
573cabdff1aSopenharmony_ci        src6 = src10;
574cabdff1aSopenharmony_ci    }
575cabdff1aSopenharmony_ci}
576cabdff1aSopenharmony_ci
577cabdff1aSopenharmony_cistatic void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
578cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
579cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
580cabdff1aSopenharmony_ci{
581cabdff1aSopenharmony_ci    uint32_t loop_cnt;
582cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
583cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, filt3;
584cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
585cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
586cabdff1aSopenharmony_ci    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
587cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
588cabdff1aSopenharmony_ci    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
589cabdff1aSopenharmony_ci
590cabdff1aSopenharmony_ci    src -= (3 * src_stride);
591cabdff1aSopenharmony_ci
592cabdff1aSopenharmony_ci    filt = LD_SH(filter);
593cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
594cabdff1aSopenharmony_ci
595cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
596cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
597cabdff1aSopenharmony_ci    src += (7 * src_stride);
598cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
599cabdff1aSopenharmony_ci               src54_r, src21_r);
600cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
601cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
602cabdff1aSopenharmony_ci               src54_l, src21_l);
603cabdff1aSopenharmony_ci    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
604cabdff1aSopenharmony_ci
605cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
606cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
607cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
608cabdff1aSopenharmony_ci        src += (4 * src_stride);
609cabdff1aSopenharmony_ci
610cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
611cabdff1aSopenharmony_ci                   src87_r, src98_r, src109_r);
612cabdff1aSopenharmony_ci        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
613cabdff1aSopenharmony_ci                   src87_l, src98_l, src109_l);
614cabdff1aSopenharmony_ci        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
615cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
616cabdff1aSopenharmony_ci        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
617cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
618cabdff1aSopenharmony_ci        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
619cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
620cabdff1aSopenharmony_ci        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
621cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
622cabdff1aSopenharmony_ci        out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
623cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
624cabdff1aSopenharmony_ci        out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
625cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
626cabdff1aSopenharmony_ci        out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
627cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
628cabdff1aSopenharmony_ci        out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
629cabdff1aSopenharmony_ci                                     filt1, filt2, filt3);
630cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
631cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
632cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
633cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
634cabdff1aSopenharmony_ci        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
635cabdff1aSopenharmony_ci                    out3_r, tmp0, tmp1, tmp2, tmp3);
636cabdff1aSopenharmony_ci        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
637cabdff1aSopenharmony_ci        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
638cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
639cabdff1aSopenharmony_ci
640cabdff1aSopenharmony_ci        src10_r = src54_r;
641cabdff1aSopenharmony_ci        src32_r = src76_r;
642cabdff1aSopenharmony_ci        src54_r = src98_r;
643cabdff1aSopenharmony_ci        src21_r = src65_r;
644cabdff1aSopenharmony_ci        src43_r = src87_r;
645cabdff1aSopenharmony_ci        src65_r = src109_r;
646cabdff1aSopenharmony_ci        src10_l = src54_l;
647cabdff1aSopenharmony_ci        src32_l = src76_l;
648cabdff1aSopenharmony_ci        src54_l = src98_l;
649cabdff1aSopenharmony_ci        src21_l = src65_l;
650cabdff1aSopenharmony_ci        src43_l = src87_l;
651cabdff1aSopenharmony_ci        src65_l = src109_l;
652cabdff1aSopenharmony_ci        src6 = src10;
653cabdff1aSopenharmony_ci    }
654cabdff1aSopenharmony_ci}
655cabdff1aSopenharmony_ci
656cabdff1aSopenharmony_cistatic void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
657cabdff1aSopenharmony_ci                                      uint8_t *dst, int32_t dst_stride,
658cabdff1aSopenharmony_ci                                      const int8_t *filter, int32_t height,
659cabdff1aSopenharmony_ci                                      int32_t width)
660cabdff1aSopenharmony_ci{
661cabdff1aSopenharmony_ci    const uint8_t *src_tmp;
662cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
663cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
664cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
665cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, filt3;
666cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
667cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
668cabdff1aSopenharmony_ci    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
669cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
670cabdff1aSopenharmony_ci    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
671cabdff1aSopenharmony_ci
672cabdff1aSopenharmony_ci    src -= (3 * src_stride);
673cabdff1aSopenharmony_ci
674cabdff1aSopenharmony_ci    filt = LD_SH(filter);
675cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
676cabdff1aSopenharmony_ci
677cabdff1aSopenharmony_ci    for (cnt = (width >> 4); cnt--;) {
678cabdff1aSopenharmony_ci        src_tmp = src;
679cabdff1aSopenharmony_ci        dst_tmp = dst;
680cabdff1aSopenharmony_ci
681cabdff1aSopenharmony_ci        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
682cabdff1aSopenharmony_ci        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
683cabdff1aSopenharmony_ci        src_tmp += (7 * src_stride);
684cabdff1aSopenharmony_ci        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
685cabdff1aSopenharmony_ci                   src32_r, src54_r, src21_r);
686cabdff1aSopenharmony_ci        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
687cabdff1aSopenharmony_ci        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
688cabdff1aSopenharmony_ci                   src32_l, src54_l, src21_l);
689cabdff1aSopenharmony_ci        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
690cabdff1aSopenharmony_ci
691cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 2); loop_cnt--;) {
692cabdff1aSopenharmony_ci            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
693cabdff1aSopenharmony_ci            XORI_B4_128_SB(src7, src8, src9, src10);
694cabdff1aSopenharmony_ci            src_tmp += (4 * src_stride);
695cabdff1aSopenharmony_ci            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
696cabdff1aSopenharmony_ci                       src87_r, src98_r, src109_r);
697cabdff1aSopenharmony_ci            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
698cabdff1aSopenharmony_ci                       src87_l, src98_l, src109_l);
699cabdff1aSopenharmony_ci            out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
700cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
701cabdff1aSopenharmony_ci            out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
702cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
703cabdff1aSopenharmony_ci            out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
704cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
705cabdff1aSopenharmony_ci            out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
706cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
707cabdff1aSopenharmony_ci            out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
708cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
709cabdff1aSopenharmony_ci            out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
710cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
711cabdff1aSopenharmony_ci            out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
712cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
713cabdff1aSopenharmony_ci            out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
714cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
715cabdff1aSopenharmony_ci            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
716cabdff1aSopenharmony_ci            SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
717cabdff1aSopenharmony_ci            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
718cabdff1aSopenharmony_ci            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
719cabdff1aSopenharmony_ci            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
720cabdff1aSopenharmony_ci                        out3_r, tmp0, tmp1, tmp2, tmp3);
721cabdff1aSopenharmony_ci            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
722cabdff1aSopenharmony_ci            ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
723cabdff1aSopenharmony_ci            dst_tmp += (4 * dst_stride);
724cabdff1aSopenharmony_ci
725cabdff1aSopenharmony_ci            src10_r = src54_r;
726cabdff1aSopenharmony_ci            src32_r = src76_r;
727cabdff1aSopenharmony_ci            src54_r = src98_r;
728cabdff1aSopenharmony_ci            src21_r = src65_r;
729cabdff1aSopenharmony_ci            src43_r = src87_r;
730cabdff1aSopenharmony_ci            src65_r = src109_r;
731cabdff1aSopenharmony_ci            src10_l = src54_l;
732cabdff1aSopenharmony_ci            src32_l = src76_l;
733cabdff1aSopenharmony_ci            src54_l = src98_l;
734cabdff1aSopenharmony_ci            src21_l = src65_l;
735cabdff1aSopenharmony_ci            src43_l = src87_l;
736cabdff1aSopenharmony_ci            src65_l = src109_l;
737cabdff1aSopenharmony_ci            src6 = src10;
738cabdff1aSopenharmony_ci        }
739cabdff1aSopenharmony_ci
740cabdff1aSopenharmony_ci        src += 16;
741cabdff1aSopenharmony_ci        dst += 16;
742cabdff1aSopenharmony_ci    }
743cabdff1aSopenharmony_ci}
744cabdff1aSopenharmony_ci
745cabdff1aSopenharmony_cistatic void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
746cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
747cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
748cabdff1aSopenharmony_ci{
749cabdff1aSopenharmony_ci    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
750cabdff1aSopenharmony_ci                              32);
751cabdff1aSopenharmony_ci}
752cabdff1aSopenharmony_ci
753cabdff1aSopenharmony_cistatic void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
754cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
755cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
756cabdff1aSopenharmony_ci{
757cabdff1aSopenharmony_ci    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
758cabdff1aSopenharmony_ci                              64);
759cabdff1aSopenharmony_ci}
760cabdff1aSopenharmony_ci
761cabdff1aSopenharmony_cistatic void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
762cabdff1aSopenharmony_ci                                     uint8_t *dst, int32_t dst_stride,
763cabdff1aSopenharmony_ci                                     const int8_t *filter_horiz,
764cabdff1aSopenharmony_ci                                     const int8_t *filter_vert,
765cabdff1aSopenharmony_ci                                     int32_t height)
766cabdff1aSopenharmony_ci{
767cabdff1aSopenharmony_ci    uint32_t loop_cnt;
768cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
769cabdff1aSopenharmony_ci    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
770cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, out;
771cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
772cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
773cabdff1aSopenharmony_ci    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
774cabdff1aSopenharmony_ci
775cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[16]);
776cabdff1aSopenharmony_ci    src -= (3 + 3 * src_stride);
777cabdff1aSopenharmony_ci
778cabdff1aSopenharmony_ci    /* rearranging filter */
779cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
780cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
781cabdff1aSopenharmony_ci
782cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
783cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
784cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
785cabdff1aSopenharmony_ci
786cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
787cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
788cabdff1aSopenharmony_ci    src += (7 * src_stride);
789cabdff1aSopenharmony_ci
790cabdff1aSopenharmony_ci    hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
791cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
792cabdff1aSopenharmony_ci    hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
793cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
794cabdff1aSopenharmony_ci    hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
795cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
796cabdff1aSopenharmony_ci    hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
797cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
798cabdff1aSopenharmony_ci    SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3);
799cabdff1aSopenharmony_ci
800cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
801cabdff1aSopenharmony_ci    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
802cabdff1aSopenharmony_ci
803cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
804cabdff1aSopenharmony_ci    out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
805cabdff1aSopenharmony_ci
806cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
807cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
808cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
809cabdff1aSopenharmony_ci        src += (4 * src_stride);
810cabdff1aSopenharmony_ci
811cabdff1aSopenharmony_ci        hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
812cabdff1aSopenharmony_ci                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
813cabdff1aSopenharmony_ci        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
814cabdff1aSopenharmony_ci        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
815cabdff1aSopenharmony_ci        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
816cabdff1aSopenharmony_ci                                   filt_vt2, filt_vt3);
817cabdff1aSopenharmony_ci
818cabdff1aSopenharmony_ci        hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
819cabdff1aSopenharmony_ci                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
820cabdff1aSopenharmony_ci        hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
821cabdff1aSopenharmony_ci        out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
822cabdff1aSopenharmony_ci        tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
823cabdff1aSopenharmony_ci                                   filt_vt2, filt_vt3);
824cabdff1aSopenharmony_ci        SRARI_H2_SH(tmp0, tmp1, 7);
825cabdff1aSopenharmony_ci        SAT_SH2_SH(tmp0, tmp1, 7);
826cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(tmp0, tmp1);
827cabdff1aSopenharmony_ci        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
828cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
829cabdff1aSopenharmony_ci
830cabdff1aSopenharmony_ci        hz_out5 = hz_out9;
831cabdff1aSopenharmony_ci        out0 = out2;
832cabdff1aSopenharmony_ci        out1 = out3;
833cabdff1aSopenharmony_ci        out2 = out4;
834cabdff1aSopenharmony_ci    }
835cabdff1aSopenharmony_ci}
836cabdff1aSopenharmony_ci
837cabdff1aSopenharmony_cistatic void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
838cabdff1aSopenharmony_ci                                     uint8_t *dst, int32_t dst_stride,
839cabdff1aSopenharmony_ci                                     const int8_t *filter_horiz,
840cabdff1aSopenharmony_ci                                     const int8_t *filter_vert,
841cabdff1aSopenharmony_ci                                     int32_t height)
842cabdff1aSopenharmony_ci{
843cabdff1aSopenharmony_ci    uint32_t loop_cnt;
844cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
845cabdff1aSopenharmony_ci    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
846cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
847cabdff1aSopenharmony_ci    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
848cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
849cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
850cabdff1aSopenharmony_ci    v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
851cabdff1aSopenharmony_ci
852cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
853cabdff1aSopenharmony_ci    src -= (3 + 3 * src_stride);
854cabdff1aSopenharmony_ci
855cabdff1aSopenharmony_ci    /* rearranging filter */
856cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
857cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
858cabdff1aSopenharmony_ci
859cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
860cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
861cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
862cabdff1aSopenharmony_ci
863cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
864cabdff1aSopenharmony_ci    src += (7 * src_stride);
865cabdff1aSopenharmony_ci
866cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
867cabdff1aSopenharmony_ci    hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
868cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
869cabdff1aSopenharmony_ci    hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
870cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
871cabdff1aSopenharmony_ci    hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
872cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
873cabdff1aSopenharmony_ci    hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
874cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
875cabdff1aSopenharmony_ci    hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
876cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
877cabdff1aSopenharmony_ci    hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
878cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
879cabdff1aSopenharmony_ci    hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
880cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
881cabdff1aSopenharmony_ci
882cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
883cabdff1aSopenharmony_ci    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
884cabdff1aSopenharmony_ci
885cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
886cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
887cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
888cabdff1aSopenharmony_ci
889cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
890cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
891cabdff1aSopenharmony_ci        src += (4 * src_stride);
892cabdff1aSopenharmony_ci
893cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
894cabdff1aSopenharmony_ci
895cabdff1aSopenharmony_ci        hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
896cabdff1aSopenharmony_ci                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
897cabdff1aSopenharmony_ci        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
898cabdff1aSopenharmony_ci        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
899cabdff1aSopenharmony_ci                                   filt_vt2, filt_vt3);
900cabdff1aSopenharmony_ci
901cabdff1aSopenharmony_ci        hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
902cabdff1aSopenharmony_ci                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
903cabdff1aSopenharmony_ci        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
904cabdff1aSopenharmony_ci        tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
905cabdff1aSopenharmony_ci                                   filt_vt2, filt_vt3);
906cabdff1aSopenharmony_ci
907cabdff1aSopenharmony_ci        hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
908cabdff1aSopenharmony_ci                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
909cabdff1aSopenharmony_ci        out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
910cabdff1aSopenharmony_ci        tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0,
911cabdff1aSopenharmony_ci                                   filt_vt1, filt_vt2, filt_vt3);
912cabdff1aSopenharmony_ci
913cabdff1aSopenharmony_ci        hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
914cabdff1aSopenharmony_ci                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
915cabdff1aSopenharmony_ci        out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
916cabdff1aSopenharmony_ci        tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
917cabdff1aSopenharmony_ci                                   filt_vt2, filt_vt3);
918cabdff1aSopenharmony_ci        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
919cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
920cabdff1aSopenharmony_ci        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
921cabdff1aSopenharmony_ci        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
922cabdff1aSopenharmony_ci        ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
923cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
924cabdff1aSopenharmony_ci
925cabdff1aSopenharmony_ci        hz_out6 = hz_out10;
926cabdff1aSopenharmony_ci        out0 = out2;
927cabdff1aSopenharmony_ci        out1 = out3;
928cabdff1aSopenharmony_ci        out2 = out8;
929cabdff1aSopenharmony_ci        out4 = out6;
930cabdff1aSopenharmony_ci        out5 = out7;
931cabdff1aSopenharmony_ci        out6 = out9;
932cabdff1aSopenharmony_ci    }
933cabdff1aSopenharmony_ci}
934cabdff1aSopenharmony_ci
935cabdff1aSopenharmony_cistatic void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
936cabdff1aSopenharmony_ci                                      uint8_t *dst, int32_t dst_stride,
937cabdff1aSopenharmony_ci                                      const int8_t *filter_horiz,
938cabdff1aSopenharmony_ci                                      const int8_t *filter_vert,
939cabdff1aSopenharmony_ci                                      int32_t height)
940cabdff1aSopenharmony_ci{
941cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
942cabdff1aSopenharmony_ci
943cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
944cabdff1aSopenharmony_ci        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
945cabdff1aSopenharmony_ci                                 filter_vert, height);
946cabdff1aSopenharmony_ci
947cabdff1aSopenharmony_ci        src += 8;
948cabdff1aSopenharmony_ci        dst += 8;
949cabdff1aSopenharmony_ci    }
950cabdff1aSopenharmony_ci}
951cabdff1aSopenharmony_ci
952cabdff1aSopenharmony_cistatic void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
953cabdff1aSopenharmony_ci                                      uint8_t *dst, int32_t dst_stride,
954cabdff1aSopenharmony_ci                                      const int8_t *filter_horiz,
955cabdff1aSopenharmony_ci                                      const int8_t *filter_vert,
956cabdff1aSopenharmony_ci                                      int32_t height)
957cabdff1aSopenharmony_ci{
958cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
959cabdff1aSopenharmony_ci
960cabdff1aSopenharmony_ci    for (multiple8_cnt = 4; multiple8_cnt--;) {
961cabdff1aSopenharmony_ci        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
962cabdff1aSopenharmony_ci                                 filter_vert, height);
963cabdff1aSopenharmony_ci
964cabdff1aSopenharmony_ci        src += 8;
965cabdff1aSopenharmony_ci        dst += 8;
966cabdff1aSopenharmony_ci    }
967cabdff1aSopenharmony_ci}
968cabdff1aSopenharmony_ci
969cabdff1aSopenharmony_cistatic void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
970cabdff1aSopenharmony_ci                                      uint8_t *dst, int32_t dst_stride,
971cabdff1aSopenharmony_ci                                      const int8_t *filter_horiz,
972cabdff1aSopenharmony_ci                                      const int8_t *filter_vert,
973cabdff1aSopenharmony_ci                                      int32_t height)
974cabdff1aSopenharmony_ci{
975cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
976cabdff1aSopenharmony_ci
977cabdff1aSopenharmony_ci    for (multiple8_cnt = 8; multiple8_cnt--;) {
978cabdff1aSopenharmony_ci        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
979cabdff1aSopenharmony_ci                                 filter_vert, height);
980cabdff1aSopenharmony_ci
981cabdff1aSopenharmony_ci        src += 8;
982cabdff1aSopenharmony_ci        dst += 8;
983cabdff1aSopenharmony_ci    }
984cabdff1aSopenharmony_ci}
985cabdff1aSopenharmony_ci
986cabdff1aSopenharmony_cistatic void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
987cabdff1aSopenharmony_ci                                              int32_t src_stride,
988cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
989cabdff1aSopenharmony_ci                                              const int8_t *filter)
990cabdff1aSopenharmony_ci{
991cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
992cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
993cabdff1aSopenharmony_ci    v16u8 dst0, res;
994cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3;
995cabdff1aSopenharmony_ci    v8i16 filt, res0, res1;
996cabdff1aSopenharmony_ci
997cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[16]);
998cabdff1aSopenharmony_ci    src -= 3;
999cabdff1aSopenharmony_ci
1000cabdff1aSopenharmony_ci    /* rearranging filter */
1001cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1002cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1003cabdff1aSopenharmony_ci
1004cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1005cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1006cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1007cabdff1aSopenharmony_ci
1008cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1009cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
1010cabdff1aSopenharmony_ci    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1011cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, res0, res1);
1012cabdff1aSopenharmony_ci    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1013cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1014cabdff1aSopenharmony_ci    SRARI_H2_SH(res0, res1, 7);
1015cabdff1aSopenharmony_ci    SAT_SH2_SH(res0, res1, 7);
1016cabdff1aSopenharmony_ci    res = PCKEV_XORI128_UB(res0, res1);
1017cabdff1aSopenharmony_ci    res = (v16u8) __msa_aver_u_b(res, dst0);
1018cabdff1aSopenharmony_ci    ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
1019cabdff1aSopenharmony_ci}
1020cabdff1aSopenharmony_ci
1021cabdff1aSopenharmony_cistatic void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
1022cabdff1aSopenharmony_ci                                              int32_t src_stride,
1023cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
1024cabdff1aSopenharmony_ci                                              const int8_t *filter)
1025cabdff1aSopenharmony_ci{
1026cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
1027cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1028cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
1029cabdff1aSopenharmony_ci    v16u8 dst0, dst1;
1030cabdff1aSopenharmony_ci    v8i16 filt, vec0, vec1, vec2, vec3;
1031cabdff1aSopenharmony_ci
1032cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[16]);
1033cabdff1aSopenharmony_ci    src -= 3;
1034cabdff1aSopenharmony_ci
1035cabdff1aSopenharmony_ci    /* rearranging filter */
1036cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1037cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1038cabdff1aSopenharmony_ci
1039cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1040cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1041cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1042cabdff1aSopenharmony_ci
1043cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1044cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
1045cabdff1aSopenharmony_ci    src += (4 * src_stride);
1046cabdff1aSopenharmony_ci    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1047cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1048cabdff1aSopenharmony_ci    LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
1049cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1050cabdff1aSopenharmony_ci    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1051cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, vec0, vec1);
1052cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1053cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
1054cabdff1aSopenharmony_ci    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1055cabdff1aSopenharmony_ci                               mask3, filt0, filt1, filt2, filt3, vec2, vec3);
1056cabdff1aSopenharmony_ci    SRARI_H4_SH(vec0, vec1, vec2, vec3, 7);
1057cabdff1aSopenharmony_ci    SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
1058cabdff1aSopenharmony_ci    PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1059cabdff1aSopenharmony_ci                res0, res1, res2, res3);
1060cabdff1aSopenharmony_ci    ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
1061cabdff1aSopenharmony_ci    XORI_B2_128_UB(res0, res2);
1062cabdff1aSopenharmony_ci    AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
1063cabdff1aSopenharmony_ci    ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1064cabdff1aSopenharmony_ci}
1065cabdff1aSopenharmony_ci
1066cabdff1aSopenharmony_cistatic void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
1067cabdff1aSopenharmony_ci                                             int32_t src_stride,
1068cabdff1aSopenharmony_ci                                             uint8_t *dst, int32_t dst_stride,
1069cabdff1aSopenharmony_ci                                             const int8_t *filter,
1070cabdff1aSopenharmony_ci                                             int32_t height)
1071cabdff1aSopenharmony_ci{
1072cabdff1aSopenharmony_ci    if (4 == height) {
1073cabdff1aSopenharmony_ci        common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
1074cabdff1aSopenharmony_ci                                          filter);
1075cabdff1aSopenharmony_ci    } else if (8 == height) {
1076cabdff1aSopenharmony_ci        common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
1077cabdff1aSopenharmony_ci                                          filter);
1078cabdff1aSopenharmony_ci    }
1079cabdff1aSopenharmony_ci}
1080cabdff1aSopenharmony_ci
1081cabdff1aSopenharmony_cistatic void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
1082cabdff1aSopenharmony_ci                                             int32_t src_stride,
1083cabdff1aSopenharmony_ci                                             uint8_t *dst, int32_t dst_stride,
1084cabdff1aSopenharmony_ci                                             const int8_t *filter,
1085cabdff1aSopenharmony_ci                                             int32_t height)
1086cabdff1aSopenharmony_ci{
1087cabdff1aSopenharmony_ci    int32_t loop_cnt;
1088cabdff1aSopenharmony_ci    int64_t tp0, tp1, tp2, tp3;
1089cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1090cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
1091cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
1092cabdff1aSopenharmony_ci
1093cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
1094cabdff1aSopenharmony_ci    src -= 3;
1095cabdff1aSopenharmony_ci
1096cabdff1aSopenharmony_ci    /* rearranging filter */
1097cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1098cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1099cabdff1aSopenharmony_ci
1100cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1101cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1102cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1103cabdff1aSopenharmony_ci
1104cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1105cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
1106cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
1107cabdff1aSopenharmony_ci        src += (4 * src_stride);
1108cabdff1aSopenharmony_ci        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1109cabdff1aSopenharmony_ci                                   mask3, filt0, filt1, filt2, filt3, out0,
1110cabdff1aSopenharmony_ci                                   out1, out2, out3);
1111cabdff1aSopenharmony_ci        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1112cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst0);
1113cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, dst1);
1114cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
1115cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
1116cabdff1aSopenharmony_ci        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1117cabdff1aSopenharmony_ci                                dst, dst_stride);
1118cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1119cabdff1aSopenharmony_ci    }
1120cabdff1aSopenharmony_ci}
1121cabdff1aSopenharmony_ci
1122cabdff1aSopenharmony_cistatic void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
1123cabdff1aSopenharmony_ci                                              int32_t src_stride,
1124cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
1125cabdff1aSopenharmony_ci                                              const int8_t *filter,
1126cabdff1aSopenharmony_ci                                              int32_t height)
1127cabdff1aSopenharmony_ci{
1128cabdff1aSopenharmony_ci    int32_t loop_cnt;
1129cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1130cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
1131cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
1132cabdff1aSopenharmony_ci    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1133cabdff1aSopenharmony_ci    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1134cabdff1aSopenharmony_ci
1135cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
1136cabdff1aSopenharmony_ci    src -= 3;
1137cabdff1aSopenharmony_ci
1138cabdff1aSopenharmony_ci    /* rearranging filter */
1139cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1140cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1141cabdff1aSopenharmony_ci
1142cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1143cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1144cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1145cabdff1aSopenharmony_ci
1146cabdff1aSopenharmony_ci    for (loop_cnt = height >> 1; loop_cnt--;) {
1147cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src0, src2);
1148cabdff1aSopenharmony_ci        LD_SB2(src + 8, src_stride, src1, src3);
1149cabdff1aSopenharmony_ci        src += (2 * src_stride);
1150cabdff1aSopenharmony_ci
1151cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
1152cabdff1aSopenharmony_ci        VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1153cabdff1aSopenharmony_ci                   vec12);
1154cabdff1aSopenharmony_ci        VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1155cabdff1aSopenharmony_ci                   vec13);
1156cabdff1aSopenharmony_ci        VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1157cabdff1aSopenharmony_ci                   vec14);
1158cabdff1aSopenharmony_ci        VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1159cabdff1aSopenharmony_ci                   vec15);
1160cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1161cabdff1aSopenharmony_ci                    vec1, vec2, vec3);
1162cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1163cabdff1aSopenharmony_ci                    vec9, vec10, vec11);
1164cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1165cabdff1aSopenharmony_ci                     vec1, vec2, vec3);
1166cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1167cabdff1aSopenharmony_ci                     vec8, vec9, vec10, vec11);
1168cabdff1aSopenharmony_ci        ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1169cabdff1aSopenharmony_ci                    out1, out2, out3);
1170cabdff1aSopenharmony_ci        LD_UB2(dst, dst_stride, dst0, dst1);
1171cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
1172cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
1173cabdff1aSopenharmony_ci        PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
1174cabdff1aSopenharmony_ci        dst += dst_stride;
1175cabdff1aSopenharmony_ci        PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
1176cabdff1aSopenharmony_ci        dst += dst_stride;
1177cabdff1aSopenharmony_ci    }
1178cabdff1aSopenharmony_ci}
1179cabdff1aSopenharmony_ci
1180cabdff1aSopenharmony_cistatic void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
1181cabdff1aSopenharmony_ci                                              int32_t src_stride,
1182cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
1183cabdff1aSopenharmony_ci                                              const int8_t *filter,
1184cabdff1aSopenharmony_ci                                              int32_t height)
1185cabdff1aSopenharmony_ci{
1186cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1187cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1188cabdff1aSopenharmony_ci    v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1189cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
1190cabdff1aSopenharmony_ci    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1191cabdff1aSopenharmony_ci    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1192cabdff1aSopenharmony_ci
1193cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
1194cabdff1aSopenharmony_ci    src -= 3;
1195cabdff1aSopenharmony_ci
1196cabdff1aSopenharmony_ci    /* rearranging filter */
1197cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1198cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1199cabdff1aSopenharmony_ci
1200cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1201cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1202cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1203cabdff1aSopenharmony_ci
1204cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
1205cabdff1aSopenharmony_ci        src0 = LD_SB(src);
1206cabdff1aSopenharmony_ci        src2 = LD_SB(src + 16);
1207cabdff1aSopenharmony_ci        src3 = LD_SB(src + 24);
1208cabdff1aSopenharmony_ci        src1 = __msa_sldi_b(src2, src0, 8);
1209cabdff1aSopenharmony_ci        src += src_stride;
1210cabdff1aSopenharmony_ci
1211cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
1212cabdff1aSopenharmony_ci        VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1213cabdff1aSopenharmony_ci                   vec12);
1214cabdff1aSopenharmony_ci        VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1215cabdff1aSopenharmony_ci                   vec13);
1216cabdff1aSopenharmony_ci        VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1217cabdff1aSopenharmony_ci                   vec14);
1218cabdff1aSopenharmony_ci        VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1219cabdff1aSopenharmony_ci                   vec15);
1220cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1221cabdff1aSopenharmony_ci                    vec1, vec2, vec3);
1222cabdff1aSopenharmony_ci        DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1223cabdff1aSopenharmony_ci                    vec9, vec10, vec11);
1224cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1225cabdff1aSopenharmony_ci                     vec1, vec2, vec3);
1226cabdff1aSopenharmony_ci        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1227cabdff1aSopenharmony_ci                     vec8, vec9, vec10, vec11);
1228cabdff1aSopenharmony_ci        ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1229cabdff1aSopenharmony_ci                    out1, out2, out3);
1230cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
1231cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
1232cabdff1aSopenharmony_ci        LD_UB2(dst, 16, dst1, dst2);
1233cabdff1aSopenharmony_ci        PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
1234cabdff1aSopenharmony_ci        PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
1235cabdff1aSopenharmony_ci        dst += dst_stride;
1236cabdff1aSopenharmony_ci    }
1237cabdff1aSopenharmony_ci}
1238cabdff1aSopenharmony_ci
1239cabdff1aSopenharmony_cistatic void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
1240cabdff1aSopenharmony_ci                                              int32_t src_stride,
1241cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
1242cabdff1aSopenharmony_ci                                              const int8_t *filter,
1243cabdff1aSopenharmony_ci                                              int32_t height)
1244cabdff1aSopenharmony_ci{
1245cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
1246cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1247cabdff1aSopenharmony_ci    v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1248cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
1249cabdff1aSopenharmony_ci    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1250cabdff1aSopenharmony_ci    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1251cabdff1aSopenharmony_ci
1252cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
1253cabdff1aSopenharmony_ci    src -= 3;
1254cabdff1aSopenharmony_ci
1255cabdff1aSopenharmony_ci    /* rearranging filter */
1256cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1257cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1258cabdff1aSopenharmony_ci
1259cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1260cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1261cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1262cabdff1aSopenharmony_ci
1263cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
1264cabdff1aSopenharmony_ci        for (cnt = 0; cnt < 2; ++cnt) {
1265cabdff1aSopenharmony_ci            src0 = LD_SB(&src[cnt << 5]);
1266cabdff1aSopenharmony_ci            src2 = LD_SB(&src[16 + (cnt << 5)]);
1267cabdff1aSopenharmony_ci            src3 = LD_SB(&src[24 + (cnt << 5)]);
1268cabdff1aSopenharmony_ci            src1 = __msa_sldi_b(src2, src0, 8);
1269cabdff1aSopenharmony_ci
1270cabdff1aSopenharmony_ci            XORI_B4_128_SB(src0, src1, src2, src3);
1271cabdff1aSopenharmony_ci            VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1272cabdff1aSopenharmony_ci                       vec12);
1273cabdff1aSopenharmony_ci            VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1274cabdff1aSopenharmony_ci                       vec13);
1275cabdff1aSopenharmony_ci            VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6,
1276cabdff1aSopenharmony_ci                       vec10, vec14);
1277cabdff1aSopenharmony_ci            VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7,
1278cabdff1aSopenharmony_ci                       vec11, vec15);
1279cabdff1aSopenharmony_ci            DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1280cabdff1aSopenharmony_ci                        vec0, vec1, vec2, vec3);
1281cabdff1aSopenharmony_ci            DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2,
1282cabdff1aSopenharmony_ci                        vec8, vec9, vec10, vec11);
1283cabdff1aSopenharmony_ci            DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
1284cabdff1aSopenharmony_ci                         vec0, vec1, vec2, vec3);
1285cabdff1aSopenharmony_ci            DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1286cabdff1aSopenharmony_ci                         vec8, vec9, vec10, vec11);
1287cabdff1aSopenharmony_ci            ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1288cabdff1aSopenharmony_ci                        out1, out2, out3);
1289cabdff1aSopenharmony_ci            SRARI_H4_SH(out0, out1, out2, out3, 7);
1290cabdff1aSopenharmony_ci            SAT_SH4_SH(out0, out1, out2, out3, 7);
1291cabdff1aSopenharmony_ci            LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
1292cabdff1aSopenharmony_ci            PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
1293cabdff1aSopenharmony_ci            PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
1294cabdff1aSopenharmony_ci        }
1295cabdff1aSopenharmony_ci
1296cabdff1aSopenharmony_ci        src += src_stride;
1297cabdff1aSopenharmony_ci        dst += dst_stride;
1298cabdff1aSopenharmony_ci    }
1299cabdff1aSopenharmony_ci}
1300cabdff1aSopenharmony_ci
1301cabdff1aSopenharmony_cistatic void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
1302cabdff1aSopenharmony_ci                                             int32_t src_stride,
1303cabdff1aSopenharmony_ci                                             uint8_t *dst, int32_t dst_stride,
1304cabdff1aSopenharmony_ci                                             const int8_t *filter,
1305cabdff1aSopenharmony_ci                                             int32_t height)
1306cabdff1aSopenharmony_ci{
1307cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1308cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
1309cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1310cabdff1aSopenharmony_ci    v16u8 dst0, out;
1311cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1312cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
1313cabdff1aSopenharmony_ci    v16i8 src10998, filt0, filt1, filt2, filt3;
1314cabdff1aSopenharmony_ci    v8i16 filt, out10, out32;
1315cabdff1aSopenharmony_ci
1316cabdff1aSopenharmony_ci    src -= (3 * src_stride);
1317cabdff1aSopenharmony_ci
1318cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1319cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1320cabdff1aSopenharmony_ci
1321cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1322cabdff1aSopenharmony_ci    src += (7 * src_stride);
1323cabdff1aSopenharmony_ci
1324cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1325cabdff1aSopenharmony_ci               src54_r, src21_r);
1326cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1327cabdff1aSopenharmony_ci    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
1328cabdff1aSopenharmony_ci               src4332, src6554);
1329cabdff1aSopenharmony_ci    XORI_B3_128_SB(src2110, src4332, src6554);
1330cabdff1aSopenharmony_ci
1331cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1332cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
1333cabdff1aSopenharmony_ci        src += (4 * src_stride);
1334cabdff1aSopenharmony_ci
1335cabdff1aSopenharmony_ci        LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1336cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1337cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1338cabdff1aSopenharmony_ci                   src87_r, src98_r, src109_r);
1339cabdff1aSopenharmony_ci        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
1340cabdff1aSopenharmony_ci        XORI_B2_128_SB(src8776, src10998);
1341cabdff1aSopenharmony_ci        out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
1342cabdff1aSopenharmony_ci                                    filt1, filt2, filt3);
1343cabdff1aSopenharmony_ci        out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
1344cabdff1aSopenharmony_ci                                    filt1, filt2, filt3);
1345cabdff1aSopenharmony_ci        SRARI_H2_SH(out10, out32, 7);
1346cabdff1aSopenharmony_ci        SAT_SH2_SH(out10, out32, 7);
1347cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out10, out32);
1348cabdff1aSopenharmony_ci        out = __msa_aver_u_b(out, dst0);
1349cabdff1aSopenharmony_ci
1350cabdff1aSopenharmony_ci        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1351cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1352cabdff1aSopenharmony_ci
1353cabdff1aSopenharmony_ci        src2110 = src6554;
1354cabdff1aSopenharmony_ci        src4332 = src8776;
1355cabdff1aSopenharmony_ci        src6554 = src10998;
1356cabdff1aSopenharmony_ci        src6 = src10;
1357cabdff1aSopenharmony_ci    }
1358cabdff1aSopenharmony_ci}
1359cabdff1aSopenharmony_ci
1360cabdff1aSopenharmony_cistatic void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
1361cabdff1aSopenharmony_ci                                             int32_t src_stride,
1362cabdff1aSopenharmony_ci                                             uint8_t *dst, int32_t dst_stride,
1363cabdff1aSopenharmony_ci                                             const int8_t *filter,
1364cabdff1aSopenharmony_ci                                             int32_t height)
1365cabdff1aSopenharmony_ci{
1366cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1367cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
1368cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1369cabdff1aSopenharmony_ci    v16u8 dst0, dst1;
1370cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1371cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1372cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
1373cabdff1aSopenharmony_ci
1374cabdff1aSopenharmony_ci    src -= (3 * src_stride);
1375cabdff1aSopenharmony_ci
1376cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1377cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1378cabdff1aSopenharmony_ci
1379cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1380cabdff1aSopenharmony_ci    src += (7 * src_stride);
1381cabdff1aSopenharmony_ci
1382cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1383cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1384cabdff1aSopenharmony_ci               src54_r, src21_r);
1385cabdff1aSopenharmony_ci    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1386cabdff1aSopenharmony_ci
1387cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1388cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
1389cabdff1aSopenharmony_ci        src += (4 * src_stride);
1390cabdff1aSopenharmony_ci
1391cabdff1aSopenharmony_ci        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1392cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst0);
1393cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, dst1);
1394cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
1395cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1396cabdff1aSopenharmony_ci                   src87_r, src98_r, src109_r);
1397cabdff1aSopenharmony_ci        out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1398cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1399cabdff1aSopenharmony_ci        out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1400cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1401cabdff1aSopenharmony_ci        out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1402cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1403cabdff1aSopenharmony_ci        out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1404cabdff1aSopenharmony_ci                                   filt1, filt2, filt3);
1405cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
1406cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
1407cabdff1aSopenharmony_ci        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1408cabdff1aSopenharmony_ci                                dst, dst_stride);
1409cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1410cabdff1aSopenharmony_ci
1411cabdff1aSopenharmony_ci        src10_r = src54_r;
1412cabdff1aSopenharmony_ci        src32_r = src76_r;
1413cabdff1aSopenharmony_ci        src54_r = src98_r;
1414cabdff1aSopenharmony_ci        src21_r = src65_r;
1415cabdff1aSopenharmony_ci        src43_r = src87_r;
1416cabdff1aSopenharmony_ci        src65_r = src109_r;
1417cabdff1aSopenharmony_ci        src6 = src10;
1418cabdff1aSopenharmony_ci    }
1419cabdff1aSopenharmony_ci}
1420cabdff1aSopenharmony_ci
1421cabdff1aSopenharmony_cistatic void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
1422cabdff1aSopenharmony_ci                                                   int32_t src_stride,
1423cabdff1aSopenharmony_ci                                                   uint8_t *dst,
1424cabdff1aSopenharmony_ci                                                   int32_t dst_stride,
1425cabdff1aSopenharmony_ci                                                   const int8_t *filter,
1426cabdff1aSopenharmony_ci                                                   int32_t height,
1427cabdff1aSopenharmony_ci                                                   int32_t width)
1428cabdff1aSopenharmony_ci{
1429cabdff1aSopenharmony_ci    const uint8_t *src_tmp;
1430cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
1431cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
1432cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1433cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1434cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1435cabdff1aSopenharmony_ci    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1436cabdff1aSopenharmony_ci    v16i8 filt0, filt1, filt2, filt3;
1437cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
1438cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
1439cabdff1aSopenharmony_ci
1440cabdff1aSopenharmony_ci    src -= (3 * src_stride);
1441cabdff1aSopenharmony_ci
1442cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1443cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1444cabdff1aSopenharmony_ci
1445cabdff1aSopenharmony_ci    for (cnt = (width >> 4); cnt--;) {
1446cabdff1aSopenharmony_ci        src_tmp = src;
1447cabdff1aSopenharmony_ci        dst_tmp = dst;
1448cabdff1aSopenharmony_ci
1449cabdff1aSopenharmony_ci        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1450cabdff1aSopenharmony_ci        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1451cabdff1aSopenharmony_ci        src_tmp += (7 * src_stride);
1452cabdff1aSopenharmony_ci
1453cabdff1aSopenharmony_ci        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1454cabdff1aSopenharmony_ci                   src32_r, src54_r, src21_r);
1455cabdff1aSopenharmony_ci        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1456cabdff1aSopenharmony_ci        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1457cabdff1aSopenharmony_ci                   src32_l, src54_l, src21_l);
1458cabdff1aSopenharmony_ci        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1459cabdff1aSopenharmony_ci
1460cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 2); loop_cnt--;) {
1461cabdff1aSopenharmony_ci            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1462cabdff1aSopenharmony_ci            src_tmp += (4 * src_stride);
1463cabdff1aSopenharmony_ci
1464cabdff1aSopenharmony_ci            LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
1465cabdff1aSopenharmony_ci            XORI_B4_128_SB(src7, src8, src9, src10);
1466cabdff1aSopenharmony_ci            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1467cabdff1aSopenharmony_ci                       src87_r, src98_r, src109_r);
1468cabdff1aSopenharmony_ci            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1469cabdff1aSopenharmony_ci                       src87_l, src98_l, src109_l);
1470cabdff1aSopenharmony_ci            out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
1471cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
1472cabdff1aSopenharmony_ci            out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
1473cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
1474cabdff1aSopenharmony_ci            out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
1475cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
1476cabdff1aSopenharmony_ci            out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
1477cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
1478cabdff1aSopenharmony_ci            out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
1479cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
1480cabdff1aSopenharmony_ci            out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
1481cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
1482cabdff1aSopenharmony_ci            out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
1483cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
1484cabdff1aSopenharmony_ci            out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
1485cabdff1aSopenharmony_ci                                         filt0, filt1, filt2, filt3);
1486cabdff1aSopenharmony_ci            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1487cabdff1aSopenharmony_ci            SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1488cabdff1aSopenharmony_ci            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1489cabdff1aSopenharmony_ci            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1490cabdff1aSopenharmony_ci            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1491cabdff1aSopenharmony_ci                        out3_r, tmp0, tmp1, tmp2, tmp3);
1492cabdff1aSopenharmony_ci            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1493cabdff1aSopenharmony_ci            AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
1494cabdff1aSopenharmony_ci                        dst0, dst1, dst2, dst3);
1495cabdff1aSopenharmony_ci            ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
1496cabdff1aSopenharmony_ci            dst_tmp += (4 * dst_stride);
1497cabdff1aSopenharmony_ci
1498cabdff1aSopenharmony_ci            src10_r = src54_r;
1499cabdff1aSopenharmony_ci            src32_r = src76_r;
1500cabdff1aSopenharmony_ci            src54_r = src98_r;
1501cabdff1aSopenharmony_ci            src21_r = src65_r;
1502cabdff1aSopenharmony_ci            src43_r = src87_r;
1503cabdff1aSopenharmony_ci            src65_r = src109_r;
1504cabdff1aSopenharmony_ci            src10_l = src54_l;
1505cabdff1aSopenharmony_ci            src32_l = src76_l;
1506cabdff1aSopenharmony_ci            src54_l = src98_l;
1507cabdff1aSopenharmony_ci            src21_l = src65_l;
1508cabdff1aSopenharmony_ci            src43_l = src87_l;
1509cabdff1aSopenharmony_ci            src65_l = src109_l;
1510cabdff1aSopenharmony_ci            src6 = src10;
1511cabdff1aSopenharmony_ci        }
1512cabdff1aSopenharmony_ci
1513cabdff1aSopenharmony_ci        src += 16;
1514cabdff1aSopenharmony_ci        dst += 16;
1515cabdff1aSopenharmony_ci    }
1516cabdff1aSopenharmony_ci}
1517cabdff1aSopenharmony_ci
1518cabdff1aSopenharmony_cistatic void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
1519cabdff1aSopenharmony_ci                                              int32_t src_stride,
1520cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
1521cabdff1aSopenharmony_ci                                              const int8_t *filter,
1522cabdff1aSopenharmony_ci                                              int32_t height)
1523cabdff1aSopenharmony_ci{
1524cabdff1aSopenharmony_ci    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1525cabdff1aSopenharmony_ci                                           filter, height, 16);
1526cabdff1aSopenharmony_ci}
1527cabdff1aSopenharmony_ci
1528cabdff1aSopenharmony_cistatic void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
1529cabdff1aSopenharmony_ci                                              int32_t src_stride,
1530cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
1531cabdff1aSopenharmony_ci                                              const int8_t *filter,
1532cabdff1aSopenharmony_ci                                              int32_t height)
1533cabdff1aSopenharmony_ci{
1534cabdff1aSopenharmony_ci    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1535cabdff1aSopenharmony_ci                                           filter, height, 32);
1536cabdff1aSopenharmony_ci}
1537cabdff1aSopenharmony_ci
1538cabdff1aSopenharmony_cistatic void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
1539cabdff1aSopenharmony_ci                                              int32_t src_stride,
1540cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
1541cabdff1aSopenharmony_ci                                              const int8_t *filter,
1542cabdff1aSopenharmony_ci                                              int32_t height)
1543cabdff1aSopenharmony_ci{
1544cabdff1aSopenharmony_ci    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1545cabdff1aSopenharmony_ci                                           filter, height, 64);
1546cabdff1aSopenharmony_ci}
1547cabdff1aSopenharmony_ci
1548cabdff1aSopenharmony_cistatic void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
1549cabdff1aSopenharmony_ci                                                  int32_t src_stride,
1550cabdff1aSopenharmony_ci                                                  uint8_t *dst,
1551cabdff1aSopenharmony_ci                                                  int32_t dst_stride,
1552cabdff1aSopenharmony_ci                                                  const int8_t *filter_horiz,
1553cabdff1aSopenharmony_ci                                                  const int8_t *filter_vert,
1554cabdff1aSopenharmony_ci                                                  int32_t height)
1555cabdff1aSopenharmony_ci{
1556cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1557cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
1558cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1559cabdff1aSopenharmony_ci    v16u8 dst0, res, mask0, mask1, mask2, mask3;
1560cabdff1aSopenharmony_ci    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1561cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1562cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
1563cabdff1aSopenharmony_ci    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1564cabdff1aSopenharmony_ci
1565cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[16]);
1566cabdff1aSopenharmony_ci    src -= (3 + 3 * src_stride);
1567cabdff1aSopenharmony_ci
1568cabdff1aSopenharmony_ci    /* rearranging filter */
1569cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
1570cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1571cabdff1aSopenharmony_ci
1572cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1573cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1574cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1575cabdff1aSopenharmony_ci
1576cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1577cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1578cabdff1aSopenharmony_ci    src += (7 * src_stride);
1579cabdff1aSopenharmony_ci
1580cabdff1aSopenharmony_ci    hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
1581cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
1582cabdff1aSopenharmony_ci    hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
1583cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
1584cabdff1aSopenharmony_ci    hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
1585cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
1586cabdff1aSopenharmony_ci    hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
1587cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
1588cabdff1aSopenharmony_ci    SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3);
1589cabdff1aSopenharmony_ci
1590cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
1591cabdff1aSopenharmony_ci    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1592cabdff1aSopenharmony_ci
1593cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1594cabdff1aSopenharmony_ci    vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1595cabdff1aSopenharmony_ci
1596cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1597cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
1598cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
1599cabdff1aSopenharmony_ci        src += (4 * src_stride);
1600cabdff1aSopenharmony_ci
1601cabdff1aSopenharmony_ci        LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1602cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1603cabdff1aSopenharmony_ci        hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
1604cabdff1aSopenharmony_ci                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1605cabdff1aSopenharmony_ci        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1606cabdff1aSopenharmony_ci        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1607cabdff1aSopenharmony_ci        res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
1608cabdff1aSopenharmony_ci                                   filt_vt2, filt_vt3);
1609cabdff1aSopenharmony_ci
1610cabdff1aSopenharmony_ci        hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
1611cabdff1aSopenharmony_ci                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1612cabdff1aSopenharmony_ci        hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
1613cabdff1aSopenharmony_ci        vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1614cabdff1aSopenharmony_ci        res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
1615cabdff1aSopenharmony_ci                                   filt_vt2, filt_vt3);
1616cabdff1aSopenharmony_ci
1617cabdff1aSopenharmony_ci        SRARI_H2_SH(res0, res1, 7);
1618cabdff1aSopenharmony_ci        SAT_SH2_SH(res0, res1, 7);
1619cabdff1aSopenharmony_ci        res = PCKEV_XORI128_UB(res0, res1);
1620cabdff1aSopenharmony_ci        res = (v16u8) __msa_aver_u_b(res, dst0);
1621cabdff1aSopenharmony_ci        ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
1622cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1623cabdff1aSopenharmony_ci
1624cabdff1aSopenharmony_ci        hz_out5 = hz_out9;
1625cabdff1aSopenharmony_ci        vec0 = vec2;
1626cabdff1aSopenharmony_ci        vec1 = vec3;
1627cabdff1aSopenharmony_ci        vec2 = vec4;
1628cabdff1aSopenharmony_ci    }
1629cabdff1aSopenharmony_ci}
1630cabdff1aSopenharmony_ci
1631cabdff1aSopenharmony_cistatic void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
1632cabdff1aSopenharmony_ci                                                  int32_t src_stride,
1633cabdff1aSopenharmony_ci                                                  uint8_t *dst,
1634cabdff1aSopenharmony_ci                                                  int32_t dst_stride,
1635cabdff1aSopenharmony_ci                                                  const int8_t *filter_horiz,
1636cabdff1aSopenharmony_ci                                                  const int8_t *filter_vert,
1637cabdff1aSopenharmony_ci                                                  int32_t height)
1638cabdff1aSopenharmony_ci{
1639cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1640cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
1641cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1642cabdff1aSopenharmony_ci    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1643cabdff1aSopenharmony_ci    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1644cabdff1aSopenharmony_ci    v16u8 dst0, dst1, mask0, mask1, mask2, mask3;
1645cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1646cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
1647cabdff1aSopenharmony_ci    v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
1648cabdff1aSopenharmony_ci
1649cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
1650cabdff1aSopenharmony_ci    src -= (3 + 3 * src_stride);
1651cabdff1aSopenharmony_ci
1652cabdff1aSopenharmony_ci    /* rearranging filter */
1653cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
1654cabdff1aSopenharmony_ci    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1655cabdff1aSopenharmony_ci
1656cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1657cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1658cabdff1aSopenharmony_ci    mask3 = mask0 + 6;
1659cabdff1aSopenharmony_ci
1660cabdff1aSopenharmony_ci    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1661cabdff1aSopenharmony_ci    src += (7 * src_stride);
1662cabdff1aSopenharmony_ci
1663cabdff1aSopenharmony_ci    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1664cabdff1aSopenharmony_ci    hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
1665cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
1666cabdff1aSopenharmony_ci    hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
1667cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
1668cabdff1aSopenharmony_ci    hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
1669cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
1670cabdff1aSopenharmony_ci    hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
1671cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
1672cabdff1aSopenharmony_ci    hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
1673cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
1674cabdff1aSopenharmony_ci    hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
1675cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
1676cabdff1aSopenharmony_ci    hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
1677cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2, filt_hz3);
1678cabdff1aSopenharmony_ci
1679cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
1680cabdff1aSopenharmony_ci    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1681cabdff1aSopenharmony_ci
1682cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1683cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
1684cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
1685cabdff1aSopenharmony_ci
1686cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1687cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
1688cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
1689cabdff1aSopenharmony_ci        src += (4 * src_stride);
1690cabdff1aSopenharmony_ci
1691cabdff1aSopenharmony_ci        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1692cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst0);
1693cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, dst1);
1694cabdff1aSopenharmony_ci
1695cabdff1aSopenharmony_ci        hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
1696cabdff1aSopenharmony_ci                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1697cabdff1aSopenharmony_ci        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1698cabdff1aSopenharmony_ci        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
1699cabdff1aSopenharmony_ci                                   filt_vt2, filt_vt3);
1700cabdff1aSopenharmony_ci
1701cabdff1aSopenharmony_ci        hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
1702cabdff1aSopenharmony_ci                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1703cabdff1aSopenharmony_ci        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1704cabdff1aSopenharmony_ci        tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
1705cabdff1aSopenharmony_ci                                   filt_vt2, filt_vt3);
1706cabdff1aSopenharmony_ci
1707cabdff1aSopenharmony_ci        hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
1708cabdff1aSopenharmony_ci                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1709cabdff1aSopenharmony_ci        out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1710cabdff1aSopenharmony_ci        tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
1711cabdff1aSopenharmony_ci                                   filt_vt2, filt_vt3);
1712cabdff1aSopenharmony_ci
1713cabdff1aSopenharmony_ci        hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
1714cabdff1aSopenharmony_ci                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1715cabdff1aSopenharmony_ci        out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
1716cabdff1aSopenharmony_ci        tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
1717cabdff1aSopenharmony_ci                                   filt_vt2, filt_vt3);
1718cabdff1aSopenharmony_ci
1719cabdff1aSopenharmony_ci        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1720cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1721cabdff1aSopenharmony_ci        CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1,
1722cabdff1aSopenharmony_ci                                dst, dst_stride);
1723cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1724cabdff1aSopenharmony_ci
1725cabdff1aSopenharmony_ci        hz_out6 = hz_out10;
1726cabdff1aSopenharmony_ci        out0 = out2;
1727cabdff1aSopenharmony_ci        out1 = out3;
1728cabdff1aSopenharmony_ci        out2 = out8;
1729cabdff1aSopenharmony_ci        out4 = out6;
1730cabdff1aSopenharmony_ci        out5 = out7;
1731cabdff1aSopenharmony_ci        out6 = out9;
1732cabdff1aSopenharmony_ci    }
1733cabdff1aSopenharmony_ci}
1734cabdff1aSopenharmony_ci
1735cabdff1aSopenharmony_cistatic void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
1736cabdff1aSopenharmony_ci                                                   int32_t src_stride,
1737cabdff1aSopenharmony_ci                                                   uint8_t *dst,
1738cabdff1aSopenharmony_ci                                                   int32_t dst_stride,
1739cabdff1aSopenharmony_ci                                                   const int8_t *filter_horiz,
1740cabdff1aSopenharmony_ci                                                   const int8_t *filter_vert,
1741cabdff1aSopenharmony_ci                                                   int32_t height)
1742cabdff1aSopenharmony_ci{
1743cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
1744cabdff1aSopenharmony_ci
1745cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
1746cabdff1aSopenharmony_ci        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1747cabdff1aSopenharmony_ci                                              filter_horiz, filter_vert,
1748cabdff1aSopenharmony_ci                                              height);
1749cabdff1aSopenharmony_ci
1750cabdff1aSopenharmony_ci        src += 8;
1751cabdff1aSopenharmony_ci        dst += 8;
1752cabdff1aSopenharmony_ci    }
1753cabdff1aSopenharmony_ci}
1754cabdff1aSopenharmony_ci
1755cabdff1aSopenharmony_cistatic void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
1756cabdff1aSopenharmony_ci                                                   int32_t src_stride,
1757cabdff1aSopenharmony_ci                                                   uint8_t *dst,
1758cabdff1aSopenharmony_ci                                                   int32_t dst_stride,
1759cabdff1aSopenharmony_ci                                                   const int8_t *filter_horiz,
1760cabdff1aSopenharmony_ci                                                   const int8_t *filter_vert,
1761cabdff1aSopenharmony_ci                                                   int32_t height)
1762cabdff1aSopenharmony_ci{
1763cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
1764cabdff1aSopenharmony_ci
1765cabdff1aSopenharmony_ci    for (multiple8_cnt = 4; multiple8_cnt--;) {
1766cabdff1aSopenharmony_ci        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1767cabdff1aSopenharmony_ci                                              filter_horiz, filter_vert,
1768cabdff1aSopenharmony_ci                                              height);
1769cabdff1aSopenharmony_ci
1770cabdff1aSopenharmony_ci        src += 8;
1771cabdff1aSopenharmony_ci        dst += 8;
1772cabdff1aSopenharmony_ci    }
1773cabdff1aSopenharmony_ci}
1774cabdff1aSopenharmony_ci
1775cabdff1aSopenharmony_cistatic void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
1776cabdff1aSopenharmony_ci                                                   int32_t src_stride,
1777cabdff1aSopenharmony_ci                                                   uint8_t *dst,
1778cabdff1aSopenharmony_ci                                                   int32_t dst_stride,
1779cabdff1aSopenharmony_ci                                                   const int8_t *filter_horiz,
1780cabdff1aSopenharmony_ci                                                   const int8_t *filter_vert,
1781cabdff1aSopenharmony_ci                                                   int32_t height)
1782cabdff1aSopenharmony_ci{
1783cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
1784cabdff1aSopenharmony_ci
1785cabdff1aSopenharmony_ci    for (multiple8_cnt = 8; multiple8_cnt--;) {
1786cabdff1aSopenharmony_ci        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1787cabdff1aSopenharmony_ci                                              filter_horiz, filter_vert,
1788cabdff1aSopenharmony_ci                                              height);
1789cabdff1aSopenharmony_ci
1790cabdff1aSopenharmony_ci        src += 8;
1791cabdff1aSopenharmony_ci        dst += 8;
1792cabdff1aSopenharmony_ci    }
1793cabdff1aSopenharmony_ci}
1794cabdff1aSopenharmony_ci
1795cabdff1aSopenharmony_cistatic void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
1796cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1797cabdff1aSopenharmony_ci                                 const int8_t *filter)
1798cabdff1aSopenharmony_ci{
1799cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, mask;
1800cabdff1aSopenharmony_ci    v16u8 filt0, vec0, vec1, res0, res1;
1801cabdff1aSopenharmony_ci    v8u16 vec2, vec3, filt;
1802cabdff1aSopenharmony_ci
1803cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[16]);
1804cabdff1aSopenharmony_ci
1805cabdff1aSopenharmony_ci    /* rearranging filter */
1806cabdff1aSopenharmony_ci    filt = LD_UH(filter);
1807cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1808cabdff1aSopenharmony_ci
1809cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1810cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1811cabdff1aSopenharmony_ci    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1812cabdff1aSopenharmony_ci    SRARI_H2_UH(vec2, vec3, 7);
1813cabdff1aSopenharmony_ci    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
1814cabdff1aSopenharmony_ci    ST_W2(res0, 0, 1, dst, dst_stride);
1815cabdff1aSopenharmony_ci    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1816cabdff1aSopenharmony_ci}
1817cabdff1aSopenharmony_ci
1818cabdff1aSopenharmony_cistatic void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
1819cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1820cabdff1aSopenharmony_ci                                 const int8_t *filter)
1821cabdff1aSopenharmony_ci{
1822cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, filt0;
1823cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1824cabdff1aSopenharmony_ci    v16i8 res0, res1, res2, res3;
1825cabdff1aSopenharmony_ci    v8u16 vec4, vec5, vec6, vec7, filt;
1826cabdff1aSopenharmony_ci
1827cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[16]);
1828cabdff1aSopenharmony_ci
1829cabdff1aSopenharmony_ci    /* rearranging filter */
1830cabdff1aSopenharmony_ci    filt = LD_UH(filter);
1831cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1832cabdff1aSopenharmony_ci
1833cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1834cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1835cabdff1aSopenharmony_ci    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1836cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1837cabdff1aSopenharmony_ci                vec4, vec5, vec6, vec7);
1838cabdff1aSopenharmony_ci    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
1839cabdff1aSopenharmony_ci    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1840cabdff1aSopenharmony_ci                res0, res1, res2, res3);
1841cabdff1aSopenharmony_ci    ST_W2(res0, 0, 1, dst, dst_stride);
1842cabdff1aSopenharmony_ci    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1843cabdff1aSopenharmony_ci    ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
1844cabdff1aSopenharmony_ci    ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
1845cabdff1aSopenharmony_ci}
1846cabdff1aSopenharmony_ci
1847cabdff1aSopenharmony_civoid ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1848cabdff1aSopenharmony_ci                         const uint8_t *src, ptrdiff_t src_stride,
1849cabdff1aSopenharmony_ci                         int height, int mx, int my)
1850cabdff1aSopenharmony_ci{
1851cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1852cabdff1aSopenharmony_ci
1853cabdff1aSopenharmony_ci    if (4 == height) {
1854cabdff1aSopenharmony_ci        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1855cabdff1aSopenharmony_ci    } else if (8 == height) {
1856cabdff1aSopenharmony_ci        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1857cabdff1aSopenharmony_ci    }
1858cabdff1aSopenharmony_ci}
1859cabdff1aSopenharmony_ci
1860cabdff1aSopenharmony_cistatic void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
1861cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1862cabdff1aSopenharmony_ci                                 const int8_t *filter)
1863cabdff1aSopenharmony_ci{
1864cabdff1aSopenharmony_ci    v16u8 filt0;
1865cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, mask;
1866cabdff1aSopenharmony_ci    v8u16 vec0, vec1, vec2, vec3, filt;
1867cabdff1aSopenharmony_ci
1868cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
1869cabdff1aSopenharmony_ci
1870cabdff1aSopenharmony_ci    /* rearranging filter */
1871cabdff1aSopenharmony_ci    filt = LD_UH(filter);
1872cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1873cabdff1aSopenharmony_ci
1874cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1875cabdff1aSopenharmony_ci    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1876cabdff1aSopenharmony_ci    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1877cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1878cabdff1aSopenharmony_ci                vec0, vec1, vec2, vec3);
1879cabdff1aSopenharmony_ci    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1880cabdff1aSopenharmony_ci    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
1881cabdff1aSopenharmony_ci    ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
1882cabdff1aSopenharmony_ci}
1883cabdff1aSopenharmony_ci
1884cabdff1aSopenharmony_cistatic void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
1885cabdff1aSopenharmony_ci                                     uint8_t *dst, int32_t dst_stride,
1886cabdff1aSopenharmony_ci                                     const int8_t *filter, int32_t height)
1887cabdff1aSopenharmony_ci{
1888cabdff1aSopenharmony_ci    v16u8 filt0;
1889cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, mask, out0, out1;
1890cabdff1aSopenharmony_ci    v8u16 vec0, vec1, vec2, vec3, filt;
1891cabdff1aSopenharmony_ci
1892cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
1893cabdff1aSopenharmony_ci
1894cabdff1aSopenharmony_ci    /* rearranging filter */
1895cabdff1aSopenharmony_ci    filt = LD_UH(filter);
1896cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1897cabdff1aSopenharmony_ci
1898cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1899cabdff1aSopenharmony_ci    src += (4 * src_stride);
1900cabdff1aSopenharmony_ci
1901cabdff1aSopenharmony_ci    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1902cabdff1aSopenharmony_ci    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1903cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1904cabdff1aSopenharmony_ci                vec0, vec1, vec2, vec3);
1905cabdff1aSopenharmony_ci    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1906cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1907cabdff1aSopenharmony_ci    src += (4 * src_stride);
1908cabdff1aSopenharmony_ci
1909cabdff1aSopenharmony_ci    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1910cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1911cabdff1aSopenharmony_ci
1912cabdff1aSopenharmony_ci    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1913cabdff1aSopenharmony_ci    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1914cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1915cabdff1aSopenharmony_ci                vec0, vec1, vec2, vec3);
1916cabdff1aSopenharmony_ci    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1917cabdff1aSopenharmony_ci    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1918cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1919cabdff1aSopenharmony_ci    dst += (8 * dst_stride);
1920cabdff1aSopenharmony_ci
1921cabdff1aSopenharmony_ci    if (16 == height) {
1922cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
1923cabdff1aSopenharmony_ci        src += (4 * src_stride);
1924cabdff1aSopenharmony_ci
1925cabdff1aSopenharmony_ci        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1926cabdff1aSopenharmony_ci        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1927cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1928cabdff1aSopenharmony_ci                    vec0, vec1, vec2, vec3);
1929cabdff1aSopenharmony_ci        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1930cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
1931cabdff1aSopenharmony_ci        src += (4 * src_stride);
1932cabdff1aSopenharmony_ci
1933cabdff1aSopenharmony_ci        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1934cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1935cabdff1aSopenharmony_ci
1936cabdff1aSopenharmony_ci        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1937cabdff1aSopenharmony_ci        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1938cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1939cabdff1aSopenharmony_ci                    vec0, vec1, vec2, vec3);
1940cabdff1aSopenharmony_ci        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1941cabdff1aSopenharmony_ci        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1942cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1943cabdff1aSopenharmony_ci    }
1944cabdff1aSopenharmony_ci}
1945cabdff1aSopenharmony_ci
1946cabdff1aSopenharmony_civoid ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1947cabdff1aSopenharmony_ci                         const uint8_t *src, ptrdiff_t src_stride,
1948cabdff1aSopenharmony_ci                         int height, int mx, int my)
1949cabdff1aSopenharmony_ci{
1950cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1951cabdff1aSopenharmony_ci
1952cabdff1aSopenharmony_ci    if (4 == height) {
1953cabdff1aSopenharmony_ci        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1954cabdff1aSopenharmony_ci    } else {
1955cabdff1aSopenharmony_ci        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1956cabdff1aSopenharmony_ci                                 height);
1957cabdff1aSopenharmony_ci    }
1958cabdff1aSopenharmony_ci}
1959cabdff1aSopenharmony_ci
1960cabdff1aSopenharmony_civoid ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1961cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
1962cabdff1aSopenharmony_ci                          int height, int mx, int my)
1963cabdff1aSopenharmony_ci{
1964cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1965cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1966cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1967cabdff1aSopenharmony_ci    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1968cabdff1aSopenharmony_ci    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
1969cabdff1aSopenharmony_ci
1970cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
1971cabdff1aSopenharmony_ci
1972cabdff1aSopenharmony_ci    loop_cnt = (height >> 2) - 1;
1973cabdff1aSopenharmony_ci
1974cabdff1aSopenharmony_ci    /* rearranging filter */
1975cabdff1aSopenharmony_ci    filt = LD_UH(filter);
1976cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1977cabdff1aSopenharmony_ci
1978cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src2, src4, src6);
1979cabdff1aSopenharmony_ci    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1980cabdff1aSopenharmony_ci    src += (4 * src_stride);
1981cabdff1aSopenharmony_ci
1982cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1983cabdff1aSopenharmony_ci    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1984cabdff1aSopenharmony_ci    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1985cabdff1aSopenharmony_ci    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1986cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1987cabdff1aSopenharmony_ci                out0, out1, out2, out3);
1988cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1989cabdff1aSopenharmony_ci                out4, out5, out6, out7);
1990cabdff1aSopenharmony_ci    SRARI_H4_UH(out0, out1, out2, out3, 7);
1991cabdff1aSopenharmony_ci    SRARI_H4_UH(out4, out5, out6, out7, 7);
1992cabdff1aSopenharmony_ci    PCKEV_ST_SB(out0, out1, dst);
1993cabdff1aSopenharmony_ci    dst += dst_stride;
1994cabdff1aSopenharmony_ci    PCKEV_ST_SB(out2, out3, dst);
1995cabdff1aSopenharmony_ci    dst += dst_stride;
1996cabdff1aSopenharmony_ci    PCKEV_ST_SB(out4, out5, dst);
1997cabdff1aSopenharmony_ci    dst += dst_stride;
1998cabdff1aSopenharmony_ci    PCKEV_ST_SB(out6, out7, dst);
1999cabdff1aSopenharmony_ci    dst += dst_stride;
2000cabdff1aSopenharmony_ci
2001cabdff1aSopenharmony_ci    for (; loop_cnt--;) {
2002cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src2, src4, src6);
2003cabdff1aSopenharmony_ci        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2004cabdff1aSopenharmony_ci        src += (4 * src_stride);
2005cabdff1aSopenharmony_ci
2006cabdff1aSopenharmony_ci        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2007cabdff1aSopenharmony_ci        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2008cabdff1aSopenharmony_ci        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2009cabdff1aSopenharmony_ci        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2010cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2011cabdff1aSopenharmony_ci                    out0, out1, out2, out3);
2012cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2013cabdff1aSopenharmony_ci                    out4, out5, out6, out7);
2014cabdff1aSopenharmony_ci        SRARI_H4_UH(out0, out1, out2, out3, 7);
2015cabdff1aSopenharmony_ci        SRARI_H4_UH(out4, out5, out6, out7, 7);
2016cabdff1aSopenharmony_ci        PCKEV_ST_SB(out0, out1, dst);
2017cabdff1aSopenharmony_ci        dst += dst_stride;
2018cabdff1aSopenharmony_ci        PCKEV_ST_SB(out2, out3, dst);
2019cabdff1aSopenharmony_ci        dst += dst_stride;
2020cabdff1aSopenharmony_ci        PCKEV_ST_SB(out4, out5, dst);
2021cabdff1aSopenharmony_ci        dst += dst_stride;
2022cabdff1aSopenharmony_ci        PCKEV_ST_SB(out6, out7, dst);
2023cabdff1aSopenharmony_ci        dst += dst_stride;
2024cabdff1aSopenharmony_ci    }
2025cabdff1aSopenharmony_ci}
2026cabdff1aSopenharmony_ci
2027cabdff1aSopenharmony_civoid ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2028cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
2029cabdff1aSopenharmony_ci                          int height, int mx, int my)
2030cabdff1aSopenharmony_ci{
2031cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2032cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2033cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2034cabdff1aSopenharmony_ci    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2035cabdff1aSopenharmony_ci    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
2036cabdff1aSopenharmony_ci
2037cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
2038cabdff1aSopenharmony_ci
2039cabdff1aSopenharmony_ci    /* rearranging filter */
2040cabdff1aSopenharmony_ci    filt = LD_UH(filter);
2041cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2042cabdff1aSopenharmony_ci
2043cabdff1aSopenharmony_ci    for (loop_cnt = height >> 1; loop_cnt--;) {
2044cabdff1aSopenharmony_ci        src0 = LD_SB(src);
2045cabdff1aSopenharmony_ci        src2 = LD_SB(src + 16);
2046cabdff1aSopenharmony_ci        src3 = LD_SB(src + 24);
2047cabdff1aSopenharmony_ci        src1 = __msa_sldi_b(src2, src0, 8);
2048cabdff1aSopenharmony_ci        src += src_stride;
2049cabdff1aSopenharmony_ci        src4 = LD_SB(src);
2050cabdff1aSopenharmony_ci        src6 = LD_SB(src + 16);
2051cabdff1aSopenharmony_ci        src7 = LD_SB(src + 24);
2052cabdff1aSopenharmony_ci        src5 = __msa_sldi_b(src6, src4, 8);
2053cabdff1aSopenharmony_ci        src += src_stride;
2054cabdff1aSopenharmony_ci
2055cabdff1aSopenharmony_ci        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2056cabdff1aSopenharmony_ci        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2057cabdff1aSopenharmony_ci        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2058cabdff1aSopenharmony_ci        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2059cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2060cabdff1aSopenharmony_ci                    out0, out1, out2, out3);
2061cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2062cabdff1aSopenharmony_ci                    out4, out5, out6, out7);
2063cabdff1aSopenharmony_ci        SRARI_H4_UH(out0, out1, out2, out3, 7);
2064cabdff1aSopenharmony_ci        SRARI_H4_UH(out4, out5, out6, out7, 7);
2065cabdff1aSopenharmony_ci        PCKEV_ST_SB(out0, out1, dst);
2066cabdff1aSopenharmony_ci        PCKEV_ST_SB(out2, out3, dst + 16);
2067cabdff1aSopenharmony_ci        dst += dst_stride;
2068cabdff1aSopenharmony_ci        PCKEV_ST_SB(out4, out5, dst);
2069cabdff1aSopenharmony_ci        PCKEV_ST_SB(out6, out7, dst + 16);
2070cabdff1aSopenharmony_ci        dst += dst_stride;
2071cabdff1aSopenharmony_ci    }
2072cabdff1aSopenharmony_ci}
2073cabdff1aSopenharmony_ci
2074cabdff1aSopenharmony_civoid ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2075cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
2076cabdff1aSopenharmony_ci                          int height, int mx, int my)
2077cabdff1aSopenharmony_ci{
2078cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2079cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2080cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2081cabdff1aSopenharmony_ci    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2082cabdff1aSopenharmony_ci    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
2083cabdff1aSopenharmony_ci
2084cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
2085cabdff1aSopenharmony_ci
2086cabdff1aSopenharmony_ci    /* rearranging filter */
2087cabdff1aSopenharmony_ci    filt = LD_UH(filter);
2088cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2089cabdff1aSopenharmony_ci
2090cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
2091cabdff1aSopenharmony_ci        src0 = LD_SB(src);
2092cabdff1aSopenharmony_ci        src2 = LD_SB(src + 16);
2093cabdff1aSopenharmony_ci        src4 = LD_SB(src + 32);
2094cabdff1aSopenharmony_ci        src6 = LD_SB(src + 48);
2095cabdff1aSopenharmony_ci        src7 = LD_SB(src + 56);
2096cabdff1aSopenharmony_ci        SLDI_B3_SB(src2, src0, src4, src2, src6, src4, 8, src1, src3, src5);
2097cabdff1aSopenharmony_ci        src += src_stride;
2098cabdff1aSopenharmony_ci
2099cabdff1aSopenharmony_ci        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2100cabdff1aSopenharmony_ci        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2101cabdff1aSopenharmony_ci        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2102cabdff1aSopenharmony_ci        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2103cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2104cabdff1aSopenharmony_ci                    out0, out1, out2, out3);
2105cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2106cabdff1aSopenharmony_ci                    out4, out5, out6, out7);
2107cabdff1aSopenharmony_ci        SRARI_H4_UH(out0, out1, out2, out3, 7);
2108cabdff1aSopenharmony_ci        SRARI_H4_UH(out4, out5, out6, out7, 7);
2109cabdff1aSopenharmony_ci        PCKEV_ST_SB(out0, out1, dst);
2110cabdff1aSopenharmony_ci        PCKEV_ST_SB(out2, out3, dst + 16);
2111cabdff1aSopenharmony_ci        PCKEV_ST_SB(out4, out5, dst + 32);
2112cabdff1aSopenharmony_ci        PCKEV_ST_SB(out6, out7, dst + 48);
2113cabdff1aSopenharmony_ci        dst += dst_stride;
2114cabdff1aSopenharmony_ci    }
2115cabdff1aSopenharmony_ci}
2116cabdff1aSopenharmony_ci
2117cabdff1aSopenharmony_cistatic void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
2118cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2119cabdff1aSopenharmony_ci                                 const int8_t *filter)
2120cabdff1aSopenharmony_ci{
2121cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
2122cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2123cabdff1aSopenharmony_ci    v16u8 filt0;
2124cabdff1aSopenharmony_ci    v8i16 filt;
2125cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1;
2126cabdff1aSopenharmony_ci
2127cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2128cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
2129cabdff1aSopenharmony_ci
2130cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2131cabdff1aSopenharmony_ci    src += (5 * src_stride);
2132cabdff1aSopenharmony_ci
2133cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2134cabdff1aSopenharmony_ci               src10_r, src21_r, src32_r, src43_r);
2135cabdff1aSopenharmony_ci    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2136cabdff1aSopenharmony_ci    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
2137cabdff1aSopenharmony_ci    SRARI_H2_UH(tmp0, tmp1, 7);
2138cabdff1aSopenharmony_ci    SAT_UH2_UH(tmp0, tmp1, 7);
2139cabdff1aSopenharmony_ci    src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2140cabdff1aSopenharmony_ci    ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
2141cabdff1aSopenharmony_ci}
2142cabdff1aSopenharmony_ci
2143cabdff1aSopenharmony_cistatic void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
2144cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2145cabdff1aSopenharmony_ci                                 const int8_t *filter)
2146cabdff1aSopenharmony_ci{
2147cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2148cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
2149cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
2150cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3;
2151cabdff1aSopenharmony_ci    v16u8 filt0;
2152cabdff1aSopenharmony_ci    v8i16 filt;
2153cabdff1aSopenharmony_ci
2154cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2155cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
2156cabdff1aSopenharmony_ci
2157cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2158cabdff1aSopenharmony_ci    src += (8 * src_stride);
2159cabdff1aSopenharmony_ci
2160cabdff1aSopenharmony_ci    src8 = LD_SB(src);
2161cabdff1aSopenharmony_ci    src += src_stride;
2162cabdff1aSopenharmony_ci
2163cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2164cabdff1aSopenharmony_ci               src32_r, src43_r);
2165cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2166cabdff1aSopenharmony_ci               src76_r, src87_r);
2167cabdff1aSopenharmony_ci    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2168cabdff1aSopenharmony_ci               src87_r, src76_r, src2110, src4332, src6554, src8776);
2169cabdff1aSopenharmony_ci    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
2170cabdff1aSopenharmony_ci                tmp0, tmp1, tmp2, tmp3);
2171cabdff1aSopenharmony_ci    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2172cabdff1aSopenharmony_ci    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2173cabdff1aSopenharmony_ci    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
2174cabdff1aSopenharmony_ci    ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2175cabdff1aSopenharmony_ci}
2176cabdff1aSopenharmony_ci
2177cabdff1aSopenharmony_civoid ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2178cabdff1aSopenharmony_ci                         const uint8_t *src, ptrdiff_t src_stride,
2179cabdff1aSopenharmony_ci                         int height, int mx, int my)
2180cabdff1aSopenharmony_ci{
2181cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2182cabdff1aSopenharmony_ci
2183cabdff1aSopenharmony_ci    if (4 == height) {
2184cabdff1aSopenharmony_ci        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2185cabdff1aSopenharmony_ci    } else if (8 == height) {
2186cabdff1aSopenharmony_ci        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2187cabdff1aSopenharmony_ci    }
2188cabdff1aSopenharmony_ci}
2189cabdff1aSopenharmony_ci
2190cabdff1aSopenharmony_cistatic void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
2191cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
2192cabdff1aSopenharmony_ci                                 const int8_t *filter)
2193cabdff1aSopenharmony_ci{
2194cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
2195cabdff1aSopenharmony_ci    v16i8 out0, out1;
2196cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3;
2197cabdff1aSopenharmony_ci    v8i16 filt;
2198cabdff1aSopenharmony_ci
2199cabdff1aSopenharmony_ci    /* rearranging filter_y */
2200cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2201cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
2202cabdff1aSopenharmony_ci
2203cabdff1aSopenharmony_ci    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
2204cabdff1aSopenharmony_ci    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
2205cabdff1aSopenharmony_ci    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
2206cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2207cabdff1aSopenharmony_ci                tmp0, tmp1, tmp2, tmp3);
2208cabdff1aSopenharmony_ci    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2209cabdff1aSopenharmony_ci    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2210cabdff1aSopenharmony_ci    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2211cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2212cabdff1aSopenharmony_ci}
2213cabdff1aSopenharmony_ci
2214cabdff1aSopenharmony_cistatic void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
2215cabdff1aSopenharmony_ci                                     uint8_t *dst, int32_t dst_stride,
2216cabdff1aSopenharmony_ci                                     const int8_t *filter, int32_t height)
2217cabdff1aSopenharmony_ci{
2218cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2219cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2220cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2221cabdff1aSopenharmony_ci    v16i8 out0, out1;
2222cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3;
2223cabdff1aSopenharmony_ci    v8i16 filt;
2224cabdff1aSopenharmony_ci
2225cabdff1aSopenharmony_ci    /* rearranging filter_y */
2226cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2227cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
2228cabdff1aSopenharmony_ci
2229cabdff1aSopenharmony_ci    src0 = LD_UB(src);
2230cabdff1aSopenharmony_ci    src += src_stride;
2231cabdff1aSopenharmony_ci
2232cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
2233cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
2234cabdff1aSopenharmony_ci        src += (8 * src_stride);
2235cabdff1aSopenharmony_ci
2236cabdff1aSopenharmony_ci        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
2237cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
2238cabdff1aSopenharmony_ci        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
2239cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
2240cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2241cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2, tmp3);
2242cabdff1aSopenharmony_ci        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2243cabdff1aSopenharmony_ci        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2244cabdff1aSopenharmony_ci        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2245cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2246cabdff1aSopenharmony_ci
2247cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2248cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2, tmp3);
2249cabdff1aSopenharmony_ci        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2250cabdff1aSopenharmony_ci        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2251cabdff1aSopenharmony_ci        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2252cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2253cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
2254cabdff1aSopenharmony_ci
2255cabdff1aSopenharmony_ci        src0 = src8;
2256cabdff1aSopenharmony_ci    }
2257cabdff1aSopenharmony_ci}
2258cabdff1aSopenharmony_ci
2259cabdff1aSopenharmony_civoid ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2260cabdff1aSopenharmony_ci                         const uint8_t *src, ptrdiff_t src_stride,
2261cabdff1aSopenharmony_ci                         int height, int mx, int my)
2262cabdff1aSopenharmony_ci{
2263cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2264cabdff1aSopenharmony_ci
2265cabdff1aSopenharmony_ci    if (4 == height) {
2266cabdff1aSopenharmony_ci        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
2267cabdff1aSopenharmony_ci    } else {
2268cabdff1aSopenharmony_ci        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
2269cabdff1aSopenharmony_ci                                 height);
2270cabdff1aSopenharmony_ci    }
2271cabdff1aSopenharmony_ci}
2272cabdff1aSopenharmony_ci
2273cabdff1aSopenharmony_civoid ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2274cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
2275cabdff1aSopenharmony_ci                          int height, int mx, int my)
2276cabdff1aSopenharmony_ci{
2277cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2278cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2279cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
2280cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2281cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3;
2282cabdff1aSopenharmony_ci    v8i16 filt;
2283cabdff1aSopenharmony_ci
2284cabdff1aSopenharmony_ci    /* rearranging filter_y */
2285cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2286cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
2287cabdff1aSopenharmony_ci
2288cabdff1aSopenharmony_ci    src0 = LD_UB(src);
2289cabdff1aSopenharmony_ci    src += src_stride;
2290cabdff1aSopenharmony_ci
2291cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
2292cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src1, src2, src3, src4);
2293cabdff1aSopenharmony_ci        src += (4 * src_stride);
2294cabdff1aSopenharmony_ci
2295cabdff1aSopenharmony_ci        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2296cabdff1aSopenharmony_ci        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2297cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2298cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
2299cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
2300cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp0, tmp1, dst);
2301cabdff1aSopenharmony_ci        dst += dst_stride;
2302cabdff1aSopenharmony_ci
2303cabdff1aSopenharmony_ci        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2304cabdff1aSopenharmony_ci        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2305cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2306cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
2307cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
2308cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp2, tmp3, dst);
2309cabdff1aSopenharmony_ci        dst += dst_stride;
2310cabdff1aSopenharmony_ci
2311cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2312cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
2313cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
2314cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp0, tmp1, dst);
2315cabdff1aSopenharmony_ci        dst += dst_stride;
2316cabdff1aSopenharmony_ci
2317cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2318cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
2319cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
2320cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp2, tmp3, dst);
2321cabdff1aSopenharmony_ci        dst += dst_stride;
2322cabdff1aSopenharmony_ci
2323cabdff1aSopenharmony_ci        src0 = src4;
2324cabdff1aSopenharmony_ci    }
2325cabdff1aSopenharmony_ci}
2326cabdff1aSopenharmony_ci
2327cabdff1aSopenharmony_civoid ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2328cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
2329cabdff1aSopenharmony_ci                          int height, int mx, int my)
2330cabdff1aSopenharmony_ci{
2331cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2332cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2333cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
2334cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2335cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3;
2336cabdff1aSopenharmony_ci    v8i16 filt;
2337cabdff1aSopenharmony_ci
2338cabdff1aSopenharmony_ci    /* rearranging filter_y */
2339cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2340cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
2341cabdff1aSopenharmony_ci
2342cabdff1aSopenharmony_ci    src0 = LD_UB(src);
2343cabdff1aSopenharmony_ci    src5 = LD_UB(src + 16);
2344cabdff1aSopenharmony_ci    src += src_stride;
2345cabdff1aSopenharmony_ci
2346cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
2347cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src1, src2, src3, src4);
2348cabdff1aSopenharmony_ci        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2349cabdff1aSopenharmony_ci        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2350cabdff1aSopenharmony_ci
2351cabdff1aSopenharmony_ci        LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
2352cabdff1aSopenharmony_ci        src += (4 * src_stride);
2353cabdff1aSopenharmony_ci
2354cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2355cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
2356cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
2357cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp0, tmp1, dst);
2358cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2359cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
2360cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
2361cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
2362cabdff1aSopenharmony_ci
2363cabdff1aSopenharmony_ci        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2364cabdff1aSopenharmony_ci        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2365cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2366cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
2367cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
2368cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
2369cabdff1aSopenharmony_ci
2370cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2371cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
2372cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
2373cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
2374cabdff1aSopenharmony_ci
2375cabdff1aSopenharmony_ci        ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
2376cabdff1aSopenharmony_ci        ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
2377cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2378cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
2379cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
2380cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp0, tmp1, dst + 16);
2381cabdff1aSopenharmony_ci
2382cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2383cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
2384cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
2385cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
2386cabdff1aSopenharmony_ci
2387cabdff1aSopenharmony_ci        ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
2388cabdff1aSopenharmony_ci        ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
2389cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2390cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
2391cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
2392cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
2393cabdff1aSopenharmony_ci
2394cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2395cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
2396cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
2397cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
2398cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
2399cabdff1aSopenharmony_ci
2400cabdff1aSopenharmony_ci        src0 = src4;
2401cabdff1aSopenharmony_ci        src5 = src9;
2402cabdff1aSopenharmony_ci    }
2403cabdff1aSopenharmony_ci}
2404cabdff1aSopenharmony_ci
2405cabdff1aSopenharmony_civoid ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2406cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
2407cabdff1aSopenharmony_ci                          int height, int mx, int my)
2408cabdff1aSopenharmony_ci{
2409cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2410cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2411cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2412cabdff1aSopenharmony_ci    v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2413cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2414cabdff1aSopenharmony_ci    v8i16 filt;
2415cabdff1aSopenharmony_ci
2416cabdff1aSopenharmony_ci    /* rearranging filter_y */
2417cabdff1aSopenharmony_ci    filt = LD_SH(filter);
2418cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
2419cabdff1aSopenharmony_ci
2420cabdff1aSopenharmony_ci    LD_UB4(src, 16, src0, src3, src6, src9);
2421cabdff1aSopenharmony_ci    src += src_stride;
2422cabdff1aSopenharmony_ci
2423cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
2424cabdff1aSopenharmony_ci        LD_UB2(src, src_stride, src1, src2);
2425cabdff1aSopenharmony_ci        LD_UB2(src + 16, src_stride, src4, src5);
2426cabdff1aSopenharmony_ci        LD_UB2(src + 32, src_stride, src7, src8);
2427cabdff1aSopenharmony_ci        LD_UB2(src + 48, src_stride, src10, src11);
2428cabdff1aSopenharmony_ci        src += (2 * src_stride);
2429cabdff1aSopenharmony_ci
2430cabdff1aSopenharmony_ci        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2431cabdff1aSopenharmony_ci        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2432cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2433cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
2434cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
2435cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp0, tmp1, dst);
2436cabdff1aSopenharmony_ci
2437cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2438cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
2439cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
2440cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
2441cabdff1aSopenharmony_ci
2442cabdff1aSopenharmony_ci        ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
2443cabdff1aSopenharmony_ci        ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
2444cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2445cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp4, tmp5, 7);
2446cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp4, tmp5, 7);
2447cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp4, tmp5, dst + 16);
2448cabdff1aSopenharmony_ci
2449cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2450cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp6, tmp7, 7);
2451cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp6, tmp7, 7);
2452cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
2453cabdff1aSopenharmony_ci
2454cabdff1aSopenharmony_ci        ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
2455cabdff1aSopenharmony_ci        ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
2456cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2457cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
2458cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
2459cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp0, tmp1, dst + 32);
2460cabdff1aSopenharmony_ci
2461cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2462cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
2463cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
2464cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
2465cabdff1aSopenharmony_ci
2466cabdff1aSopenharmony_ci        ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
2467cabdff1aSopenharmony_ci        ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
2468cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2469cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp4, tmp5, 7);
2470cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp4, tmp5, 7);
2471cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp4, tmp5, dst + 48);
2472cabdff1aSopenharmony_ci
2473cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2474cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp6, tmp7, 7);
2475cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp6, tmp7, 7);
2476cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
2477cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
2478cabdff1aSopenharmony_ci
2479cabdff1aSopenharmony_ci        src0 = src2;
2480cabdff1aSopenharmony_ci        src3 = src5;
2481cabdff1aSopenharmony_ci        src6 = src8;
2482cabdff1aSopenharmony_ci        src9 = src11;
2483cabdff1aSopenharmony_ci    }
2484cabdff1aSopenharmony_ci}
2485cabdff1aSopenharmony_ci
2486cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
2487cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
2488cabdff1aSopenharmony_ci                               const int8_t *filter_horiz, const int8_t *filter_vert)
2489cabdff1aSopenharmony_ci{
2490cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, mask;
2491cabdff1aSopenharmony_ci    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
2492cabdff1aSopenharmony_ci    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
2493cabdff1aSopenharmony_ci
2494cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[16]);
2495cabdff1aSopenharmony_ci
2496cabdff1aSopenharmony_ci    /* rearranging filter */
2497cabdff1aSopenharmony_ci    filt = LD_UH(filter_horiz);
2498cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
2499cabdff1aSopenharmony_ci
2500cabdff1aSopenharmony_ci    filt = LD_UH(filter_vert);
2501cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
2502cabdff1aSopenharmony_ci
2503cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2504cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
2505cabdff1aSopenharmony_ci    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
2506cabdff1aSopenharmony_ci    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2507cabdff1aSopenharmony_ci    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
2508cabdff1aSopenharmony_ci    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
2509cabdff1aSopenharmony_ci
2510cabdff1aSopenharmony_ci    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2511cabdff1aSopenharmony_ci    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
2512cabdff1aSopenharmony_ci    SRARI_H2_UH(tmp0, tmp1, 7);
2513cabdff1aSopenharmony_ci    SAT_UH2_UH(tmp0, tmp1, 7);
2514cabdff1aSopenharmony_ci    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
2515cabdff1aSopenharmony_ci    ST_W2(res0, 0, 1, dst, dst_stride);
2516cabdff1aSopenharmony_ci    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2517cabdff1aSopenharmony_ci}
2518cabdff1aSopenharmony_ci
2519cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
2520cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
2521cabdff1aSopenharmony_ci                               const int8_t *filter_horiz, const int8_t *filter_vert)
2522cabdff1aSopenharmony_ci{
2523cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
2524cabdff1aSopenharmony_ci    v16i8 res0, res1, res2, res3;
2525cabdff1aSopenharmony_ci    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2526cabdff1aSopenharmony_ci    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2527cabdff1aSopenharmony_ci    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
2528cabdff1aSopenharmony_ci
2529cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[16]);
2530cabdff1aSopenharmony_ci
2531cabdff1aSopenharmony_ci    /* rearranging filter */
2532cabdff1aSopenharmony_ci    filt = LD_UH(filter_horiz);
2533cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
2534cabdff1aSopenharmony_ci
2535cabdff1aSopenharmony_ci    filt = LD_UH(filter_vert);
2536cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
2537cabdff1aSopenharmony_ci
2538cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2539cabdff1aSopenharmony_ci    src += (8 * src_stride);
2540cabdff1aSopenharmony_ci    src8 = LD_SB(src);
2541cabdff1aSopenharmony_ci
2542cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
2543cabdff1aSopenharmony_ci    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
2544cabdff1aSopenharmony_ci    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
2545cabdff1aSopenharmony_ci    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
2546cabdff1aSopenharmony_ci    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
2547cabdff1aSopenharmony_ci    SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
2548cabdff1aSopenharmony_ci               hz_out3, hz_out5);
2549cabdff1aSopenharmony_ci    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2550cabdff1aSopenharmony_ci
2551cabdff1aSopenharmony_ci    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2552cabdff1aSopenharmony_ci    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2553cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2554cabdff1aSopenharmony_ci                vec4, vec5, vec6, vec7);
2555cabdff1aSopenharmony_ci    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2556cabdff1aSopenharmony_ci    SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
2557cabdff1aSopenharmony_ci    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2558cabdff1aSopenharmony_ci                res0, res1, res2, res3);
2559cabdff1aSopenharmony_ci    ST_W2(res0, 0, 1, dst, dst_stride);
2560cabdff1aSopenharmony_ci    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2561cabdff1aSopenharmony_ci    ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
2562cabdff1aSopenharmony_ci    ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
2563cabdff1aSopenharmony_ci}
2564cabdff1aSopenharmony_ci
2565cabdff1aSopenharmony_civoid ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2566cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
2567cabdff1aSopenharmony_ci                          int height, int mx, int my)
2568cabdff1aSopenharmony_ci{
2569cabdff1aSopenharmony_ci    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2570cabdff1aSopenharmony_ci    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2571cabdff1aSopenharmony_ci
2572cabdff1aSopenharmony_ci    if (4 == height) {
2573cabdff1aSopenharmony_ci        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
2574cabdff1aSopenharmony_ci                                  filter_horiz, filter_vert);
2575cabdff1aSopenharmony_ci    } else if (8 == height) {
2576cabdff1aSopenharmony_ci        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
2577cabdff1aSopenharmony_ci                                  filter_horiz, filter_vert);
2578cabdff1aSopenharmony_ci    }
2579cabdff1aSopenharmony_ci}
2580cabdff1aSopenharmony_ci
2581cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
2582cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
2583cabdff1aSopenharmony_ci                               const int8_t *filter_horiz, const int8_t *filter_vert)
2584cabdff1aSopenharmony_ci{
2585cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2586cabdff1aSopenharmony_ci    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2587cabdff1aSopenharmony_ci    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2588cabdff1aSopenharmony_ci    v8i16 filt;
2589cabdff1aSopenharmony_ci
2590cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
2591cabdff1aSopenharmony_ci
2592cabdff1aSopenharmony_ci    /* rearranging filter */
2593cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
2594cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h(filt, 0);
2595cabdff1aSopenharmony_ci
2596cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
2597cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h(filt, 0);
2598cabdff1aSopenharmony_ci
2599cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2600cabdff1aSopenharmony_ci
2601cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2602cabdff1aSopenharmony_ci    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2603cabdff1aSopenharmony_ci    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2604cabdff1aSopenharmony_ci    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2605cabdff1aSopenharmony_ci
2606cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2607cabdff1aSopenharmony_ci    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2608cabdff1aSopenharmony_ci    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2609cabdff1aSopenharmony_ci
2610cabdff1aSopenharmony_ci    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2611cabdff1aSopenharmony_ci    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2612cabdff1aSopenharmony_ci    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2613cabdff1aSopenharmony_ci
2614cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2615cabdff1aSopenharmony_ci    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2616cabdff1aSopenharmony_ci    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2617cabdff1aSopenharmony_ci
2618cabdff1aSopenharmony_ci    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2619cabdff1aSopenharmony_ci    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2620cabdff1aSopenharmony_ci    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2621cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2622cabdff1aSopenharmony_ci}
2623cabdff1aSopenharmony_ci
2624cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride,
2625cabdff1aSopenharmony_ci                                   uint8_t *dst, int32_t dst_stride,
2626cabdff1aSopenharmony_ci                                   const int8_t *filter_horiz, const int8_t *filter_vert,
2627cabdff1aSopenharmony_ci                                   int32_t height)
2628cabdff1aSopenharmony_ci{
2629cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2630cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2631cabdff1aSopenharmony_ci    v16u8 filt_hz, filt_vt, vec0;
2632cabdff1aSopenharmony_ci    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2633cabdff1aSopenharmony_ci    v8i16 filt;
2634cabdff1aSopenharmony_ci
2635cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
2636cabdff1aSopenharmony_ci
2637cabdff1aSopenharmony_ci    /* rearranging filter */
2638cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
2639cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h(filt, 0);
2640cabdff1aSopenharmony_ci
2641cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
2642cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h(filt, 0);
2643cabdff1aSopenharmony_ci
2644cabdff1aSopenharmony_ci    src0 = LD_SB(src);
2645cabdff1aSopenharmony_ci    src += src_stride;
2646cabdff1aSopenharmony_ci
2647cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2648cabdff1aSopenharmony_ci
2649cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
2650cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src1, src2, src3, src4);
2651cabdff1aSopenharmony_ci        src += (4 * src_stride);
2652cabdff1aSopenharmony_ci
2653cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2654cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2655cabdff1aSopenharmony_ci        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2656cabdff1aSopenharmony_ci
2657cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2658cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2659cabdff1aSopenharmony_ci        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2660cabdff1aSopenharmony_ci
2661cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp1, tmp2, 7);
2662cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp1, tmp2, 7);
2663cabdff1aSopenharmony_ci
2664cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2665cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2666cabdff1aSopenharmony_ci        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2667cabdff1aSopenharmony_ci
2668cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2669cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src1, src2, src3, src4);
2670cabdff1aSopenharmony_ci        src += (4 * src_stride);
2671cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2672cabdff1aSopenharmony_ci        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2673cabdff1aSopenharmony_ci
2674cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp3, tmp4, 7);
2675cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp3, tmp4, 7);
2676cabdff1aSopenharmony_ci        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
2677cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2678cabdff1aSopenharmony_ci
2679cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2680cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2681cabdff1aSopenharmony_ci        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2682cabdff1aSopenharmony_ci
2683cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2684cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2685cabdff1aSopenharmony_ci        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2686cabdff1aSopenharmony_ci
2687cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2688cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2689cabdff1aSopenharmony_ci        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2690cabdff1aSopenharmony_ci
2691cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2692cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2693cabdff1aSopenharmony_ci        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2694cabdff1aSopenharmony_ci
2695cabdff1aSopenharmony_ci        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2696cabdff1aSopenharmony_ci        SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2697cabdff1aSopenharmony_ci        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
2698cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2699cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
2700cabdff1aSopenharmony_ci    }
2701cabdff1aSopenharmony_ci}
2702cabdff1aSopenharmony_ci
2703cabdff1aSopenharmony_civoid ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2704cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
2705cabdff1aSopenharmony_ci                          int height, int mx, int my)
2706cabdff1aSopenharmony_ci{
2707cabdff1aSopenharmony_ci    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2708cabdff1aSopenharmony_ci    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2709cabdff1aSopenharmony_ci
2710cabdff1aSopenharmony_ci    if (4 == height) {
2711cabdff1aSopenharmony_ci        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
2712cabdff1aSopenharmony_ci                                  filter_horiz, filter_vert);
2713cabdff1aSopenharmony_ci    } else {
2714cabdff1aSopenharmony_ci        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
2715cabdff1aSopenharmony_ci                                      filter_horiz, filter_vert, height);
2716cabdff1aSopenharmony_ci    }
2717cabdff1aSopenharmony_ci}
2718cabdff1aSopenharmony_ci
2719cabdff1aSopenharmony_civoid ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2720cabdff1aSopenharmony_ci                           const uint8_t *src, ptrdiff_t src_stride,
2721cabdff1aSopenharmony_ci                           int height, int mx, int my)
2722cabdff1aSopenharmony_ci{
2723cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2724cabdff1aSopenharmony_ci    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2725cabdff1aSopenharmony_ci    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2726cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2727cabdff1aSopenharmony_ci    v16u8 filt_hz, filt_vt, vec0, vec1;
2728cabdff1aSopenharmony_ci    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2729cabdff1aSopenharmony_ci    v8i16 filt;
2730cabdff1aSopenharmony_ci
2731cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
2732cabdff1aSopenharmony_ci
2733cabdff1aSopenharmony_ci    /* rearranging filter */
2734cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
2735cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h(filt, 0);
2736cabdff1aSopenharmony_ci
2737cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
2738cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h(filt, 0);
2739cabdff1aSopenharmony_ci
2740cabdff1aSopenharmony_ci    LD_SB2(src, 8, src0, src1);
2741cabdff1aSopenharmony_ci    src += src_stride;
2742cabdff1aSopenharmony_ci
2743cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2744cabdff1aSopenharmony_ci    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2745cabdff1aSopenharmony_ci
2746cabdff1aSopenharmony_ci
2747cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
2748cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src2, src4, src6);
2749cabdff1aSopenharmony_ci        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2750cabdff1aSopenharmony_ci        src += (4 * src_stride);
2751cabdff1aSopenharmony_ci
2752cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2753cabdff1aSopenharmony_ci        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2754cabdff1aSopenharmony_ci        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2755cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2756cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp1, tmp2, 7);
2757cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp1, tmp2, 7);
2758cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp1, tmp2, dst);
2759cabdff1aSopenharmony_ci        dst += dst_stride;
2760cabdff1aSopenharmony_ci
2761cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2762cabdff1aSopenharmony_ci        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2763cabdff1aSopenharmony_ci        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2764cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2765cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp1, tmp2, 7);
2766cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp1, tmp2, 7);
2767cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp1, tmp2, dst);
2768cabdff1aSopenharmony_ci        dst += dst_stride;
2769cabdff1aSopenharmony_ci
2770cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2771cabdff1aSopenharmony_ci        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
2772cabdff1aSopenharmony_ci        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2773cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2774cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp1, tmp2, 7);
2775cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp1, tmp2, 7);
2776cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp1, tmp2, dst);
2777cabdff1aSopenharmony_ci        dst += dst_stride;
2778cabdff1aSopenharmony_ci
2779cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
2780cabdff1aSopenharmony_ci        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
2781cabdff1aSopenharmony_ci        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2782cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2783cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp1, tmp2, 7);
2784cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp1, tmp2, 7);
2785cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp1, tmp2, dst);
2786cabdff1aSopenharmony_ci        dst += dst_stride;
2787cabdff1aSopenharmony_ci    }
2788cabdff1aSopenharmony_ci}
2789cabdff1aSopenharmony_ci
2790cabdff1aSopenharmony_civoid ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2791cabdff1aSopenharmony_ci                           const uint8_t *src, ptrdiff_t src_stride,
2792cabdff1aSopenharmony_ci                           int height, int mx, int my)
2793cabdff1aSopenharmony_ci{
2794cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
2795cabdff1aSopenharmony_ci
2796cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
2797cabdff1aSopenharmony_ci        ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
2798cabdff1aSopenharmony_ci
2799cabdff1aSopenharmony_ci        src += 16;
2800cabdff1aSopenharmony_ci        dst += 16;
2801cabdff1aSopenharmony_ci    }
2802cabdff1aSopenharmony_ci}
2803cabdff1aSopenharmony_ci
2804cabdff1aSopenharmony_civoid ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2805cabdff1aSopenharmony_ci                           const uint8_t *src, ptrdiff_t src_stride,
2806cabdff1aSopenharmony_ci                           int height, int mx, int my)
2807cabdff1aSopenharmony_ci{
2808cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
2809cabdff1aSopenharmony_ci
2810cabdff1aSopenharmony_ci    for (multiple8_cnt = 4; multiple8_cnt--;) {
2811cabdff1aSopenharmony_ci        ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
2812cabdff1aSopenharmony_ci
2813cabdff1aSopenharmony_ci        src += 16;
2814cabdff1aSopenharmony_ci        dst += 16;
2815cabdff1aSopenharmony_ci    }
2816cabdff1aSopenharmony_ci}
2817cabdff1aSopenharmony_ci
2818cabdff1aSopenharmony_cistatic void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
2819cabdff1aSopenharmony_ci                                              int32_t src_stride,
2820cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
2821cabdff1aSopenharmony_ci                                              const int8_t *filter)
2822cabdff1aSopenharmony_ci{
2823cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
2824cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, mask;
2825cabdff1aSopenharmony_ci    v16u8 filt0, dst0, vec0, vec1, res;
2826cabdff1aSopenharmony_ci    v8u16 vec2, vec3, filt;
2827cabdff1aSopenharmony_ci
2828cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[16]);
2829cabdff1aSopenharmony_ci
2830cabdff1aSopenharmony_ci    /* rearranging filter */
2831cabdff1aSopenharmony_ci    filt = LD_UH(filter);
2832cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2833cabdff1aSopenharmony_ci
2834cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
2835cabdff1aSopenharmony_ci    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
2836cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
2837cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2838cabdff1aSopenharmony_ci    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
2839cabdff1aSopenharmony_ci    SRARI_H2_UH(vec2, vec3, 7);
2840cabdff1aSopenharmony_ci
2841cabdff1aSopenharmony_ci    res = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
2842cabdff1aSopenharmony_ci    res = (v16u8) __msa_aver_u_b(res, dst0);
2843cabdff1aSopenharmony_ci
2844cabdff1aSopenharmony_ci    ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
2845cabdff1aSopenharmony_ci}
2846cabdff1aSopenharmony_ci
2847cabdff1aSopenharmony_cistatic void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
2848cabdff1aSopenharmony_ci                                              int32_t src_stride,
2849cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
2850cabdff1aSopenharmony_ci                                              const int8_t *filter)
2851cabdff1aSopenharmony_ci{
2852cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
2853cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2854cabdff1aSopenharmony_ci    v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
2855cabdff1aSopenharmony_ci    v16u8 dst0, dst1;
2856cabdff1aSopenharmony_ci    v8u16 vec4, vec5, vec6, vec7, filt;
2857cabdff1aSopenharmony_ci
2858cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[16]);
2859cabdff1aSopenharmony_ci
2860cabdff1aSopenharmony_ci    /* rearranging filter */
2861cabdff1aSopenharmony_ci    filt = LD_UH(filter);
2862cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2863cabdff1aSopenharmony_ci
2864cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2865cabdff1aSopenharmony_ci    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
2866cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
2867cabdff1aSopenharmony_ci    LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
2868cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
2869cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2870cabdff1aSopenharmony_ci    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
2871cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
2872cabdff1aSopenharmony_ci                vec6, vec7);
2873cabdff1aSopenharmony_ci    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2874cabdff1aSopenharmony_ci    PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
2875cabdff1aSopenharmony_ci                res2, res3);
2876cabdff1aSopenharmony_ci    ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
2877cabdff1aSopenharmony_ci    AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
2878cabdff1aSopenharmony_ci    ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2879cabdff1aSopenharmony_ci}
2880cabdff1aSopenharmony_ci
2881cabdff1aSopenharmony_civoid ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2882cabdff1aSopenharmony_ci                         const uint8_t *src, ptrdiff_t src_stride,
2883cabdff1aSopenharmony_ci                         int height, int mx, int my)
2884cabdff1aSopenharmony_ci{
2885cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2886cabdff1aSopenharmony_ci
2887cabdff1aSopenharmony_ci    if (4 == height) {
2888cabdff1aSopenharmony_ci        common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
2889cabdff1aSopenharmony_ci                                          filter);
2890cabdff1aSopenharmony_ci    } else if (8 == height) {
2891cabdff1aSopenharmony_ci        common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
2892cabdff1aSopenharmony_ci                                          filter);
2893cabdff1aSopenharmony_ci    }
2894cabdff1aSopenharmony_ci}
2895cabdff1aSopenharmony_ci
2896cabdff1aSopenharmony_cistatic void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
2897cabdff1aSopenharmony_ci                                              int32_t src_stride,
2898cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
2899cabdff1aSopenharmony_ci                                              const int8_t *filter)
2900cabdff1aSopenharmony_ci{
2901cabdff1aSopenharmony_ci    int64_t tp0, tp1, tp2, tp3;
2902cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, mask;
2903cabdff1aSopenharmony_ci    v16u8 filt0, dst0, dst1;
2904cabdff1aSopenharmony_ci    v8u16 vec0, vec1, vec2, vec3, filt;
2905cabdff1aSopenharmony_ci
2906cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
2907cabdff1aSopenharmony_ci
2908cabdff1aSopenharmony_ci    /* rearranging filter */
2909cabdff1aSopenharmony_ci    filt = LD_UH(filter);
2910cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2911cabdff1aSopenharmony_ci
2912cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
2913cabdff1aSopenharmony_ci    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2914cabdff1aSopenharmony_ci    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2915cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2916cabdff1aSopenharmony_ci                vec0, vec1, vec2, vec3);
2917cabdff1aSopenharmony_ci    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2918cabdff1aSopenharmony_ci    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2919cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
2920cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
2921cabdff1aSopenharmony_ci    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2922cabdff1aSopenharmony_ci}
2923cabdff1aSopenharmony_ci
2924cabdff1aSopenharmony_cistatic void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
2925cabdff1aSopenharmony_ci                                                  int32_t src_stride,
2926cabdff1aSopenharmony_ci                                                  uint8_t *dst,
2927cabdff1aSopenharmony_ci                                                  int32_t dst_stride,
2928cabdff1aSopenharmony_ci                                                  const int8_t *filter,
2929cabdff1aSopenharmony_ci                                                  int32_t height)
2930cabdff1aSopenharmony_ci{
2931cabdff1aSopenharmony_ci    int64_t tp0, tp1, tp2, tp3;
2932cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, mask;
2933cabdff1aSopenharmony_ci    v16u8 filt0, dst0, dst1;
2934cabdff1aSopenharmony_ci    v8u16 vec0, vec1, vec2, vec3, filt;
2935cabdff1aSopenharmony_ci
2936cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
2937cabdff1aSopenharmony_ci
2938cabdff1aSopenharmony_ci    /* rearranging filter */
2939cabdff1aSopenharmony_ci    filt = LD_UH(filter);
2940cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2941cabdff1aSopenharmony_ci
2942cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
2943cabdff1aSopenharmony_ci    src += (4 * src_stride);
2944cabdff1aSopenharmony_ci    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2945cabdff1aSopenharmony_ci    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2946cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
2947cabdff1aSopenharmony_ci                vec2, vec3);
2948cabdff1aSopenharmony_ci    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2949cabdff1aSopenharmony_ci    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2950cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
2951cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
2952cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
2953cabdff1aSopenharmony_ci    src += (4 * src_stride);
2954cabdff1aSopenharmony_ci    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2955cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
2956cabdff1aSopenharmony_ci
2957cabdff1aSopenharmony_ci    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2958cabdff1aSopenharmony_ci    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2959cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
2960cabdff1aSopenharmony_ci                vec2, vec3);
2961cabdff1aSopenharmony_ci    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2962cabdff1aSopenharmony_ci    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2963cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
2964cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
2965cabdff1aSopenharmony_ci    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2966cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
2967cabdff1aSopenharmony_ci
2968cabdff1aSopenharmony_ci    if (16 == height) {
2969cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
2970cabdff1aSopenharmony_ci        src += (4 * src_stride);
2971cabdff1aSopenharmony_ci
2972cabdff1aSopenharmony_ci        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2973cabdff1aSopenharmony_ci        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2974cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
2975cabdff1aSopenharmony_ci                    vec1, vec2, vec3);
2976cabdff1aSopenharmony_ci        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2977cabdff1aSopenharmony_ci        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2978cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst0);
2979cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, dst1);
2980cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
2981cabdff1aSopenharmony_ci        PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2982cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
2983cabdff1aSopenharmony_ci
2984cabdff1aSopenharmony_ci        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2985cabdff1aSopenharmony_ci        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2986cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
2987cabdff1aSopenharmony_ci                    vec1, vec2, vec3);
2988cabdff1aSopenharmony_ci        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2989cabdff1aSopenharmony_ci        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2990cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst0);
2991cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, dst1);
2992cabdff1aSopenharmony_ci        PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2993cabdff1aSopenharmony_ci    }
2994cabdff1aSopenharmony_ci}
2995cabdff1aSopenharmony_ci
2996cabdff1aSopenharmony_civoid ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2997cabdff1aSopenharmony_ci                         const uint8_t *src, ptrdiff_t src_stride,
2998cabdff1aSopenharmony_ci                         int height, int mx, int my)
2999cabdff1aSopenharmony_ci{
3000cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3001cabdff1aSopenharmony_ci
3002cabdff1aSopenharmony_ci    if (4 == height) {
3003cabdff1aSopenharmony_ci        common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3004cabdff1aSopenharmony_ci                                          filter);
3005cabdff1aSopenharmony_ci    } else {
3006cabdff1aSopenharmony_ci        common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
3007cabdff1aSopenharmony_ci                                              filter, height);
3008cabdff1aSopenharmony_ci    }
3009cabdff1aSopenharmony_ci}
3010cabdff1aSopenharmony_ci
3011cabdff1aSopenharmony_civoid ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3012cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
3013cabdff1aSopenharmony_ci                          int height, int mx, int my)
3014cabdff1aSopenharmony_ci{
3015cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3016cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3017cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3018cabdff1aSopenharmony_ci    v16u8 filt0, dst0, dst1, dst2, dst3;
3019cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3020cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
3021cabdff1aSopenharmony_ci
3022cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
3023cabdff1aSopenharmony_ci
3024cabdff1aSopenharmony_ci    /* rearranging filter */
3025cabdff1aSopenharmony_ci    filt = LD_UH(filter);
3026cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3027cabdff1aSopenharmony_ci
3028cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src2, src4, src6);
3029cabdff1aSopenharmony_ci    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3030cabdff1aSopenharmony_ci    src += (4 * src_stride);
3031cabdff1aSopenharmony_ci
3032cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3033cabdff1aSopenharmony_ci    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3034cabdff1aSopenharmony_ci    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3035cabdff1aSopenharmony_ci    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3036cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
3037cabdff1aSopenharmony_ci                res2, res3);
3038cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
3039cabdff1aSopenharmony_ci                res6, res7);
3040cabdff1aSopenharmony_ci    SRARI_H4_UH(res0, res1, res2, res3, 7);
3041cabdff1aSopenharmony_ci    SRARI_H4_UH(res4, res5, res6, res7, 7);
3042cabdff1aSopenharmony_ci    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3043cabdff1aSopenharmony_ci    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3044cabdff1aSopenharmony_ci    dst += dst_stride;
3045cabdff1aSopenharmony_ci    PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
3046cabdff1aSopenharmony_ci    dst += dst_stride;
3047cabdff1aSopenharmony_ci    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3048cabdff1aSopenharmony_ci    dst += dst_stride;
3049cabdff1aSopenharmony_ci    PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
3050cabdff1aSopenharmony_ci    dst += dst_stride;
3051cabdff1aSopenharmony_ci
3052cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
3053cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src2, src4, src6);
3054cabdff1aSopenharmony_ci        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3055cabdff1aSopenharmony_ci        src += (4 * src_stride);
3056cabdff1aSopenharmony_ci
3057cabdff1aSopenharmony_ci        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3058cabdff1aSopenharmony_ci        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3059cabdff1aSopenharmony_ci        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3060cabdff1aSopenharmony_ci        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3061cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
3062cabdff1aSopenharmony_ci                    res1, res2, res3);
3063cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4,
3064cabdff1aSopenharmony_ci                    res5, res6, res7);
3065cabdff1aSopenharmony_ci        SRARI_H4_UH(res0, res1, res2, res3, 7);
3066cabdff1aSopenharmony_ci        SRARI_H4_UH(res4, res5, res6, res7, 7);
3067cabdff1aSopenharmony_ci        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3068cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3069cabdff1aSopenharmony_ci        dst += dst_stride;
3070cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
3071cabdff1aSopenharmony_ci        dst += dst_stride;
3072cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3073cabdff1aSopenharmony_ci        dst += dst_stride;
3074cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
3075cabdff1aSopenharmony_ci        dst += dst_stride;
3076cabdff1aSopenharmony_ci    }
3077cabdff1aSopenharmony_ci}
3078cabdff1aSopenharmony_ci
3079cabdff1aSopenharmony_civoid ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3080cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
3081cabdff1aSopenharmony_ci                          int height, int mx, int my)
3082cabdff1aSopenharmony_ci{
3083cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3084cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3085cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3086cabdff1aSopenharmony_ci    v16u8 filt0, dst0, dst1, dst2, dst3;
3087cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3088cabdff1aSopenharmony_ci    v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
3089cabdff1aSopenharmony_ci
3090cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
3091cabdff1aSopenharmony_ci
3092cabdff1aSopenharmony_ci    /* rearranging filter */
3093cabdff1aSopenharmony_ci    filt = LD_UH(filter);
3094cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3095cabdff1aSopenharmony_ci
3096cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
3097cabdff1aSopenharmony_ci        src0 = LD_SB(src);
3098cabdff1aSopenharmony_ci        src2 = LD_SB(src + 16);
3099cabdff1aSopenharmony_ci        src3 = LD_SB(src + 24);
3100cabdff1aSopenharmony_ci        src1 = __msa_sldi_b(src2, src0, 8);
3101cabdff1aSopenharmony_ci        src += src_stride;
3102cabdff1aSopenharmony_ci        src4 = LD_SB(src);
3103cabdff1aSopenharmony_ci        src6 = LD_SB(src + 16);
3104cabdff1aSopenharmony_ci        src7 = LD_SB(src + 24);
3105cabdff1aSopenharmony_ci        src5 = __msa_sldi_b(src6, src4, 8);
3106cabdff1aSopenharmony_ci        src += src_stride;
3107cabdff1aSopenharmony_ci
3108cabdff1aSopenharmony_ci        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3109cabdff1aSopenharmony_ci        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3110cabdff1aSopenharmony_ci        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3111cabdff1aSopenharmony_ci        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3112cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3113cabdff1aSopenharmony_ci                    res0, res1, res2, res3);
3114cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3115cabdff1aSopenharmony_ci                    res4, res5, res6, res7);
3116cabdff1aSopenharmony_ci        SRARI_H4_UH(res0, res1, res2, res3, 7);
3117cabdff1aSopenharmony_ci        SRARI_H4_UH(res4, res5, res6, res7, 7);
3118cabdff1aSopenharmony_ci        LD_UB2(dst, 16, dst0, dst1);
3119cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3120cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
3121cabdff1aSopenharmony_ci        dst += dst_stride;
3122cabdff1aSopenharmony_ci        LD_UB2(dst, 16, dst2, dst3);
3123cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3124cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
3125cabdff1aSopenharmony_ci        dst += dst_stride;
3126cabdff1aSopenharmony_ci    }
3127cabdff1aSopenharmony_ci}
3128cabdff1aSopenharmony_ci
3129cabdff1aSopenharmony_civoid ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3130cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
3131cabdff1aSopenharmony_ci                          int height, int mx, int my)
3132cabdff1aSopenharmony_ci{
3133cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3134cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3135cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3136cabdff1aSopenharmony_ci    v16u8 filt0, dst0, dst1, dst2, dst3;
3137cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3138cabdff1aSopenharmony_ci    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
3139cabdff1aSopenharmony_ci
3140cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
3141cabdff1aSopenharmony_ci
3142cabdff1aSopenharmony_ci    /* rearranging filter */
3143cabdff1aSopenharmony_ci    filt = LD_UH(filter);
3144cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3145cabdff1aSopenharmony_ci
3146cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
3147cabdff1aSopenharmony_ci        LD_SB4(src, 16, src0, src2, src4, src6);
3148cabdff1aSopenharmony_ci        src7 = LD_SB(src + 56);
3149cabdff1aSopenharmony_ci        SLDI_B3_SB(src2, src0, src4, src2, src6, src4, 8, src1, src3, src5);
3150cabdff1aSopenharmony_ci        src += src_stride;
3151cabdff1aSopenharmony_ci
3152cabdff1aSopenharmony_ci        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3153cabdff1aSopenharmony_ci        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3154cabdff1aSopenharmony_ci        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3155cabdff1aSopenharmony_ci        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3156cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3157cabdff1aSopenharmony_ci                    out0, out1, out2, out3);
3158cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3159cabdff1aSopenharmony_ci                    out4, out5, out6, out7);
3160cabdff1aSopenharmony_ci        SRARI_H4_UH(out0, out1, out2, out3, 7);
3161cabdff1aSopenharmony_ci        SRARI_H4_UH(out4, out5, out6, out7, 7);
3162cabdff1aSopenharmony_ci        LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
3163cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
3164cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
3165cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
3166cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
3167cabdff1aSopenharmony_ci        dst += dst_stride;
3168cabdff1aSopenharmony_ci    }
3169cabdff1aSopenharmony_ci}
3170cabdff1aSopenharmony_ci
3171cabdff1aSopenharmony_cistatic void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
3172cabdff1aSopenharmony_ci                                              int32_t src_stride,
3173cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
3174cabdff1aSopenharmony_ci                                              const int8_t *filter)
3175cabdff1aSopenharmony_ci{
3176cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
3177cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
3178cabdff1aSopenharmony_ci    v16u8 dst0, out, filt0, src2110, src4332;
3179cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r;
3180cabdff1aSopenharmony_ci    v8i16 filt;
3181cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1;
3182cabdff1aSopenharmony_ci
3183cabdff1aSopenharmony_ci    filt = LD_SH(filter);
3184cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
3185cabdff1aSopenharmony_ci
3186cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
3187cabdff1aSopenharmony_ci    src += (4 * src_stride);
3188cabdff1aSopenharmony_ci
3189cabdff1aSopenharmony_ci    src4 = LD_SB(src);
3190cabdff1aSopenharmony_ci    src += src_stride;
3191cabdff1aSopenharmony_ci
3192cabdff1aSopenharmony_ci    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3193cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3194cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
3195cabdff1aSopenharmony_ci               src10_r, src21_r, src32_r, src43_r);
3196cabdff1aSopenharmony_ci    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3197cabdff1aSopenharmony_ci    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
3198cabdff1aSopenharmony_ci    SRARI_H2_UH(tmp0, tmp1, 7);
3199cabdff1aSopenharmony_ci    SAT_UH2_UH(tmp0, tmp1, 7);
3200cabdff1aSopenharmony_ci
3201cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3202cabdff1aSopenharmony_ci    out = __msa_aver_u_b(out, dst0);
3203cabdff1aSopenharmony_ci
3204cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3205cabdff1aSopenharmony_ci}
3206cabdff1aSopenharmony_ci
3207cabdff1aSopenharmony_cistatic void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
3208cabdff1aSopenharmony_ci                                              int32_t src_stride,
3209cabdff1aSopenharmony_ci                                              uint8_t *dst, int32_t dst_stride,
3210cabdff1aSopenharmony_ci                                              const int8_t *filter)
3211cabdff1aSopenharmony_ci{
3212cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
3213cabdff1aSopenharmony_ci    v16u8 dst0, dst1;
3214cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
3215cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3216cabdff1aSopenharmony_ci    v16u8 src2110, src4332, src6554, src8776, filt0;
3217cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3;
3218cabdff1aSopenharmony_ci    v8i16 filt;
3219cabdff1aSopenharmony_ci
3220cabdff1aSopenharmony_ci    filt = LD_SH(filter);
3221cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
3222cabdff1aSopenharmony_ci
3223cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3224cabdff1aSopenharmony_ci    src += (8 * src_stride);
3225cabdff1aSopenharmony_ci    src8 = LD_SB(src);
3226cabdff1aSopenharmony_ci
3227cabdff1aSopenharmony_ci    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3228cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3229cabdff1aSopenharmony_ci    LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3230cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
3231cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3232cabdff1aSopenharmony_ci               src32_r, src43_r);
3233cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3234cabdff1aSopenharmony_ci               src76_r, src87_r);
3235cabdff1aSopenharmony_ci    ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3236cabdff1aSopenharmony_ci               src87_r, src76_r, src2110, src4332, src6554, src8776);
3237cabdff1aSopenharmony_ci    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
3238cabdff1aSopenharmony_ci                tmp0, tmp1, tmp2, tmp3);
3239cabdff1aSopenharmony_ci    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3240cabdff1aSopenharmony_ci    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3241cabdff1aSopenharmony_ci    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
3242cabdff1aSopenharmony_ci    AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
3243cabdff1aSopenharmony_ci    ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3244cabdff1aSopenharmony_ci}
3245cabdff1aSopenharmony_ci
3246cabdff1aSopenharmony_civoid ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3247cabdff1aSopenharmony_ci                         const uint8_t *src, ptrdiff_t src_stride,
3248cabdff1aSopenharmony_ci                         int height, int mx, int my)
3249cabdff1aSopenharmony_ci{
3250cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3251cabdff1aSopenharmony_ci
3252cabdff1aSopenharmony_ci    if (4 == height) {
3253cabdff1aSopenharmony_ci        common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
3254cabdff1aSopenharmony_ci                                          filter);
3255cabdff1aSopenharmony_ci    } else if (8 == height) {
3256cabdff1aSopenharmony_ci        common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
3257cabdff1aSopenharmony_ci                                          filter);
3258cabdff1aSopenharmony_ci    }
3259cabdff1aSopenharmony_ci}
3260cabdff1aSopenharmony_ci
3261cabdff1aSopenharmony_cistatic void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
3262cabdff1aSopenharmony_ci                                              int32_t src_stride,
3263cabdff1aSopenharmony_ci                                              uint8_t *dst,
3264cabdff1aSopenharmony_ci                                              int32_t dst_stride,
3265cabdff1aSopenharmony_ci                                              const int8_t *filter)
3266cabdff1aSopenharmony_ci{
3267cabdff1aSopenharmony_ci    int64_t tp0, tp1, tp2, tp3;
3268cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
3269cabdff1aSopenharmony_ci    v16u8 dst0, dst1, vec0, vec1, vec2, vec3, filt0;
3270cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3;
3271cabdff1aSopenharmony_ci    v8i16 filt;
3272cabdff1aSopenharmony_ci
3273cabdff1aSopenharmony_ci    /* rearranging filter_y */
3274cabdff1aSopenharmony_ci    filt = LD_SH(filter);
3275cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
3276cabdff1aSopenharmony_ci
3277cabdff1aSopenharmony_ci    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
3278cabdff1aSopenharmony_ci    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3279cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
3280cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
3281cabdff1aSopenharmony_ci    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
3282cabdff1aSopenharmony_ci    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
3283cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3284cabdff1aSopenharmony_ci                tmp0, tmp1, tmp2, tmp3);
3285cabdff1aSopenharmony_ci    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3286cabdff1aSopenharmony_ci    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3287cabdff1aSopenharmony_ci    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3288cabdff1aSopenharmony_ci}
3289cabdff1aSopenharmony_ci
3290cabdff1aSopenharmony_cistatic void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
3291cabdff1aSopenharmony_ci                                                  int32_t src_stride,
3292cabdff1aSopenharmony_ci                                                  uint8_t *dst,
3293cabdff1aSopenharmony_ci                                                  int32_t dst_stride,
3294cabdff1aSopenharmony_ci                                                  const int8_t *filter,
3295cabdff1aSopenharmony_ci                                                  int32_t height)
3296cabdff1aSopenharmony_ci{
3297cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3298cabdff1aSopenharmony_ci    int64_t tp0, tp1, tp2, tp3;
3299cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3300cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3;
3301cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3302cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3;
3303cabdff1aSopenharmony_ci    v8i16 filt;
3304cabdff1aSopenharmony_ci
3305cabdff1aSopenharmony_ci    /* rearranging filter_y */
3306cabdff1aSopenharmony_ci    filt = LD_SH(filter);
3307cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
3308cabdff1aSopenharmony_ci
3309cabdff1aSopenharmony_ci    src0 = LD_UB(src);
3310cabdff1aSopenharmony_ci    src += src_stride;
3311cabdff1aSopenharmony_ci
3312cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
3313cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
3314cabdff1aSopenharmony_ci        src += (8 * src_stride);
3315cabdff1aSopenharmony_ci
3316cabdff1aSopenharmony_ci        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3317cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst0);
3318cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, dst1);
3319cabdff1aSopenharmony_ci        LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3320cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst2);
3321cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, dst3);
3322cabdff1aSopenharmony_ci
3323cabdff1aSopenharmony_ci        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
3324cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
3325cabdff1aSopenharmony_ci        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
3326cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
3327cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3328cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2, tmp3);
3329cabdff1aSopenharmony_ci        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3330cabdff1aSopenharmony_ci        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3331cabdff1aSopenharmony_ci        PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3332cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
3333cabdff1aSopenharmony_ci
3334cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3335cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2, tmp3);
3336cabdff1aSopenharmony_ci        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3337cabdff1aSopenharmony_ci        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3338cabdff1aSopenharmony_ci        PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride);
3339cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
3340cabdff1aSopenharmony_ci
3341cabdff1aSopenharmony_ci        src0 = src8;
3342cabdff1aSopenharmony_ci    }
3343cabdff1aSopenharmony_ci}
3344cabdff1aSopenharmony_ci
3345cabdff1aSopenharmony_civoid ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3346cabdff1aSopenharmony_ci                         const uint8_t *src, ptrdiff_t src_stride,
3347cabdff1aSopenharmony_ci                         int height, int mx, int my)
3348cabdff1aSopenharmony_ci{
3349cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3350cabdff1aSopenharmony_ci
3351cabdff1aSopenharmony_ci    if (4 == height) {
3352cabdff1aSopenharmony_ci        common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3353cabdff1aSopenharmony_ci                                          filter);
3354cabdff1aSopenharmony_ci    } else {
3355cabdff1aSopenharmony_ci        common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
3356cabdff1aSopenharmony_ci                                              filter, height);
3357cabdff1aSopenharmony_ci    }
3358cabdff1aSopenharmony_ci}
3359cabdff1aSopenharmony_ci
3360cabdff1aSopenharmony_civoid ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3361cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
3362cabdff1aSopenharmony_ci                          int height, int mx, int my)
3363cabdff1aSopenharmony_ci{
3364cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3365cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3366cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
3367cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3368cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3, filt;
3369cabdff1aSopenharmony_ci
3370cabdff1aSopenharmony_ci    /* rearranging filter_y */
3371cabdff1aSopenharmony_ci    filt = LD_UH(filter);
3372cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3373cabdff1aSopenharmony_ci
3374cabdff1aSopenharmony_ci    src0 = LD_UB(src);
3375cabdff1aSopenharmony_ci    src += src_stride;
3376cabdff1aSopenharmony_ci
3377cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
3378cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src1, src2, src3, src4);
3379cabdff1aSopenharmony_ci        src += (4 * src_stride);
3380cabdff1aSopenharmony_ci
3381cabdff1aSopenharmony_ci        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3382cabdff1aSopenharmony_ci        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3383cabdff1aSopenharmony_ci        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3384cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3385cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
3386cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
3387cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3388cabdff1aSopenharmony_ci        dst += dst_stride;
3389cabdff1aSopenharmony_ci
3390cabdff1aSopenharmony_ci        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3391cabdff1aSopenharmony_ci        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3392cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3393cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
3394cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
3395cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
3396cabdff1aSopenharmony_ci        dst += dst_stride;
3397cabdff1aSopenharmony_ci
3398cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3399cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
3400cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
3401cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
3402cabdff1aSopenharmony_ci        dst += dst_stride;
3403cabdff1aSopenharmony_ci
3404cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3405cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
3406cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
3407cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
3408cabdff1aSopenharmony_ci        dst += dst_stride;
3409cabdff1aSopenharmony_ci
3410cabdff1aSopenharmony_ci        src0 = src4;
3411cabdff1aSopenharmony_ci    }
3412cabdff1aSopenharmony_ci}
3413cabdff1aSopenharmony_ci
3414cabdff1aSopenharmony_civoid ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3415cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
3416cabdff1aSopenharmony_ci                          int height, int mx, int my)
3417cabdff1aSopenharmony_ci{
3418cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3419cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3420cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3421cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3422cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3423cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3, filt;
3424cabdff1aSopenharmony_ci
3425cabdff1aSopenharmony_ci    /* rearranging filter_y */
3426cabdff1aSopenharmony_ci    filt = LD_UH(filter);
3427cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3428cabdff1aSopenharmony_ci
3429cabdff1aSopenharmony_ci    LD_UB2(src, 16, src0, src5);
3430cabdff1aSopenharmony_ci    src += src_stride;
3431cabdff1aSopenharmony_ci
3432cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
3433cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src1, src2, src3, src4);
3434cabdff1aSopenharmony_ci        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3435cabdff1aSopenharmony_ci        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3436cabdff1aSopenharmony_ci        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3437cabdff1aSopenharmony_ci
3438cabdff1aSopenharmony_ci        LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
3439cabdff1aSopenharmony_ci        LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
3440cabdff1aSopenharmony_ci        src += (4 * src_stride);
3441cabdff1aSopenharmony_ci
3442cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3443cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
3444cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
3445cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3446cabdff1aSopenharmony_ci
3447cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3448cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
3449cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
3450cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
3451cabdff1aSopenharmony_ci
3452cabdff1aSopenharmony_ci        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3453cabdff1aSopenharmony_ci        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3454cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3455cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
3456cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
3457cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
3458cabdff1aSopenharmony_ci
3459cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3460cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
3461cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
3462cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
3463cabdff1aSopenharmony_ci
3464cabdff1aSopenharmony_ci        ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
3465cabdff1aSopenharmony_ci        ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
3466cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3467cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
3468cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
3469cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
3470cabdff1aSopenharmony_ci
3471cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3472cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
3473cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
3474cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
3475cabdff1aSopenharmony_ci
3476cabdff1aSopenharmony_ci        ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
3477cabdff1aSopenharmony_ci        ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
3478cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3479cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
3480cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
3481cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
3482cabdff1aSopenharmony_ci
3483cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3484cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
3485cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
3486cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
3487cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
3488cabdff1aSopenharmony_ci
3489cabdff1aSopenharmony_ci        src0 = src4;
3490cabdff1aSopenharmony_ci        src5 = src9;
3491cabdff1aSopenharmony_ci    }
3492cabdff1aSopenharmony_ci}
3493cabdff1aSopenharmony_ci
3494cabdff1aSopenharmony_civoid ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3495cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
3496cabdff1aSopenharmony_ci                          int height, int mx, int my)
3497cabdff1aSopenharmony_ci{
3498cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3499cabdff1aSopenharmony_ci    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3500cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5;
3501cabdff1aSopenharmony_ci    v16u8 src6, src7, src8, src9, src10, src11, filt0;
3502cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3503cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3504cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3505cabdff1aSopenharmony_ci    v8u16 filt;
3506cabdff1aSopenharmony_ci
3507cabdff1aSopenharmony_ci    /* rearranging filter_y */
3508cabdff1aSopenharmony_ci    filt = LD_UH(filter);
3509cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3510cabdff1aSopenharmony_ci
3511cabdff1aSopenharmony_ci    LD_UB4(src, 16, src0, src3, src6, src9);
3512cabdff1aSopenharmony_ci    src += src_stride;
3513cabdff1aSopenharmony_ci
3514cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
3515cabdff1aSopenharmony_ci        LD_UB2(src, src_stride, src1, src2);
3516cabdff1aSopenharmony_ci        LD_UB2(dst, dst_stride, dst0, dst1);
3517cabdff1aSopenharmony_ci        LD_UB2(src + 16, src_stride, src4, src5);
3518cabdff1aSopenharmony_ci        LD_UB2(dst + 16, dst_stride, dst2, dst3);
3519cabdff1aSopenharmony_ci        LD_UB2(src + 32, src_stride, src7, src8);
3520cabdff1aSopenharmony_ci        LD_UB2(dst + 32, dst_stride, dst4, dst5);
3521cabdff1aSopenharmony_ci        LD_UB2(src + 48, src_stride, src10, src11);
3522cabdff1aSopenharmony_ci        LD_UB2(dst + 48, dst_stride, dst6, dst7);
3523cabdff1aSopenharmony_ci        src += (2 * src_stride);
3524cabdff1aSopenharmony_ci
3525cabdff1aSopenharmony_ci        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3526cabdff1aSopenharmony_ci        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3527cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3528cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
3529cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
3530cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3531cabdff1aSopenharmony_ci
3532cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3533cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
3534cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
3535cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
3536cabdff1aSopenharmony_ci
3537cabdff1aSopenharmony_ci        ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
3538cabdff1aSopenharmony_ci        ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
3539cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3540cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp4, tmp5, 7);
3541cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp4, tmp5, 7);
3542cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
3543cabdff1aSopenharmony_ci
3544cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3545cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp6, tmp7, 7);
3546cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp6, tmp7, 7);
3547cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
3548cabdff1aSopenharmony_ci
3549cabdff1aSopenharmony_ci        ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
3550cabdff1aSopenharmony_ci        ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
3551cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3552cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
3553cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
3554cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
3555cabdff1aSopenharmony_ci
3556cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3557cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
3558cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
3559cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
3560cabdff1aSopenharmony_ci
3561cabdff1aSopenharmony_ci        ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
3562cabdff1aSopenharmony_ci        ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
3563cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3564cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp4, tmp5, 7);
3565cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp4, tmp5, 7);
3566cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
3567cabdff1aSopenharmony_ci
3568cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3569cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp6, tmp7, 7);
3570cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp6, tmp7, 7);
3571cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
3572cabdff1aSopenharmony_ci        dst += (2 * dst_stride);
3573cabdff1aSopenharmony_ci
3574cabdff1aSopenharmony_ci        src0 = src2;
3575cabdff1aSopenharmony_ci        src3 = src5;
3576cabdff1aSopenharmony_ci        src6 = src8;
3577cabdff1aSopenharmony_ci        src9 = src11;
3578cabdff1aSopenharmony_ci    }
3579cabdff1aSopenharmony_ci}
3580cabdff1aSopenharmony_ci
3581cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
3582cabdff1aSopenharmony_ci                                                   int32_t src_stride,
3583cabdff1aSopenharmony_ci                                                   uint8_t *dst,
3584cabdff1aSopenharmony_ci                                                   int32_t dst_stride,
3585cabdff1aSopenharmony_ci                                                   const int8_t *filter_horiz,
3586cabdff1aSopenharmony_ci                                                   const int8_t *filter_vert)
3587cabdff1aSopenharmony_ci{
3588cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
3589cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, mask;
3590cabdff1aSopenharmony_ci    v16u8 filt_hz, filt_vt, vec0, vec1;
3591cabdff1aSopenharmony_ci    v16u8 dst0, out;
3592cabdff1aSopenharmony_ci    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
3593cabdff1aSopenharmony_ci
3594cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[16]);
3595cabdff1aSopenharmony_ci
3596cabdff1aSopenharmony_ci    /* rearranging filter */
3597cabdff1aSopenharmony_ci    filt = LD_UH(filter_horiz);
3598cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
3599cabdff1aSopenharmony_ci
3600cabdff1aSopenharmony_ci    filt = LD_UH(filter_vert);
3601cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
3602cabdff1aSopenharmony_ci
3603cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3604cabdff1aSopenharmony_ci
3605cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
3606cabdff1aSopenharmony_ci    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
3607cabdff1aSopenharmony_ci    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3608cabdff1aSopenharmony_ci    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
3609cabdff1aSopenharmony_ci    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
3610cabdff1aSopenharmony_ci    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3611cabdff1aSopenharmony_ci
3612cabdff1aSopenharmony_ci    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3613cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3614cabdff1aSopenharmony_ci
3615cabdff1aSopenharmony_ci    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3616cabdff1aSopenharmony_ci    SRARI_H2_UH(tmp0, tmp1, 7);
3617cabdff1aSopenharmony_ci    SAT_UH2_UH(tmp0, tmp1, 7);
3618cabdff1aSopenharmony_ci
3619cabdff1aSopenharmony_ci    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3620cabdff1aSopenharmony_ci    out = __msa_aver_u_b(out, dst0);
3621cabdff1aSopenharmony_ci
3622cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3623cabdff1aSopenharmony_ci}
3624cabdff1aSopenharmony_ci
3625cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
3626cabdff1aSopenharmony_ci                                                   int32_t src_stride,
3627cabdff1aSopenharmony_ci                                                   uint8_t *dst,
3628cabdff1aSopenharmony_ci                                                   int32_t dst_stride,
3629cabdff1aSopenharmony_ci                                                   const int8_t *filter_horiz,
3630cabdff1aSopenharmony_ci                                                   const int8_t *filter_vert)
3631cabdff1aSopenharmony_ci{
3632cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
3633cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
3634cabdff1aSopenharmony_ci    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
3635cabdff1aSopenharmony_ci    v16u8 dst0, dst1;
3636cabdff1aSopenharmony_ci    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3637cabdff1aSopenharmony_ci    v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
3638cabdff1aSopenharmony_ci    v8i16 filt;
3639cabdff1aSopenharmony_ci
3640cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[16]);
3641cabdff1aSopenharmony_ci
3642cabdff1aSopenharmony_ci    /* rearranging filter */
3643cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
3644cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h(filt, 0);
3645cabdff1aSopenharmony_ci
3646cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
3647cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h(filt, 0);
3648cabdff1aSopenharmony_ci
3649cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3650cabdff1aSopenharmony_ci    src += (8 * src_stride);
3651cabdff1aSopenharmony_ci    src8 = LD_SB(src);
3652cabdff1aSopenharmony_ci
3653cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
3654cabdff1aSopenharmony_ci    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
3655cabdff1aSopenharmony_ci    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
3656cabdff1aSopenharmony_ci    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
3657cabdff1aSopenharmony_ci    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
3658cabdff1aSopenharmony_ci    SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
3659cabdff1aSopenharmony_ci               hz_out3, hz_out5);
3660cabdff1aSopenharmony_ci    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
3661cabdff1aSopenharmony_ci
3662cabdff1aSopenharmony_ci    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3663cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3664cabdff1aSopenharmony_ci    LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3665cabdff1aSopenharmony_ci    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
3666cabdff1aSopenharmony_ci    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3667cabdff1aSopenharmony_ci    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
3668cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
3669cabdff1aSopenharmony_ci                tmp0, tmp1, tmp2, tmp3);
3670cabdff1aSopenharmony_ci    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3671cabdff1aSopenharmony_ci    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3672cabdff1aSopenharmony_ci    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
3673cabdff1aSopenharmony_ci    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
3674cabdff1aSopenharmony_ci    ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3675cabdff1aSopenharmony_ci}
3676cabdff1aSopenharmony_ci
3677cabdff1aSopenharmony_civoid ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3678cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
3679cabdff1aSopenharmony_ci                          int height, int mx, int my)
3680cabdff1aSopenharmony_ci{
3681cabdff1aSopenharmony_ci    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3682cabdff1aSopenharmony_ci    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3683cabdff1aSopenharmony_ci
3684cabdff1aSopenharmony_ci    if (4 == height) {
3685cabdff1aSopenharmony_ci        common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
3686cabdff1aSopenharmony_ci                                               filter_horiz, filter_vert);
3687cabdff1aSopenharmony_ci    } else if (8 == height) {
3688cabdff1aSopenharmony_ci        common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
3689cabdff1aSopenharmony_ci                                               filter_horiz, filter_vert);
3690cabdff1aSopenharmony_ci    }
3691cabdff1aSopenharmony_ci}
3692cabdff1aSopenharmony_ci
3693cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
3694cabdff1aSopenharmony_ci                                                   int32_t src_stride,
3695cabdff1aSopenharmony_ci                                                   uint8_t *dst,
3696cabdff1aSopenharmony_ci                                                   int32_t dst_stride,
3697cabdff1aSopenharmony_ci                                                   const int8_t *filter_horiz,
3698cabdff1aSopenharmony_ci                                                   const int8_t *filter_vert)
3699cabdff1aSopenharmony_ci{
3700cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
3701cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, mask;
3702cabdff1aSopenharmony_ci    v16u8 filt_hz, filt_vt, dst0, dst1, vec0, vec1, vec2, vec3;
3703cabdff1aSopenharmony_ci    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3704cabdff1aSopenharmony_ci    v8i16 filt;
3705cabdff1aSopenharmony_ci
3706cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
3707cabdff1aSopenharmony_ci
3708cabdff1aSopenharmony_ci    /* rearranging filter */
3709cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
3710cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h(filt, 0);
3711cabdff1aSopenharmony_ci
3712cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
3713cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h(filt, 0);
3714cabdff1aSopenharmony_ci
3715cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3716cabdff1aSopenharmony_ci    src += (5 * src_stride);
3717cabdff1aSopenharmony_ci
3718cabdff1aSopenharmony_ci    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3719cabdff1aSopenharmony_ci    INSERT_D2_UB(tp0, tp1, dst0);
3720cabdff1aSopenharmony_ci    INSERT_D2_UB(tp2, tp3, dst1);
3721cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3722cabdff1aSopenharmony_ci    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3723cabdff1aSopenharmony_ci    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3724cabdff1aSopenharmony_ci    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3725cabdff1aSopenharmony_ci
3726cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3727cabdff1aSopenharmony_ci    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3728cabdff1aSopenharmony_ci    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
3729cabdff1aSopenharmony_ci
3730cabdff1aSopenharmony_ci    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3731cabdff1aSopenharmony_ci    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3732cabdff1aSopenharmony_ci    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
3733cabdff1aSopenharmony_ci
3734cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3735cabdff1aSopenharmony_ci    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3736cabdff1aSopenharmony_ci    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
3737cabdff1aSopenharmony_ci
3738cabdff1aSopenharmony_ci    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3739cabdff1aSopenharmony_ci    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3740cabdff1aSopenharmony_ci    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3741cabdff1aSopenharmony_ci}
3742cabdff1aSopenharmony_ci
3743cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
3744cabdff1aSopenharmony_ci                                                       int32_t src_stride,
3745cabdff1aSopenharmony_ci                                                       uint8_t *dst,
3746cabdff1aSopenharmony_ci                                                       int32_t dst_stride,
3747cabdff1aSopenharmony_ci                                                       const int8_t *filter_horiz,
3748cabdff1aSopenharmony_ci                                                       const int8_t *filter_vert,
3749cabdff1aSopenharmony_ci                                                       int32_t height)
3750cabdff1aSopenharmony_ci{
3751cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3752cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3;
3753cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, mask;
3754cabdff1aSopenharmony_ci    v16u8 filt_hz, filt_vt, vec0, dst0, dst1;
3755cabdff1aSopenharmony_ci    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3756cabdff1aSopenharmony_ci    v8i16 filt;
3757cabdff1aSopenharmony_ci
3758cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
3759cabdff1aSopenharmony_ci
3760cabdff1aSopenharmony_ci    /* rearranging filter */
3761cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
3762cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h(filt, 0);
3763cabdff1aSopenharmony_ci
3764cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
3765cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h(filt, 0);
3766cabdff1aSopenharmony_ci
3767cabdff1aSopenharmony_ci    src0 = LD_SB(src);
3768cabdff1aSopenharmony_ci    src += src_stride;
3769cabdff1aSopenharmony_ci
3770cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3771cabdff1aSopenharmony_ci
3772cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
3773cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src1, src2, src3, src4);
3774cabdff1aSopenharmony_ci        src += (4 * src_stride);
3775cabdff1aSopenharmony_ci
3776cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3777cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3778cabdff1aSopenharmony_ci        tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3779cabdff1aSopenharmony_ci
3780cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3781cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3782cabdff1aSopenharmony_ci        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
3783cabdff1aSopenharmony_ci
3784cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
3785cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
3786cabdff1aSopenharmony_ci
3787cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3788cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3789cabdff1aSopenharmony_ci        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
3790cabdff1aSopenharmony_ci
3791cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3792cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3793cabdff1aSopenharmony_ci        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
3794cabdff1aSopenharmony_ci
3795cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
3796cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
3797cabdff1aSopenharmony_ci        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3798cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst0);
3799cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, dst1);
3800cabdff1aSopenharmony_ci        PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3801cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
3802cabdff1aSopenharmony_ci    }
3803cabdff1aSopenharmony_ci}
3804cabdff1aSopenharmony_ci
3805cabdff1aSopenharmony_civoid ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3806cabdff1aSopenharmony_ci                          const uint8_t *src, ptrdiff_t src_stride,
3807cabdff1aSopenharmony_ci                          int height, int mx, int my)
3808cabdff1aSopenharmony_ci{
3809cabdff1aSopenharmony_ci    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3810cabdff1aSopenharmony_ci    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3811cabdff1aSopenharmony_ci
3812cabdff1aSopenharmony_ci    if (4 == height) {
3813cabdff1aSopenharmony_ci        common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3814cabdff1aSopenharmony_ci                                               filter_horiz, filter_vert);
3815cabdff1aSopenharmony_ci    } else {
3816cabdff1aSopenharmony_ci        common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride,
3817cabdff1aSopenharmony_ci                                                   dst, dst_stride,
3818cabdff1aSopenharmony_ci                                                   filter_horiz, filter_vert,
3819cabdff1aSopenharmony_ci                                                   height);
3820cabdff1aSopenharmony_ci    }
3821cabdff1aSopenharmony_ci}
3822cabdff1aSopenharmony_ci
3823cabdff1aSopenharmony_civoid ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3824cabdff1aSopenharmony_ci                           const uint8_t *src, ptrdiff_t src_stride,
3825cabdff1aSopenharmony_ci                           int height, int mx, int my)
3826cabdff1aSopenharmony_ci{
3827cabdff1aSopenharmony_ci    uint32_t loop_cnt;
3828cabdff1aSopenharmony_ci    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3829cabdff1aSopenharmony_ci    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3830cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3831cabdff1aSopenharmony_ci    v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
3832cabdff1aSopenharmony_ci    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
3833cabdff1aSopenharmony_ci    v8i16 filt;
3834cabdff1aSopenharmony_ci
3835cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
3836cabdff1aSopenharmony_ci
3837cabdff1aSopenharmony_ci    /* rearranging filter */
3838cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
3839cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h(filt, 0);
3840cabdff1aSopenharmony_ci
3841cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
3842cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h(filt, 0);
3843cabdff1aSopenharmony_ci
3844cabdff1aSopenharmony_ci    LD_SB2(src, 8, src0, src1);
3845cabdff1aSopenharmony_ci    src += src_stride;
3846cabdff1aSopenharmony_ci
3847cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3848cabdff1aSopenharmony_ci    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3849cabdff1aSopenharmony_ci
3850cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
3851cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src2, src4, src6);
3852cabdff1aSopenharmony_ci        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3853cabdff1aSopenharmony_ci        src += (4 * src_stride);
3854cabdff1aSopenharmony_ci        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3855cabdff1aSopenharmony_ci
3856cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3857cabdff1aSopenharmony_ci        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3858cabdff1aSopenharmony_ci        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3859cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3860cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
3861cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
3862cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3863cabdff1aSopenharmony_ci        dst += dst_stride;
3864cabdff1aSopenharmony_ci
3865cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3866cabdff1aSopenharmony_ci        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3867cabdff1aSopenharmony_ci        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3868cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3869cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
3870cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
3871cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
3872cabdff1aSopenharmony_ci        dst += dst_stride;
3873cabdff1aSopenharmony_ci
3874cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3875cabdff1aSopenharmony_ci        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
3876cabdff1aSopenharmony_ci        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3877cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3878cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
3879cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
3880cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
3881cabdff1aSopenharmony_ci        dst += dst_stride;
3882cabdff1aSopenharmony_ci
3883cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
3884cabdff1aSopenharmony_ci        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
3885cabdff1aSopenharmony_ci        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3886cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3887cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
3888cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
3889cabdff1aSopenharmony_ci        PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
3890cabdff1aSopenharmony_ci        dst += dst_stride;
3891cabdff1aSopenharmony_ci    }
3892cabdff1aSopenharmony_ci}
3893cabdff1aSopenharmony_ci
3894cabdff1aSopenharmony_civoid ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3895cabdff1aSopenharmony_ci                           const uint8_t *src, ptrdiff_t src_stride,
3896cabdff1aSopenharmony_ci                           int height, int mx, int my)
3897cabdff1aSopenharmony_ci{
3898cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
3899cabdff1aSopenharmony_ci
3900cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
3901cabdff1aSopenharmony_ci        ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
3902cabdff1aSopenharmony_ci
3903cabdff1aSopenharmony_ci        src += 16;
3904cabdff1aSopenharmony_ci        dst += 16;
3905cabdff1aSopenharmony_ci    }
3906cabdff1aSopenharmony_ci}
3907cabdff1aSopenharmony_ci
3908cabdff1aSopenharmony_civoid ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3909cabdff1aSopenharmony_ci                           const uint8_t *src, ptrdiff_t src_stride,
3910cabdff1aSopenharmony_ci                           int height, int mx, int my)
3911cabdff1aSopenharmony_ci{
3912cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
3913cabdff1aSopenharmony_ci
3914cabdff1aSopenharmony_ci    for (multiple8_cnt = 4; multiple8_cnt--;) {
3915cabdff1aSopenharmony_ci        ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
3916cabdff1aSopenharmony_ci
3917cabdff1aSopenharmony_ci        src += 16;
3918cabdff1aSopenharmony_ci        dst += 16;
3919cabdff1aSopenharmony_ci    }
3920cabdff1aSopenharmony_ci}
3921cabdff1aSopenharmony_ci
3922cabdff1aSopenharmony_cistatic void copy_width8_msa(const uint8_t *src, int32_t src_stride,
3923cabdff1aSopenharmony_ci                            uint8_t *dst, int32_t dst_stride,
3924cabdff1aSopenharmony_ci                            int32_t height)
3925cabdff1aSopenharmony_ci{
3926cabdff1aSopenharmony_ci    int32_t cnt;
3927cabdff1aSopenharmony_ci    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
3928cabdff1aSopenharmony_ci
3929cabdff1aSopenharmony_ci    if (0 == height % 8) {
3930cabdff1aSopenharmony_ci        for (cnt = height >> 3; cnt--;) {
3931cabdff1aSopenharmony_ci            LD4(src, src_stride, out0, out1, out2, out3);
3932cabdff1aSopenharmony_ci            src += (4 * src_stride);
3933cabdff1aSopenharmony_ci            LD4(src, src_stride, out4, out5, out6, out7);
3934cabdff1aSopenharmony_ci            src += (4 * src_stride);
3935cabdff1aSopenharmony_ci
3936cabdff1aSopenharmony_ci            SD4(out0, out1, out2, out3, dst, dst_stride);
3937cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
3938cabdff1aSopenharmony_ci            SD4(out4, out5, out6, out7, dst, dst_stride);
3939cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
3940cabdff1aSopenharmony_ci        }
3941cabdff1aSopenharmony_ci    } else if (0 == height % 4) {
3942cabdff1aSopenharmony_ci        for (cnt = (height / 4); cnt--;) {
3943cabdff1aSopenharmony_ci            LD4(src, src_stride, out0, out1, out2, out3);
3944cabdff1aSopenharmony_ci            src += (4 * src_stride);
3945cabdff1aSopenharmony_ci
3946cabdff1aSopenharmony_ci            SD4(out0, out1, out2, out3, dst, dst_stride);
3947cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
3948cabdff1aSopenharmony_ci        }
3949cabdff1aSopenharmony_ci    }
3950cabdff1aSopenharmony_ci}
3951cabdff1aSopenharmony_ci
3952cabdff1aSopenharmony_cistatic void copy_width16_msa(const uint8_t *src, int32_t src_stride,
3953cabdff1aSopenharmony_ci                             uint8_t *dst, int32_t dst_stride,
3954cabdff1aSopenharmony_ci                             int32_t height)
3955cabdff1aSopenharmony_ci{
3956cabdff1aSopenharmony_ci    int32_t cnt;
3957cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
3958cabdff1aSopenharmony_ci
3959cabdff1aSopenharmony_ci    if (8 == height) {
3960cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3961cabdff1aSopenharmony_ci        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3962cabdff1aSopenharmony_ci    } else if (16 == height) {
3963cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3964cabdff1aSopenharmony_ci        src += (8 * src_stride);
3965cabdff1aSopenharmony_ci        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3966cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
3967cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3968cabdff1aSopenharmony_ci        src += (8 * src_stride);
3969cabdff1aSopenharmony_ci        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3970cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
3971cabdff1aSopenharmony_ci    } else if (32 == height) {
3972cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3973cabdff1aSopenharmony_ci        src += (8 * src_stride);
3974cabdff1aSopenharmony_ci        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3975cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
3976cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3977cabdff1aSopenharmony_ci        src += (8 * src_stride);
3978cabdff1aSopenharmony_ci        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3979cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
3980cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3981cabdff1aSopenharmony_ci        src += (8 * src_stride);
3982cabdff1aSopenharmony_ci        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3983cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
3984cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3985cabdff1aSopenharmony_ci        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3986cabdff1aSopenharmony_ci    } else if (0 == height % 4) {
3987cabdff1aSopenharmony_ci        for (cnt = (height >> 2); cnt--;) {
3988cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src0, src1, src2, src3);
3989cabdff1aSopenharmony_ci            src += (4 * src_stride);
3990cabdff1aSopenharmony_ci            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
3991cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
3992cabdff1aSopenharmony_ci        }
3993cabdff1aSopenharmony_ci    }
3994cabdff1aSopenharmony_ci}
3995cabdff1aSopenharmony_ci
3996cabdff1aSopenharmony_cistatic void copy_width32_msa(const uint8_t *src, int32_t src_stride,
3997cabdff1aSopenharmony_ci                             uint8_t *dst, int32_t dst_stride,
3998cabdff1aSopenharmony_ci                             int32_t height)
3999cabdff1aSopenharmony_ci{
4000cabdff1aSopenharmony_ci    int32_t cnt;
4001cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4002cabdff1aSopenharmony_ci
4003cabdff1aSopenharmony_ci    if (0 == height % 8) {
4004cabdff1aSopenharmony_ci        for (cnt = (height >> 3); cnt--;) {
4005cabdff1aSopenharmony_ci            LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
4006cabdff1aSopenharmony_ci            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
4007cabdff1aSopenharmony_ci            LD_UB8(src + 16, src_stride, src0, src1, src2, src3, src4, src5, src6,
4008cabdff1aSopenharmony_ci                   src7);
4009cabdff1aSopenharmony_ci            src += (8 * src_stride);
4010cabdff1aSopenharmony_ci            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 16,
4011cabdff1aSopenharmony_ci                   dst_stride);
4012cabdff1aSopenharmony_ci            dst += (8 * dst_stride);
4013cabdff1aSopenharmony_ci        }
4014cabdff1aSopenharmony_ci    } else if (0 == height % 4) {
4015cabdff1aSopenharmony_ci        for (cnt = (height >> 2); cnt--;) {
4016cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src0, src1, src2, src3);
4017cabdff1aSopenharmony_ci            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
4018cabdff1aSopenharmony_ci            src += (4 * src_stride);
4019cabdff1aSopenharmony_ci            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
4020cabdff1aSopenharmony_ci            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
4021cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
4022cabdff1aSopenharmony_ci        }
4023cabdff1aSopenharmony_ci    }
4024cabdff1aSopenharmony_ci}
4025cabdff1aSopenharmony_ci
4026cabdff1aSopenharmony_cistatic void copy_width64_msa(const uint8_t *src, int32_t src_stride,
4027cabdff1aSopenharmony_ci                             uint8_t *dst, int32_t dst_stride,
4028cabdff1aSopenharmony_ci                             int32_t height)
4029cabdff1aSopenharmony_ci{
4030cabdff1aSopenharmony_ci    int32_t cnt;
4031cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4032cabdff1aSopenharmony_ci    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4033cabdff1aSopenharmony_ci
4034cabdff1aSopenharmony_ci    for (cnt = (height >> 2); cnt--;) {
4035cabdff1aSopenharmony_ci        LD_UB4(src, 16, src0, src1, src2, src3);
4036cabdff1aSopenharmony_ci        src += src_stride;
4037cabdff1aSopenharmony_ci        LD_UB4(src, 16, src4, src5, src6, src7);
4038cabdff1aSopenharmony_ci        src += src_stride;
4039cabdff1aSopenharmony_ci        LD_UB4(src, 16, src8, src9, src10, src11);
4040cabdff1aSopenharmony_ci        src += src_stride;
4041cabdff1aSopenharmony_ci        LD_UB4(src, 16, src12, src13, src14, src15);
4042cabdff1aSopenharmony_ci        src += src_stride;
4043cabdff1aSopenharmony_ci
4044cabdff1aSopenharmony_ci        ST_UB4(src0, src1, src2, src3, dst, 16);
4045cabdff1aSopenharmony_ci        dst += dst_stride;
4046cabdff1aSopenharmony_ci        ST_UB4(src4, src5, src6, src7, dst, 16);
4047cabdff1aSopenharmony_ci        dst += dst_stride;
4048cabdff1aSopenharmony_ci        ST_UB4(src8, src9, src10, src11, dst, 16);
4049cabdff1aSopenharmony_ci        dst += dst_stride;
4050cabdff1aSopenharmony_ci        ST_UB4(src12, src13, src14, src15, dst, 16);
4051cabdff1aSopenharmony_ci        dst += dst_stride;
4052cabdff1aSopenharmony_ci    }
4053cabdff1aSopenharmony_ci}
4054cabdff1aSopenharmony_ci
4055cabdff1aSopenharmony_cistatic void avg_width4_msa(const uint8_t *src, int32_t src_stride,
4056cabdff1aSopenharmony_ci                           uint8_t *dst, int32_t dst_stride,
4057cabdff1aSopenharmony_ci                           int32_t height)
4058cabdff1aSopenharmony_ci{
4059cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3;
4060cabdff1aSopenharmony_ci    v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
4061cabdff1aSopenharmony_ci
4062cabdff1aSopenharmony_ci    if (8 == height) {
4063cabdff1aSopenharmony_ci        LW4(src, src_stride, tp0, tp1, tp2, tp3);
4064cabdff1aSopenharmony_ci        src += 4 * src_stride;
4065cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
4066cabdff1aSopenharmony_ci        LW4(src, src_stride, tp0, tp1, tp2, tp3);
4067cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
4068cabdff1aSopenharmony_ci        LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
4069cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4070cabdff1aSopenharmony_ci        LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
4071cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
4072cabdff1aSopenharmony_ci        AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
4073cabdff1aSopenharmony_ci        ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4074cabdff1aSopenharmony_ci    } else if (4 == height) {
4075cabdff1aSopenharmony_ci        LW4(src, src_stride, tp0, tp1, tp2, tp3);
4076cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
4077cabdff1aSopenharmony_ci        LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
4078cabdff1aSopenharmony_ci        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4079cabdff1aSopenharmony_ci        dst0 = __msa_aver_u_b(src0, dst0);
4080cabdff1aSopenharmony_ci        ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
4081cabdff1aSopenharmony_ci    }
4082cabdff1aSopenharmony_ci}
4083cabdff1aSopenharmony_ci
4084cabdff1aSopenharmony_cistatic void avg_width8_msa(const uint8_t *src, int32_t src_stride,
4085cabdff1aSopenharmony_ci                           uint8_t *dst, int32_t dst_stride,
4086cabdff1aSopenharmony_ci                           int32_t height)
4087cabdff1aSopenharmony_ci{
4088cabdff1aSopenharmony_ci    int32_t cnt;
4089cabdff1aSopenharmony_ci    uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
4090cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3;
4091cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3;
4092cabdff1aSopenharmony_ci
4093cabdff1aSopenharmony_ci    if (0 == (height % 8)) {
4094cabdff1aSopenharmony_ci        for (cnt = (height >> 3); cnt--;) {
4095cabdff1aSopenharmony_ci            LD4(src, src_stride, tp0, tp1, tp2, tp3);
4096cabdff1aSopenharmony_ci            src += 4 * src_stride;
4097cabdff1aSopenharmony_ci            LD4(src, src_stride, tp4, tp5, tp6, tp7);
4098cabdff1aSopenharmony_ci            src += 4 * src_stride;
4099cabdff1aSopenharmony_ci            INSERT_D2_UB(tp0, tp1, src0);
4100cabdff1aSopenharmony_ci            INSERT_D2_UB(tp2, tp3, src1);
4101cabdff1aSopenharmony_ci            INSERT_D2_UB(tp4, tp5, src2);
4102cabdff1aSopenharmony_ci            INSERT_D2_UB(tp6, tp7, src3);
4103cabdff1aSopenharmony_ci            LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
4104cabdff1aSopenharmony_ci            LD4(dst + 4 * dst_stride, dst_stride, tp4, tp5, tp6, tp7);
4105cabdff1aSopenharmony_ci            INSERT_D2_UB(tp0, tp1, dst0);
4106cabdff1aSopenharmony_ci            INSERT_D2_UB(tp2, tp3, dst1);
4107cabdff1aSopenharmony_ci            INSERT_D2_UB(tp4, tp5, dst2);
4108cabdff1aSopenharmony_ci            INSERT_D2_UB(tp6, tp7, dst3);
4109cabdff1aSopenharmony_ci            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0,
4110cabdff1aSopenharmony_ci                        dst1, dst2, dst3);
4111cabdff1aSopenharmony_ci            ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
4112cabdff1aSopenharmony_ci            dst += 8 * dst_stride;
4113cabdff1aSopenharmony_ci        }
4114cabdff1aSopenharmony_ci    } else if (4 == height) {
4115cabdff1aSopenharmony_ci        LD4(src, src_stride, tp0, tp1, tp2, tp3);
4116cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, src0);
4117cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, src1);
4118cabdff1aSopenharmony_ci        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
4119cabdff1aSopenharmony_ci        INSERT_D2_UB(tp0, tp1, dst0);
4120cabdff1aSopenharmony_ci        INSERT_D2_UB(tp2, tp3, dst1);
4121cabdff1aSopenharmony_ci        AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
4122cabdff1aSopenharmony_ci        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
4123cabdff1aSopenharmony_ci    }
4124cabdff1aSopenharmony_ci}
4125cabdff1aSopenharmony_ci
4126cabdff1aSopenharmony_cistatic void avg_width16_msa(const uint8_t *src, int32_t src_stride,
4127cabdff1aSopenharmony_ci                            uint8_t *dst, int32_t dst_stride,
4128cabdff1aSopenharmony_ci                            int32_t height)
4129cabdff1aSopenharmony_ci{
4130cabdff1aSopenharmony_ci    int32_t cnt;
4131cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4132cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4133cabdff1aSopenharmony_ci
4134cabdff1aSopenharmony_ci    if (0 == (height % 8)) {
4135cabdff1aSopenharmony_ci        for (cnt = (height / 8); cnt--;) {
4136cabdff1aSopenharmony_ci            LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
4137cabdff1aSopenharmony_ci            src += (8 * src_stride);
4138cabdff1aSopenharmony_ci            LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4139cabdff1aSopenharmony_ci
4140cabdff1aSopenharmony_ci            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4141cabdff1aSopenharmony_ci                        dst0, dst1, dst2, dst3);
4142cabdff1aSopenharmony_ci            AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4143cabdff1aSopenharmony_ci                        dst4, dst5, dst6, dst7);
4144cabdff1aSopenharmony_ci            ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
4145cabdff1aSopenharmony_ci            dst += (8 * dst_stride);
4146cabdff1aSopenharmony_ci        }
4147cabdff1aSopenharmony_ci    } else if (0 == (height % 4)) {
4148cabdff1aSopenharmony_ci        for (cnt = (height / 4); cnt--;) {
4149cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src0, src1, src2, src3);
4150cabdff1aSopenharmony_ci            src += (4 * src_stride);
4151cabdff1aSopenharmony_ci            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
4152cabdff1aSopenharmony_ci
4153cabdff1aSopenharmony_ci            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4154cabdff1aSopenharmony_ci                        dst0, dst1, dst2, dst3);
4155cabdff1aSopenharmony_ci            ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
4156cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
4157cabdff1aSopenharmony_ci        }
4158cabdff1aSopenharmony_ci    }
4159cabdff1aSopenharmony_ci}
4160cabdff1aSopenharmony_ci
4161cabdff1aSopenharmony_cistatic void avg_width32_msa(const uint8_t *src, int32_t src_stride,
4162cabdff1aSopenharmony_ci                            uint8_t *dst, int32_t dst_stride,
4163cabdff1aSopenharmony_ci                            int32_t height)
4164cabdff1aSopenharmony_ci{
4165cabdff1aSopenharmony_ci    int32_t cnt;
4166cabdff1aSopenharmony_ci    uint8_t *dst_dup = dst;
4167cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4168cabdff1aSopenharmony_ci    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4169cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4170cabdff1aSopenharmony_ci    v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4171cabdff1aSopenharmony_ci
4172cabdff1aSopenharmony_ci    if (0 == (height % 8)) {
4173cabdff1aSopenharmony_ci        for (cnt = (height / 8); cnt--;) {
4174cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src0, src2, src4, src6);
4175cabdff1aSopenharmony_ci            LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
4176cabdff1aSopenharmony_ci            src += (4 * src_stride);
4177cabdff1aSopenharmony_ci            LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
4178cabdff1aSopenharmony_ci            LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
4179cabdff1aSopenharmony_ci            dst_dup += (4 * dst_stride);
4180cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src8, src10, src12, src14);
4181cabdff1aSopenharmony_ci            LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
4182cabdff1aSopenharmony_ci            src += (4 * src_stride);
4183cabdff1aSopenharmony_ci            LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
4184cabdff1aSopenharmony_ci            LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
4185cabdff1aSopenharmony_ci            dst_dup += (4 * dst_stride);
4186cabdff1aSopenharmony_ci
4187cabdff1aSopenharmony_ci            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4188cabdff1aSopenharmony_ci                        dst0, dst1, dst2, dst3);
4189cabdff1aSopenharmony_ci            AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4190cabdff1aSopenharmony_ci                        dst4, dst5, dst6, dst7);
4191cabdff1aSopenharmony_ci            AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4192cabdff1aSopenharmony_ci                        dst8, dst9, dst10, dst11);
4193cabdff1aSopenharmony_ci            AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4194cabdff1aSopenharmony_ci                        dst12, dst13, dst14, dst15);
4195cabdff1aSopenharmony_ci
4196cabdff1aSopenharmony_ci            ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
4197cabdff1aSopenharmony_ci            ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
4198cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
4199cabdff1aSopenharmony_ci            ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
4200cabdff1aSopenharmony_ci            ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
4201cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
4202cabdff1aSopenharmony_ci        }
4203cabdff1aSopenharmony_ci    } else if (0 == (height % 4)) {
4204cabdff1aSopenharmony_ci        for (cnt = (height / 4); cnt--;) {
4205cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src0, src2, src4, src6);
4206cabdff1aSopenharmony_ci            LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
4207cabdff1aSopenharmony_ci            src += (4 * src_stride);
4208cabdff1aSopenharmony_ci            LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
4209cabdff1aSopenharmony_ci            LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
4210cabdff1aSopenharmony_ci            dst_dup += (4 * dst_stride);
4211cabdff1aSopenharmony_ci
4212cabdff1aSopenharmony_ci            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4213cabdff1aSopenharmony_ci                        dst0, dst1, dst2, dst3);
4214cabdff1aSopenharmony_ci            AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4215cabdff1aSopenharmony_ci                        dst4, dst5, dst6, dst7);
4216cabdff1aSopenharmony_ci
4217cabdff1aSopenharmony_ci            ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
4218cabdff1aSopenharmony_ci            ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
4219cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
4220cabdff1aSopenharmony_ci        }
4221cabdff1aSopenharmony_ci    }
4222cabdff1aSopenharmony_ci}
4223cabdff1aSopenharmony_ci
4224cabdff1aSopenharmony_cistatic void avg_width64_msa(const uint8_t *src, int32_t src_stride,
4225cabdff1aSopenharmony_ci                            uint8_t *dst, int32_t dst_stride,
4226cabdff1aSopenharmony_ci                            int32_t height)
4227cabdff1aSopenharmony_ci{
4228cabdff1aSopenharmony_ci    int32_t cnt;
4229cabdff1aSopenharmony_ci    uint8_t *dst_dup = dst;
4230cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4231cabdff1aSopenharmony_ci    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4232cabdff1aSopenharmony_ci    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4233cabdff1aSopenharmony_ci    v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4234cabdff1aSopenharmony_ci
4235cabdff1aSopenharmony_ci    for (cnt = (height / 4); cnt--;) {
4236cabdff1aSopenharmony_ci        LD_UB4(src, 16, src0, src1, src2, src3);
4237cabdff1aSopenharmony_ci        src += src_stride;
4238cabdff1aSopenharmony_ci        LD_UB4(src, 16, src4, src5, src6, src7);
4239cabdff1aSopenharmony_ci        src += src_stride;
4240cabdff1aSopenharmony_ci        LD_UB4(src, 16, src8, src9, src10, src11);
4241cabdff1aSopenharmony_ci        src += src_stride;
4242cabdff1aSopenharmony_ci        LD_UB4(src, 16, src12, src13, src14, src15);
4243cabdff1aSopenharmony_ci        src += src_stride;
4244cabdff1aSopenharmony_ci
4245cabdff1aSopenharmony_ci        LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
4246cabdff1aSopenharmony_ci        dst_dup += dst_stride;
4247cabdff1aSopenharmony_ci        LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
4248cabdff1aSopenharmony_ci        dst_dup += dst_stride;
4249cabdff1aSopenharmony_ci        LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
4250cabdff1aSopenharmony_ci        dst_dup += dst_stride;
4251cabdff1aSopenharmony_ci        LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
4252cabdff1aSopenharmony_ci        dst_dup += dst_stride;
4253cabdff1aSopenharmony_ci
4254cabdff1aSopenharmony_ci        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4255cabdff1aSopenharmony_ci                    dst0, dst1, dst2, dst3);
4256cabdff1aSopenharmony_ci        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4257cabdff1aSopenharmony_ci                    dst4, dst5, dst6, dst7);
4258cabdff1aSopenharmony_ci        AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4259cabdff1aSopenharmony_ci                    dst8, dst9, dst10, dst11);
4260cabdff1aSopenharmony_ci        AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4261cabdff1aSopenharmony_ci                    dst12, dst13, dst14, dst15);
4262cabdff1aSopenharmony_ci
4263cabdff1aSopenharmony_ci        ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
4264cabdff1aSopenharmony_ci        dst += dst_stride;
4265cabdff1aSopenharmony_ci        ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
4266cabdff1aSopenharmony_ci        dst += dst_stride;
4267cabdff1aSopenharmony_ci        ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
4268cabdff1aSopenharmony_ci        dst += dst_stride;
4269cabdff1aSopenharmony_ci        ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
4270cabdff1aSopenharmony_ci        dst += dst_stride;
4271cabdff1aSopenharmony_ci    }
4272cabdff1aSopenharmony_ci}
4273cabdff1aSopenharmony_ci
4274cabdff1aSopenharmony_cistatic const int8_t vp9_subpel_filters_msa[3][15][8] = {
4275cabdff1aSopenharmony_ci    [FILTER_8TAP_REGULAR] = {
4276cabdff1aSopenharmony_ci         {0, 1, -5, 126, 8, -3, 1, 0},
4277cabdff1aSopenharmony_ci         {-1, 3, -10, 122, 18, -6, 2, 0},
4278cabdff1aSopenharmony_ci         {-1, 4, -13, 118, 27, -9, 3, -1},
4279cabdff1aSopenharmony_ci         {-1, 4, -16, 112, 37, -11, 4, -1},
4280cabdff1aSopenharmony_ci         {-1, 5, -18, 105, 48, -14, 4, -1},
4281cabdff1aSopenharmony_ci         {-1, 5, -19, 97, 58, -16, 5, -1},
4282cabdff1aSopenharmony_ci         {-1, 6, -19, 88, 68, -18, 5, -1},
4283cabdff1aSopenharmony_ci         {-1, 6, -19, 78, 78, -19, 6, -1},
4284cabdff1aSopenharmony_ci         {-1, 5, -18, 68, 88, -19, 6, -1},
4285cabdff1aSopenharmony_ci         {-1, 5, -16, 58, 97, -19, 5, -1},
4286cabdff1aSopenharmony_ci         {-1, 4, -14, 48, 105, -18, 5, -1},
4287cabdff1aSopenharmony_ci         {-1, 4, -11, 37, 112, -16, 4, -1},
4288cabdff1aSopenharmony_ci         {-1, 3, -9, 27, 118, -13, 4, -1},
4289cabdff1aSopenharmony_ci         {0, 2, -6, 18, 122, -10, 3, -1},
4290cabdff1aSopenharmony_ci         {0, 1, -3, 8, 126, -5, 1, 0},
4291cabdff1aSopenharmony_ci    }, [FILTER_8TAP_SHARP] = {
4292cabdff1aSopenharmony_ci        {-1, 3, -7, 127, 8, -3, 1, 0},
4293cabdff1aSopenharmony_ci        {-2, 5, -13, 125, 17, -6, 3, -1},
4294cabdff1aSopenharmony_ci        {-3, 7, -17, 121, 27, -10, 5, -2},
4295cabdff1aSopenharmony_ci        {-4, 9, -20, 115, 37, -13, 6, -2},
4296cabdff1aSopenharmony_ci        {-4, 10, -23, 108, 48, -16, 8, -3},
4297cabdff1aSopenharmony_ci        {-4, 10, -24, 100, 59, -19, 9, -3},
4298cabdff1aSopenharmony_ci        {-4, 11, -24, 90, 70, -21, 10, -4},
4299cabdff1aSopenharmony_ci        {-4, 11, -23, 80, 80, -23, 11, -4},
4300cabdff1aSopenharmony_ci        {-4, 10, -21, 70, 90, -24, 11, -4},
4301cabdff1aSopenharmony_ci        {-3, 9, -19, 59, 100, -24, 10, -4},
4302cabdff1aSopenharmony_ci        {-3, 8, -16, 48, 108, -23, 10, -4},
4303cabdff1aSopenharmony_ci        {-2, 6, -13, 37, 115, -20, 9, -4},
4304cabdff1aSopenharmony_ci        {-2, 5, -10, 27, 121, -17, 7, -3},
4305cabdff1aSopenharmony_ci        {-1, 3, -6, 17, 125, -13, 5, -2},
4306cabdff1aSopenharmony_ci        {0, 1, -3, 8, 127, -7, 3, -1},
4307cabdff1aSopenharmony_ci    }, [FILTER_8TAP_SMOOTH] = {
4308cabdff1aSopenharmony_ci        {-3, -1, 32, 64, 38, 1, -3, 0},
4309cabdff1aSopenharmony_ci        {-2, -2, 29, 63, 41, 2, -3, 0},
4310cabdff1aSopenharmony_ci        {-2, -2, 26, 63, 43, 4, -4, 0},
4311cabdff1aSopenharmony_ci        {-2, -3, 24, 62, 46, 5, -4, 0},
4312cabdff1aSopenharmony_ci        {-2, -3, 21, 60, 49, 7, -4, 0},
4313cabdff1aSopenharmony_ci        {-1, -4, 18, 59, 51, 9, -4, 0},
4314cabdff1aSopenharmony_ci        {-1, -4, 16, 57, 53, 12, -4, -1},
4315cabdff1aSopenharmony_ci        {-1, -4, 14, 55, 55, 14, -4, -1},
4316cabdff1aSopenharmony_ci        {-1, -4, 12, 53, 57, 16, -4, -1},
4317cabdff1aSopenharmony_ci        {0, -4, 9, 51, 59, 18, -4, -1},
4318cabdff1aSopenharmony_ci        {0, -4, 7, 49, 60, 21, -3, -2},
4319cabdff1aSopenharmony_ci        {0, -4, 5, 46, 62, 24, -3, -2},
4320cabdff1aSopenharmony_ci        {0, -4, 4, 43, 63, 26, -2, -2},
4321cabdff1aSopenharmony_ci        {0, -3, 2, 41, 63, 29, -2, -2},
4322cabdff1aSopenharmony_ci        {0, -3, 1, 38, 64, 32, -1, -3},
4323cabdff1aSopenharmony_ci    }
4324cabdff1aSopenharmony_ci};
4325cabdff1aSopenharmony_ci
4326cabdff1aSopenharmony_ci#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)                           \
4327cabdff1aSopenharmony_civoid ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \
4328cabdff1aSopenharmony_ci                                        const uint8_t *src,                    \
4329cabdff1aSopenharmony_ci                                        ptrdiff_t srcstride,                   \
4330cabdff1aSopenharmony_ci                                        int h, int mx, int my)                 \
4331cabdff1aSopenharmony_ci{                                                                              \
4332cabdff1aSopenharmony_ci    const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \
4333cabdff1aSopenharmony_ci                                                                               \
4334cabdff1aSopenharmony_ci    common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \
4335cabdff1aSopenharmony_ci}                                                                              \
4336cabdff1aSopenharmony_ci                                                                               \
4337cabdff1aSopenharmony_civoid ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \
4338cabdff1aSopenharmony_ci                                        const uint8_t *src,                    \
4339cabdff1aSopenharmony_ci                                        ptrdiff_t srcstride,                   \
4340cabdff1aSopenharmony_ci                                        int h, int mx, int my)                 \
4341cabdff1aSopenharmony_ci{                                                                              \
4342cabdff1aSopenharmony_ci    const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \
4343cabdff1aSopenharmony_ci                                                                               \
4344cabdff1aSopenharmony_ci    common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \
4345cabdff1aSopenharmony_ci}                                                                              \
4346cabdff1aSopenharmony_ci                                                                               \
4347cabdff1aSopenharmony_civoid ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \
4348cabdff1aSopenharmony_ci                                         const uint8_t *src,                   \
4349cabdff1aSopenharmony_ci                                         ptrdiff_t srcstride,                  \
4350cabdff1aSopenharmony_ci                                         int h, int mx, int my)                \
4351cabdff1aSopenharmony_ci{                                                                              \
4352cabdff1aSopenharmony_ci    const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];            \
4353cabdff1aSopenharmony_ci    const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];            \
4354cabdff1aSopenharmony_ci                                                                               \
4355cabdff1aSopenharmony_ci    common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter,   \
4356cabdff1aSopenharmony_ci                                    vfilter, h);                               \
4357cabdff1aSopenharmony_ci}                                                                              \
4358cabdff1aSopenharmony_ci                                                                               \
4359cabdff1aSopenharmony_civoid ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \
4360cabdff1aSopenharmony_ci                                        const uint8_t *src,                    \
4361cabdff1aSopenharmony_ci                                        ptrdiff_t srcstride,                   \
4362cabdff1aSopenharmony_ci                                        int h, int mx, int my)                 \
4363cabdff1aSopenharmony_ci{                                                                              \
4364cabdff1aSopenharmony_ci    const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \
4365cabdff1aSopenharmony_ci                                                                               \
4366cabdff1aSopenharmony_ci    common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,               \
4367cabdff1aSopenharmony_ci                                            dststride, filter, h);             \
4368cabdff1aSopenharmony_ci}                                                                              \
4369cabdff1aSopenharmony_ci                                                                               \
4370cabdff1aSopenharmony_civoid ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \
4371cabdff1aSopenharmony_ci                                        const uint8_t *src,                    \
4372cabdff1aSopenharmony_ci                                        ptrdiff_t srcstride,                   \
4373cabdff1aSopenharmony_ci                                        int h, int mx, int my)                 \
4374cabdff1aSopenharmony_ci{                                                                              \
4375cabdff1aSopenharmony_ci    const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \
4376cabdff1aSopenharmony_ci                                                                               \
4377cabdff1aSopenharmony_ci    common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride,    \
4378cabdff1aSopenharmony_ci                                            filter, h);                        \
4379cabdff1aSopenharmony_ci}                                                                              \
4380cabdff1aSopenharmony_ci                                                                               \
4381cabdff1aSopenharmony_civoid ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \
4382cabdff1aSopenharmony_ci                                         const uint8_t *src,                   \
4383cabdff1aSopenharmony_ci                                         ptrdiff_t srcstride,                  \
4384cabdff1aSopenharmony_ci                                         int h, int mx, int my)                \
4385cabdff1aSopenharmony_ci{                                                                              \
4386cabdff1aSopenharmony_ci    const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];            \
4387cabdff1aSopenharmony_ci    const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];            \
4388cabdff1aSopenharmony_ci                                                                               \
4389cabdff1aSopenharmony_ci    common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,          \
4390cabdff1aSopenharmony_ci                                                 dststride, hfilter,           \
4391cabdff1aSopenharmony_ci                                                 vfilter, h);                  \
4392cabdff1aSopenharmony_ci}
4393cabdff1aSopenharmony_ci
4394cabdff1aSopenharmony_ci#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)                           \
4395cabdff1aSopenharmony_civoid ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \
4396cabdff1aSopenharmony_ci                         const uint8_t *src, ptrdiff_t srcstride,  \
4397cabdff1aSopenharmony_ci                         int h, int mx, int my)                    \
4398cabdff1aSopenharmony_ci{                                                                  \
4399cabdff1aSopenharmony_ci                                                                   \
4400cabdff1aSopenharmony_ci    copy_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \
4401cabdff1aSopenharmony_ci}                                                                  \
4402cabdff1aSopenharmony_ci                                                                   \
4403cabdff1aSopenharmony_civoid ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,         \
4404cabdff1aSopenharmony_ci                        const uint8_t *src, ptrdiff_t srcstride,   \
4405cabdff1aSopenharmony_ci                        int h, int mx, int my)                     \
4406cabdff1aSopenharmony_ci{                                                                  \
4407cabdff1aSopenharmony_ci                                                                   \
4408cabdff1aSopenharmony_ci    avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);      \
4409cabdff1aSopenharmony_ci}
4410cabdff1aSopenharmony_ci
4411cabdff1aSopenharmony_ci#define VP9_AVG_MIPS_MSA_FUNC(SIZE)                               \
4412cabdff1aSopenharmony_civoid ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \
4413cabdff1aSopenharmony_ci                        const uint8_t *src, ptrdiff_t srcstride,  \
4414cabdff1aSopenharmony_ci                        int h, int mx, int my)                    \
4415cabdff1aSopenharmony_ci{                                                                 \
4416cabdff1aSopenharmony_ci                                                                  \
4417cabdff1aSopenharmony_ci    avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \
4418cabdff1aSopenharmony_ci}
4419cabdff1aSopenharmony_ci
4420cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(64, regular, FILTER_8TAP_REGULAR);
4421cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(32, regular, FILTER_8TAP_REGULAR);
4422cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(16, regular, FILTER_8TAP_REGULAR);
4423cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(8, regular, FILTER_8TAP_REGULAR);
4424cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(4, regular, FILTER_8TAP_REGULAR);
4425cabdff1aSopenharmony_ci
4426cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(64, sharp, FILTER_8TAP_SHARP);
4427cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(32, sharp, FILTER_8TAP_SHARP);
4428cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(16, sharp, FILTER_8TAP_SHARP);
4429cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(8, sharp, FILTER_8TAP_SHARP);
4430cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(4, sharp, FILTER_8TAP_SHARP);
4431cabdff1aSopenharmony_ci
4432cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
4433cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
4434cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
4435cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
4436cabdff1aSopenharmony_ciVP9_8TAP_MIPS_MSA_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
4437cabdff1aSopenharmony_ci
4438cabdff1aSopenharmony_ciVP9_COPY_AVG_MIPS_MSA_FUNC(64);
4439cabdff1aSopenharmony_ciVP9_COPY_AVG_MIPS_MSA_FUNC(32);
4440cabdff1aSopenharmony_ciVP9_COPY_AVG_MIPS_MSA_FUNC(16);
4441cabdff1aSopenharmony_ciVP9_COPY_AVG_MIPS_MSA_FUNC(8);
4442cabdff1aSopenharmony_ciVP9_AVG_MIPS_MSA_FUNC(4);
4443cabdff1aSopenharmony_ci
4444cabdff1aSopenharmony_ci#undef VP9_8TAP_MIPS_MSA_FUNC
4445cabdff1aSopenharmony_ci#undef VP9_COPY_AVG_MIPS_MSA_FUNC
4446cabdff1aSopenharmony_ci#undef VP9_AVG_MIPS_MSA_FUNC
4447