1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavcodec/vp8dsp.h"
22cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h"
23cabdff1aSopenharmony_ci#include "vp8dsp_mips.h"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_cistatic const uint8_t mc_filt_mask_arr[16 * 3] = {
26cabdff1aSopenharmony_ci    /* 8 width cases */
27cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28cabdff1aSopenharmony_ci    /* 4 width cases */
29cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30cabdff1aSopenharmony_ci    /* 4 width cases */
31cabdff1aSopenharmony_ci    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32cabdff1aSopenharmony_ci};
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_cistatic const int8_t subpel_filters_msa[7][8] = {
35cabdff1aSopenharmony_ci    {-6, 123, 12, -1, 0, 0, 0, 0},
36cabdff1aSopenharmony_ci    {2, -11, 108, 36, -8, 1, 0, 0},     /* New 1/4 pel 6 tap filter */
37cabdff1aSopenharmony_ci    {-9, 93, 50, -6, 0, 0, 0, 0},
38cabdff1aSopenharmony_ci    {3, -16, 77, 77, -16, 3, 0, 0},     /* New 1/2 pel 6 tap filter */
39cabdff1aSopenharmony_ci    {-6, 50, 93, -9, 0, 0, 0, 0},
40cabdff1aSopenharmony_ci    {1, -8, 36, 108, -11, 2, 0, 0},     /* New 1/4 pel 6 tap filter */
41cabdff1aSopenharmony_ci    {-1, 12, 123, -6, 0, 0, 0, 0},
42cabdff1aSopenharmony_ci};
43cabdff1aSopenharmony_ci
44cabdff1aSopenharmony_cistatic const int8_t bilinear_filters_msa[7][2] = {
45cabdff1aSopenharmony_ci    {112, 16},
46cabdff1aSopenharmony_ci    {96, 32},
47cabdff1aSopenharmony_ci    {80, 48},
48cabdff1aSopenharmony_ci    {64, 64},
49cabdff1aSopenharmony_ci    {48, 80},
50cabdff1aSopenharmony_ci    {32, 96},
51cabdff1aSopenharmony_ci    {16, 112}
52cabdff1aSopenharmony_ci};
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_ci#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2,                 \
55cabdff1aSopenharmony_ci                        filt_h0, filt_h1, filt_h2)                       \
56cabdff1aSopenharmony_ci( {                                                                      \
57cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m;                                        \
58cabdff1aSopenharmony_ci    v8i16 hz_out_m;                                                      \
59cabdff1aSopenharmony_ci                                                                         \
60cabdff1aSopenharmony_ci    VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,  \
61cabdff1aSopenharmony_ci               vec0_m, vec1_m, vec2_m);                                  \
62cabdff1aSopenharmony_ci    hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m,                      \
63cabdff1aSopenharmony_ci                            filt_h0, filt_h1, filt_h2);                  \
64cabdff1aSopenharmony_ci                                                                         \
65cabdff1aSopenharmony_ci    hz_out_m = __msa_srari_h(hz_out_m, 7);                               \
66cabdff1aSopenharmony_ci    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
67cabdff1aSopenharmony_ci                                                                         \
68cabdff1aSopenharmony_ci    hz_out_m;                                                            \
69cabdff1aSopenharmony_ci} )
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_ci#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
72cabdff1aSopenharmony_ci                                   mask0, mask1, mask2,                \
73cabdff1aSopenharmony_ci                                   filt0, filt1, filt2,                \
74cabdff1aSopenharmony_ci                                   out0, out1)                         \
75cabdff1aSopenharmony_ci{                                                                      \
76cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;              \
77cabdff1aSopenharmony_ci                                                                       \
78cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
79cabdff1aSopenharmony_ci    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
80cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
81cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
82cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);  \
83cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);            \
84cabdff1aSopenharmony_ci}
85cabdff1aSopenharmony_ci
86cabdff1aSopenharmony_ci#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
87cabdff1aSopenharmony_ci                                   mask0, mask1, mask2,                       \
88cabdff1aSopenharmony_ci                                   filt0, filt1, filt2,                       \
89cabdff1aSopenharmony_ci                                   out0, out1, out2, out3)                    \
90cabdff1aSopenharmony_ci{                                                                             \
91cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
92cabdff1aSopenharmony_ci                                                                              \
93cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
94cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
95cabdff1aSopenharmony_ci    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
96cabdff1aSopenharmony_ci                out0, out1, out2, out3);                                      \
97cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
98cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
99cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);         \
100cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);         \
101cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
102cabdff1aSopenharmony_ci                 out0, out1, out2, out3);                                     \
103cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2,  \
104cabdff1aSopenharmony_ci                 out0, out1, out2, out3);                                     \
105cabdff1aSopenharmony_ci}
106cabdff1aSopenharmony_ci
107cabdff1aSopenharmony_ci#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)           \
108cabdff1aSopenharmony_ci( {                                                             \
109cabdff1aSopenharmony_ci    v8i16 tmp0;                                                 \
110cabdff1aSopenharmony_ci                                                                \
111cabdff1aSopenharmony_ci    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
112cabdff1aSopenharmony_ci    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
113cabdff1aSopenharmony_ci                                                                \
114cabdff1aSopenharmony_ci    tmp0;                                                       \
115cabdff1aSopenharmony_ci} )
116cabdff1aSopenharmony_ci
117cabdff1aSopenharmony_ci#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)    \
118cabdff1aSopenharmony_ci( {                                                                    \
119cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m;                                              \
120cabdff1aSopenharmony_ci    v8i16 hz_out_m;                                                    \
121cabdff1aSopenharmony_ci                                                                       \
122cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m);  \
123cabdff1aSopenharmony_ci    hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1);  \
124cabdff1aSopenharmony_ci                                                                       \
125cabdff1aSopenharmony_ci    hz_out_m = __msa_srari_h(hz_out_m, 7);                             \
126cabdff1aSopenharmony_ci    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                             \
127cabdff1aSopenharmony_ci                                                                       \
128cabdff1aSopenharmony_ci    hz_out_m;                                                          \
129cabdff1aSopenharmony_ci} )
130cabdff1aSopenharmony_ci
131cabdff1aSopenharmony_ci#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
132cabdff1aSopenharmony_ci                                   mask0, mask1, filt0, filt1,         \
133cabdff1aSopenharmony_ci                                   out0, out1)                         \
134cabdff1aSopenharmony_ci{                                                                      \
135cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \
136cabdff1aSopenharmony_ci                                                                       \
137cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
138cabdff1aSopenharmony_ci    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
139cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
140cabdff1aSopenharmony_ci    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
141cabdff1aSopenharmony_ci}
142cabdff1aSopenharmony_ci
143cabdff1aSopenharmony_ci#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
144cabdff1aSopenharmony_ci                                   mask0, mask1, filt0, filt1,                \
145cabdff1aSopenharmony_ci                                   out0, out1, out2, out3)                    \
146cabdff1aSopenharmony_ci{                                                                             \
147cabdff1aSopenharmony_ci    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \
148cabdff1aSopenharmony_ci                                                                              \
149cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
150cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
151cabdff1aSopenharmony_ci    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
152cabdff1aSopenharmony_ci                out0, out1, out2, out3);                                      \
153cabdff1aSopenharmony_ci    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
154cabdff1aSopenharmony_ci    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
155cabdff1aSopenharmony_ci    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
156cabdff1aSopenharmony_ci                 out0, out1, out2, out3);                                     \
157cabdff1aSopenharmony_ci}
158cabdff1aSopenharmony_ci
159cabdff1aSopenharmony_cistatic void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride,
160cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
161cabdff1aSopenharmony_ci                                 const int8_t *filter)
162cabdff1aSopenharmony_ci{
163cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
164cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, out;
165cabdff1aSopenharmony_ci    v8i16 filt, out0, out1;
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[16]);
168cabdff1aSopenharmony_ci    src -= 2;
169cabdff1aSopenharmony_ci
170cabdff1aSopenharmony_ci    /* rearranging filter */
171cabdff1aSopenharmony_ci    filt = LD_SH(filter);
172cabdff1aSopenharmony_ci    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
173cabdff1aSopenharmony_ci
174cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
175cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
176cabdff1aSopenharmony_ci
177cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
178cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
179cabdff1aSopenharmony_ci    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
180cabdff1aSopenharmony_ci                               filt0, filt1, filt2, out0, out1);
181cabdff1aSopenharmony_ci    SRARI_H2_SH(out0, out1, 7);
182cabdff1aSopenharmony_ci    SAT_SH2_SH(out0, out1, 7);
183cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
184cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
185cabdff1aSopenharmony_ci}
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_cistatic void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride,
188cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
189cabdff1aSopenharmony_ci                                 const int8_t *filter)
190cabdff1aSopenharmony_ci{
191cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
192cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, out;
193cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
194cabdff1aSopenharmony_ci
195cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[16]);
196cabdff1aSopenharmony_ci    src -= 2;
197cabdff1aSopenharmony_ci
198cabdff1aSopenharmony_ci    /* rearranging filter */
199cabdff1aSopenharmony_ci    filt = LD_SH(filter);
200cabdff1aSopenharmony_ci    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
201cabdff1aSopenharmony_ci
202cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
203cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
204cabdff1aSopenharmony_ci
205cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
206cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
207cabdff1aSopenharmony_ci    src += (4 * src_stride);
208cabdff1aSopenharmony_ci    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
209cabdff1aSopenharmony_ci                               filt0, filt1, filt2, out0, out1);
210cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
211cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
212cabdff1aSopenharmony_ci    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
213cabdff1aSopenharmony_ci                               filt0, filt1, filt2, out2, out3);
214cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 7);
215cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
216cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
217cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
218cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out2, out3);
219cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
220cabdff1aSopenharmony_ci}
221cabdff1aSopenharmony_ci
222cabdff1aSopenharmony_civoid ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
223cabdff1aSopenharmony_ci                             uint8_t *src, ptrdiff_t src_stride,
224cabdff1aSopenharmony_ci                             int height, int mx, int my)
225cabdff1aSopenharmony_ci{
226cabdff1aSopenharmony_ci    const int8_t *filter = subpel_filters_msa[mx - 1];
227cabdff1aSopenharmony_ci
228cabdff1aSopenharmony_ci    if (4 == height) {
229cabdff1aSopenharmony_ci        common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
230cabdff1aSopenharmony_ci    } else if (8 == height) {
231cabdff1aSopenharmony_ci        common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
232cabdff1aSopenharmony_ci    }
233cabdff1aSopenharmony_ci}
234cabdff1aSopenharmony_ci
235cabdff1aSopenharmony_civoid ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
236cabdff1aSopenharmony_ci                             uint8_t *src, ptrdiff_t src_stride,
237cabdff1aSopenharmony_ci                             int height, int mx, int my)
238cabdff1aSopenharmony_ci{
239cabdff1aSopenharmony_ci    uint32_t loop_cnt;
240cabdff1aSopenharmony_ci    const int8_t *filter = subpel_filters_msa[mx - 1];
241cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
242cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, tmp0, tmp1;
243cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
246cabdff1aSopenharmony_ci
247cabdff1aSopenharmony_ci    src -= 2;
248cabdff1aSopenharmony_ci
249cabdff1aSopenharmony_ci    /* rearranging filter */
250cabdff1aSopenharmony_ci    filt = LD_SH(filter);
251cabdff1aSopenharmony_ci    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
254cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
255cabdff1aSopenharmony_ci
256cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
257cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
258cabdff1aSopenharmony_ci    src += (4 * src_stride);
259cabdff1aSopenharmony_ci    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
260cabdff1aSopenharmony_ci                               filt0, filt1, filt2, out0, out1, out2, out3);
261cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 7);
262cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
263cabdff1aSopenharmony_ci    tmp0 = PCKEV_XORI128_UB(out0, out1);
264cabdff1aSopenharmony_ci    tmp1 = PCKEV_XORI128_UB(out2, out3);
265cabdff1aSopenharmony_ci    ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
266cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
267cabdff1aSopenharmony_ci
268cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
269cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
270cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
271cabdff1aSopenharmony_ci        src += (4 * src_stride);
272cabdff1aSopenharmony_ci        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
273cabdff1aSopenharmony_ci                                   filt0, filt1, filt2, out0, out1, out2, out3);
274cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
275cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
276cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0, out1);
277cabdff1aSopenharmony_ci        tmp1 = PCKEV_XORI128_UB(out2, out3);
278cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
279cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
280cabdff1aSopenharmony_ci    }
281cabdff1aSopenharmony_ci}
282cabdff1aSopenharmony_ci
283cabdff1aSopenharmony_civoid ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
284cabdff1aSopenharmony_ci                              uint8_t *src, ptrdiff_t src_stride,
285cabdff1aSopenharmony_ci                              int height, int mx, int my)
286cabdff1aSopenharmony_ci{
287cabdff1aSopenharmony_ci    uint32_t loop_cnt;
288cabdff1aSopenharmony_ci    const int8_t *filter = subpel_filters_msa[mx - 1];
289cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
290cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, out;
291cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
294cabdff1aSopenharmony_ci    src -= 2;
295cabdff1aSopenharmony_ci
296cabdff1aSopenharmony_ci    /* rearranging filter */
297cabdff1aSopenharmony_ci    filt = LD_SH(filter);
298cabdff1aSopenharmony_ci    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
299cabdff1aSopenharmony_ci
300cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
301cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
302cabdff1aSopenharmony_ci
303cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
304cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src2, src4, src6);
305cabdff1aSopenharmony_ci        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
306cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
307cabdff1aSopenharmony_ci        src += (4 * src_stride);
308cabdff1aSopenharmony_ci
309cabdff1aSopenharmony_ci        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
310cabdff1aSopenharmony_ci                                   filt0, filt1, filt2, out0, out1, out2, out3);
311cabdff1aSopenharmony_ci        HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
312cabdff1aSopenharmony_ci                                   filt0, filt1, filt2, out4, out5, out6, out7);
313cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
314cabdff1aSopenharmony_ci        SRARI_H4_SH(out4, out5, out6, out7, 7);
315cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
316cabdff1aSopenharmony_ci        SAT_SH4_SH(out4, out5, out6, out7, 7);
317cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
318cabdff1aSopenharmony_ci        ST_UB(out, dst);
319cabdff1aSopenharmony_ci        dst += dst_stride;
320cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
321cabdff1aSopenharmony_ci        ST_UB(out, dst);
322cabdff1aSopenharmony_ci        dst += dst_stride;
323cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out4, out5);
324cabdff1aSopenharmony_ci        ST_UB(out, dst);
325cabdff1aSopenharmony_ci        dst += dst_stride;
326cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out6, out7);
327cabdff1aSopenharmony_ci        ST_UB(out, dst);
328cabdff1aSopenharmony_ci        dst += dst_stride;
329cabdff1aSopenharmony_ci    }
330cabdff1aSopenharmony_ci}
331cabdff1aSopenharmony_ci
332cabdff1aSopenharmony_civoid ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
333cabdff1aSopenharmony_ci                             uint8_t *src, ptrdiff_t src_stride,
334cabdff1aSopenharmony_ci                             int height, int mx, int my)
335cabdff1aSopenharmony_ci{
336cabdff1aSopenharmony_ci    uint32_t loop_cnt;
337cabdff1aSopenharmony_ci    const int8_t *filter = subpel_filters_msa[my - 1];
338cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
339cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
340cabdff1aSopenharmony_ci    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
341cabdff1aSopenharmony_ci    v16u8 out;
342cabdff1aSopenharmony_ci    v8i16 filt, out10, out32;
343cabdff1aSopenharmony_ci
344cabdff1aSopenharmony_ci    src -= (2 * src_stride);
345cabdff1aSopenharmony_ci
346cabdff1aSopenharmony_ci    filt = LD_SH(filter);
347cabdff1aSopenharmony_ci    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
348cabdff1aSopenharmony_ci
349cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
350cabdff1aSopenharmony_ci    src += (5 * src_stride);
351cabdff1aSopenharmony_ci
352cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
353cabdff1aSopenharmony_ci               src32_r, src43_r);
354cabdff1aSopenharmony_ci    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
355cabdff1aSopenharmony_ci    XORI_B2_128_SB(src2110, src4332);
356cabdff1aSopenharmony_ci
357cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
358cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src5, src6, src7, src8);
359cabdff1aSopenharmony_ci        src += (4 * src_stride);
360cabdff1aSopenharmony_ci
361cabdff1aSopenharmony_ci        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
362cabdff1aSopenharmony_ci                   src65_r, src76_r, src87_r);
363cabdff1aSopenharmony_ci        ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
364cabdff1aSopenharmony_ci        XORI_B2_128_SB(src6554, src8776);
365cabdff1aSopenharmony_ci        out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
366cabdff1aSopenharmony_ci        out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
367cabdff1aSopenharmony_ci        SRARI_H2_SH(out10, out32, 7);
368cabdff1aSopenharmony_ci        SAT_SH2_SH(out10, out32, 7);
369cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out10, out32);
370cabdff1aSopenharmony_ci        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
371cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
372cabdff1aSopenharmony_ci
373cabdff1aSopenharmony_ci        src2110 = src6554;
374cabdff1aSopenharmony_ci        src4332 = src8776;
375cabdff1aSopenharmony_ci        src4 = src8;
376cabdff1aSopenharmony_ci    }
377cabdff1aSopenharmony_ci}
378cabdff1aSopenharmony_ci
379cabdff1aSopenharmony_civoid ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
380cabdff1aSopenharmony_ci                             uint8_t *src, ptrdiff_t src_stride,
381cabdff1aSopenharmony_ci                             int height, int mx, int my)
382cabdff1aSopenharmony_ci{
383cabdff1aSopenharmony_ci    uint32_t loop_cnt;
384cabdff1aSopenharmony_ci    const int8_t *filter = subpel_filters_msa[my - 1];
385cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
386cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
387cabdff1aSopenharmony_ci    v16i8 src109_r, filt0, filt1, filt2;
388cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1;
389cabdff1aSopenharmony_ci    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
390cabdff1aSopenharmony_ci
391cabdff1aSopenharmony_ci    src -= (2 * src_stride);
392cabdff1aSopenharmony_ci
393cabdff1aSopenharmony_ci    filt = LD_SH(filter);
394cabdff1aSopenharmony_ci    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
395cabdff1aSopenharmony_ci
396cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
397cabdff1aSopenharmony_ci    src += (5 * src_stride);
398cabdff1aSopenharmony_ci
399cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
400cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3,
401cabdff1aSopenharmony_ci               src10_r, src32_r, src21_r, src43_r);
402cabdff1aSopenharmony_ci
403cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
404cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
405cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
406cabdff1aSopenharmony_ci        src += (4 * src_stride);
407cabdff1aSopenharmony_ci
408cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
409cabdff1aSopenharmony_ci                   src87_r, src98_r, src109_r);
410cabdff1aSopenharmony_ci        out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
411cabdff1aSopenharmony_ci        out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
412cabdff1aSopenharmony_ci        out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
413cabdff1aSopenharmony_ci        out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
414cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
415cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
416cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
417cabdff1aSopenharmony_ci        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
418cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
419cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
420cabdff1aSopenharmony_ci
421cabdff1aSopenharmony_ci        src10_r = src76_r;
422cabdff1aSopenharmony_ci        src32_r = src98_r;
423cabdff1aSopenharmony_ci        src21_r = src87_r;
424cabdff1aSopenharmony_ci        src43_r = src109_r;
425cabdff1aSopenharmony_ci        src4 = src10;
426cabdff1aSopenharmony_ci    }
427cabdff1aSopenharmony_ci}
428cabdff1aSopenharmony_ci
429cabdff1aSopenharmony_civoid ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
430cabdff1aSopenharmony_ci                              uint8_t *src, ptrdiff_t src_stride,
431cabdff1aSopenharmony_ci                              int height, int mx, int my)
432cabdff1aSopenharmony_ci{
433cabdff1aSopenharmony_ci    uint32_t loop_cnt;
434cabdff1aSopenharmony_ci    const int8_t *filter = subpel_filters_msa[my - 1];
435cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
436cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
437cabdff1aSopenharmony_ci    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
438cabdff1aSopenharmony_ci    v16i8 src65_l, src87_l, filt0, filt1, filt2;
439cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
440cabdff1aSopenharmony_ci    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
441cabdff1aSopenharmony_ci
442cabdff1aSopenharmony_ci    src -= (2 * src_stride);
443cabdff1aSopenharmony_ci
444cabdff1aSopenharmony_ci    filt = LD_SH(filter);
445cabdff1aSopenharmony_ci    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
446cabdff1aSopenharmony_ci
447cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
448cabdff1aSopenharmony_ci    src += (5 * src_stride);
449cabdff1aSopenharmony_ci
450cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
451cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
452cabdff1aSopenharmony_ci               src32_r, src43_r, src21_r);
453cabdff1aSopenharmony_ci    ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
454cabdff1aSopenharmony_ci               src32_l, src43_l, src21_l);
455cabdff1aSopenharmony_ci
456cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
457cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src5, src6, src7, src8);
458cabdff1aSopenharmony_ci        src += (4 * src_stride);
459cabdff1aSopenharmony_ci
460cabdff1aSopenharmony_ci        XORI_B4_128_SB(src5, src6, src7, src8);
461cabdff1aSopenharmony_ci        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
462cabdff1aSopenharmony_ci                   src65_r, src76_r, src87_r);
463cabdff1aSopenharmony_ci        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
464cabdff1aSopenharmony_ci                   src65_l, src76_l, src87_l);
465cabdff1aSopenharmony_ci        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1,
466cabdff1aSopenharmony_ci                              filt2);
467cabdff1aSopenharmony_ci        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1,
468cabdff1aSopenharmony_ci                              filt2);
469cabdff1aSopenharmony_ci        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1,
470cabdff1aSopenharmony_ci                              filt2);
471cabdff1aSopenharmony_ci        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1,
472cabdff1aSopenharmony_ci                              filt2);
473cabdff1aSopenharmony_ci        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1,
474cabdff1aSopenharmony_ci                              filt2);
475cabdff1aSopenharmony_ci        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1,
476cabdff1aSopenharmony_ci                              filt2);
477cabdff1aSopenharmony_ci        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1,
478cabdff1aSopenharmony_ci                              filt2);
479cabdff1aSopenharmony_ci        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1,
480cabdff1aSopenharmony_ci                              filt2);
481cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
482cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
483cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
484cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
485cabdff1aSopenharmony_ci        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
486cabdff1aSopenharmony_ci                    out3_r, tmp0, tmp1, tmp2, tmp3);
487cabdff1aSopenharmony_ci        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
488cabdff1aSopenharmony_ci        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
489cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
490cabdff1aSopenharmony_ci
491cabdff1aSopenharmony_ci        src10_r = src54_r;
492cabdff1aSopenharmony_ci        src32_r = src76_r;
493cabdff1aSopenharmony_ci        src21_r = src65_r;
494cabdff1aSopenharmony_ci        src43_r = src87_r;
495cabdff1aSopenharmony_ci        src10_l = src54_l;
496cabdff1aSopenharmony_ci        src32_l = src76_l;
497cabdff1aSopenharmony_ci        src21_l = src65_l;
498cabdff1aSopenharmony_ci        src43_l = src87_l;
499cabdff1aSopenharmony_ci        src4 = src8;
500cabdff1aSopenharmony_ci    }
501cabdff1aSopenharmony_ci}
502cabdff1aSopenharmony_ci
503cabdff1aSopenharmony_civoid ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
504cabdff1aSopenharmony_ci                               uint8_t *src, ptrdiff_t src_stride,
505cabdff1aSopenharmony_ci                               int height, int mx, int my)
506cabdff1aSopenharmony_ci{
507cabdff1aSopenharmony_ci    uint32_t loop_cnt;
508cabdff1aSopenharmony_ci    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
509cabdff1aSopenharmony_ci    const int8_t *filter_vert = subpel_filters_msa[my - 1];
510cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
511cabdff1aSopenharmony_ci    v16i8 filt_hz0, filt_hz1, filt_hz2;
512cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, out;
513cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1;
514cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
515cabdff1aSopenharmony_ci    v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
516cabdff1aSopenharmony_ci
517cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[16]);
518cabdff1aSopenharmony_ci    src -= (2 + 2 * src_stride);
519cabdff1aSopenharmony_ci
520cabdff1aSopenharmony_ci    /* rearranging filter */
521cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
522cabdff1aSopenharmony_ci    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
523cabdff1aSopenharmony_ci
524cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
525cabdff1aSopenharmony_ci    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
526cabdff1aSopenharmony_ci
527cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
528cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
529cabdff1aSopenharmony_ci
530cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
531cabdff1aSopenharmony_ci    src += (5 * src_stride);
532cabdff1aSopenharmony_ci
533cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
534cabdff1aSopenharmony_ci    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
535cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2);
536cabdff1aSopenharmony_ci    hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0,
537cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2);
538cabdff1aSopenharmony_ci    hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
539cabdff1aSopenharmony_ci    hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
540cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2);
541cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
542cabdff1aSopenharmony_ci
543cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
544cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src5, src6);
545cabdff1aSopenharmony_ci        src += (2 * src_stride);
546cabdff1aSopenharmony_ci
547cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src6);
548cabdff1aSopenharmony_ci        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
549cabdff1aSopenharmony_ci                                  filt_hz1, filt_hz2);
550cabdff1aSopenharmony_ci        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
551cabdff1aSopenharmony_ci
552cabdff1aSopenharmony_ci        LD_SB2(src, src_stride, src7, src8);
553cabdff1aSopenharmony_ci        src += (2 * src_stride);
554cabdff1aSopenharmony_ci
555cabdff1aSopenharmony_ci        XORI_B2_128_SB(src7, src8);
556cabdff1aSopenharmony_ci        hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
557cabdff1aSopenharmony_ci                                  filt_hz1, filt_hz2);
558cabdff1aSopenharmony_ci        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
559cabdff1aSopenharmony_ci
560cabdff1aSopenharmony_ci        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
561cabdff1aSopenharmony_ci        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
562cabdff1aSopenharmony_ci
563cabdff1aSopenharmony_ci        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
564cabdff1aSopenharmony_ci        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
565cabdff1aSopenharmony_ci
566cabdff1aSopenharmony_ci        SRARI_H2_SH(tmp0, tmp1, 7);
567cabdff1aSopenharmony_ci        SAT_SH2_SH(tmp0, tmp1, 7);
568cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(tmp0, tmp1);
569cabdff1aSopenharmony_ci        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
570cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
571cabdff1aSopenharmony_ci
572cabdff1aSopenharmony_ci        hz_out3 = hz_out7;
573cabdff1aSopenharmony_ci        out0 = out2;
574cabdff1aSopenharmony_ci        out1 = out3;
575cabdff1aSopenharmony_ci    }
576cabdff1aSopenharmony_ci}
577cabdff1aSopenharmony_ci
578cabdff1aSopenharmony_civoid ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
579cabdff1aSopenharmony_ci                               uint8_t *src, ptrdiff_t src_stride,
580cabdff1aSopenharmony_ci                               int height, int mx, int my)
581cabdff1aSopenharmony_ci{
582cabdff1aSopenharmony_ci    uint32_t loop_cnt;
583cabdff1aSopenharmony_ci    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
584cabdff1aSopenharmony_ci    const int8_t *filter_vert = subpel_filters_msa[my - 1];
585cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
586cabdff1aSopenharmony_ci    v16i8 filt_hz0, filt_hz1, filt_hz2;
587cabdff1aSopenharmony_ci    v16u8 mask0, mask1, mask2, vec0, vec1;
588cabdff1aSopenharmony_ci    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
589cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
590cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
591cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3;
592cabdff1aSopenharmony_ci
593cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
594cabdff1aSopenharmony_ci    src -= (2 + 2 * src_stride);
595cabdff1aSopenharmony_ci
596cabdff1aSopenharmony_ci    /* rearranging filter */
597cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
598cabdff1aSopenharmony_ci    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
599cabdff1aSopenharmony_ci
600cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
601cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
602cabdff1aSopenharmony_ci
603cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
604cabdff1aSopenharmony_ci    src += (5 * src_stride);
605cabdff1aSopenharmony_ci
606cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
607cabdff1aSopenharmony_ci    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
608cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2);
609cabdff1aSopenharmony_ci    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
610cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2);
611cabdff1aSopenharmony_ci    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
612cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2);
613cabdff1aSopenharmony_ci    hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
614cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2);
615cabdff1aSopenharmony_ci    hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
616cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2);
617cabdff1aSopenharmony_ci
618cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
619cabdff1aSopenharmony_ci    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
620cabdff1aSopenharmony_ci
621cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
622cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
623cabdff1aSopenharmony_ci
624cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
625cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src5, src6, src7, src8);
626cabdff1aSopenharmony_ci        src += (4 * src_stride);
627cabdff1aSopenharmony_ci
628cabdff1aSopenharmony_ci        XORI_B4_128_SB(src5, src6, src7, src8);
629cabdff1aSopenharmony_ci        hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
630cabdff1aSopenharmony_ci                                  filt_hz1, filt_hz2);
631cabdff1aSopenharmony_ci        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
632cabdff1aSopenharmony_ci        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
633cabdff1aSopenharmony_ci
634cabdff1aSopenharmony_ci        hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
635cabdff1aSopenharmony_ci                                  filt_hz1, filt_hz2);
636cabdff1aSopenharmony_ci        out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
637cabdff1aSopenharmony_ci        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
638cabdff1aSopenharmony_ci
639cabdff1aSopenharmony_ci        hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
640cabdff1aSopenharmony_ci                                  filt_hz1, filt_hz2);
641cabdff1aSopenharmony_ci        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
642cabdff1aSopenharmony_ci        tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
643cabdff1aSopenharmony_ci
644cabdff1aSopenharmony_ci        hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
645cabdff1aSopenharmony_ci                                  filt_hz1, filt_hz2);
646cabdff1aSopenharmony_ci        out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
647cabdff1aSopenharmony_ci        tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
648cabdff1aSopenharmony_ci
649cabdff1aSopenharmony_ci        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
650cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
651cabdff1aSopenharmony_ci        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
652cabdff1aSopenharmony_ci        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
653cabdff1aSopenharmony_ci        ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
654cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
655cabdff1aSopenharmony_ci
656cabdff1aSopenharmony_ci        hz_out4 = hz_out8;
657cabdff1aSopenharmony_ci        out0 = out2;
658cabdff1aSopenharmony_ci        out1 = out7;
659cabdff1aSopenharmony_ci        out3 = out5;
660cabdff1aSopenharmony_ci        out4 = out6;
661cabdff1aSopenharmony_ci    }
662cabdff1aSopenharmony_ci}
663cabdff1aSopenharmony_ci
664cabdff1aSopenharmony_ci
665cabdff1aSopenharmony_civoid ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
666cabdff1aSopenharmony_ci                               uint8_t *src, ptrdiff_t src_stride,
667cabdff1aSopenharmony_ci                               int height, int mx, int my)
668cabdff1aSopenharmony_ci{
669cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
670cabdff1aSopenharmony_ci
671cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
672cabdff1aSopenharmony_ci        ff_put_vp8_epel8_h6v6_msa(dst, dst_stride, src, src_stride, height,
673cabdff1aSopenharmony_ci                                  mx, my);
674cabdff1aSopenharmony_ci
675cabdff1aSopenharmony_ci        src += 8;
676cabdff1aSopenharmony_ci        dst += 8;
677cabdff1aSopenharmony_ci    }
678cabdff1aSopenharmony_ci}
679cabdff1aSopenharmony_ci
680cabdff1aSopenharmony_cistatic void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
681cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
682cabdff1aSopenharmony_ci                                 const int8_t *filter)
683cabdff1aSopenharmony_ci{
684cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
685cabdff1aSopenharmony_ci    v8i16 filt, out0, out1;
686cabdff1aSopenharmony_ci    v16u8 out;
687cabdff1aSopenharmony_ci
688cabdff1aSopenharmony_ci    mask0 = LD_SB(&mc_filt_mask_arr[16]);
689cabdff1aSopenharmony_ci    src -= 1;
690cabdff1aSopenharmony_ci
691cabdff1aSopenharmony_ci    /* rearranging filter */
692cabdff1aSopenharmony_ci    filt = LD_SH(filter);
693cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
694cabdff1aSopenharmony_ci
695cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
696cabdff1aSopenharmony_ci
697cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
698cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
699cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
700cabdff1aSopenharmony_ci                               filt0, filt1, out0, out1);
701cabdff1aSopenharmony_ci    SRARI_H2_SH(out0, out1, 7);
702cabdff1aSopenharmony_ci    SAT_SH2_SH(out0, out1, 7);
703cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
704cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
705cabdff1aSopenharmony_ci}
706cabdff1aSopenharmony_ci
707cabdff1aSopenharmony_cistatic void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
708cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
709cabdff1aSopenharmony_ci                                 const int8_t *filter)
710cabdff1aSopenharmony_ci{
711cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
712cabdff1aSopenharmony_ci    v16u8 out;
713cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
714cabdff1aSopenharmony_ci
715cabdff1aSopenharmony_ci    mask0 = LD_SB(&mc_filt_mask_arr[16]);
716cabdff1aSopenharmony_ci    src -= 1;
717cabdff1aSopenharmony_ci
718cabdff1aSopenharmony_ci    /* rearranging filter */
719cabdff1aSopenharmony_ci    filt = LD_SH(filter);
720cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
721cabdff1aSopenharmony_ci
722cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
723cabdff1aSopenharmony_ci
724cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
725cabdff1aSopenharmony_ci    src += (4 * src_stride);
726cabdff1aSopenharmony_ci
727cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
728cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
729cabdff1aSopenharmony_ci                               filt0, filt1, out0, out1);
730cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
731cabdff1aSopenharmony_ci    XORI_B4_128_SB(src0, src1, src2, src3);
732cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
733cabdff1aSopenharmony_ci                               filt0, filt1, out2, out3);
734cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 7);
735cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
736cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
737cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
738cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out2, out3);
739cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
740cabdff1aSopenharmony_ci}
741cabdff1aSopenharmony_ci
742cabdff1aSopenharmony_cistatic void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
743cabdff1aSopenharmony_ci                                  uint8_t *dst, int32_t dst_stride,
744cabdff1aSopenharmony_ci                                  const int8_t *filter)
745cabdff1aSopenharmony_ci{
746cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
747cabdff1aSopenharmony_ci    v16i8 filt0, filt1, mask0, mask1;
748cabdff1aSopenharmony_ci    v16u8 out;
749cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
750cabdff1aSopenharmony_ci
751cabdff1aSopenharmony_ci    mask0 = LD_SB(&mc_filt_mask_arr[16]);
752cabdff1aSopenharmony_ci    src -= 1;
753cabdff1aSopenharmony_ci
754cabdff1aSopenharmony_ci    /* rearranging filter */
755cabdff1aSopenharmony_ci    filt = LD_SH(filter);
756cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
757cabdff1aSopenharmony_ci
758cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
759cabdff1aSopenharmony_ci
760cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
761cabdff1aSopenharmony_ci    src += (8 * src_stride);
762cabdff1aSopenharmony_ci    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
763cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
764cabdff1aSopenharmony_ci                               filt0, filt1, out0, out1);
765cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
766cabdff1aSopenharmony_ci                               filt0, filt1, out2, out3);
767cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 7);
768cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
769cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
770cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
771cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
772cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out2, out3);
773cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
774cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
775cabdff1aSopenharmony_ci
776cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
777cabdff1aSopenharmony_ci    src += (8 * src_stride);
778cabdff1aSopenharmony_ci    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
779cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
780cabdff1aSopenharmony_ci                               filt0, filt1, out0, out1);
781cabdff1aSopenharmony_ci    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
782cabdff1aSopenharmony_ci                               filt0, filt1, out2, out3);
783cabdff1aSopenharmony_ci    SRARI_H4_SH(out0, out1, out2, out3, 7);
784cabdff1aSopenharmony_ci    SAT_SH4_SH(out0, out1, out2, out3, 7);
785cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out0, out1);
786cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
787cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
788cabdff1aSopenharmony_ci    out = PCKEV_XORI128_UB(out2, out3);
789cabdff1aSopenharmony_ci    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
790cabdff1aSopenharmony_ci}
791cabdff1aSopenharmony_ci
792cabdff1aSopenharmony_civoid ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
793cabdff1aSopenharmony_ci                             uint8_t *src, ptrdiff_t src_stride,
794cabdff1aSopenharmony_ci                             int height, int mx, int my)
795cabdff1aSopenharmony_ci{
796cabdff1aSopenharmony_ci    const int8_t *filter = subpel_filters_msa[mx - 1];
797cabdff1aSopenharmony_ci
798cabdff1aSopenharmony_ci    if (4 == height) {
799cabdff1aSopenharmony_ci        common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
800cabdff1aSopenharmony_ci    } else if (8 == height) {
801cabdff1aSopenharmony_ci        common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
802cabdff1aSopenharmony_ci    } else if (16 == height) {
803cabdff1aSopenharmony_ci        common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
804cabdff1aSopenharmony_ci    }
805cabdff1aSopenharmony_ci}
806cabdff1aSopenharmony_ci
807cabdff1aSopenharmony_civoid ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
808cabdff1aSopenharmony_ci                             uint8_t *src, ptrdiff_t src_stride,
809cabdff1aSopenharmony_ci                             int height, int mx, int my)
810cabdff1aSopenharmony_ci{
811cabdff1aSopenharmony_ci    uint32_t loop_cnt;
812cabdff1aSopenharmony_ci    const int8_t *filter = subpel_filters_msa[mx - 1];
813cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
814cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1;
815cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3;
816cabdff1aSopenharmony_ci
817cabdff1aSopenharmony_ci    mask0 = LD_SB(&mc_filt_mask_arr[0]);
818cabdff1aSopenharmony_ci    src -= 1;
819cabdff1aSopenharmony_ci
820cabdff1aSopenharmony_ci    /* rearranging filter */
821cabdff1aSopenharmony_ci    filt = LD_SH(filter);
822cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
823cabdff1aSopenharmony_ci
824cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
825cabdff1aSopenharmony_ci
826cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
827cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
828cabdff1aSopenharmony_ci        src += (4 * src_stride);
829cabdff1aSopenharmony_ci
830cabdff1aSopenharmony_ci        XORI_B4_128_SB(src0, src1, src2, src3);
831cabdff1aSopenharmony_ci        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
832cabdff1aSopenharmony_ci                                   filt1, out0, out1, out2, out3);
833cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
834cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
835cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0, out1);
836cabdff1aSopenharmony_ci        tmp1 = PCKEV_XORI128_UB(out2, out3);
837cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
838cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
839cabdff1aSopenharmony_ci    }
840cabdff1aSopenharmony_ci}
841cabdff1aSopenharmony_ci
842cabdff1aSopenharmony_civoid ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
843cabdff1aSopenharmony_ci                              uint8_t *src, ptrdiff_t src_stride,
844cabdff1aSopenharmony_ci                              int height, int mx, int my)
845cabdff1aSopenharmony_ci{
846cabdff1aSopenharmony_ci    uint32_t loop_cnt;
847cabdff1aSopenharmony_ci    const int8_t *filter = subpel_filters_msa[mx - 1];
848cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
849cabdff1aSopenharmony_ci    v16i8 filt0, filt1, mask0, mask1;
850cabdff1aSopenharmony_ci    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
851cabdff1aSopenharmony_ci    v16u8 out;
852cabdff1aSopenharmony_ci
853cabdff1aSopenharmony_ci    mask0 = LD_SB(&mc_filt_mask_arr[0]);
854cabdff1aSopenharmony_ci    src -= 1;
855cabdff1aSopenharmony_ci
856cabdff1aSopenharmony_ci    /* rearranging filter */
857cabdff1aSopenharmony_ci    filt = LD_SH(filter);
858cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
859cabdff1aSopenharmony_ci
860cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
861cabdff1aSopenharmony_ci
862cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
863cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src2, src4, src6);
864cabdff1aSopenharmony_ci        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
865cabdff1aSopenharmony_ci        src += (4 * src_stride);
866cabdff1aSopenharmony_ci
867cabdff1aSopenharmony_ci        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
868cabdff1aSopenharmony_ci        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
869cabdff1aSopenharmony_ci                                   filt1, out0, out1, out2, out3);
870cabdff1aSopenharmony_ci        HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
871cabdff1aSopenharmony_ci                                   filt1, out4, out5, out6, out7);
872cabdff1aSopenharmony_ci        SRARI_H4_SH(out0, out1, out2, out3, 7);
873cabdff1aSopenharmony_ci        SRARI_H4_SH(out4, out5, out6, out7, 7);
874cabdff1aSopenharmony_ci        SAT_SH4_SH(out0, out1, out2, out3, 7);
875cabdff1aSopenharmony_ci        SAT_SH4_SH(out4, out5, out6, out7, 7);
876cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out0, out1);
877cabdff1aSopenharmony_ci        ST_UB(out, dst);
878cabdff1aSopenharmony_ci        dst += dst_stride;
879cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out2, out3);
880cabdff1aSopenharmony_ci        ST_UB(out, dst);
881cabdff1aSopenharmony_ci        dst += dst_stride;
882cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out4, out5);
883cabdff1aSopenharmony_ci        ST_UB(out, dst);
884cabdff1aSopenharmony_ci        dst += dst_stride;
885cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out6, out7);
886cabdff1aSopenharmony_ci        ST_UB(out, dst);
887cabdff1aSopenharmony_ci        dst += dst_stride;
888cabdff1aSopenharmony_ci    }
889cabdff1aSopenharmony_ci}
890cabdff1aSopenharmony_ci
891cabdff1aSopenharmony_civoid ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
892cabdff1aSopenharmony_ci                             uint8_t *src, ptrdiff_t src_stride,
893cabdff1aSopenharmony_ci                             int height, int mx, int my)
894cabdff1aSopenharmony_ci{
895cabdff1aSopenharmony_ci    uint32_t loop_cnt;
896cabdff1aSopenharmony_ci    const int8_t *filter = subpel_filters_msa[my - 1];
897cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5;
898cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
899cabdff1aSopenharmony_ci    v16i8 src2110, src4332, filt0, filt1;
900cabdff1aSopenharmony_ci    v8i16 filt, out10, out32;
901cabdff1aSopenharmony_ci    v16u8 out;
902cabdff1aSopenharmony_ci
903cabdff1aSopenharmony_ci    src -= src_stride;
904cabdff1aSopenharmony_ci
905cabdff1aSopenharmony_ci    filt = LD_SH(filter);
906cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
907cabdff1aSopenharmony_ci
908cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
909cabdff1aSopenharmony_ci    src += (3 * src_stride);
910cabdff1aSopenharmony_ci
911cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
912cabdff1aSopenharmony_ci
913cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
914cabdff1aSopenharmony_ci    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
915cabdff1aSopenharmony_ci
916cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
917cabdff1aSopenharmony_ci        LD_SB3(src, src_stride, src3, src4, src5);
918cabdff1aSopenharmony_ci        src += (3 * src_stride);
919cabdff1aSopenharmony_ci        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
920cabdff1aSopenharmony_ci        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
921cabdff1aSopenharmony_ci        src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
922cabdff1aSopenharmony_ci        out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
923cabdff1aSopenharmony_ci
924cabdff1aSopenharmony_ci        src2 = LD_SB(src);
925cabdff1aSopenharmony_ci        src += (src_stride);
926cabdff1aSopenharmony_ci        ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
927cabdff1aSopenharmony_ci        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
928cabdff1aSopenharmony_ci        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
929cabdff1aSopenharmony_ci        out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
930cabdff1aSopenharmony_ci        SRARI_H2_SH(out10, out32, 7);
931cabdff1aSopenharmony_ci        SAT_SH2_SH(out10, out32, 7);
932cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(out10, out32);
933cabdff1aSopenharmony_ci        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
934cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
935cabdff1aSopenharmony_ci    }
936cabdff1aSopenharmony_ci}
937cabdff1aSopenharmony_ci
938cabdff1aSopenharmony_civoid ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
939cabdff1aSopenharmony_ci                             uint8_t *src, ptrdiff_t src_stride,
940cabdff1aSopenharmony_ci                             int height, int mx, int my)
941cabdff1aSopenharmony_ci{
942cabdff1aSopenharmony_ci    uint32_t loop_cnt;
943cabdff1aSopenharmony_ci    const int8_t *filter = subpel_filters_msa[my - 1];
944cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src7, src8, src9, src10;
945cabdff1aSopenharmony_ci    v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
946cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1;
947cabdff1aSopenharmony_ci    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
948cabdff1aSopenharmony_ci
949cabdff1aSopenharmony_ci    src -= src_stride;
950cabdff1aSopenharmony_ci
951cabdff1aSopenharmony_ci    filt = LD_SH(filter);
952cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
953cabdff1aSopenharmony_ci
954cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
955cabdff1aSopenharmony_ci    src += (3 * src_stride);
956cabdff1aSopenharmony_ci
957cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
958cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
959cabdff1aSopenharmony_ci
960cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
961cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src7, src8, src9, src10);
962cabdff1aSopenharmony_ci        src += (4 * src_stride);
963cabdff1aSopenharmony_ci
964cabdff1aSopenharmony_ci        XORI_B4_128_SB(src7, src8, src9, src10);
965cabdff1aSopenharmony_ci        ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
966cabdff1aSopenharmony_ci                   src72_r, src87_r, src98_r, src109_r);
967cabdff1aSopenharmony_ci        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
968cabdff1aSopenharmony_ci        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
969cabdff1aSopenharmony_ci        out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
970cabdff1aSopenharmony_ci        out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
971cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
972cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
973cabdff1aSopenharmony_ci        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
974cabdff1aSopenharmony_ci        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
975cabdff1aSopenharmony_ci        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
976cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
977cabdff1aSopenharmony_ci
978cabdff1aSopenharmony_ci        src10_r = src98_r;
979cabdff1aSopenharmony_ci        src21_r = src109_r;
980cabdff1aSopenharmony_ci        src2 = src10;
981cabdff1aSopenharmony_ci    }
982cabdff1aSopenharmony_ci}
983cabdff1aSopenharmony_ci
984cabdff1aSopenharmony_civoid ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
985cabdff1aSopenharmony_ci                              uint8_t *src, ptrdiff_t src_stride,
986cabdff1aSopenharmony_ci                              int height, int mx, int my)
987cabdff1aSopenharmony_ci{
988cabdff1aSopenharmony_ci    uint32_t loop_cnt;
989cabdff1aSopenharmony_ci    const int8_t *filter = subpel_filters_msa[my - 1];
990cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
991cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
992cabdff1aSopenharmony_ci    v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
993cabdff1aSopenharmony_ci    v16u8 tmp0, tmp1, tmp2, tmp3;
994cabdff1aSopenharmony_ci    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
995cabdff1aSopenharmony_ci
996cabdff1aSopenharmony_ci    src -= src_stride;
997cabdff1aSopenharmony_ci
998cabdff1aSopenharmony_ci    filt = LD_SH(filter);
999cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1000cabdff1aSopenharmony_ci
1001cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
1002cabdff1aSopenharmony_ci    src += (3 * src_stride);
1003cabdff1aSopenharmony_ci
1004cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
1005cabdff1aSopenharmony_ci    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
1006cabdff1aSopenharmony_ci    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
1007cabdff1aSopenharmony_ci
1008cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1009cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src3, src4, src5, src6);
1010cabdff1aSopenharmony_ci        src += (4 * src_stride);
1011cabdff1aSopenharmony_ci
1012cabdff1aSopenharmony_ci        XORI_B4_128_SB(src3, src4, src5, src6);
1013cabdff1aSopenharmony_ci        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1014cabdff1aSopenharmony_ci                   src32_r, src43_r, src54_r, src65_r);
1015cabdff1aSopenharmony_ci        ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1016cabdff1aSopenharmony_ci                   src32_l, src43_l, src54_l, src65_l);
1017cabdff1aSopenharmony_ci        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
1018cabdff1aSopenharmony_ci        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
1019cabdff1aSopenharmony_ci        out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
1020cabdff1aSopenharmony_ci        out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
1021cabdff1aSopenharmony_ci        out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
1022cabdff1aSopenharmony_ci        out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
1023cabdff1aSopenharmony_ci        out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
1024cabdff1aSopenharmony_ci        out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
1025cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1026cabdff1aSopenharmony_ci        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1027cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1028cabdff1aSopenharmony_ci        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1029cabdff1aSopenharmony_ci        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1030cabdff1aSopenharmony_ci                    out3_r, tmp0, tmp1, tmp2, tmp3);
1031cabdff1aSopenharmony_ci        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1032cabdff1aSopenharmony_ci        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1033cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1034cabdff1aSopenharmony_ci
1035cabdff1aSopenharmony_ci        src10_r = src54_r;
1036cabdff1aSopenharmony_ci        src21_r = src65_r;
1037cabdff1aSopenharmony_ci        src10_l = src54_l;
1038cabdff1aSopenharmony_ci        src21_l = src65_l;
1039cabdff1aSopenharmony_ci        src2 = src6;
1040cabdff1aSopenharmony_ci    }
1041cabdff1aSopenharmony_ci}
1042cabdff1aSopenharmony_ci
1043cabdff1aSopenharmony_civoid ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1044cabdff1aSopenharmony_ci                               uint8_t *src, ptrdiff_t src_stride,
1045cabdff1aSopenharmony_ci                               int height, int mx, int my)
1046cabdff1aSopenharmony_ci{
1047cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1048cabdff1aSopenharmony_ci    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1049cabdff1aSopenharmony_ci    const int8_t *filter_vert = subpel_filters_msa[my - 1];
1050cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1051cabdff1aSopenharmony_ci    v16u8 mask0, mask1, out;
1052cabdff1aSopenharmony_ci    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1053cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1054cabdff1aSopenharmony_ci
1055cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[16]);
1056cabdff1aSopenharmony_ci    src -= (1 + 1 * src_stride);
1057cabdff1aSopenharmony_ci
1058cabdff1aSopenharmony_ci    /* rearranging filter */
1059cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
1060cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1061cabdff1aSopenharmony_ci
1062cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1063cabdff1aSopenharmony_ci
1064cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
1065cabdff1aSopenharmony_ci    src += (3 * src_stride);
1066cabdff1aSopenharmony_ci
1067cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
1068cabdff1aSopenharmony_ci    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1069cabdff1aSopenharmony_ci    hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
1070cabdff1aSopenharmony_ci    vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1071cabdff1aSopenharmony_ci
1072cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
1073cabdff1aSopenharmony_ci    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1074cabdff1aSopenharmony_ci
1075cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1076cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src3, src4, src5, src6);
1077cabdff1aSopenharmony_ci        src += (4 * src_stride);
1078cabdff1aSopenharmony_ci
1079cabdff1aSopenharmony_ci        XORI_B2_128_SB(src3, src4);
1080cabdff1aSopenharmony_ci        hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1081cabdff1aSopenharmony_ci        hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1082cabdff1aSopenharmony_ci        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1083cabdff1aSopenharmony_ci        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1084cabdff1aSopenharmony_ci
1085cabdff1aSopenharmony_ci        XORI_B2_128_SB(src5, src6);
1086cabdff1aSopenharmony_ci        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1087cabdff1aSopenharmony_ci        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1088cabdff1aSopenharmony_ci        vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1089cabdff1aSopenharmony_ci        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1090cabdff1aSopenharmony_ci
1091cabdff1aSopenharmony_ci        SRARI_H2_SH(tmp0, tmp1, 7);
1092cabdff1aSopenharmony_ci        SAT_SH2_SH(tmp0, tmp1, 7);
1093cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(tmp0, tmp1);
1094cabdff1aSopenharmony_ci        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1095cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1096cabdff1aSopenharmony_ci
1097cabdff1aSopenharmony_ci        hz_out1 = hz_out5;
1098cabdff1aSopenharmony_ci        vec0 = vec2;
1099cabdff1aSopenharmony_ci    }
1100cabdff1aSopenharmony_ci}
1101cabdff1aSopenharmony_ci
1102cabdff1aSopenharmony_civoid ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1103cabdff1aSopenharmony_ci                               uint8_t *src, ptrdiff_t src_stride,
1104cabdff1aSopenharmony_ci                               int height, int mx, int my)
1105cabdff1aSopenharmony_ci{
1106cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1107cabdff1aSopenharmony_ci    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1108cabdff1aSopenharmony_ci    const int8_t *filter_vert = subpel_filters_msa[my - 1];
1109cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1110cabdff1aSopenharmony_ci    v16u8 mask0, mask1, out0, out1;
1111cabdff1aSopenharmony_ci    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1112cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1113cabdff1aSopenharmony_ci    v8i16 vec0, vec1, vec2, vec3, vec4;
1114cabdff1aSopenharmony_ci
1115cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[0]);
1116cabdff1aSopenharmony_ci    src -= (1 + 1 * src_stride);
1117cabdff1aSopenharmony_ci
1118cabdff1aSopenharmony_ci    /* rearranging filter */
1119cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
1120cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1121cabdff1aSopenharmony_ci
1122cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1123cabdff1aSopenharmony_ci
1124cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
1125cabdff1aSopenharmony_ci    src += (3 * src_stride);
1126cabdff1aSopenharmony_ci
1127cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
1128cabdff1aSopenharmony_ci    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1129cabdff1aSopenharmony_ci    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1130cabdff1aSopenharmony_ci    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1131cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1132cabdff1aSopenharmony_ci
1133cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
1134cabdff1aSopenharmony_ci    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1135cabdff1aSopenharmony_ci
1136cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1137cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src3, src4, src5, src6);
1138cabdff1aSopenharmony_ci        src += (4 * src_stride);
1139cabdff1aSopenharmony_ci
1140cabdff1aSopenharmony_ci        XORI_B4_128_SB(src3, src4, src5, src6);
1141cabdff1aSopenharmony_ci        hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1142cabdff1aSopenharmony_ci        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1143cabdff1aSopenharmony_ci        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1144cabdff1aSopenharmony_ci
1145cabdff1aSopenharmony_ci        hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1146cabdff1aSopenharmony_ci        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1147cabdff1aSopenharmony_ci        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1148cabdff1aSopenharmony_ci
1149cabdff1aSopenharmony_ci        hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1150cabdff1aSopenharmony_ci        vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1151cabdff1aSopenharmony_ci        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
1152cabdff1aSopenharmony_ci
1153cabdff1aSopenharmony_ci        hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1154cabdff1aSopenharmony_ci        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1155cabdff1aSopenharmony_ci        tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1156cabdff1aSopenharmony_ci
1157cabdff1aSopenharmony_ci        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1158cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1159cabdff1aSopenharmony_ci        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1160cabdff1aSopenharmony_ci        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1161cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1162cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1163cabdff1aSopenharmony_ci
1164cabdff1aSopenharmony_ci        vec0 = vec4;
1165cabdff1aSopenharmony_ci        vec2 = vec1;
1166cabdff1aSopenharmony_ci    }
1167cabdff1aSopenharmony_ci}
1168cabdff1aSopenharmony_ci
1169cabdff1aSopenharmony_civoid ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1170cabdff1aSopenharmony_ci                                uint8_t *src, ptrdiff_t src_stride,
1171cabdff1aSopenharmony_ci                                int height, int mx, int my)
1172cabdff1aSopenharmony_ci{
1173cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
1174cabdff1aSopenharmony_ci
1175cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
1176cabdff1aSopenharmony_ci        ff_put_vp8_epel8_h4v4_msa(dst, dst_stride, src, src_stride, height,
1177cabdff1aSopenharmony_ci                                  mx, my);
1178cabdff1aSopenharmony_ci
1179cabdff1aSopenharmony_ci        src += 8;
1180cabdff1aSopenharmony_ci        dst += 8;
1181cabdff1aSopenharmony_ci    }
1182cabdff1aSopenharmony_ci}
1183cabdff1aSopenharmony_ci
1184cabdff1aSopenharmony_civoid ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1185cabdff1aSopenharmony_ci                               uint8_t *src, ptrdiff_t src_stride,
1186cabdff1aSopenharmony_ci                               int height, int mx, int my)
1187cabdff1aSopenharmony_ci{
1188cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1189cabdff1aSopenharmony_ci    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1190cabdff1aSopenharmony_ci    const int8_t *filter_vert = subpel_filters_msa[my - 1];
1191cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
1192cabdff1aSopenharmony_ci    v16i8 filt_hz0, filt_hz1, filt_hz2;
1193cabdff1aSopenharmony_ci    v16u8 res0, res1, mask0, mask1, mask2;
1194cabdff1aSopenharmony_ci    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1195cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1196cabdff1aSopenharmony_ci
1197cabdff1aSopenharmony_ci    mask0 = LD_UB(&mc_filt_mask_arr[16]);
1198cabdff1aSopenharmony_ci    src -= (2 + 1 * src_stride);
1199cabdff1aSopenharmony_ci
1200cabdff1aSopenharmony_ci    /* rearranging filter */
1201cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
1202cabdff1aSopenharmony_ci    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1203cabdff1aSopenharmony_ci
1204cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1205cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1206cabdff1aSopenharmony_ci
1207cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
1208cabdff1aSopenharmony_ci    src += (3 * src_stride);
1209cabdff1aSopenharmony_ci
1210cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
1211cabdff1aSopenharmony_ci    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
1212cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2);
1213cabdff1aSopenharmony_ci    hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0,
1214cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2);
1215cabdff1aSopenharmony_ci    vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1216cabdff1aSopenharmony_ci
1217cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
1218cabdff1aSopenharmony_ci    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1219cabdff1aSopenharmony_ci
1220cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1221cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src3, src4, src5, src6);
1222cabdff1aSopenharmony_ci        src += (4 * src_stride);
1223cabdff1aSopenharmony_ci
1224cabdff1aSopenharmony_ci        XORI_B4_128_SB(src3, src4, src5, src6);
1225cabdff1aSopenharmony_ci        hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
1226cabdff1aSopenharmony_ci                                  filt_hz1, filt_hz2);
1227cabdff1aSopenharmony_ci        hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1228cabdff1aSopenharmony_ci        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1229cabdff1aSopenharmony_ci        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1230cabdff1aSopenharmony_ci
1231cabdff1aSopenharmony_ci        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
1232cabdff1aSopenharmony_ci                                  filt_hz1, filt_hz2);
1233cabdff1aSopenharmony_ci        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1234cabdff1aSopenharmony_ci        vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1235cabdff1aSopenharmony_ci        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1236cabdff1aSopenharmony_ci
1237cabdff1aSopenharmony_ci        SRARI_H2_SH(tmp0, tmp1, 7);
1238cabdff1aSopenharmony_ci        SAT_SH2_SH(tmp0, tmp1, 7);
1239cabdff1aSopenharmony_ci        PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1240cabdff1aSopenharmony_ci        XORI_B2_128_UB(res0, res1);
1241cabdff1aSopenharmony_ci        ST_W2(res0, 0, 1, dst, dst_stride);
1242cabdff1aSopenharmony_ci        ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1243cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1244cabdff1aSopenharmony_ci
1245cabdff1aSopenharmony_ci        hz_out1 = hz_out5;
1246cabdff1aSopenharmony_ci        vec0 = vec2;
1247cabdff1aSopenharmony_ci    }
1248cabdff1aSopenharmony_ci}
1249cabdff1aSopenharmony_ci
1250cabdff1aSopenharmony_civoid ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1251cabdff1aSopenharmony_ci                               uint8_t *src, ptrdiff_t src_stride,
1252cabdff1aSopenharmony_ci                               int height, int mx, int my)
1253cabdff1aSopenharmony_ci{
1254cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1255cabdff1aSopenharmony_ci    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1256cabdff1aSopenharmony_ci    const int8_t *filter_vert = subpel_filters_msa[my - 1];
1257cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6;
1258cabdff1aSopenharmony_ci    v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1259cabdff1aSopenharmony_ci    v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1260cabdff1aSopenharmony_ci    v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1261cabdff1aSopenharmony_ci    v16u8 out0, out1;
1262cabdff1aSopenharmony_ci
1263cabdff1aSopenharmony_ci    mask0 = LD_SB(&mc_filt_mask_arr[0]);
1264cabdff1aSopenharmony_ci    src -= (2 + src_stride);
1265cabdff1aSopenharmony_ci
1266cabdff1aSopenharmony_ci    /* rearranging filter */
1267cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
1268cabdff1aSopenharmony_ci    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1269cabdff1aSopenharmony_ci
1270cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1271cabdff1aSopenharmony_ci    mask2 = mask0 + 4;
1272cabdff1aSopenharmony_ci
1273cabdff1aSopenharmony_ci    LD_SB3(src, src_stride, src0, src1, src2);
1274cabdff1aSopenharmony_ci    src += (3 * src_stride);
1275cabdff1aSopenharmony_ci
1276cabdff1aSopenharmony_ci    XORI_B3_128_SB(src0, src1, src2);
1277cabdff1aSopenharmony_ci    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
1278cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2);
1279cabdff1aSopenharmony_ci    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
1280cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2);
1281cabdff1aSopenharmony_ci    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
1282cabdff1aSopenharmony_ci                              filt_hz1, filt_hz2);
1283cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1284cabdff1aSopenharmony_ci
1285cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
1286cabdff1aSopenharmony_ci    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1287cabdff1aSopenharmony_ci
1288cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1289cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src3, src4, src5, src6);
1290cabdff1aSopenharmony_ci        src += (4 * src_stride);
1291cabdff1aSopenharmony_ci
1292cabdff1aSopenharmony_ci        XORI_B4_128_SB(src3, src4, src5, src6);
1293cabdff1aSopenharmony_ci
1294cabdff1aSopenharmony_ci        hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
1295cabdff1aSopenharmony_ci                                  filt_hz1, filt_hz2);
1296cabdff1aSopenharmony_ci        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1297cabdff1aSopenharmony_ci        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1298cabdff1aSopenharmony_ci
1299cabdff1aSopenharmony_ci        hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
1300cabdff1aSopenharmony_ci                                  filt_hz1, filt_hz2);
1301cabdff1aSopenharmony_ci        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1302cabdff1aSopenharmony_ci        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1303cabdff1aSopenharmony_ci
1304cabdff1aSopenharmony_ci        hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
1305cabdff1aSopenharmony_ci                                  filt_hz1, filt_hz2);
1306cabdff1aSopenharmony_ci        vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1307cabdff1aSopenharmony_ci        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
1308cabdff1aSopenharmony_ci
1309cabdff1aSopenharmony_ci        hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
1310cabdff1aSopenharmony_ci                                  filt_hz1, filt_hz2);
1311cabdff1aSopenharmony_ci        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1312cabdff1aSopenharmony_ci        tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1313cabdff1aSopenharmony_ci
1314cabdff1aSopenharmony_ci        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1315cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1316cabdff1aSopenharmony_ci        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1317cabdff1aSopenharmony_ci        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1318cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1319cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1320cabdff1aSopenharmony_ci    }
1321cabdff1aSopenharmony_ci}
1322cabdff1aSopenharmony_ci
1323cabdff1aSopenharmony_civoid ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1324cabdff1aSopenharmony_ci                               uint8_t *src, ptrdiff_t src_stride,
1325cabdff1aSopenharmony_ci                               int height, int mx, int my)
1326cabdff1aSopenharmony_ci{
1327cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
1328cabdff1aSopenharmony_ci
1329cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
1330cabdff1aSopenharmony_ci        ff_put_vp8_epel8_h6v4_msa(dst, dst_stride, src, src_stride, height,
1331cabdff1aSopenharmony_ci                                  mx, my);
1332cabdff1aSopenharmony_ci
1333cabdff1aSopenharmony_ci        src += 8;
1334cabdff1aSopenharmony_ci        dst += 8;
1335cabdff1aSopenharmony_ci    }
1336cabdff1aSopenharmony_ci}
1337cabdff1aSopenharmony_ci
1338cabdff1aSopenharmony_civoid ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1339cabdff1aSopenharmony_ci                               uint8_t *src, ptrdiff_t src_stride,
1340cabdff1aSopenharmony_ci                               int height, int mx, int my)
1341cabdff1aSopenharmony_ci{
1342cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1343cabdff1aSopenharmony_ci    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1344cabdff1aSopenharmony_ci    const int8_t *filter_vert = subpel_filters_msa[my - 1];
1345cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1346cabdff1aSopenharmony_ci    v16i8 filt_hz0, filt_hz1, mask0, mask1;
1347cabdff1aSopenharmony_ci    v16u8 out;
1348cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1349cabdff1aSopenharmony_ci    v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1350cabdff1aSopenharmony_ci    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
1351cabdff1aSopenharmony_ci
1352cabdff1aSopenharmony_ci    mask0 = LD_SB(&mc_filt_mask_arr[16]);
1353cabdff1aSopenharmony_ci
1354cabdff1aSopenharmony_ci    src -= (1 + 2 * src_stride);
1355cabdff1aSopenharmony_ci
1356cabdff1aSopenharmony_ci    /* rearranging filter */
1357cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
1358cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1359cabdff1aSopenharmony_ci
1360cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1361cabdff1aSopenharmony_ci
1362cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1363cabdff1aSopenharmony_ci    src += (5 * src_stride);
1364cabdff1aSopenharmony_ci
1365cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1366cabdff1aSopenharmony_ci    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1367cabdff1aSopenharmony_ci    hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1368cabdff1aSopenharmony_ci    hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1369cabdff1aSopenharmony_ci    hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1370cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1371cabdff1aSopenharmony_ci
1372cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
1373cabdff1aSopenharmony_ci    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1374cabdff1aSopenharmony_ci
1375cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1376cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src5, src6, src7, src8);
1377cabdff1aSopenharmony_ci        XORI_B4_128_SB(src5, src6, src7, src8);
1378cabdff1aSopenharmony_ci        src += (4 * src_stride);
1379cabdff1aSopenharmony_ci
1380cabdff1aSopenharmony_ci        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1381cabdff1aSopenharmony_ci        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1382cabdff1aSopenharmony_ci        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1383cabdff1aSopenharmony_ci        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1384cabdff1aSopenharmony_ci
1385cabdff1aSopenharmony_ci        hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1386cabdff1aSopenharmony_ci        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1387cabdff1aSopenharmony_ci        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1388cabdff1aSopenharmony_ci        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1389cabdff1aSopenharmony_ci
1390cabdff1aSopenharmony_ci        SRARI_H2_SH(tmp0, tmp1, 7);
1391cabdff1aSopenharmony_ci        SAT_SH2_SH(tmp0, tmp1, 7);
1392cabdff1aSopenharmony_ci        out = PCKEV_XORI128_UB(tmp0, tmp1);
1393cabdff1aSopenharmony_ci        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1394cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1395cabdff1aSopenharmony_ci
1396cabdff1aSopenharmony_ci        hz_out3 = hz_out7;
1397cabdff1aSopenharmony_ci        out0 = out2;
1398cabdff1aSopenharmony_ci        out1 = out3;
1399cabdff1aSopenharmony_ci    }
1400cabdff1aSopenharmony_ci}
1401cabdff1aSopenharmony_ci
1402cabdff1aSopenharmony_civoid ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1403cabdff1aSopenharmony_ci                               uint8_t *src, ptrdiff_t src_stride,
1404cabdff1aSopenharmony_ci                               int height, int mx, int my)
1405cabdff1aSopenharmony_ci{
1406cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1407cabdff1aSopenharmony_ci    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1408cabdff1aSopenharmony_ci    const int8_t *filter_vert = subpel_filters_msa[my - 1];
1409cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1410cabdff1aSopenharmony_ci    v16i8 filt_hz0, filt_hz1, mask0, mask1;
1411cabdff1aSopenharmony_ci    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1412cabdff1aSopenharmony_ci    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1413cabdff1aSopenharmony_ci    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1414cabdff1aSopenharmony_ci    v16u8 vec0, vec1;
1415cabdff1aSopenharmony_ci
1416cabdff1aSopenharmony_ci    mask0 = LD_SB(&mc_filt_mask_arr[0]);
1417cabdff1aSopenharmony_ci    src -= (1 + 2 * src_stride);
1418cabdff1aSopenharmony_ci
1419cabdff1aSopenharmony_ci    /* rearranging filter */
1420cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
1421cabdff1aSopenharmony_ci    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1422cabdff1aSopenharmony_ci
1423cabdff1aSopenharmony_ci    mask1 = mask0 + 2;
1424cabdff1aSopenharmony_ci
1425cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1426cabdff1aSopenharmony_ci    src += (5 * src_stride);
1427cabdff1aSopenharmony_ci
1428cabdff1aSopenharmony_ci    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1429cabdff1aSopenharmony_ci    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1430cabdff1aSopenharmony_ci    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1431cabdff1aSopenharmony_ci    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1432cabdff1aSopenharmony_ci    hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1433cabdff1aSopenharmony_ci    hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1434cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1435cabdff1aSopenharmony_ci    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1436cabdff1aSopenharmony_ci
1437cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
1438cabdff1aSopenharmony_ci    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1439cabdff1aSopenharmony_ci
1440cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1441cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src5, src6, src7, src8);
1442cabdff1aSopenharmony_ci        src += (4 * src_stride);
1443cabdff1aSopenharmony_ci
1444cabdff1aSopenharmony_ci        XORI_B4_128_SB(src5, src6, src7, src8);
1445cabdff1aSopenharmony_ci
1446cabdff1aSopenharmony_ci        hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1447cabdff1aSopenharmony_ci        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1448cabdff1aSopenharmony_ci        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1449cabdff1aSopenharmony_ci
1450cabdff1aSopenharmony_ci        hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1451cabdff1aSopenharmony_ci        out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
1452cabdff1aSopenharmony_ci        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1453cabdff1aSopenharmony_ci
1454cabdff1aSopenharmony_ci        hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1455cabdff1aSopenharmony_ci        out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1456cabdff1aSopenharmony_ci        tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1457cabdff1aSopenharmony_ci
1458cabdff1aSopenharmony_ci        hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1459cabdff1aSopenharmony_ci        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1460cabdff1aSopenharmony_ci        tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1461cabdff1aSopenharmony_ci
1462cabdff1aSopenharmony_ci        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1463cabdff1aSopenharmony_ci        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1464cabdff1aSopenharmony_ci        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
1465cabdff1aSopenharmony_ci        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
1466cabdff1aSopenharmony_ci        ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
1467cabdff1aSopenharmony_ci        dst += (4 * dst_stride);
1468cabdff1aSopenharmony_ci
1469cabdff1aSopenharmony_ci        hz_out4 = hz_out8;
1470cabdff1aSopenharmony_ci        out0 = out2;
1471cabdff1aSopenharmony_ci        out1 = out6;
1472cabdff1aSopenharmony_ci        out3 = out5;
1473cabdff1aSopenharmony_ci        out4 = out7;
1474cabdff1aSopenharmony_ci    }
1475cabdff1aSopenharmony_ci}
1476cabdff1aSopenharmony_ci
1477cabdff1aSopenharmony_civoid ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1478cabdff1aSopenharmony_ci                                uint8_t *src, ptrdiff_t src_stride,
1479cabdff1aSopenharmony_ci                                int height, int mx, int my)
1480cabdff1aSopenharmony_ci{
1481cabdff1aSopenharmony_ci    int32_t multiple8_cnt;
1482cabdff1aSopenharmony_ci
1483cabdff1aSopenharmony_ci    for (multiple8_cnt = 2; multiple8_cnt--;) {
1484cabdff1aSopenharmony_ci        ff_put_vp8_epel8_h4v6_msa(dst, dst_stride, src, src_stride, height,
1485cabdff1aSopenharmony_ci                                  mx, my);
1486cabdff1aSopenharmony_ci
1487cabdff1aSopenharmony_ci        src += 8;
1488cabdff1aSopenharmony_ci        dst += 8;
1489cabdff1aSopenharmony_ci    }
1490cabdff1aSopenharmony_ci}
1491cabdff1aSopenharmony_ci
1492cabdff1aSopenharmony_cistatic void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride,
1493cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1494cabdff1aSopenharmony_ci                                 const int8_t *filter)
1495cabdff1aSopenharmony_ci{
1496cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, mask;
1497cabdff1aSopenharmony_ci    v16u8 filt0, vec0, vec1, res0, res1;
1498cabdff1aSopenharmony_ci    v8u16 vec2, vec3, filt;
1499cabdff1aSopenharmony_ci
1500cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[16]);
1501cabdff1aSopenharmony_ci
1502cabdff1aSopenharmony_ci    /* rearranging filter */
1503cabdff1aSopenharmony_ci    filt = LD_UH(filter);
1504cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1505cabdff1aSopenharmony_ci
1506cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1507cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1508cabdff1aSopenharmony_ci    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1509cabdff1aSopenharmony_ci    SRARI_H2_UH(vec2, vec3, 7);
1510cabdff1aSopenharmony_ci    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
1511cabdff1aSopenharmony_ci    ST_W2(res0, 0, 1, dst, dst_stride);
1512cabdff1aSopenharmony_ci    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1513cabdff1aSopenharmony_ci}
1514cabdff1aSopenharmony_ci
1515cabdff1aSopenharmony_cistatic void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride,
1516cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1517cabdff1aSopenharmony_ci                                 const int8_t *filter)
1518cabdff1aSopenharmony_ci{
1519cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, filt0;
1520cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1521cabdff1aSopenharmony_ci    v16i8 res0, res1, res2, res3;
1522cabdff1aSopenharmony_ci    v8u16 vec4, vec5, vec6, vec7, filt;
1523cabdff1aSopenharmony_ci
1524cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[16]);
1525cabdff1aSopenharmony_ci
1526cabdff1aSopenharmony_ci    /* rearranging filter */
1527cabdff1aSopenharmony_ci    filt = LD_UH(filter);
1528cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1529cabdff1aSopenharmony_ci
1530cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1531cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1532cabdff1aSopenharmony_ci    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1533cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1534cabdff1aSopenharmony_ci                vec4, vec5, vec6, vec7);
1535cabdff1aSopenharmony_ci    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
1536cabdff1aSopenharmony_ci    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1537cabdff1aSopenharmony_ci                res0, res1, res2, res3);
1538cabdff1aSopenharmony_ci    ST_W2(res0, 0, 1, dst, dst_stride);
1539cabdff1aSopenharmony_ci    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1540cabdff1aSopenharmony_ci    ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
1541cabdff1aSopenharmony_ci    ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
1542cabdff1aSopenharmony_ci}
1543cabdff1aSopenharmony_ci
1544cabdff1aSopenharmony_civoid ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1545cabdff1aSopenharmony_ci                                uint8_t *src, ptrdiff_t src_stride,
1546cabdff1aSopenharmony_ci                                int height, int mx, int my)
1547cabdff1aSopenharmony_ci{
1548cabdff1aSopenharmony_ci    const int8_t *filter = bilinear_filters_msa[mx - 1];
1549cabdff1aSopenharmony_ci
1550cabdff1aSopenharmony_ci    if (4 == height) {
1551cabdff1aSopenharmony_ci        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1552cabdff1aSopenharmony_ci    } else if (8 == height) {
1553cabdff1aSopenharmony_ci        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1554cabdff1aSopenharmony_ci    }
1555cabdff1aSopenharmony_ci}
1556cabdff1aSopenharmony_ci
1557cabdff1aSopenharmony_cistatic void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride,
1558cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1559cabdff1aSopenharmony_ci                                 const int8_t *filter)
1560cabdff1aSopenharmony_ci{
1561cabdff1aSopenharmony_ci    v16u8 filt0;
1562cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, mask;
1563cabdff1aSopenharmony_ci    v8u16 vec0, vec1, vec2, vec3, filt;
1564cabdff1aSopenharmony_ci
1565cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
1566cabdff1aSopenharmony_ci
1567cabdff1aSopenharmony_ci    /* rearranging filter */
1568cabdff1aSopenharmony_ci    filt = LD_UH(filter);
1569cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1570cabdff1aSopenharmony_ci
1571cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1572cabdff1aSopenharmony_ci    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1573cabdff1aSopenharmony_ci    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1574cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1575cabdff1aSopenharmony_ci                vec0, vec1, vec2, vec3);
1576cabdff1aSopenharmony_ci    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1577cabdff1aSopenharmony_ci    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
1578cabdff1aSopenharmony_ci    ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
1579cabdff1aSopenharmony_ci}
1580cabdff1aSopenharmony_ci
1581cabdff1aSopenharmony_cistatic void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
1582cabdff1aSopenharmony_ci                                     uint8_t *dst, int32_t dst_stride,
1583cabdff1aSopenharmony_ci                                     const int8_t *filter, int32_t height)
1584cabdff1aSopenharmony_ci{
1585cabdff1aSopenharmony_ci    v16u8 filt0;
1586cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, mask, out0, out1;
1587cabdff1aSopenharmony_ci    v8u16 vec0, vec1, vec2, vec3, filt;
1588cabdff1aSopenharmony_ci
1589cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
1590cabdff1aSopenharmony_ci
1591cabdff1aSopenharmony_ci    /* rearranging filter */
1592cabdff1aSopenharmony_ci    filt = LD_UH(filter);
1593cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1594cabdff1aSopenharmony_ci
1595cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1596cabdff1aSopenharmony_ci    src += (4 * src_stride);
1597cabdff1aSopenharmony_ci
1598cabdff1aSopenharmony_ci    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1599cabdff1aSopenharmony_ci    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1600cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1601cabdff1aSopenharmony_ci                vec0, vec1, vec2, vec3);
1602cabdff1aSopenharmony_ci    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1603cabdff1aSopenharmony_ci
1604cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src1, src2, src3);
1605cabdff1aSopenharmony_ci    src += (4 * src_stride);
1606cabdff1aSopenharmony_ci
1607cabdff1aSopenharmony_ci    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1608cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1609cabdff1aSopenharmony_ci
1610cabdff1aSopenharmony_ci    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1611cabdff1aSopenharmony_ci    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1612cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1613cabdff1aSopenharmony_ci                vec0, vec1, vec2, vec3);
1614cabdff1aSopenharmony_ci    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1615cabdff1aSopenharmony_ci    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1616cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1617cabdff1aSopenharmony_ci    dst += (8 * dst_stride);
1618cabdff1aSopenharmony_ci
1619cabdff1aSopenharmony_ci    if (16 == height) {
1620cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
1621cabdff1aSopenharmony_ci        src += (4 * src_stride);
1622cabdff1aSopenharmony_ci
1623cabdff1aSopenharmony_ci        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1624cabdff1aSopenharmony_ci        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1625cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1626cabdff1aSopenharmony_ci                    vec0, vec1, vec2, vec3);
1627cabdff1aSopenharmony_ci        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1628cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src1, src2, src3);
1629cabdff1aSopenharmony_ci        src += (4 * src_stride);
1630cabdff1aSopenharmony_ci
1631cabdff1aSopenharmony_ci        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1632cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1633cabdff1aSopenharmony_ci
1634cabdff1aSopenharmony_ci        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1635cabdff1aSopenharmony_ci        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1636cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1637cabdff1aSopenharmony_ci                    vec0, vec1, vec2, vec3);
1638cabdff1aSopenharmony_ci        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1639cabdff1aSopenharmony_ci        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1640cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1641cabdff1aSopenharmony_ci    }
1642cabdff1aSopenharmony_ci}
1643cabdff1aSopenharmony_ci
1644cabdff1aSopenharmony_civoid ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1645cabdff1aSopenharmony_ci                                uint8_t *src, ptrdiff_t src_stride,
1646cabdff1aSopenharmony_ci                                int height, int mx, int my)
1647cabdff1aSopenharmony_ci{
1648cabdff1aSopenharmony_ci    const int8_t *filter = bilinear_filters_msa[mx - 1];
1649cabdff1aSopenharmony_ci
1650cabdff1aSopenharmony_ci    if (4 == height) {
1651cabdff1aSopenharmony_ci        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1652cabdff1aSopenharmony_ci    } else {
1653cabdff1aSopenharmony_ci        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1654cabdff1aSopenharmony_ci                                 height);
1655cabdff1aSopenharmony_ci    }
1656cabdff1aSopenharmony_ci}
1657cabdff1aSopenharmony_ci
1658cabdff1aSopenharmony_civoid ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1659cabdff1aSopenharmony_ci                                 uint8_t *src, ptrdiff_t src_stride,
1660cabdff1aSopenharmony_ci                                 int height, int mx, int my)
1661cabdff1aSopenharmony_ci{
1662cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1663cabdff1aSopenharmony_ci    const int8_t *filter = bilinear_filters_msa[mx - 1];
1664cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1665cabdff1aSopenharmony_ci    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1666cabdff1aSopenharmony_ci    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
1667cabdff1aSopenharmony_ci
1668cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
1669cabdff1aSopenharmony_ci
1670cabdff1aSopenharmony_ci    loop_cnt = (height >> 2) - 1;
1671cabdff1aSopenharmony_ci
1672cabdff1aSopenharmony_ci    /* rearranging filter */
1673cabdff1aSopenharmony_ci    filt = LD_UH(filter);
1674cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1675cabdff1aSopenharmony_ci
1676cabdff1aSopenharmony_ci    LD_SB4(src, src_stride, src0, src2, src4, src6);
1677cabdff1aSopenharmony_ci    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1678cabdff1aSopenharmony_ci    src += (4 * src_stride);
1679cabdff1aSopenharmony_ci
1680cabdff1aSopenharmony_ci    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1681cabdff1aSopenharmony_ci    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1682cabdff1aSopenharmony_ci    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1683cabdff1aSopenharmony_ci    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1684cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1685cabdff1aSopenharmony_ci                out0, out1, out2, out3);
1686cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1687cabdff1aSopenharmony_ci                out4, out5, out6, out7);
1688cabdff1aSopenharmony_ci    SRARI_H4_UH(out0, out1, out2, out3, 7);
1689cabdff1aSopenharmony_ci    SRARI_H4_UH(out4, out5, out6, out7, 7);
1690cabdff1aSopenharmony_ci    PCKEV_ST_SB(out0, out1, dst);
1691cabdff1aSopenharmony_ci    dst += dst_stride;
1692cabdff1aSopenharmony_ci    PCKEV_ST_SB(out2, out3, dst);
1693cabdff1aSopenharmony_ci    dst += dst_stride;
1694cabdff1aSopenharmony_ci    PCKEV_ST_SB(out4, out5, dst);
1695cabdff1aSopenharmony_ci    dst += dst_stride;
1696cabdff1aSopenharmony_ci    PCKEV_ST_SB(out6, out7, dst);
1697cabdff1aSopenharmony_ci    dst += dst_stride;
1698cabdff1aSopenharmony_ci
1699cabdff1aSopenharmony_ci    for (; loop_cnt--;) {
1700cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src2, src4, src6);
1701cabdff1aSopenharmony_ci        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1702cabdff1aSopenharmony_ci        src += (4 * src_stride);
1703cabdff1aSopenharmony_ci
1704cabdff1aSopenharmony_ci        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1705cabdff1aSopenharmony_ci        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1706cabdff1aSopenharmony_ci        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1707cabdff1aSopenharmony_ci        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1708cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1709cabdff1aSopenharmony_ci                    out0, out1, out2, out3);
1710cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1711cabdff1aSopenharmony_ci                    out4, out5, out6, out7);
1712cabdff1aSopenharmony_ci        SRARI_H4_UH(out0, out1, out2, out3, 7);
1713cabdff1aSopenharmony_ci        SRARI_H4_UH(out4, out5, out6, out7, 7);
1714cabdff1aSopenharmony_ci        PCKEV_ST_SB(out0, out1, dst);
1715cabdff1aSopenharmony_ci        dst += dst_stride;
1716cabdff1aSopenharmony_ci        PCKEV_ST_SB(out2, out3, dst);
1717cabdff1aSopenharmony_ci        dst += dst_stride;
1718cabdff1aSopenharmony_ci        PCKEV_ST_SB(out4, out5, dst);
1719cabdff1aSopenharmony_ci        dst += dst_stride;
1720cabdff1aSopenharmony_ci        PCKEV_ST_SB(out6, out7, dst);
1721cabdff1aSopenharmony_ci        dst += dst_stride;
1722cabdff1aSopenharmony_ci    }
1723cabdff1aSopenharmony_ci}
1724cabdff1aSopenharmony_ci
1725cabdff1aSopenharmony_cistatic void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride,
1726cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1727cabdff1aSopenharmony_ci                                 const int8_t *filter)
1728cabdff1aSopenharmony_ci{
1729cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4;
1730cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
1731cabdff1aSopenharmony_ci    v16u8 filt0;
1732cabdff1aSopenharmony_ci    v8i16 filt;
1733cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1;
1734cabdff1aSopenharmony_ci
1735cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1736cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
1737cabdff1aSopenharmony_ci
1738cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1739cabdff1aSopenharmony_ci    src += (5 * src_stride);
1740cabdff1aSopenharmony_ci
1741cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1742cabdff1aSopenharmony_ci               src10_r, src21_r, src32_r, src43_r);
1743cabdff1aSopenharmony_ci    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1744cabdff1aSopenharmony_ci    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1745cabdff1aSopenharmony_ci    SRARI_H2_UH(tmp0, tmp1, 7);
1746cabdff1aSopenharmony_ci    SAT_UH2_UH(tmp0, tmp1, 7);
1747cabdff1aSopenharmony_ci    src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
1748cabdff1aSopenharmony_ci    ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
1749cabdff1aSopenharmony_ci}
1750cabdff1aSopenharmony_ci
1751cabdff1aSopenharmony_cistatic void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride,
1752cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1753cabdff1aSopenharmony_ci                                 const int8_t *filter)
1754cabdff1aSopenharmony_ci{
1755cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1756cabdff1aSopenharmony_ci    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
1757cabdff1aSopenharmony_ci    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
1758cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3;
1759cabdff1aSopenharmony_ci    v16u8 filt0;
1760cabdff1aSopenharmony_ci    v8i16 filt;
1761cabdff1aSopenharmony_ci
1762cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1763cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
1764cabdff1aSopenharmony_ci
1765cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1766cabdff1aSopenharmony_ci    src += (8 * src_stride);
1767cabdff1aSopenharmony_ci
1768cabdff1aSopenharmony_ci    src8 = LD_SB(src);
1769cabdff1aSopenharmony_ci    src += src_stride;
1770cabdff1aSopenharmony_ci
1771cabdff1aSopenharmony_ci    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1772cabdff1aSopenharmony_ci               src32_r, src43_r);
1773cabdff1aSopenharmony_ci    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1774cabdff1aSopenharmony_ci               src76_r, src87_r);
1775cabdff1aSopenharmony_ci    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1776cabdff1aSopenharmony_ci               src87_r, src76_r, src2110, src4332, src6554, src8776);
1777cabdff1aSopenharmony_ci    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
1778cabdff1aSopenharmony_ci                tmp0, tmp1, tmp2, tmp3);
1779cabdff1aSopenharmony_ci    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1780cabdff1aSopenharmony_ci    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1781cabdff1aSopenharmony_ci    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
1782cabdff1aSopenharmony_ci    ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1783cabdff1aSopenharmony_ci}
1784cabdff1aSopenharmony_ci
1785cabdff1aSopenharmony_civoid ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1786cabdff1aSopenharmony_ci                                uint8_t *src, ptrdiff_t src_stride,
1787cabdff1aSopenharmony_ci                                int height, int mx, int my)
1788cabdff1aSopenharmony_ci{
1789cabdff1aSopenharmony_ci    const int8_t *filter = bilinear_filters_msa[my - 1];
1790cabdff1aSopenharmony_ci
1791cabdff1aSopenharmony_ci    if (4 == height) {
1792cabdff1aSopenharmony_ci        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1793cabdff1aSopenharmony_ci    } else if (8 == height) {
1794cabdff1aSopenharmony_ci        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1795cabdff1aSopenharmony_ci    }
1796cabdff1aSopenharmony_ci}
1797cabdff1aSopenharmony_ci
1798cabdff1aSopenharmony_cistatic void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride,
1799cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
1800cabdff1aSopenharmony_ci                                 const int8_t *filter)
1801cabdff1aSopenharmony_ci{
1802cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
1803cabdff1aSopenharmony_ci    v16i8 out0, out1;
1804cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3;
1805cabdff1aSopenharmony_ci    v8i16 filt;
1806cabdff1aSopenharmony_ci
1807cabdff1aSopenharmony_ci    /* rearranging filter_y */
1808cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1809cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
1810cabdff1aSopenharmony_ci
1811cabdff1aSopenharmony_ci    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1812cabdff1aSopenharmony_ci    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
1813cabdff1aSopenharmony_ci    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
1814cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1815cabdff1aSopenharmony_ci                tmp0, tmp1, tmp2, tmp3);
1816cabdff1aSopenharmony_ci    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1817cabdff1aSopenharmony_ci    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1818cabdff1aSopenharmony_ci    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1819cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1820cabdff1aSopenharmony_ci}
1821cabdff1aSopenharmony_ci
1822cabdff1aSopenharmony_cistatic void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
1823cabdff1aSopenharmony_ci                                     uint8_t *dst, int32_t dst_stride,
1824cabdff1aSopenharmony_ci                                     const int8_t *filter, int32_t height)
1825cabdff1aSopenharmony_ci{
1826cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1827cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1828cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1829cabdff1aSopenharmony_ci    v16i8 out0, out1;
1830cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3;
1831cabdff1aSopenharmony_ci    v8i16 filt;
1832cabdff1aSopenharmony_ci
1833cabdff1aSopenharmony_ci    /* rearranging filter_y */
1834cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1835cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
1836cabdff1aSopenharmony_ci
1837cabdff1aSopenharmony_ci    src0 = LD_UB(src);
1838cabdff1aSopenharmony_ci    src += src_stride;
1839cabdff1aSopenharmony_ci
1840cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
1841cabdff1aSopenharmony_ci        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
1842cabdff1aSopenharmony_ci        src += (8 * src_stride);
1843cabdff1aSopenharmony_ci
1844cabdff1aSopenharmony_ci        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1845cabdff1aSopenharmony_ci                   vec0, vec1, vec2, vec3);
1846cabdff1aSopenharmony_ci        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1847cabdff1aSopenharmony_ci                   vec4, vec5, vec6, vec7);
1848cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1849cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2, tmp3);
1850cabdff1aSopenharmony_ci        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1851cabdff1aSopenharmony_ci        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1852cabdff1aSopenharmony_ci        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1853cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1854cabdff1aSopenharmony_ci
1855cabdff1aSopenharmony_ci        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1856cabdff1aSopenharmony_ci                    tmp0, tmp1, tmp2, tmp3);
1857cabdff1aSopenharmony_ci        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1858cabdff1aSopenharmony_ci        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1859cabdff1aSopenharmony_ci        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1860cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1861cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
1862cabdff1aSopenharmony_ci
1863cabdff1aSopenharmony_ci        src0 = src8;
1864cabdff1aSopenharmony_ci    }
1865cabdff1aSopenharmony_ci}
1866cabdff1aSopenharmony_ci
1867cabdff1aSopenharmony_civoid ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1868cabdff1aSopenharmony_ci                                uint8_t *src, ptrdiff_t src_stride,
1869cabdff1aSopenharmony_ci                                int height, int mx, int my)
1870cabdff1aSopenharmony_ci{
1871cabdff1aSopenharmony_ci    const int8_t *filter = bilinear_filters_msa[my - 1];
1872cabdff1aSopenharmony_ci
1873cabdff1aSopenharmony_ci    if (4 == height) {
1874cabdff1aSopenharmony_ci        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1875cabdff1aSopenharmony_ci    } else {
1876cabdff1aSopenharmony_ci        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1877cabdff1aSopenharmony_ci                                 height);
1878cabdff1aSopenharmony_ci    }
1879cabdff1aSopenharmony_ci}
1880cabdff1aSopenharmony_ci
1881cabdff1aSopenharmony_civoid ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1882cabdff1aSopenharmony_ci                                 uint8_t *src, ptrdiff_t src_stride,
1883cabdff1aSopenharmony_ci                                 int height, int mx, int my)
1884cabdff1aSopenharmony_ci{
1885cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1886cabdff1aSopenharmony_ci    const int8_t *filter = bilinear_filters_msa[my - 1];
1887cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4;
1888cabdff1aSopenharmony_ci    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1889cabdff1aSopenharmony_ci    v8u16 tmp0, tmp1, tmp2, tmp3;
1890cabdff1aSopenharmony_ci    v8i16 filt;
1891cabdff1aSopenharmony_ci
1892cabdff1aSopenharmony_ci    /* rearranging filter_y */
1893cabdff1aSopenharmony_ci    filt = LD_SH(filter);
1894cabdff1aSopenharmony_ci    filt0 = (v16u8) __msa_splati_h(filt, 0);
1895cabdff1aSopenharmony_ci
1896cabdff1aSopenharmony_ci    src0 = LD_UB(src);
1897cabdff1aSopenharmony_ci    src += src_stride;
1898cabdff1aSopenharmony_ci
1899cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1900cabdff1aSopenharmony_ci        LD_UB4(src, src_stride, src1, src2, src3, src4);
1901cabdff1aSopenharmony_ci        src += (4 * src_stride);
1902cabdff1aSopenharmony_ci
1903cabdff1aSopenharmony_ci        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
1904cabdff1aSopenharmony_ci        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
1905cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1906cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
1907cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
1908cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp0, tmp1, dst);
1909cabdff1aSopenharmony_ci        dst += dst_stride;
1910cabdff1aSopenharmony_ci
1911cabdff1aSopenharmony_ci        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
1912cabdff1aSopenharmony_ci        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
1913cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1914cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
1915cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
1916cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp2, tmp3, dst);
1917cabdff1aSopenharmony_ci        dst += dst_stride;
1918cabdff1aSopenharmony_ci
1919cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1920cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp0, tmp1, 7);
1921cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp0, tmp1, 7);
1922cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp0, tmp1, dst);
1923cabdff1aSopenharmony_ci        dst += dst_stride;
1924cabdff1aSopenharmony_ci
1925cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1926cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp2, tmp3, 7);
1927cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp2, tmp3, 7);
1928cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp2, tmp3, dst);
1929cabdff1aSopenharmony_ci        dst += dst_stride;
1930cabdff1aSopenharmony_ci
1931cabdff1aSopenharmony_ci        src0 = src4;
1932cabdff1aSopenharmony_ci    }
1933cabdff1aSopenharmony_ci}
1934cabdff1aSopenharmony_ci
1935cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride,
1936cabdff1aSopenharmony_ci                                      uint8_t *dst, int32_t dst_stride,
1937cabdff1aSopenharmony_ci                                      const int8_t *filter_horiz,
1938cabdff1aSopenharmony_ci                                      const int8_t *filter_vert)
1939cabdff1aSopenharmony_ci{
1940cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, mask;
1941cabdff1aSopenharmony_ci    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
1942cabdff1aSopenharmony_ci    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
1943cabdff1aSopenharmony_ci
1944cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[16]);
1945cabdff1aSopenharmony_ci
1946cabdff1aSopenharmony_ci    /* rearranging filter */
1947cabdff1aSopenharmony_ci    filt = LD_UH(filter_horiz);
1948cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1949cabdff1aSopenharmony_ci
1950cabdff1aSopenharmony_ci    filt = LD_UH(filter_vert);
1951cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1952cabdff1aSopenharmony_ci
1953cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1954cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1955cabdff1aSopenharmony_ci    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1956cabdff1aSopenharmony_ci    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
1957cabdff1aSopenharmony_ci    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1958cabdff1aSopenharmony_ci    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
1959cabdff1aSopenharmony_ci
1960cabdff1aSopenharmony_ci    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1961cabdff1aSopenharmony_ci    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1962cabdff1aSopenharmony_ci    SRARI_H2_UH(tmp0, tmp1, 7);
1963cabdff1aSopenharmony_ci    SAT_UH2_UH(tmp0, tmp1, 7);
1964cabdff1aSopenharmony_ci    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1965cabdff1aSopenharmony_ci    ST_W2(res0, 0, 1, dst, dst_stride);
1966cabdff1aSopenharmony_ci    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1967cabdff1aSopenharmony_ci}
1968cabdff1aSopenharmony_ci
1969cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride,
1970cabdff1aSopenharmony_ci                                      uint8_t *dst, int32_t dst_stride,
1971cabdff1aSopenharmony_ci                                      const int8_t *filter_horiz,
1972cabdff1aSopenharmony_ci                                      const int8_t *filter_vert)
1973cabdff1aSopenharmony_ci{
1974cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
1975cabdff1aSopenharmony_ci    v16i8 res0, res1, res2, res3;
1976cabdff1aSopenharmony_ci    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
1977cabdff1aSopenharmony_ci    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1978cabdff1aSopenharmony_ci    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
1979cabdff1aSopenharmony_ci
1980cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[16]);
1981cabdff1aSopenharmony_ci
1982cabdff1aSopenharmony_ci    /* rearranging filter */
1983cabdff1aSopenharmony_ci    filt = LD_UH(filter_horiz);
1984cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1985cabdff1aSopenharmony_ci
1986cabdff1aSopenharmony_ci    filt = LD_UH(filter_vert);
1987cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1988cabdff1aSopenharmony_ci
1989cabdff1aSopenharmony_ci    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1990cabdff1aSopenharmony_ci    src += (8 * src_stride);
1991cabdff1aSopenharmony_ci    src8 = LD_SB(src);
1992cabdff1aSopenharmony_ci
1993cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1994cabdff1aSopenharmony_ci    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1995cabdff1aSopenharmony_ci    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
1996cabdff1aSopenharmony_ci    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
1997cabdff1aSopenharmony_ci    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
1998cabdff1aSopenharmony_ci    SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
1999cabdff1aSopenharmony_ci               hz_out3, hz_out5);
2000cabdff1aSopenharmony_ci    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2001cabdff1aSopenharmony_ci
2002cabdff1aSopenharmony_ci    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2003cabdff1aSopenharmony_ci    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2004cabdff1aSopenharmony_ci    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2005cabdff1aSopenharmony_ci                vec4, vec5, vec6, vec7);
2006cabdff1aSopenharmony_ci    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2007cabdff1aSopenharmony_ci    SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
2008cabdff1aSopenharmony_ci    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2009cabdff1aSopenharmony_ci                res0, res1, res2, res3);
2010cabdff1aSopenharmony_ci    ST_W2(res0, 0, 1, dst, dst_stride);
2011cabdff1aSopenharmony_ci    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2012cabdff1aSopenharmony_ci    ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
2013cabdff1aSopenharmony_ci    ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
2014cabdff1aSopenharmony_ci}
2015cabdff1aSopenharmony_ci
2016cabdff1aSopenharmony_civoid ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2017cabdff1aSopenharmony_ci                                 uint8_t *src, ptrdiff_t src_stride,
2018cabdff1aSopenharmony_ci                                 int height, int mx, int my)
2019cabdff1aSopenharmony_ci{
2020cabdff1aSopenharmony_ci    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2021cabdff1aSopenharmony_ci    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2022cabdff1aSopenharmony_ci
2023cabdff1aSopenharmony_ci    if (4 == height) {
2024cabdff1aSopenharmony_ci        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
2025cabdff1aSopenharmony_ci                                  filter_horiz, filter_vert);
2026cabdff1aSopenharmony_ci    } else if (8 == height) {
2027cabdff1aSopenharmony_ci        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
2028cabdff1aSopenharmony_ci                                  filter_horiz, filter_vert);
2029cabdff1aSopenharmony_ci    }
2030cabdff1aSopenharmony_ci}
2031cabdff1aSopenharmony_ci
2032cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride,
2033cabdff1aSopenharmony_ci                                      uint8_t *dst, int32_t dst_stride,
2034cabdff1aSopenharmony_ci                                      const int8_t *filter_horiz,
2035cabdff1aSopenharmony_ci                                      const int8_t *filter_vert)
2036cabdff1aSopenharmony_ci{
2037cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2038cabdff1aSopenharmony_ci    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2039cabdff1aSopenharmony_ci    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2040cabdff1aSopenharmony_ci    v8i16 filt;
2041cabdff1aSopenharmony_ci
2042cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
2043cabdff1aSopenharmony_ci
2044cabdff1aSopenharmony_ci    /* rearranging filter */
2045cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
2046cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h(filt, 0);
2047cabdff1aSopenharmony_ci
2048cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
2049cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h(filt, 0);
2050cabdff1aSopenharmony_ci
2051cabdff1aSopenharmony_ci    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2052cabdff1aSopenharmony_ci
2053cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2054cabdff1aSopenharmony_ci    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2055cabdff1aSopenharmony_ci    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2056cabdff1aSopenharmony_ci    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2057cabdff1aSopenharmony_ci
2058cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2059cabdff1aSopenharmony_ci    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2060cabdff1aSopenharmony_ci    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2061cabdff1aSopenharmony_ci
2062cabdff1aSopenharmony_ci    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2063cabdff1aSopenharmony_ci    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2064cabdff1aSopenharmony_ci    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2065cabdff1aSopenharmony_ci
2066cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2067cabdff1aSopenharmony_ci    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2068cabdff1aSopenharmony_ci    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2069cabdff1aSopenharmony_ci
2070cabdff1aSopenharmony_ci    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2071cabdff1aSopenharmony_ci    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2072cabdff1aSopenharmony_ci    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2073cabdff1aSopenharmony_ci    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2074cabdff1aSopenharmony_ci}
2075cabdff1aSopenharmony_ci
2076cabdff1aSopenharmony_cistatic void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride,
2077cabdff1aSopenharmony_ci                                          uint8_t *dst, int32_t dst_stride,
2078cabdff1aSopenharmony_ci                                          const int8_t *filter_horiz,
2079cabdff1aSopenharmony_ci                                          const int8_t *filter_vert,
2080cabdff1aSopenharmony_ci                                          int32_t height)
2081cabdff1aSopenharmony_ci{
2082cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2083cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2084cabdff1aSopenharmony_ci    v16u8 filt_hz, filt_vt, vec0;
2085cabdff1aSopenharmony_ci    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2086cabdff1aSopenharmony_ci    v8i16 filt;
2087cabdff1aSopenharmony_ci
2088cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
2089cabdff1aSopenharmony_ci
2090cabdff1aSopenharmony_ci    /* rearranging filter */
2091cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
2092cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h(filt, 0);
2093cabdff1aSopenharmony_ci
2094cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
2095cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h(filt, 0);
2096cabdff1aSopenharmony_ci
2097cabdff1aSopenharmony_ci    src0 = LD_SB(src);
2098cabdff1aSopenharmony_ci    src += src_stride;
2099cabdff1aSopenharmony_ci
2100cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2101cabdff1aSopenharmony_ci
2102cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
2103cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src1, src2, src3, src4);
2104cabdff1aSopenharmony_ci        src += (4 * src_stride);
2105cabdff1aSopenharmony_ci
2106cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2107cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2108cabdff1aSopenharmony_ci        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2109cabdff1aSopenharmony_ci
2110cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2111cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2112cabdff1aSopenharmony_ci        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2113cabdff1aSopenharmony_ci
2114cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp1, tmp2, 7);
2115cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp1, tmp2, 7);
2116cabdff1aSopenharmony_ci
2117cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2118cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2119cabdff1aSopenharmony_ci        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2120cabdff1aSopenharmony_ci
2121cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2122cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src1, src2, src3, src4);
2123cabdff1aSopenharmony_ci        src += (4 * src_stride);
2124cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2125cabdff1aSopenharmony_ci        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2126cabdff1aSopenharmony_ci
2127cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp3, tmp4, 7);
2128cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp3, tmp4, 7);
2129cabdff1aSopenharmony_ci        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
2130cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2131cabdff1aSopenharmony_ci
2132cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2133cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2134cabdff1aSopenharmony_ci        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2135cabdff1aSopenharmony_ci
2136cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2137cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2138cabdff1aSopenharmony_ci        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2139cabdff1aSopenharmony_ci
2140cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2141cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2142cabdff1aSopenharmony_ci        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2143cabdff1aSopenharmony_ci
2144cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2145cabdff1aSopenharmony_ci        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2146cabdff1aSopenharmony_ci        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2147cabdff1aSopenharmony_ci
2148cabdff1aSopenharmony_ci        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2149cabdff1aSopenharmony_ci        SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2150cabdff1aSopenharmony_ci        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
2151cabdff1aSopenharmony_ci        ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2152cabdff1aSopenharmony_ci        dst += (8 * dst_stride);
2153cabdff1aSopenharmony_ci    }
2154cabdff1aSopenharmony_ci}
2155cabdff1aSopenharmony_ci
2156cabdff1aSopenharmony_civoid ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2157cabdff1aSopenharmony_ci                                 uint8_t *src, ptrdiff_t src_stride,
2158cabdff1aSopenharmony_ci                                 int height, int mx, int my)
2159cabdff1aSopenharmony_ci{
2160cabdff1aSopenharmony_ci    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2161cabdff1aSopenharmony_ci    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2162cabdff1aSopenharmony_ci
2163cabdff1aSopenharmony_ci    if (4 == height) {
2164cabdff1aSopenharmony_ci        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
2165cabdff1aSopenharmony_ci                                  filter_horiz, filter_vert);
2166cabdff1aSopenharmony_ci    } else {
2167cabdff1aSopenharmony_ci        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
2168cabdff1aSopenharmony_ci                                      filter_horiz, filter_vert, height);
2169cabdff1aSopenharmony_ci    }
2170cabdff1aSopenharmony_ci}
2171cabdff1aSopenharmony_ci
2172cabdff1aSopenharmony_civoid ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2173cabdff1aSopenharmony_ci                                  uint8_t *src, ptrdiff_t src_stride,
2174cabdff1aSopenharmony_ci                                  int height, int mx, int my)
2175cabdff1aSopenharmony_ci{
2176cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2177cabdff1aSopenharmony_ci    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2178cabdff1aSopenharmony_ci    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2179cabdff1aSopenharmony_ci    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2180cabdff1aSopenharmony_ci    v16u8 filt_hz, filt_vt, vec0, vec1;
2181cabdff1aSopenharmony_ci    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2182cabdff1aSopenharmony_ci    v8i16 filt;
2183cabdff1aSopenharmony_ci
2184cabdff1aSopenharmony_ci    mask = LD_SB(&mc_filt_mask_arr[0]);
2185cabdff1aSopenharmony_ci
2186cabdff1aSopenharmony_ci    /* rearranging filter */
2187cabdff1aSopenharmony_ci    filt = LD_SH(filter_horiz);
2188cabdff1aSopenharmony_ci    filt_hz = (v16u8) __msa_splati_h(filt, 0);
2189cabdff1aSopenharmony_ci
2190cabdff1aSopenharmony_ci    filt = LD_SH(filter_vert);
2191cabdff1aSopenharmony_ci    filt_vt = (v16u8) __msa_splati_h(filt, 0);
2192cabdff1aSopenharmony_ci
2193cabdff1aSopenharmony_ci    LD_SB2(src, 8, src0, src1);
2194cabdff1aSopenharmony_ci    src += src_stride;
2195cabdff1aSopenharmony_ci
2196cabdff1aSopenharmony_ci    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2197cabdff1aSopenharmony_ci    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2198cabdff1aSopenharmony_ci
2199cabdff1aSopenharmony_ci
2200cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
2201cabdff1aSopenharmony_ci        LD_SB4(src, src_stride, src0, src2, src4, src6);
2202cabdff1aSopenharmony_ci        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2203cabdff1aSopenharmony_ci        src += (4 * src_stride);
2204cabdff1aSopenharmony_ci
2205cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2206cabdff1aSopenharmony_ci        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2207cabdff1aSopenharmony_ci        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2208cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2209cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp1, tmp2, 7);
2210cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp1, tmp2, 7);
2211cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp1, tmp2, dst);
2212cabdff1aSopenharmony_ci        dst += dst_stride;
2213cabdff1aSopenharmony_ci
2214cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2215cabdff1aSopenharmony_ci        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2216cabdff1aSopenharmony_ci        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2217cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2218cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp1, tmp2, 7);
2219cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp1, tmp2, 7);
2220cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp1, tmp2, dst);
2221cabdff1aSopenharmony_ci        dst += dst_stride;
2222cabdff1aSopenharmony_ci
2223cabdff1aSopenharmony_ci        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2224cabdff1aSopenharmony_ci        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
2225cabdff1aSopenharmony_ci        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2226cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2227cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp1, tmp2, 7);
2228cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp1, tmp2, 7);
2229cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp1, tmp2, dst);
2230cabdff1aSopenharmony_ci        dst += dst_stride;
2231cabdff1aSopenharmony_ci
2232cabdff1aSopenharmony_ci        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
2233cabdff1aSopenharmony_ci        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
2234cabdff1aSopenharmony_ci        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2235cabdff1aSopenharmony_ci        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2236cabdff1aSopenharmony_ci        SRARI_H2_UH(tmp1, tmp2, 7);
2237cabdff1aSopenharmony_ci        SAT_UH2_UH(tmp1, tmp2, 7);
2238cabdff1aSopenharmony_ci        PCKEV_ST_SB(tmp1, tmp2, dst);
2239cabdff1aSopenharmony_ci        dst += dst_stride;
2240cabdff1aSopenharmony_ci    }
2241cabdff1aSopenharmony_ci}
2242cabdff1aSopenharmony_ci
2243cabdff1aSopenharmony_civoid ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride,
2244cabdff1aSopenharmony_ci                            uint8_t *src, ptrdiff_t src_stride,
2245cabdff1aSopenharmony_ci                            int height, int mx, int my)
2246cabdff1aSopenharmony_ci{
2247cabdff1aSopenharmony_ci    int32_t cnt;
2248cabdff1aSopenharmony_ci    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
2249cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2250cabdff1aSopenharmony_ci
2251cabdff1aSopenharmony_ci    if (0 == height % 8) {
2252cabdff1aSopenharmony_ci        for (cnt = height >> 3; cnt--;) {
2253cabdff1aSopenharmony_ci            LD_UB8(src, src_stride,
2254cabdff1aSopenharmony_ci                   src0, src1, src2, src3, src4, src5, src6, src7);
2255cabdff1aSopenharmony_ci            src += (8 * src_stride);
2256cabdff1aSopenharmony_ci
2257cabdff1aSopenharmony_ci            out0 = __msa_copy_u_d((v2i64) src0, 0);
2258cabdff1aSopenharmony_ci            out1 = __msa_copy_u_d((v2i64) src1, 0);
2259cabdff1aSopenharmony_ci            out2 = __msa_copy_u_d((v2i64) src2, 0);
2260cabdff1aSopenharmony_ci            out3 = __msa_copy_u_d((v2i64) src3, 0);
2261cabdff1aSopenharmony_ci            out4 = __msa_copy_u_d((v2i64) src4, 0);
2262cabdff1aSopenharmony_ci            out5 = __msa_copy_u_d((v2i64) src5, 0);
2263cabdff1aSopenharmony_ci            out6 = __msa_copy_u_d((v2i64) src6, 0);
2264cabdff1aSopenharmony_ci            out7 = __msa_copy_u_d((v2i64) src7, 0);
2265cabdff1aSopenharmony_ci
2266cabdff1aSopenharmony_ci            SD4(out0, out1, out2, out3, dst, dst_stride);
2267cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
2268cabdff1aSopenharmony_ci            SD4(out4, out5, out6, out7, dst, dst_stride);
2269cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
2270cabdff1aSopenharmony_ci        }
2271cabdff1aSopenharmony_ci    } else if (0 == height % 4) {
2272cabdff1aSopenharmony_ci        for (cnt = (height / 4); cnt--;) {
2273cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src0, src1, src2, src3);
2274cabdff1aSopenharmony_ci            src += (4 * src_stride);
2275cabdff1aSopenharmony_ci            out0 = __msa_copy_u_d((v2i64) src0, 0);
2276cabdff1aSopenharmony_ci            out1 = __msa_copy_u_d((v2i64) src1, 0);
2277cabdff1aSopenharmony_ci            out2 = __msa_copy_u_d((v2i64) src2, 0);
2278cabdff1aSopenharmony_ci            out3 = __msa_copy_u_d((v2i64) src3, 0);
2279cabdff1aSopenharmony_ci
2280cabdff1aSopenharmony_ci            SD4(out0, out1, out2, out3, dst, dst_stride);
2281cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
2282cabdff1aSopenharmony_ci        }
2283cabdff1aSopenharmony_ci    }
2284cabdff1aSopenharmony_ci}
2285cabdff1aSopenharmony_ci
2286cabdff1aSopenharmony_cistatic void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
2287cabdff1aSopenharmony_ci                                  uint8_t *dst, int32_t dst_stride,
2288cabdff1aSopenharmony_ci                                  int32_t height, int32_t width)
2289cabdff1aSopenharmony_ci{
2290cabdff1aSopenharmony_ci    int32_t cnt, loop_cnt;
2291cabdff1aSopenharmony_ci    uint8_t *src_tmp, *dst_tmp;
2292cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2293cabdff1aSopenharmony_ci
2294cabdff1aSopenharmony_ci    for (cnt = (width >> 4); cnt--;) {
2295cabdff1aSopenharmony_ci        src_tmp = src;
2296cabdff1aSopenharmony_ci        dst_tmp = dst;
2297cabdff1aSopenharmony_ci
2298cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 3); loop_cnt--;) {
2299cabdff1aSopenharmony_ci            LD_UB8(src_tmp, src_stride,
2300cabdff1aSopenharmony_ci                   src0, src1, src2, src3, src4, src5, src6, src7);
2301cabdff1aSopenharmony_ci            src_tmp += (8 * src_stride);
2302cabdff1aSopenharmony_ci
2303cabdff1aSopenharmony_ci            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2304cabdff1aSopenharmony_ci                   dst_tmp, dst_stride);
2305cabdff1aSopenharmony_ci            dst_tmp += (8 * dst_stride);
2306cabdff1aSopenharmony_ci        }
2307cabdff1aSopenharmony_ci
2308cabdff1aSopenharmony_ci        src += 16;
2309cabdff1aSopenharmony_ci        dst += 16;
2310cabdff1aSopenharmony_ci    }
2311cabdff1aSopenharmony_ci}
2312cabdff1aSopenharmony_ci
2313cabdff1aSopenharmony_civoid ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride,
2314cabdff1aSopenharmony_ci                            uint8_t *src, ptrdiff_t src_stride,
2315cabdff1aSopenharmony_ci                            int height, int mx, int my)
2316cabdff1aSopenharmony_ci{
2317cabdff1aSopenharmony_ci    int32_t cnt;
2318cabdff1aSopenharmony_ci    v16u8 src0, src1, src2, src3;
2319cabdff1aSopenharmony_ci
2320cabdff1aSopenharmony_ci    if (0 == height % 8) {
2321cabdff1aSopenharmony_ci        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
2322cabdff1aSopenharmony_ci    } else if (0 == height % 4) {
2323cabdff1aSopenharmony_ci        for (cnt = (height >> 2); cnt--;) {
2324cabdff1aSopenharmony_ci            LD_UB4(src, src_stride, src0, src1, src2, src3);
2325cabdff1aSopenharmony_ci            src += (4 * src_stride);
2326cabdff1aSopenharmony_ci
2327cabdff1aSopenharmony_ci            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2328cabdff1aSopenharmony_ci            dst += (4 * dst_stride);
2329cabdff1aSopenharmony_ci        }
2330cabdff1aSopenharmony_ci    }
2331cabdff1aSopenharmony_ci}
2332