1/*
2 * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavcodec/vp9dsp.h"
22#include "libavutil/mips/generic_macros_msa.h"
23#include "vp9dsp_mips.h"
24
25static const uint8_t mc_filt_mask_arr[16 * 3] = {
26    /* 8 width cases */
27    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28    /* 4 width cases */
29    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30    /* 4 width cases */
31    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32};
33
34static const int8_t vp9_bilinear_filters_msa[15][2] = {
35    {120, 8},
36    {112, 16},
37    {104, 24},
38    {96, 32},
39    {88, 40},
40    {80, 48},
41    {72, 56},
42    {64, 64},
43    {56, 72},
44    {48, 80},
45    {40, 88},
46    {32, 96},
47    {24, 104},
48    {16, 112},
49    {8, 120}
50};
51
52#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,             \
53                            filt0, filt1, filt2, filt3)         \
54( {                                                             \
55    v8i16 tmp0, tmp1;                                           \
56                                                                \
57    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
58    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
59    tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2);         \
60    tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3);  \
61    tmp0 = __msa_adds_s_h(tmp0, tmp1);                          \
62                                                                \
63    tmp0;                                                       \
64} )
65
66#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,          \
67                        filt_h0, filt_h1, filt_h2, filt_h3)              \
68( {                                                                      \
69    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
70    v8i16 hz_out_m;                                                      \
71                                                                         \
72    VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,                   \
73               vec0_m, vec1_m, vec2_m, vec3_m);                          \
74    hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m,       \
75                                   filt_h0, filt_h1, filt_h2, filt_h3);  \
76                                                                         \
77    hz_out_m = __msa_srari_h(hz_out_m, 7);                               \
78    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
79                                                                         \
80    hz_out_m;                                                            \
81} )
82
83#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                  \
84                                   mask0, mask1, mask2, mask3,              \
85                                   filt0, filt1, filt2, filt3,              \
86                                   out0, out1)                              \
87{                                                                           \
88    v16i8 vec0_m, vec1_m, vec2_m, vec3_m,  vec4_m, vec5_m, vec6_m, vec7_m;  \
89    v8i16 res0_m, res1_m, res2_m, res3_m;                                   \
90                                                                            \
91    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);       \
92    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);              \
93    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);       \
94    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);             \
95    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);       \
96    DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);              \
97    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);       \
98    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);             \
99    ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                \
100}
101
102#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
103                                   mask0, mask1, mask2, mask3,                \
104                                   filt0, filt1, filt2, filt3,                \
105                                   out0, out1, out2, out3)                    \
106{                                                                             \
107    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
108    v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \
109                                                                              \
110    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
111    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
112    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
113                res0_m, res1_m, res2_m, res3_m);                              \
114    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
115    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
116    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \
117                res4_m, res5_m, res6_m, res7_m);                              \
118    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
119    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
120    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
121                 res0_m, res1_m, res2_m, res3_m);                             \
122    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
123    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
124    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
125                 res4_m, res5_m, res6_m, res7_m);                             \
126    ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \
127                res7_m, out0, out1, out2, out3);                              \
128}
129
130#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst)  \
131{                                                     \
132    v16u8 tmp_m;                                      \
133                                                      \
134    tmp_m = PCKEV_XORI128_UB(in1, in0);               \
135    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);       \
136    ST_UB(tmp_m, (pdst));                             \
137}
138
139#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)                  \
140{                                                             \
141    v16u8 tmp_m;                                              \
142                                                              \
143    tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
144    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);               \
145    ST_UB(tmp_m, (pdst));                                     \
146}
147
148#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3,  dst0, dst1,   \
149                           pdst, stride)                      \
150{                                                             \
151    v16u8 tmp0_m, tmp1_m;                                     \
152    uint8_t *pdst_m = (uint8_t *) (pdst);                     \
153                                                              \
154    PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m);          \
155    AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);  \
156    ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride);        \
157}
158
159static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
160                                 uint8_t *dst, int32_t dst_stride,
161                                 const int8_t *filter)
162{
163    v16u8 mask0, mask1, mask2, mask3, out;
164    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
165    v8i16 filt, out0, out1;
166
167    mask0 = LD_UB(&mc_filt_mask_arr[16]);
168    src -= 3;
169
170    /* rearranging filter */
171    filt = LD_SH(filter);
172    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
173
174    mask1 = mask0 + 2;
175    mask2 = mask0 + 4;
176    mask3 = mask0 + 6;
177
178    LD_SB4(src, src_stride, src0, src1, src2, src3);
179    XORI_B4_128_SB(src0, src1, src2, src3);
180    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
181                               mask3, filt0, filt1, filt2, filt3, out0, out1);
182    SRARI_H2_SH(out0, out1, 7);
183    SAT_SH2_SH(out0, out1, 7);
184    out = PCKEV_XORI128_UB(out0, out1);
185    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
186}
187
188static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
189                                 uint8_t *dst, int32_t dst_stride,
190                                 const int8_t *filter)
191{
192    v16i8 filt0, filt1, filt2, filt3;
193    v16i8 src0, src1, src2, src3;
194    v16u8 mask0, mask1, mask2, mask3, out;
195    v8i16 filt, out0, out1, out2, out3;
196
197    mask0 = LD_UB(&mc_filt_mask_arr[16]);
198    src -= 3;
199
200    /* rearranging filter */
201    filt = LD_SH(filter);
202    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
203
204    mask1 = mask0 + 2;
205    mask2 = mask0 + 4;
206    mask3 = mask0 + 6;
207
208    LD_SB4(src, src_stride, src0, src1, src2, src3);
209    XORI_B4_128_SB(src0, src1, src2, src3);
210    src += (4 * src_stride);
211    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
212                               mask3, filt0, filt1, filt2, filt3, out0, out1);
213    LD_SB4(src, src_stride, src0, src1, src2, src3);
214    XORI_B4_128_SB(src0, src1, src2, src3);
215    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
216                               mask3, filt0, filt1, filt2, filt3, out2, out3);
217    SRARI_H4_SH(out0, out1, out2, out3, 7);
218    SAT_SH4_SH(out0, out1, out2, out3, 7);
219    out = PCKEV_XORI128_UB(out0, out1);
220    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
221    out = PCKEV_XORI128_UB(out2, out3);
222    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
223}
224
225static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
226                                uint8_t *dst, int32_t dst_stride,
227                                const int8_t *filter, int32_t height)
228{
229    if (4 == height) {
230        common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
231    } else if (8 == height) {
232        common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
233    }
234}
235
236static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
237                                 uint8_t *dst, int32_t dst_stride,
238                                 const int8_t *filter)
239{
240    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
241    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
242    v8i16 filt, out0, out1, out2, out3;
243
244    mask0 = LD_UB(&mc_filt_mask_arr[0]);
245    src -= 3;
246
247    /* rearranging filter */
248    filt = LD_SH(filter);
249    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
250
251    mask1 = mask0 + 2;
252    mask2 = mask0 + 4;
253    mask3 = mask0 + 6;
254
255    LD_SB4(src, src_stride, src0, src1, src2, src3);
256    XORI_B4_128_SB(src0, src1, src2, src3);
257    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
258                               mask3, filt0, filt1, filt2, filt3, out0, out1,
259                               out2, out3);
260    SRARI_H4_SH(out0, out1, out2, out3, 7);
261    SAT_SH4_SH(out0, out1, out2, out3, 7);
262    tmp0 = PCKEV_XORI128_UB(out0, out1);
263    tmp1 = PCKEV_XORI128_UB(out2, out3);
264    ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
265}
266
267static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
268                                     uint8_t *dst, int32_t dst_stride,
269                                     const int8_t *filter, int32_t height)
270{
271    uint32_t loop_cnt;
272    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
273    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
274    v8i16 filt, out0, out1, out2, out3;
275
276    mask0 = LD_UB(&mc_filt_mask_arr[0]);
277    src -= 3;
278
279    /* rearranging filter */
280    filt = LD_SH(filter);
281    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
282
283    mask1 = mask0 + 2;
284    mask2 = mask0 + 4;
285    mask3 = mask0 + 6;
286
287    for (loop_cnt = (height >> 2); loop_cnt--;) {
288        LD_SB4(src, src_stride, src0, src1, src2, src3);
289        XORI_B4_128_SB(src0, src1, src2, src3);
290        src += (4 * src_stride);
291        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
292                                   mask3, filt0, filt1, filt2, filt3, out0,
293                                   out1, out2, out3);
294        SRARI_H4_SH(out0, out1, out2, out3, 7);
295        SAT_SH4_SH(out0, out1, out2, out3, 7);
296        tmp0 = PCKEV_XORI128_UB(out0, out1);
297        tmp1 = PCKEV_XORI128_UB(out2, out3);
298        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
299        dst += (4 * dst_stride);
300    }
301}
302
303static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
304                                uint8_t *dst, int32_t dst_stride,
305                                const int8_t *filter, int32_t height)
306{
307    if (4 == height) {
308        common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
309    } else {
310        common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
311                                 height);
312    }
313}
314
315static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
316                                 uint8_t *dst, int32_t dst_stride,
317                                 const int8_t *filter, int32_t height)
318{
319    uint32_t loop_cnt;
320    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
321    v16u8 mask0, mask1, mask2, mask3, out;
322    v8i16 filt, out0, out1, out2, out3;
323
324    mask0 = LD_UB(&mc_filt_mask_arr[0]);
325    src -= 3;
326
327    /* rearranging filter */
328    filt = LD_SH(filter);
329    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
330
331    mask1 = mask0 + 2;
332    mask2 = mask0 + 4;
333    mask3 = mask0 + 6;
334
335    for (loop_cnt = (height >> 1); loop_cnt--;) {
336        LD_SB2(src, src_stride, src0, src2);
337        LD_SB2(src + 8, src_stride, src1, src3);
338        XORI_B4_128_SB(src0, src1, src2, src3);
339        src += (2 * src_stride);
340        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
341                                   mask3, filt0, filt1, filt2, filt3, out0,
342                                   out1, out2, out3);
343        SRARI_H4_SH(out0, out1, out2, out3, 7);
344        SAT_SH4_SH(out0, out1, out2, out3, 7);
345        out = PCKEV_XORI128_UB(out0, out1);
346        ST_UB(out, dst);
347        dst += dst_stride;
348        out = PCKEV_XORI128_UB(out2, out3);
349        ST_UB(out, dst);
350        dst += dst_stride;
351    }
352}
353
354static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
355                                 uint8_t *dst, int32_t dst_stride,
356                                 const int8_t *filter, int32_t height)
357{
358    uint32_t loop_cnt;
359    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
360    v16u8 mask0, mask1, mask2, mask3, out;
361    v8i16 filt, out0, out1, out2, out3;
362
363    mask0 = LD_UB(&mc_filt_mask_arr[0]);
364    src -= 3;
365
366    /* rearranging filter */
367    filt = LD_SH(filter);
368    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
369
370    mask1 = mask0 + 2;
371    mask2 = mask0 + 4;
372    mask3 = mask0 + 6;
373
374    for (loop_cnt = (height >> 1); loop_cnt--;) {
375        src0 = LD_SB(src);
376        src2 = LD_SB(src + 16);
377        src3 = LD_SB(src + 24);
378        src1 = __msa_sldi_b(src2, src0, 8);
379        src += src_stride;
380        XORI_B4_128_SB(src0, src1, src2, src3);
381        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
382                                   mask3, filt0, filt1, filt2, filt3, out0,
383                                   out1, out2, out3);
384        SRARI_H4_SH(out0, out1, out2, out3, 7);
385        SAT_SH4_SH(out0, out1, out2, out3, 7);
386
387        src0 = LD_SB(src);
388        src2 = LD_SB(src + 16);
389        src3 = LD_SB(src + 24);
390        src1 = __msa_sldi_b(src2, src0, 8);
391        src += src_stride;
392
393        out = PCKEV_XORI128_UB(out0, out1);
394        ST_UB(out, dst);
395        out = PCKEV_XORI128_UB(out2, out3);
396        ST_UB(out, dst + 16);
397        dst += dst_stride;
398
399        XORI_B4_128_SB(src0, src1, src2, src3);
400        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
401                                   mask3, filt0, filt1, filt2, filt3, out0,
402                                   out1, out2, out3);
403        SRARI_H4_SH(out0, out1, out2, out3, 7);
404        SAT_SH4_SH(out0, out1, out2, out3, 7);
405        out = PCKEV_XORI128_UB(out0, out1);
406        ST_UB(out, dst);
407        out = PCKEV_XORI128_UB(out2, out3);
408        ST_UB(out, dst + 16);
409        dst += dst_stride;
410    }
411}
412
413static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
414                                 uint8_t *dst, int32_t dst_stride,
415                                 const int8_t *filter, int32_t height)
416{
417    int32_t loop_cnt;
418    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
419    v16u8 mask0, mask1, mask2, mask3, out;
420    v8i16 filt, out0, out1, out2, out3;
421
422    mask0 = LD_UB(&mc_filt_mask_arr[0]);
423    src -= 3;
424
425    /* rearranging filter */
426    filt = LD_SH(filter);
427    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
428
429    mask1 = mask0 + 2;
430    mask2 = mask0 + 4;
431    mask3 = mask0 + 6;
432
433    for (loop_cnt = height; loop_cnt--;) {
434        src0 = LD_SB(src);
435        src2 = LD_SB(src + 16);
436        src3 = LD_SB(src + 24);
437        src1 = __msa_sldi_b(src2, src0, 8);
438
439        XORI_B4_128_SB(src0, src1, src2, src3);
440        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
441                                   mask2, mask3, filt0, filt1, filt2, filt3,
442                                   out0, out1, out2, out3);
443        SRARI_H4_SH(out0, out1, out2, out3, 7);
444        SAT_SH4_SH(out0, out1, out2, out3, 7);
445        out = PCKEV_XORI128_UB(out0, out1);
446        ST_UB(out, dst);
447        out = PCKEV_XORI128_UB(out2, out3);
448        ST_UB(out, dst + 16);
449
450        src0 = LD_SB(src + 32);
451        src2 = LD_SB(src + 48);
452        src3 = LD_SB(src + 56);
453        src1 = __msa_sldi_b(src2, src0, 8);
454        src += src_stride;
455
456        XORI_B4_128_SB(src0, src1, src2, src3);
457        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
458                                   mask2, mask3, filt0, filt1, filt2, filt3,
459                                   out0, out1, out2, out3);
460        SRARI_H4_SH(out0, out1, out2, out3, 7);
461        SAT_SH4_SH(out0, out1, out2, out3, 7);
462        out = PCKEV_XORI128_UB(out0, out1);
463        ST_UB(out, dst + 32);
464        out = PCKEV_XORI128_UB(out2, out3);
465        ST_UB(out, dst + 48);
466        dst += dst_stride;
467    }
468}
469
470static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
471                                uint8_t *dst, int32_t dst_stride,
472                                const int8_t *filter, int32_t height)
473{
474    uint32_t loop_cnt;
475    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
476    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
477    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
478    v16i8 src10998, filt0, filt1, filt2, filt3;
479    v16u8 out;
480    v8i16 filt, out10, out32;
481
482    src -= (3 * src_stride);
483
484    filt = LD_SH(filter);
485    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
486
487    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
488    src += (7 * src_stride);
489
490    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
491               src54_r, src21_r);
492    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
493    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
494               src4332, src6554);
495    XORI_B3_128_SB(src2110, src4332, src6554);
496
497    for (loop_cnt = (height >> 2); loop_cnt--;) {
498        LD_SB4(src, src_stride, src7, src8, src9, src10);
499        src += (4 * src_stride);
500
501        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
502                   src87_r, src98_r, src109_r);
503        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
504        XORI_B2_128_SB(src8776, src10998);
505        out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
506                                    filt1, filt2, filt3);
507        out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
508                                    filt1, filt2, filt3);
509        SRARI_H2_SH(out10, out32, 7);
510        SAT_SH2_SH(out10, out32, 7);
511        out = PCKEV_XORI128_UB(out10, out32);
512        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
513        dst += (4 * dst_stride);
514
515        src2110 = src6554;
516        src4332 = src8776;
517        src6554 = src10998;
518        src6 = src10;
519    }
520}
521
522static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
523                                uint8_t *dst, int32_t dst_stride,
524                                const int8_t *filter, int32_t height)
525{
526    uint32_t loop_cnt;
527    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
528    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
529    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
530    v16u8 tmp0, tmp1;
531    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
532
533    src -= (3 * src_stride);
534
535    filt = LD_SH(filter);
536    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
537
538    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
539    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
540    src += (7 * src_stride);
541    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
542               src54_r, src21_r);
543    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
544
545    for (loop_cnt = (height >> 2); loop_cnt--;) {
546        LD_SB4(src, src_stride, src7, src8, src9, src10);
547        XORI_B4_128_SB(src7, src8, src9, src10);
548        src += (4 * src_stride);
549
550        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
551                   src87_r, src98_r, src109_r);
552        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
553                                     filt1, filt2, filt3);
554        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
555                                     filt1, filt2, filt3);
556        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
557                                     filt1, filt2, filt3);
558        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
559                                     filt1, filt2, filt3);
560        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
561        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
562        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
563        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
564        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
565        dst += (4 * dst_stride);
566
567        src10_r = src54_r;
568        src32_r = src76_r;
569        src54_r = src98_r;
570        src21_r = src65_r;
571        src43_r = src87_r;
572        src65_r = src109_r;
573        src6 = src10;
574    }
575}
576
577static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
578                                 uint8_t *dst, int32_t dst_stride,
579                                 const int8_t *filter, int32_t height)
580{
581    uint32_t loop_cnt;
582    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
583    v16i8 filt0, filt1, filt2, filt3;
584    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
585    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
586    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
587    v16u8 tmp0, tmp1, tmp2, tmp3;
588    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
589
590    src -= (3 * src_stride);
591
592    filt = LD_SH(filter);
593    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
594
595    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
596    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
597    src += (7 * src_stride);
598    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
599               src54_r, src21_r);
600    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
601    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
602               src54_l, src21_l);
603    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
604
605    for (loop_cnt = (height >> 2); loop_cnt--;) {
606        LD_SB4(src, src_stride, src7, src8, src9, src10);
607        XORI_B4_128_SB(src7, src8, src9, src10);
608        src += (4 * src_stride);
609
610        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
611                   src87_r, src98_r, src109_r);
612        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
613                   src87_l, src98_l, src109_l);
614        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
615                                     filt1, filt2, filt3);
616        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
617                                     filt1, filt2, filt3);
618        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
619                                     filt1, filt2, filt3);
620        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
621                                     filt1, filt2, filt3);
622        out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
623                                     filt1, filt2, filt3);
624        out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
625                                     filt1, filt2, filt3);
626        out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
627                                     filt1, filt2, filt3);
628        out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
629                                     filt1, filt2, filt3);
630        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
631        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
632        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
633        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
634        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
635                    out3_r, tmp0, tmp1, tmp2, tmp3);
636        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
637        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
638        dst += (4 * dst_stride);
639
640        src10_r = src54_r;
641        src32_r = src76_r;
642        src54_r = src98_r;
643        src21_r = src65_r;
644        src43_r = src87_r;
645        src65_r = src109_r;
646        src10_l = src54_l;
647        src32_l = src76_l;
648        src54_l = src98_l;
649        src21_l = src65_l;
650        src43_l = src87_l;
651        src65_l = src109_l;
652        src6 = src10;
653    }
654}
655
656static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
657                                      uint8_t *dst, int32_t dst_stride,
658                                      const int8_t *filter, int32_t height,
659                                      int32_t width)
660{
661    const uint8_t *src_tmp;
662    uint8_t *dst_tmp;
663    uint32_t loop_cnt, cnt;
664    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
665    v16i8 filt0, filt1, filt2, filt3;
666    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
667    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
668    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
669    v16u8 tmp0, tmp1, tmp2, tmp3;
670    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
671
672    src -= (3 * src_stride);
673
674    filt = LD_SH(filter);
675    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
676
677    for (cnt = (width >> 4); cnt--;) {
678        src_tmp = src;
679        dst_tmp = dst;
680
681        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
682        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
683        src_tmp += (7 * src_stride);
684        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
685                   src32_r, src54_r, src21_r);
686        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
687        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
688                   src32_l, src54_l, src21_l);
689        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
690
691        for (loop_cnt = (height >> 2); loop_cnt--;) {
692            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
693            XORI_B4_128_SB(src7, src8, src9, src10);
694            src_tmp += (4 * src_stride);
695            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
696                       src87_r, src98_r, src109_r);
697            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
698                       src87_l, src98_l, src109_l);
699            out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
700                                         filt0, filt1, filt2, filt3);
701            out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
702                                         filt0, filt1, filt2, filt3);
703            out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
704                                         filt0, filt1, filt2, filt3);
705            out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
706                                         filt0, filt1, filt2, filt3);
707            out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
708                                         filt0, filt1, filt2, filt3);
709            out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
710                                         filt0, filt1, filt2, filt3);
711            out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
712                                         filt0, filt1, filt2, filt3);
713            out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
714                                         filt0, filt1, filt2, filt3);
715            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
716            SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
717            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
718            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
719            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
720                        out3_r, tmp0, tmp1, tmp2, tmp3);
721            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
722            ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
723            dst_tmp += (4 * dst_stride);
724
725            src10_r = src54_r;
726            src32_r = src76_r;
727            src54_r = src98_r;
728            src21_r = src65_r;
729            src43_r = src87_r;
730            src65_r = src109_r;
731            src10_l = src54_l;
732            src32_l = src76_l;
733            src54_l = src98_l;
734            src21_l = src65_l;
735            src43_l = src87_l;
736            src65_l = src109_l;
737            src6 = src10;
738        }
739
740        src += 16;
741        dst += 16;
742    }
743}
744
745static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
746                                 uint8_t *dst, int32_t dst_stride,
747                                 const int8_t *filter, int32_t height)
748{
749    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
750                              32);
751}
752
753static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
754                                 uint8_t *dst, int32_t dst_stride,
755                                 const int8_t *filter, int32_t height)
756{
757    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
758                              64);
759}
760
761static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
762                                     uint8_t *dst, int32_t dst_stride,
763                                     const int8_t *filter_horiz,
764                                     const int8_t *filter_vert,
765                                     int32_t height)
766{
767    uint32_t loop_cnt;
768    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
769    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
770    v16u8 mask0, mask1, mask2, mask3, out;
771    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
772    v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
773    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
774
775    mask0 = LD_UB(&mc_filt_mask_arr[16]);
776    src -= (3 + 3 * src_stride);
777
778    /* rearranging filter */
779    filt = LD_SH(filter_horiz);
780    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
781
782    mask1 = mask0 + 2;
783    mask2 = mask0 + 4;
784    mask3 = mask0 + 6;
785
786    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
787    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
788    src += (7 * src_stride);
789
790    hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
791                              filt_hz1, filt_hz2, filt_hz3);
792    hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
793                              filt_hz1, filt_hz2, filt_hz3);
794    hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
795                              filt_hz1, filt_hz2, filt_hz3);
796    hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
797                              filt_hz1, filt_hz2, filt_hz3);
798    SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3);
799
800    filt = LD_SH(filter_vert);
801    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
802
803    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
804    out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
805
806    for (loop_cnt = (height >> 2); loop_cnt--;) {
807        LD_SB4(src, src_stride, src7, src8, src9, src10);
808        XORI_B4_128_SB(src7, src8, src9, src10);
809        src += (4 * src_stride);
810
811        hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
812                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
813        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
814        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
815        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
816                                   filt_vt2, filt_vt3);
817
818        hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
819                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
820        hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
821        out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
822        tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
823                                   filt_vt2, filt_vt3);
824        SRARI_H2_SH(tmp0, tmp1, 7);
825        SAT_SH2_SH(tmp0, tmp1, 7);
826        out = PCKEV_XORI128_UB(tmp0, tmp1);
827        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
828        dst += (4 * dst_stride);
829
830        hz_out5 = hz_out9;
831        out0 = out2;
832        out1 = out3;
833        out2 = out4;
834    }
835}
836
837static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
838                                     uint8_t *dst, int32_t dst_stride,
839                                     const int8_t *filter_horiz,
840                                     const int8_t *filter_vert,
841                                     int32_t height)
842{
843    uint32_t loop_cnt;
844    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
845    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
846    v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
847    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
848    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
849    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
850    v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
851
852    mask0 = LD_UB(&mc_filt_mask_arr[0]);
853    src -= (3 + 3 * src_stride);
854
855    /* rearranging filter */
856    filt = LD_SH(filter_horiz);
857    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
858
859    mask1 = mask0 + 2;
860    mask2 = mask0 + 4;
861    mask3 = mask0 + 6;
862
863    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
864    src += (7 * src_stride);
865
866    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
867    hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
868                              filt_hz1, filt_hz2, filt_hz3);
869    hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
870                              filt_hz1, filt_hz2, filt_hz3);
871    hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
872                              filt_hz1, filt_hz2, filt_hz3);
873    hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
874                              filt_hz1, filt_hz2, filt_hz3);
875    hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
876                              filt_hz1, filt_hz2, filt_hz3);
877    hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
878                              filt_hz1, filt_hz2, filt_hz3);
879    hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
880                              filt_hz1, filt_hz2, filt_hz3);
881
882    filt = LD_SH(filter_vert);
883    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
884
885    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
886    ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
887    ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
888
889    for (loop_cnt = (height >> 2); loop_cnt--;) {
890        LD_SB4(src, src_stride, src7, src8, src9, src10);
891        src += (4 * src_stride);
892
893        XORI_B4_128_SB(src7, src8, src9, src10);
894
895        hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
896                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
897        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
898        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
899                                   filt_vt2, filt_vt3);
900
901        hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
902                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
903        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
904        tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
905                                   filt_vt2, filt_vt3);
906
907        hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
908                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
909        out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
910        tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0,
911                                   filt_vt1, filt_vt2, filt_vt3);
912
913        hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
914                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
915        out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
916        tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
917                                   filt_vt2, filt_vt3);
918        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
919        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
920        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
921        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
922        ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
923        dst += (4 * dst_stride);
924
925        hz_out6 = hz_out10;
926        out0 = out2;
927        out1 = out3;
928        out2 = out8;
929        out4 = out6;
930        out5 = out7;
931        out6 = out9;
932    }
933}
934
935static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
936                                      uint8_t *dst, int32_t dst_stride,
937                                      const int8_t *filter_horiz,
938                                      const int8_t *filter_vert,
939                                      int32_t height)
940{
941    int32_t multiple8_cnt;
942
943    for (multiple8_cnt = 2; multiple8_cnt--;) {
944        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
945                                 filter_vert, height);
946
947        src += 8;
948        dst += 8;
949    }
950}
951
952static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
953                                      uint8_t *dst, int32_t dst_stride,
954                                      const int8_t *filter_horiz,
955                                      const int8_t *filter_vert,
956                                      int32_t height)
957{
958    int32_t multiple8_cnt;
959
960    for (multiple8_cnt = 4; multiple8_cnt--;) {
961        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
962                                 filter_vert, height);
963
964        src += 8;
965        dst += 8;
966    }
967}
968
969static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
970                                      uint8_t *dst, int32_t dst_stride,
971                                      const int8_t *filter_horiz,
972                                      const int8_t *filter_vert,
973                                      int32_t height)
974{
975    int32_t multiple8_cnt;
976
977    for (multiple8_cnt = 8; multiple8_cnt--;) {
978        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
979                                 filter_vert, height);
980
981        src += 8;
982        dst += 8;
983    }
984}
985
986static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
987                                              int32_t src_stride,
988                                              uint8_t *dst, int32_t dst_stride,
989                                              const int8_t *filter)
990{
991    uint32_t tp0, tp1, tp2, tp3;
992    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
993    v16u8 dst0, res;
994    v16u8 mask0, mask1, mask2, mask3;
995    v8i16 filt, res0, res1;
996
997    mask0 = LD_UB(&mc_filt_mask_arr[16]);
998    src -= 3;
999
1000    /* rearranging filter */
1001    filt = LD_SH(filter);
1002    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1003
1004    mask1 = mask0 + 2;
1005    mask2 = mask0 + 4;
1006    mask3 = mask0 + 6;
1007
1008    LD_SB4(src, src_stride, src0, src1, src2, src3);
1009    XORI_B4_128_SB(src0, src1, src2, src3);
1010    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1011                               mask3, filt0, filt1, filt2, filt3, res0, res1);
1012    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1013    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1014    SRARI_H2_SH(res0, res1, 7);
1015    SAT_SH2_SH(res0, res1, 7);
1016    res = PCKEV_XORI128_UB(res0, res1);
1017    res = (v16u8) __msa_aver_u_b(res, dst0);
1018    ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
1019}
1020
1021static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
1022                                              int32_t src_stride,
1023                                              uint8_t *dst, int32_t dst_stride,
1024                                              const int8_t *filter)
1025{
1026    uint32_t tp0, tp1, tp2, tp3;
1027    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1028    v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
1029    v16u8 dst0, dst1;
1030    v8i16 filt, vec0, vec1, vec2, vec3;
1031
1032    mask0 = LD_UB(&mc_filt_mask_arr[16]);
1033    src -= 3;
1034
1035    /* rearranging filter */
1036    filt = LD_SH(filter);
1037    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1038
1039    mask1 = mask0 + 2;
1040    mask2 = mask0 + 4;
1041    mask3 = mask0 + 6;
1042
1043    LD_SB4(src, src_stride, src0, src1, src2, src3);
1044    XORI_B4_128_SB(src0, src1, src2, src3);
1045    src += (4 * src_stride);
1046    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1047    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1048    LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
1049    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1050    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1051                               mask3, filt0, filt1, filt2, filt3, vec0, vec1);
1052    LD_SB4(src, src_stride, src0, src1, src2, src3);
1053    XORI_B4_128_SB(src0, src1, src2, src3);
1054    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1055                               mask3, filt0, filt1, filt2, filt3, vec2, vec3);
1056    SRARI_H4_SH(vec0, vec1, vec2, vec3, 7);
1057    SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
1058    PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1059                res0, res1, res2, res3);
1060    ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
1061    XORI_B2_128_UB(res0, res2);
1062    AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
1063    ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1064}
1065
1066static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
1067                                             int32_t src_stride,
1068                                             uint8_t *dst, int32_t dst_stride,
1069                                             const int8_t *filter,
1070                                             int32_t height)
1071{
1072    if (4 == height) {
1073        common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
1074                                          filter);
1075    } else if (8 == height) {
1076        common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
1077                                          filter);
1078    }
1079}
1080
1081static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
1082                                             int32_t src_stride,
1083                                             uint8_t *dst, int32_t dst_stride,
1084                                             const int8_t *filter,
1085                                             int32_t height)
1086{
1087    int32_t loop_cnt;
1088    int64_t tp0, tp1, tp2, tp3;
1089    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1090    v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
1091    v8i16 filt, out0, out1, out2, out3;
1092
1093    mask0 = LD_UB(&mc_filt_mask_arr[0]);
1094    src -= 3;
1095
1096    /* rearranging filter */
1097    filt = LD_SH(filter);
1098    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1099
1100    mask1 = mask0 + 2;
1101    mask2 = mask0 + 4;
1102    mask3 = mask0 + 6;
1103
1104    for (loop_cnt = (height >> 2); loop_cnt--;) {
1105        LD_SB4(src, src_stride, src0, src1, src2, src3);
1106        XORI_B4_128_SB(src0, src1, src2, src3);
1107        src += (4 * src_stride);
1108        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1109                                   mask3, filt0, filt1, filt2, filt3, out0,
1110                                   out1, out2, out3);
1111        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1112        INSERT_D2_UB(tp0, tp1, dst0);
1113        INSERT_D2_UB(tp2, tp3, dst1);
1114        SRARI_H4_SH(out0, out1, out2, out3, 7);
1115        SAT_SH4_SH(out0, out1, out2, out3, 7);
1116        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1117                                dst, dst_stride);
1118        dst += (4 * dst_stride);
1119    }
1120}
1121
1122static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
1123                                              int32_t src_stride,
1124                                              uint8_t *dst, int32_t dst_stride,
1125                                              const int8_t *filter,
1126                                              int32_t height)
1127{
1128    int32_t loop_cnt;
1129    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1130    v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
1131    v8i16 filt, out0, out1, out2, out3;
1132    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1133    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1134
1135    mask0 = LD_UB(&mc_filt_mask_arr[0]);
1136    src -= 3;
1137
1138    /* rearranging filter */
1139    filt = LD_SH(filter);
1140    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1141
1142    mask1 = mask0 + 2;
1143    mask2 = mask0 + 4;
1144    mask3 = mask0 + 6;
1145
1146    for (loop_cnt = height >> 1; loop_cnt--;) {
1147        LD_SB2(src, src_stride, src0, src2);
1148        LD_SB2(src + 8, src_stride, src1, src3);
1149        src += (2 * src_stride);
1150
1151        XORI_B4_128_SB(src0, src1, src2, src3);
1152        VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1153                   vec12);
1154        VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1155                   vec13);
1156        VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1157                   vec14);
1158        VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1159                   vec15);
1160        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1161                    vec1, vec2, vec3);
1162        DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1163                    vec9, vec10, vec11);
1164        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1165                     vec1, vec2, vec3);
1166        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1167                     vec8, vec9, vec10, vec11);
1168        ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1169                    out1, out2, out3);
1170        LD_UB2(dst, dst_stride, dst0, dst1);
1171        SRARI_H4_SH(out0, out1, out2, out3, 7);
1172        SAT_SH4_SH(out0, out1, out2, out3, 7);
1173        PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
1174        dst += dst_stride;
1175        PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
1176        dst += dst_stride;
1177    }
1178}
1179
1180static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
1181                                              int32_t src_stride,
1182                                              uint8_t *dst, int32_t dst_stride,
1183                                              const int8_t *filter,
1184                                              int32_t height)
1185{
1186    uint32_t loop_cnt;
1187    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1188    v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1189    v8i16 filt, out0, out1, out2, out3;
1190    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1191    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1192
1193    mask0 = LD_UB(&mc_filt_mask_arr[0]);
1194    src -= 3;
1195
1196    /* rearranging filter */
1197    filt = LD_SH(filter);
1198    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1199
1200    mask1 = mask0 + 2;
1201    mask2 = mask0 + 4;
1202    mask3 = mask0 + 6;
1203
1204    for (loop_cnt = height; loop_cnt--;) {
1205        src0 = LD_SB(src);
1206        src2 = LD_SB(src + 16);
1207        src3 = LD_SB(src + 24);
1208        src1 = __msa_sldi_b(src2, src0, 8);
1209        src += src_stride;
1210
1211        XORI_B4_128_SB(src0, src1, src2, src3);
1212        VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1213                   vec12);
1214        VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1215                   vec13);
1216        VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1217                   vec14);
1218        VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1219                   vec15);
1220        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1221                    vec1, vec2, vec3);
1222        DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1223                    vec9, vec10, vec11);
1224        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1225                     vec1, vec2, vec3);
1226        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1227                     vec8, vec9, vec10, vec11);
1228        ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1229                    out1, out2, out3);
1230        SRARI_H4_SH(out0, out1, out2, out3, 7);
1231        SAT_SH4_SH(out0, out1, out2, out3, 7);
1232        LD_UB2(dst, 16, dst1, dst2);
1233        PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
1234        PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
1235        dst += dst_stride;
1236    }
1237}
1238
1239static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
1240                                              int32_t src_stride,
1241                                              uint8_t *dst, int32_t dst_stride,
1242                                              const int8_t *filter,
1243                                              int32_t height)
1244{
1245    uint32_t loop_cnt, cnt;
1246    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1247    v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1248    v8i16 filt, out0, out1, out2, out3;
1249    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1250    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1251
1252    mask0 = LD_UB(&mc_filt_mask_arr[0]);
1253    src -= 3;
1254
1255    /* rearranging filter */
1256    filt = LD_SH(filter);
1257    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1258
1259    mask1 = mask0 + 2;
1260    mask2 = mask0 + 4;
1261    mask3 = mask0 + 6;
1262
1263    for (loop_cnt = height; loop_cnt--;) {
1264        for (cnt = 0; cnt < 2; ++cnt) {
1265            src0 = LD_SB(&src[cnt << 5]);
1266            src2 = LD_SB(&src[16 + (cnt << 5)]);
1267            src3 = LD_SB(&src[24 + (cnt << 5)]);
1268            src1 = __msa_sldi_b(src2, src0, 8);
1269
1270            XORI_B4_128_SB(src0, src1, src2, src3);
1271            VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1272                       vec12);
1273            VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1274                       vec13);
1275            VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6,
1276                       vec10, vec14);
1277            VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7,
1278                       vec11, vec15);
1279            DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1280                        vec0, vec1, vec2, vec3);
1281            DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2,
1282                        vec8, vec9, vec10, vec11);
1283            DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
1284                         vec0, vec1, vec2, vec3);
1285            DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1286                         vec8, vec9, vec10, vec11);
1287            ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1288                        out1, out2, out3);
1289            SRARI_H4_SH(out0, out1, out2, out3, 7);
1290            SAT_SH4_SH(out0, out1, out2, out3, 7);
1291            LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
1292            PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
1293            PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
1294        }
1295
1296        src += src_stride;
1297        dst += dst_stride;
1298    }
1299}
1300
1301static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
1302                                             int32_t src_stride,
1303                                             uint8_t *dst, int32_t dst_stride,
1304                                             const int8_t *filter,
1305                                             int32_t height)
1306{
1307    uint32_t loop_cnt;
1308    uint32_t tp0, tp1, tp2, tp3;
1309    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1310    v16u8 dst0, out;
1311    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1312    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
1313    v16i8 src10998, filt0, filt1, filt2, filt3;
1314    v8i16 filt, out10, out32;
1315
1316    src -= (3 * src_stride);
1317
1318    filt = LD_SH(filter);
1319    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1320
1321    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1322    src += (7 * src_stride);
1323
1324    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1325               src54_r, src21_r);
1326    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1327    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
1328               src4332, src6554);
1329    XORI_B3_128_SB(src2110, src4332, src6554);
1330
1331    for (loop_cnt = (height >> 2); loop_cnt--;) {
1332        LD_SB4(src, src_stride, src7, src8, src9, src10);
1333        src += (4 * src_stride);
1334
1335        LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1336        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1337        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1338                   src87_r, src98_r, src109_r);
1339        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
1340        XORI_B2_128_SB(src8776, src10998);
1341        out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
1342                                    filt1, filt2, filt3);
1343        out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
1344                                    filt1, filt2, filt3);
1345        SRARI_H2_SH(out10, out32, 7);
1346        SAT_SH2_SH(out10, out32, 7);
1347        out = PCKEV_XORI128_UB(out10, out32);
1348        out = __msa_aver_u_b(out, dst0);
1349
1350        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1351        dst += (4 * dst_stride);
1352
1353        src2110 = src6554;
1354        src4332 = src8776;
1355        src6554 = src10998;
1356        src6 = src10;
1357    }
1358}
1359
1360static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
1361                                             int32_t src_stride,
1362                                             uint8_t *dst, int32_t dst_stride,
1363                                             const int8_t *filter,
1364                                             int32_t height)
1365{
1366    uint32_t loop_cnt;
1367    uint64_t tp0, tp1, tp2, tp3;
1368    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1369    v16u8 dst0, dst1;
1370    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1371    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1372    v8i16 filt, out0, out1, out2, out3;
1373
1374    src -= (3 * src_stride);
1375
1376    filt = LD_SH(filter);
1377    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1378
1379    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1380    src += (7 * src_stride);
1381
1382    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1383    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1384               src54_r, src21_r);
1385    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1386
1387    for (loop_cnt = (height >> 2); loop_cnt--;) {
1388        LD_SB4(src, src_stride, src7, src8, src9, src10);
1389        src += (4 * src_stride);
1390
1391        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1392        INSERT_D2_UB(tp0, tp1, dst0);
1393        INSERT_D2_UB(tp2, tp3, dst1);
1394        XORI_B4_128_SB(src7, src8, src9, src10);
1395        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1396                   src87_r, src98_r, src109_r);
1397        out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1398                                   filt1, filt2, filt3);
1399        out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1400                                   filt1, filt2, filt3);
1401        out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1402                                   filt1, filt2, filt3);
1403        out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1404                                   filt1, filt2, filt3);
1405        SRARI_H4_SH(out0, out1, out2, out3, 7);
1406        SAT_SH4_SH(out0, out1, out2, out3, 7);
1407        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1408                                dst, dst_stride);
1409        dst += (4 * dst_stride);
1410
1411        src10_r = src54_r;
1412        src32_r = src76_r;
1413        src54_r = src98_r;
1414        src21_r = src65_r;
1415        src43_r = src87_r;
1416        src65_r = src109_r;
1417        src6 = src10;
1418    }
1419}
1420
1421static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
1422                                                   int32_t src_stride,
1423                                                   uint8_t *dst,
1424                                                   int32_t dst_stride,
1425                                                   const int8_t *filter,
1426                                                   int32_t height,
1427                                                   int32_t width)
1428{
1429    const uint8_t *src_tmp;
1430    uint8_t *dst_tmp;
1431    uint32_t loop_cnt, cnt;
1432    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1433    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1434    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1435    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1436    v16i8 filt0, filt1, filt2, filt3;
1437    v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
1438    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
1439
1440    src -= (3 * src_stride);
1441
1442    filt = LD_SH(filter);
1443    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1444
1445    for (cnt = (width >> 4); cnt--;) {
1446        src_tmp = src;
1447        dst_tmp = dst;
1448
1449        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1450        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1451        src_tmp += (7 * src_stride);
1452
1453        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1454                   src32_r, src54_r, src21_r);
1455        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1456        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1457                   src32_l, src54_l, src21_l);
1458        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1459
1460        for (loop_cnt = (height >> 2); loop_cnt--;) {
1461            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1462            src_tmp += (4 * src_stride);
1463
1464            LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
1465            XORI_B4_128_SB(src7, src8, src9, src10);
1466            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1467                       src87_r, src98_r, src109_r);
1468            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1469                       src87_l, src98_l, src109_l);
1470            out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
1471                                         filt0, filt1, filt2, filt3);
1472            out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
1473                                         filt0, filt1, filt2, filt3);
1474            out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
1475                                         filt0, filt1, filt2, filt3);
1476            out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
1477                                         filt0, filt1, filt2, filt3);
1478            out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
1479                                         filt0, filt1, filt2, filt3);
1480            out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
1481                                         filt0, filt1, filt2, filt3);
1482            out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
1483                                         filt0, filt1, filt2, filt3);
1484            out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
1485                                         filt0, filt1, filt2, filt3);
1486            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1487            SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1488            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1489            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1490            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1491                        out3_r, tmp0, tmp1, tmp2, tmp3);
1492            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1493            AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
1494                        dst0, dst1, dst2, dst3);
1495            ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
1496            dst_tmp += (4 * dst_stride);
1497
1498            src10_r = src54_r;
1499            src32_r = src76_r;
1500            src54_r = src98_r;
1501            src21_r = src65_r;
1502            src43_r = src87_r;
1503            src65_r = src109_r;
1504            src10_l = src54_l;
1505            src32_l = src76_l;
1506            src54_l = src98_l;
1507            src21_l = src65_l;
1508            src43_l = src87_l;
1509            src65_l = src109_l;
1510            src6 = src10;
1511        }
1512
1513        src += 16;
1514        dst += 16;
1515    }
1516}
1517
1518static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
1519                                              int32_t src_stride,
1520                                              uint8_t *dst, int32_t dst_stride,
1521                                              const int8_t *filter,
1522                                              int32_t height)
1523{
1524    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1525                                           filter, height, 16);
1526}
1527
1528static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
1529                                              int32_t src_stride,
1530                                              uint8_t *dst, int32_t dst_stride,
1531                                              const int8_t *filter,
1532                                              int32_t height)
1533{
1534    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1535                                           filter, height, 32);
1536}
1537
1538static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
1539                                              int32_t src_stride,
1540                                              uint8_t *dst, int32_t dst_stride,
1541                                              const int8_t *filter,
1542                                              int32_t height)
1543{
1544    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1545                                           filter, height, 64);
1546}
1547
1548static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
1549                                                  int32_t src_stride,
1550                                                  uint8_t *dst,
1551                                                  int32_t dst_stride,
1552                                                  const int8_t *filter_horiz,
1553                                                  const int8_t *filter_vert,
1554                                                  int32_t height)
1555{
1556    uint32_t loop_cnt;
1557    uint32_t tp0, tp1, tp2, tp3;
1558    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1559    v16u8 dst0, res, mask0, mask1, mask2, mask3;
1560    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1561    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1562    v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
1563    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1564
1565    mask0 = LD_UB(&mc_filt_mask_arr[16]);
1566    src -= (3 + 3 * src_stride);
1567
1568    /* rearranging filter */
1569    filt = LD_SH(filter_horiz);
1570    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1571
1572    mask1 = mask0 + 2;
1573    mask2 = mask0 + 4;
1574    mask3 = mask0 + 6;
1575
1576    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1577    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1578    src += (7 * src_stride);
1579
1580    hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
1581                              filt_hz1, filt_hz2, filt_hz3);
1582    hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
1583                              filt_hz1, filt_hz2, filt_hz3);
1584    hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
1585                              filt_hz1, filt_hz2, filt_hz3);
1586    hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
1587                              filt_hz1, filt_hz2, filt_hz3);
1588    SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3);
1589
1590    filt = LD_SH(filter_vert);
1591    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1592
1593    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1594    vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1595
1596    for (loop_cnt = (height >> 2); loop_cnt--;) {
1597        LD_SB4(src, src_stride, src7, src8, src9, src10);
1598        XORI_B4_128_SB(src7, src8, src9, src10);
1599        src += (4 * src_stride);
1600
1601        LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1602        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1603        hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
1604                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1605        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1606        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1607        res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
1608                                   filt_vt2, filt_vt3);
1609
1610        hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
1611                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1612        hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
1613        vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1614        res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
1615                                   filt_vt2, filt_vt3);
1616
1617        SRARI_H2_SH(res0, res1, 7);
1618        SAT_SH2_SH(res0, res1, 7);
1619        res = PCKEV_XORI128_UB(res0, res1);
1620        res = (v16u8) __msa_aver_u_b(res, dst0);
1621        ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
1622        dst += (4 * dst_stride);
1623
1624        hz_out5 = hz_out9;
1625        vec0 = vec2;
1626        vec1 = vec3;
1627        vec2 = vec4;
1628    }
1629}
1630
1631static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
1632                                                  int32_t src_stride,
1633                                                  uint8_t *dst,
1634                                                  int32_t dst_stride,
1635                                                  const int8_t *filter_horiz,
1636                                                  const int8_t *filter_vert,
1637                                                  int32_t height)
1638{
1639    uint32_t loop_cnt;
1640    uint64_t tp0, tp1, tp2, tp3;
1641    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1642    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1643    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1644    v16u8 dst0, dst1, mask0, mask1, mask2, mask3;
1645    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1646    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
1647    v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
1648
1649    mask0 = LD_UB(&mc_filt_mask_arr[0]);
1650    src -= (3 + 3 * src_stride);
1651
1652    /* rearranging filter */
1653    filt = LD_SH(filter_horiz);
1654    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1655
1656    mask1 = mask0 + 2;
1657    mask2 = mask0 + 4;
1658    mask3 = mask0 + 6;
1659
1660    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1661    src += (7 * src_stride);
1662
1663    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1664    hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
1665                              filt_hz1, filt_hz2, filt_hz3);
1666    hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
1667                              filt_hz1, filt_hz2, filt_hz3);
1668    hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
1669                              filt_hz1, filt_hz2, filt_hz3);
1670    hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
1671                              filt_hz1, filt_hz2, filt_hz3);
1672    hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
1673                              filt_hz1, filt_hz2, filt_hz3);
1674    hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
1675                              filt_hz1, filt_hz2, filt_hz3);
1676    hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
1677                              filt_hz1, filt_hz2, filt_hz3);
1678
1679    filt = LD_SH(filter_vert);
1680    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1681
1682    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1683    ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
1684    ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
1685
1686    for (loop_cnt = (height >> 2); loop_cnt--;) {
1687        LD_SB4(src, src_stride, src7, src8, src9, src10);
1688        XORI_B4_128_SB(src7, src8, src9, src10);
1689        src += (4 * src_stride);
1690
1691        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1692        INSERT_D2_UB(tp0, tp1, dst0);
1693        INSERT_D2_UB(tp2, tp3, dst1);
1694
1695        hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
1696                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1697        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1698        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
1699                                   filt_vt2, filt_vt3);
1700
1701        hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
1702                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1703        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1704        tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
1705                                   filt_vt2, filt_vt3);
1706
1707        hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
1708                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1709        out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1710        tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
1711                                   filt_vt2, filt_vt3);
1712
1713        hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
1714                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1715        out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
1716        tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
1717                                   filt_vt2, filt_vt3);
1718
1719        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1720        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1721        CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1,
1722                                dst, dst_stride);
1723        dst += (4 * dst_stride);
1724
1725        hz_out6 = hz_out10;
1726        out0 = out2;
1727        out1 = out3;
1728        out2 = out8;
1729        out4 = out6;
1730        out5 = out7;
1731        out6 = out9;
1732    }
1733}
1734
1735static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
1736                                                   int32_t src_stride,
1737                                                   uint8_t *dst,
1738                                                   int32_t dst_stride,
1739                                                   const int8_t *filter_horiz,
1740                                                   const int8_t *filter_vert,
1741                                                   int32_t height)
1742{
1743    int32_t multiple8_cnt;
1744
1745    for (multiple8_cnt = 2; multiple8_cnt--;) {
1746        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1747                                              filter_horiz, filter_vert,
1748                                              height);
1749
1750        src += 8;
1751        dst += 8;
1752    }
1753}
1754
1755static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
1756                                                   int32_t src_stride,
1757                                                   uint8_t *dst,
1758                                                   int32_t dst_stride,
1759                                                   const int8_t *filter_horiz,
1760                                                   const int8_t *filter_vert,
1761                                                   int32_t height)
1762{
1763    int32_t multiple8_cnt;
1764
1765    for (multiple8_cnt = 4; multiple8_cnt--;) {
1766        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1767                                              filter_horiz, filter_vert,
1768                                              height);
1769
1770        src += 8;
1771        dst += 8;
1772    }
1773}
1774
1775static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
1776                                                   int32_t src_stride,
1777                                                   uint8_t *dst,
1778                                                   int32_t dst_stride,
1779                                                   const int8_t *filter_horiz,
1780                                                   const int8_t *filter_vert,
1781                                                   int32_t height)
1782{
1783    int32_t multiple8_cnt;
1784
1785    for (multiple8_cnt = 8; multiple8_cnt--;) {
1786        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1787                                              filter_horiz, filter_vert,
1788                                              height);
1789
1790        src += 8;
1791        dst += 8;
1792    }
1793}
1794
1795static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
1796                                 uint8_t *dst, int32_t dst_stride,
1797                                 const int8_t *filter)
1798{
1799    v16i8 src0, src1, src2, src3, mask;
1800    v16u8 filt0, vec0, vec1, res0, res1;
1801    v8u16 vec2, vec3, filt;
1802
1803    mask = LD_SB(&mc_filt_mask_arr[16]);
1804
1805    /* rearranging filter */
1806    filt = LD_UH(filter);
1807    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1808
1809    LD_SB4(src, src_stride, src0, src1, src2, src3);
1810    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1811    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1812    SRARI_H2_UH(vec2, vec3, 7);
1813    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
1814    ST_W2(res0, 0, 1, dst, dst_stride);
1815    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1816}
1817
1818static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
1819                                 uint8_t *dst, int32_t dst_stride,
1820                                 const int8_t *filter)
1821{
1822    v16u8 vec0, vec1, vec2, vec3, filt0;
1823    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1824    v16i8 res0, res1, res2, res3;
1825    v8u16 vec4, vec5, vec6, vec7, filt;
1826
1827    mask = LD_SB(&mc_filt_mask_arr[16]);
1828
1829    /* rearranging filter */
1830    filt = LD_UH(filter);
1831    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1832
1833    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1834    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1835    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1836    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1837                vec4, vec5, vec6, vec7);
1838    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
1839    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1840                res0, res1, res2, res3);
1841    ST_W2(res0, 0, 1, dst, dst_stride);
1842    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1843    ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
1844    ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
1845}
1846
1847void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1848                         const uint8_t *src, ptrdiff_t src_stride,
1849                         int height, int mx, int my)
1850{
1851    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1852
1853    if (4 == height) {
1854        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1855    } else if (8 == height) {
1856        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1857    }
1858}
1859
1860static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
1861                                 uint8_t *dst, int32_t dst_stride,
1862                                 const int8_t *filter)
1863{
1864    v16u8 filt0;
1865    v16i8 src0, src1, src2, src3, mask;
1866    v8u16 vec0, vec1, vec2, vec3, filt;
1867
1868    mask = LD_SB(&mc_filt_mask_arr[0]);
1869
1870    /* rearranging filter */
1871    filt = LD_UH(filter);
1872    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1873
1874    LD_SB4(src, src_stride, src0, src1, src2, src3);
1875    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1876    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1877    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1878                vec0, vec1, vec2, vec3);
1879    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1880    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
1881    ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
1882}
1883
1884static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
1885                                     uint8_t *dst, int32_t dst_stride,
1886                                     const int8_t *filter, int32_t height)
1887{
1888    v16u8 filt0;
1889    v16i8 src0, src1, src2, src3, mask, out0, out1;
1890    v8u16 vec0, vec1, vec2, vec3, filt;
1891
1892    mask = LD_SB(&mc_filt_mask_arr[0]);
1893
1894    /* rearranging filter */
1895    filt = LD_UH(filter);
1896    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1897
1898    LD_SB4(src, src_stride, src0, src1, src2, src3);
1899    src += (4 * src_stride);
1900
1901    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1902    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1903    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1904                vec0, vec1, vec2, vec3);
1905    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1906    LD_SB4(src, src_stride, src0, src1, src2, src3);
1907    src += (4 * src_stride);
1908
1909    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1910    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1911
1912    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1913    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1914    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1915                vec0, vec1, vec2, vec3);
1916    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1917    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1918    ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1919    dst += (8 * dst_stride);
1920
1921    if (16 == height) {
1922        LD_SB4(src, src_stride, src0, src1, src2, src3);
1923        src += (4 * src_stride);
1924
1925        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1926        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1927        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1928                    vec0, vec1, vec2, vec3);
1929        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1930        LD_SB4(src, src_stride, src0, src1, src2, src3);
1931        src += (4 * src_stride);
1932
1933        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1934        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1935
1936        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1937        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1938        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1939                    vec0, vec1, vec2, vec3);
1940        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1941        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1942        ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1943    }
1944}
1945
1946void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1947                         const uint8_t *src, ptrdiff_t src_stride,
1948                         int height, int mx, int my)
1949{
1950    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1951
1952    if (4 == height) {
1953        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1954    } else {
1955        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1956                                 height);
1957    }
1958}
1959
1960void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1961                          const uint8_t *src, ptrdiff_t src_stride,
1962                          int height, int mx, int my)
1963{
1964    uint32_t loop_cnt;
1965    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1966    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1967    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1968    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
1969
1970    mask = LD_SB(&mc_filt_mask_arr[0]);
1971
1972    loop_cnt = (height >> 2) - 1;
1973
1974    /* rearranging filter */
1975    filt = LD_UH(filter);
1976    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1977
1978    LD_SB4(src, src_stride, src0, src2, src4, src6);
1979    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1980    src += (4 * src_stride);
1981
1982    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1983    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1984    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1985    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1986    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1987                out0, out1, out2, out3);
1988    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1989                out4, out5, out6, out7);
1990    SRARI_H4_UH(out0, out1, out2, out3, 7);
1991    SRARI_H4_UH(out4, out5, out6, out7, 7);
1992    PCKEV_ST_SB(out0, out1, dst);
1993    dst += dst_stride;
1994    PCKEV_ST_SB(out2, out3, dst);
1995    dst += dst_stride;
1996    PCKEV_ST_SB(out4, out5, dst);
1997    dst += dst_stride;
1998    PCKEV_ST_SB(out6, out7, dst);
1999    dst += dst_stride;
2000
2001    for (; loop_cnt--;) {
2002        LD_SB4(src, src_stride, src0, src2, src4, src6);
2003        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2004        src += (4 * src_stride);
2005
2006        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2007        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2008        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2009        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2010        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2011                    out0, out1, out2, out3);
2012        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2013                    out4, out5, out6, out7);
2014        SRARI_H4_UH(out0, out1, out2, out3, 7);
2015        SRARI_H4_UH(out4, out5, out6, out7, 7);
2016        PCKEV_ST_SB(out0, out1, dst);
2017        dst += dst_stride;
2018        PCKEV_ST_SB(out2, out3, dst);
2019        dst += dst_stride;
2020        PCKEV_ST_SB(out4, out5, dst);
2021        dst += dst_stride;
2022        PCKEV_ST_SB(out6, out7, dst);
2023        dst += dst_stride;
2024    }
2025}
2026
2027void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2028                          const uint8_t *src, ptrdiff_t src_stride,
2029                          int height, int mx, int my)
2030{
2031    uint32_t loop_cnt;
2032    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2033    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2034    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2035    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
2036
2037    mask = LD_SB(&mc_filt_mask_arr[0]);
2038
2039    /* rearranging filter */
2040    filt = LD_UH(filter);
2041    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2042
2043    for (loop_cnt = height >> 1; loop_cnt--;) {
2044        src0 = LD_SB(src);
2045        src2 = LD_SB(src + 16);
2046        src3 = LD_SB(src + 24);
2047        src1 = __msa_sldi_b(src2, src0, 8);
2048        src += src_stride;
2049        src4 = LD_SB(src);
2050        src6 = LD_SB(src + 16);
2051        src7 = LD_SB(src + 24);
2052        src5 = __msa_sldi_b(src6, src4, 8);
2053        src += src_stride;
2054
2055        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2056        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2057        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2058        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2059        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2060                    out0, out1, out2, out3);
2061        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2062                    out4, out5, out6, out7);
2063        SRARI_H4_UH(out0, out1, out2, out3, 7);
2064        SRARI_H4_UH(out4, out5, out6, out7, 7);
2065        PCKEV_ST_SB(out0, out1, dst);
2066        PCKEV_ST_SB(out2, out3, dst + 16);
2067        dst += dst_stride;
2068        PCKEV_ST_SB(out4, out5, dst);
2069        PCKEV_ST_SB(out6, out7, dst + 16);
2070        dst += dst_stride;
2071    }
2072}
2073
2074void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2075                          const uint8_t *src, ptrdiff_t src_stride,
2076                          int height, int mx, int my)
2077{
2078    uint32_t loop_cnt;
2079    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2080    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2081    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2082    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
2083
2084    mask = LD_SB(&mc_filt_mask_arr[0]);
2085
2086    /* rearranging filter */
2087    filt = LD_UH(filter);
2088    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2089
2090    for (loop_cnt = height; loop_cnt--;) {
2091        src0 = LD_SB(src);
2092        src2 = LD_SB(src + 16);
2093        src4 = LD_SB(src + 32);
2094        src6 = LD_SB(src + 48);
2095        src7 = LD_SB(src + 56);
2096        SLDI_B3_SB(src2, src0, src4, src2, src6, src4, 8, src1, src3, src5);
2097        src += src_stride;
2098
2099        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2100        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2101        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2102        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2103        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2104                    out0, out1, out2, out3);
2105        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2106                    out4, out5, out6, out7);
2107        SRARI_H4_UH(out0, out1, out2, out3, 7);
2108        SRARI_H4_UH(out4, out5, out6, out7, 7);
2109        PCKEV_ST_SB(out0, out1, dst);
2110        PCKEV_ST_SB(out2, out3, dst + 16);
2111        PCKEV_ST_SB(out4, out5, dst + 32);
2112        PCKEV_ST_SB(out6, out7, dst + 48);
2113        dst += dst_stride;
2114    }
2115}
2116
2117static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
2118                                 uint8_t *dst, int32_t dst_stride,
2119                                 const int8_t *filter)
2120{
2121    v16i8 src0, src1, src2, src3, src4;
2122    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2123    v16u8 filt0;
2124    v8i16 filt;
2125    v8u16 tmp0, tmp1;
2126
2127    filt = LD_SH(filter);
2128    filt0 = (v16u8) __msa_splati_h(filt, 0);
2129
2130    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2131    src += (5 * src_stride);
2132
2133    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2134               src10_r, src21_r, src32_r, src43_r);
2135    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2136    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
2137    SRARI_H2_UH(tmp0, tmp1, 7);
2138    SAT_UH2_UH(tmp0, tmp1, 7);
2139    src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2140    ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
2141}
2142
2143static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
2144                                 uint8_t *dst, int32_t dst_stride,
2145                                 const int8_t *filter)
2146{
2147    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2148    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
2149    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
2150    v8u16 tmp0, tmp1, tmp2, tmp3;
2151    v16u8 filt0;
2152    v8i16 filt;
2153
2154    filt = LD_SH(filter);
2155    filt0 = (v16u8) __msa_splati_h(filt, 0);
2156
2157    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2158    src += (8 * src_stride);
2159
2160    src8 = LD_SB(src);
2161    src += src_stride;
2162
2163    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2164               src32_r, src43_r);
2165    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2166               src76_r, src87_r);
2167    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2168               src87_r, src76_r, src2110, src4332, src6554, src8776);
2169    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
2170                tmp0, tmp1, tmp2, tmp3);
2171    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2172    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2173    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
2174    ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2175}
2176
2177void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2178                         const uint8_t *src, ptrdiff_t src_stride,
2179                         int height, int mx, int my)
2180{
2181    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2182
2183    if (4 == height) {
2184        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2185    } else if (8 == height) {
2186        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2187    }
2188}
2189
2190static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
2191                                 uint8_t *dst, int32_t dst_stride,
2192                                 const int8_t *filter)
2193{
2194    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
2195    v16i8 out0, out1;
2196    v8u16 tmp0, tmp1, tmp2, tmp3;
2197    v8i16 filt;
2198
2199    /* rearranging filter_y */
2200    filt = LD_SH(filter);
2201    filt0 = (v16u8) __msa_splati_h(filt, 0);
2202
2203    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
2204    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
2205    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
2206    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2207                tmp0, tmp1, tmp2, tmp3);
2208    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2209    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2210    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2211    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2212}
2213
2214static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
2215                                     uint8_t *dst, int32_t dst_stride,
2216                                     const int8_t *filter, int32_t height)
2217{
2218    uint32_t loop_cnt;
2219    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2220    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2221    v16i8 out0, out1;
2222    v8u16 tmp0, tmp1, tmp2, tmp3;
2223    v8i16 filt;
2224
2225    /* rearranging filter_y */
2226    filt = LD_SH(filter);
2227    filt0 = (v16u8) __msa_splati_h(filt, 0);
2228
2229    src0 = LD_UB(src);
2230    src += src_stride;
2231
2232    for (loop_cnt = (height >> 3); loop_cnt--;) {
2233        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
2234        src += (8 * src_stride);
2235
2236        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
2237                   vec0, vec1, vec2, vec3);
2238        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
2239                   vec4, vec5, vec6, vec7);
2240        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2241                    tmp0, tmp1, tmp2, tmp3);
2242        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2243        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2244        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2245        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2246
2247        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2248                    tmp0, tmp1, tmp2, tmp3);
2249        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2250        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2251        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2252        ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2253        dst += (8 * dst_stride);
2254
2255        src0 = src8;
2256    }
2257}
2258
2259void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2260                         const uint8_t *src, ptrdiff_t src_stride,
2261                         int height, int mx, int my)
2262{
2263    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2264
2265    if (4 == height) {
2266        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
2267    } else {
2268        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
2269                                 height);
2270    }
2271}
2272
2273void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2274                          const uint8_t *src, ptrdiff_t src_stride,
2275                          int height, int mx, int my)
2276{
2277    uint32_t loop_cnt;
2278    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2279    v16u8 src0, src1, src2, src3, src4;
2280    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2281    v8u16 tmp0, tmp1, tmp2, tmp3;
2282    v8i16 filt;
2283
2284    /* rearranging filter_y */
2285    filt = LD_SH(filter);
2286    filt0 = (v16u8) __msa_splati_h(filt, 0);
2287
2288    src0 = LD_UB(src);
2289    src += src_stride;
2290
2291    for (loop_cnt = (height >> 2); loop_cnt--;) {
2292        LD_UB4(src, src_stride, src1, src2, src3, src4);
2293        src += (4 * src_stride);
2294
2295        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2296        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2297        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2298        SRARI_H2_UH(tmp0, tmp1, 7);
2299        SAT_UH2_UH(tmp0, tmp1, 7);
2300        PCKEV_ST_SB(tmp0, tmp1, dst);
2301        dst += dst_stride;
2302
2303        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2304        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2305        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2306        SRARI_H2_UH(tmp2, tmp3, 7);
2307        SAT_UH2_UH(tmp2, tmp3, 7);
2308        PCKEV_ST_SB(tmp2, tmp3, dst);
2309        dst += dst_stride;
2310
2311        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2312        SRARI_H2_UH(tmp0, tmp1, 7);
2313        SAT_UH2_UH(tmp0, tmp1, 7);
2314        PCKEV_ST_SB(tmp0, tmp1, dst);
2315        dst += dst_stride;
2316
2317        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2318        SRARI_H2_UH(tmp2, tmp3, 7);
2319        SAT_UH2_UH(tmp2, tmp3, 7);
2320        PCKEV_ST_SB(tmp2, tmp3, dst);
2321        dst += dst_stride;
2322
2323        src0 = src4;
2324    }
2325}
2326
2327void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2328                          const uint8_t *src, ptrdiff_t src_stride,
2329                          int height, int mx, int my)
2330{
2331    uint32_t loop_cnt;
2332    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2333    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
2334    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2335    v8u16 tmp0, tmp1, tmp2, tmp3;
2336    v8i16 filt;
2337
2338    /* rearranging filter_y */
2339    filt = LD_SH(filter);
2340    filt0 = (v16u8) __msa_splati_h(filt, 0);
2341
2342    src0 = LD_UB(src);
2343    src5 = LD_UB(src + 16);
2344    src += src_stride;
2345
2346    for (loop_cnt = (height >> 2); loop_cnt--;) {
2347        LD_UB4(src, src_stride, src1, src2, src3, src4);
2348        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2349        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2350
2351        LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
2352        src += (4 * src_stride);
2353
2354        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2355        SRARI_H2_UH(tmp0, tmp1, 7);
2356        SAT_UH2_UH(tmp0, tmp1, 7);
2357        PCKEV_ST_SB(tmp0, tmp1, dst);
2358        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2359        SRARI_H2_UH(tmp2, tmp3, 7);
2360        SAT_UH2_UH(tmp2, tmp3, 7);
2361        PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
2362
2363        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2364        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2365        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2366        SRARI_H2_UH(tmp0, tmp1, 7);
2367        SAT_UH2_UH(tmp0, tmp1, 7);
2368        PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
2369
2370        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2371        SRARI_H2_UH(tmp2, tmp3, 7);
2372        SAT_UH2_UH(tmp2, tmp3, 7);
2373        PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
2374
2375        ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
2376        ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
2377        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2378        SRARI_H2_UH(tmp0, tmp1, 7);
2379        SAT_UH2_UH(tmp0, tmp1, 7);
2380        PCKEV_ST_SB(tmp0, tmp1, dst + 16);
2381
2382        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2383        SRARI_H2_UH(tmp2, tmp3, 7);
2384        SAT_UH2_UH(tmp2, tmp3, 7);
2385        PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
2386
2387        ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
2388        ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
2389        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2390        SRARI_H2_UH(tmp0, tmp1, 7);
2391        SAT_UH2_UH(tmp0, tmp1, 7);
2392        PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
2393
2394        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2395        SRARI_H2_UH(tmp2, tmp3, 7);
2396        SAT_UH2_UH(tmp2, tmp3, 7);
2397        PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
2398        dst += (4 * dst_stride);
2399
2400        src0 = src4;
2401        src5 = src9;
2402    }
2403}
2404
2405void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2406                          const uint8_t *src, ptrdiff_t src_stride,
2407                          int height, int mx, int my)
2408{
2409    uint32_t loop_cnt;
2410    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2411    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2412    v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2413    v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2414    v8i16 filt;
2415
2416    /* rearranging filter_y */
2417    filt = LD_SH(filter);
2418    filt0 = (v16u8) __msa_splati_h(filt, 0);
2419
2420    LD_UB4(src, 16, src0, src3, src6, src9);
2421    src += src_stride;
2422
2423    for (loop_cnt = (height >> 1); loop_cnt--;) {
2424        LD_UB2(src, src_stride, src1, src2);
2425        LD_UB2(src + 16, src_stride, src4, src5);
2426        LD_UB2(src + 32, src_stride, src7, src8);
2427        LD_UB2(src + 48, src_stride, src10, src11);
2428        src += (2 * src_stride);
2429
2430        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2431        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2432        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2433        SRARI_H2_UH(tmp0, tmp1, 7);
2434        SAT_UH2_UH(tmp0, tmp1, 7);
2435        PCKEV_ST_SB(tmp0, tmp1, dst);
2436
2437        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2438        SRARI_H2_UH(tmp2, tmp3, 7);
2439        SAT_UH2_UH(tmp2, tmp3, 7);
2440        PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
2441
2442        ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
2443        ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
2444        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2445        SRARI_H2_UH(tmp4, tmp5, 7);
2446        SAT_UH2_UH(tmp4, tmp5, 7);
2447        PCKEV_ST_SB(tmp4, tmp5, dst + 16);
2448
2449        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2450        SRARI_H2_UH(tmp6, tmp7, 7);
2451        SAT_UH2_UH(tmp6, tmp7, 7);
2452        PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
2453
2454        ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
2455        ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
2456        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2457        SRARI_H2_UH(tmp0, tmp1, 7);
2458        SAT_UH2_UH(tmp0, tmp1, 7);
2459        PCKEV_ST_SB(tmp0, tmp1, dst + 32);
2460
2461        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2462        SRARI_H2_UH(tmp2, tmp3, 7);
2463        SAT_UH2_UH(tmp2, tmp3, 7);
2464        PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
2465
2466        ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
2467        ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
2468        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2469        SRARI_H2_UH(tmp4, tmp5, 7);
2470        SAT_UH2_UH(tmp4, tmp5, 7);
2471        PCKEV_ST_SB(tmp4, tmp5, dst + 48);
2472
2473        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2474        SRARI_H2_UH(tmp6, tmp7, 7);
2475        SAT_UH2_UH(tmp6, tmp7, 7);
2476        PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
2477        dst += (2 * dst_stride);
2478
2479        src0 = src2;
2480        src3 = src5;
2481        src6 = src8;
2482        src9 = src11;
2483    }
2484}
2485
2486static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
2487                               uint8_t *dst, int32_t dst_stride,
2488                               const int8_t *filter_horiz, const int8_t *filter_vert)
2489{
2490    v16i8 src0, src1, src2, src3, src4, mask;
2491    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
2492    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
2493
2494    mask = LD_SB(&mc_filt_mask_arr[16]);
2495
2496    /* rearranging filter */
2497    filt = LD_UH(filter_horiz);
2498    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
2499
2500    filt = LD_UH(filter_vert);
2501    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
2502
2503    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2504    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
2505    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
2506    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2507    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
2508    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
2509
2510    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2511    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
2512    SRARI_H2_UH(tmp0, tmp1, 7);
2513    SAT_UH2_UH(tmp0, tmp1, 7);
2514    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
2515    ST_W2(res0, 0, 1, dst, dst_stride);
2516    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2517}
2518
2519static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
2520                               uint8_t *dst, int32_t dst_stride,
2521                               const int8_t *filter_horiz, const int8_t *filter_vert)
2522{
2523    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
2524    v16i8 res0, res1, res2, res3;
2525    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2526    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2527    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
2528
2529    mask = LD_SB(&mc_filt_mask_arr[16]);
2530
2531    /* rearranging filter */
2532    filt = LD_UH(filter_horiz);
2533    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
2534
2535    filt = LD_UH(filter_vert);
2536    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
2537
2538    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2539    src += (8 * src_stride);
2540    src8 = LD_SB(src);
2541
2542    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
2543    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
2544    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
2545    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
2546    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
2547    SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
2548               hz_out3, hz_out5);
2549    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2550
2551    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2552    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2553    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2554                vec4, vec5, vec6, vec7);
2555    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2556    SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
2557    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2558                res0, res1, res2, res3);
2559    ST_W2(res0, 0, 1, dst, dst_stride);
2560    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2561    ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
2562    ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
2563}
2564
2565void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2566                          const uint8_t *src, ptrdiff_t src_stride,
2567                          int height, int mx, int my)
2568{
2569    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2570    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2571
2572    if (4 == height) {
2573        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
2574                                  filter_horiz, filter_vert);
2575    } else if (8 == height) {
2576        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
2577                                  filter_horiz, filter_vert);
2578    }
2579}
2580
2581static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
2582                               uint8_t *dst, int32_t dst_stride,
2583                               const int8_t *filter_horiz, const int8_t *filter_vert)
2584{
2585    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2586    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2587    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2588    v8i16 filt;
2589
2590    mask = LD_SB(&mc_filt_mask_arr[0]);
2591
2592    /* rearranging filter */
2593    filt = LD_SH(filter_horiz);
2594    filt_hz = (v16u8) __msa_splati_h(filt, 0);
2595
2596    filt = LD_SH(filter_vert);
2597    filt_vt = (v16u8) __msa_splati_h(filt, 0);
2598
2599    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2600
2601    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2602    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2603    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2604    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2605
2606    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2607    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2608    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2609
2610    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2611    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2612    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2613
2614    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2615    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2616    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2617
2618    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2619    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2620    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2621    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2622}
2623
2624static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride,
2625                                   uint8_t *dst, int32_t dst_stride,
2626                                   const int8_t *filter_horiz, const int8_t *filter_vert,
2627                                   int32_t height)
2628{
2629    uint32_t loop_cnt;
2630    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2631    v16u8 filt_hz, filt_vt, vec0;
2632    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2633    v8i16 filt;
2634
2635    mask = LD_SB(&mc_filt_mask_arr[0]);
2636
2637    /* rearranging filter */
2638    filt = LD_SH(filter_horiz);
2639    filt_hz = (v16u8) __msa_splati_h(filt, 0);
2640
2641    filt = LD_SH(filter_vert);
2642    filt_vt = (v16u8) __msa_splati_h(filt, 0);
2643
2644    src0 = LD_SB(src);
2645    src += src_stride;
2646
2647    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2648
2649    for (loop_cnt = (height >> 3); loop_cnt--;) {
2650        LD_SB4(src, src_stride, src1, src2, src3, src4);
2651        src += (4 * src_stride);
2652
2653        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2654        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2655        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2656
2657        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2658        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2659        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2660
2661        SRARI_H2_UH(tmp1, tmp2, 7);
2662        SAT_UH2_UH(tmp1, tmp2, 7);
2663
2664        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2665        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2666        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2667
2668        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2669        LD_SB4(src, src_stride, src1, src2, src3, src4);
2670        src += (4 * src_stride);
2671        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2672        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2673
2674        SRARI_H2_UH(tmp3, tmp4, 7);
2675        SAT_UH2_UH(tmp3, tmp4, 7);
2676        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
2677        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2678
2679        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2680        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2681        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2682
2683        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2684        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2685        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2686
2687        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2688        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2689        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2690
2691        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2692        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2693        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2694
2695        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2696        SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2697        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
2698        ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2699        dst += (8 * dst_stride);
2700    }
2701}
2702
2703void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2704                          const uint8_t *src, ptrdiff_t src_stride,
2705                          int height, int mx, int my)
2706{
2707    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2708    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2709
2710    if (4 == height) {
2711        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
2712                                  filter_horiz, filter_vert);
2713    } else {
2714        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
2715                                      filter_horiz, filter_vert, height);
2716    }
2717}
2718
2719void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2720                           const uint8_t *src, ptrdiff_t src_stride,
2721                           int height, int mx, int my)
2722{
2723    uint32_t loop_cnt;
2724    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2725    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2726    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2727    v16u8 filt_hz, filt_vt, vec0, vec1;
2728    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2729    v8i16 filt;
2730
2731    mask = LD_SB(&mc_filt_mask_arr[0]);
2732
2733    /* rearranging filter */
2734    filt = LD_SH(filter_horiz);
2735    filt_hz = (v16u8) __msa_splati_h(filt, 0);
2736
2737    filt = LD_SH(filter_vert);
2738    filt_vt = (v16u8) __msa_splati_h(filt, 0);
2739
2740    LD_SB2(src, 8, src0, src1);
2741    src += src_stride;
2742
2743    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2744    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2745
2746
2747    for (loop_cnt = (height >> 2); loop_cnt--;) {
2748        LD_SB4(src, src_stride, src0, src2, src4, src6);
2749        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2750        src += (4 * src_stride);
2751
2752        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2753        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2754        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2755        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2756        SRARI_H2_UH(tmp1, tmp2, 7);
2757        SAT_UH2_UH(tmp1, tmp2, 7);
2758        PCKEV_ST_SB(tmp1, tmp2, dst);
2759        dst += dst_stride;
2760
2761        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2762        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2763        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2764        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2765        SRARI_H2_UH(tmp1, tmp2, 7);
2766        SAT_UH2_UH(tmp1, tmp2, 7);
2767        PCKEV_ST_SB(tmp1, tmp2, dst);
2768        dst += dst_stride;
2769
2770        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2771        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
2772        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2773        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2774        SRARI_H2_UH(tmp1, tmp2, 7);
2775        SAT_UH2_UH(tmp1, tmp2, 7);
2776        PCKEV_ST_SB(tmp1, tmp2, dst);
2777        dst += dst_stride;
2778
2779        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
2780        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
2781        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2782        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2783        SRARI_H2_UH(tmp1, tmp2, 7);
2784        SAT_UH2_UH(tmp1, tmp2, 7);
2785        PCKEV_ST_SB(tmp1, tmp2, dst);
2786        dst += dst_stride;
2787    }
2788}
2789
2790void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2791                           const uint8_t *src, ptrdiff_t src_stride,
2792                           int height, int mx, int my)
2793{
2794    int32_t multiple8_cnt;
2795
2796    for (multiple8_cnt = 2; multiple8_cnt--;) {
2797        ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
2798
2799        src += 16;
2800        dst += 16;
2801    }
2802}
2803
2804void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2805                           const uint8_t *src, ptrdiff_t src_stride,
2806                           int height, int mx, int my)
2807{
2808    int32_t multiple8_cnt;
2809
2810    for (multiple8_cnt = 4; multiple8_cnt--;) {
2811        ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
2812
2813        src += 16;
2814        dst += 16;
2815    }
2816}
2817
2818static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
2819                                              int32_t src_stride,
2820                                              uint8_t *dst, int32_t dst_stride,
2821                                              const int8_t *filter)
2822{
2823    uint32_t tp0, tp1, tp2, tp3;
2824    v16i8 src0, src1, src2, src3, mask;
2825    v16u8 filt0, dst0, vec0, vec1, res;
2826    v8u16 vec2, vec3, filt;
2827
2828    mask = LD_SB(&mc_filt_mask_arr[16]);
2829
2830    /* rearranging filter */
2831    filt = LD_UH(filter);
2832    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2833
2834    LD_SB4(src, src_stride, src0, src1, src2, src3);
2835    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
2836    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
2837    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2838    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
2839    SRARI_H2_UH(vec2, vec3, 7);
2840
2841    res = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
2842    res = (v16u8) __msa_aver_u_b(res, dst0);
2843
2844    ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
2845}
2846
2847static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
2848                                              int32_t src_stride,
2849                                              uint8_t *dst, int32_t dst_stride,
2850                                              const int8_t *filter)
2851{
2852    uint32_t tp0, tp1, tp2, tp3;
2853    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2854    v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
2855    v16u8 dst0, dst1;
2856    v8u16 vec4, vec5, vec6, vec7, filt;
2857
2858    mask = LD_SB(&mc_filt_mask_arr[16]);
2859
2860    /* rearranging filter */
2861    filt = LD_UH(filter);
2862    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2863
2864    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2865    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
2866    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
2867    LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
2868    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
2869    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2870    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
2871    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
2872                vec6, vec7);
2873    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2874    PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
2875                res2, res3);
2876    ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
2877    AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
2878    ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2879}
2880
2881void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2882                         const uint8_t *src, ptrdiff_t src_stride,
2883                         int height, int mx, int my)
2884{
2885    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2886
2887    if (4 == height) {
2888        common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
2889                                          filter);
2890    } else if (8 == height) {
2891        common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
2892                                          filter);
2893    }
2894}
2895
2896static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
2897                                              int32_t src_stride,
2898                                              uint8_t *dst, int32_t dst_stride,
2899                                              const int8_t *filter)
2900{
2901    int64_t tp0, tp1, tp2, tp3;
2902    v16i8 src0, src1, src2, src3, mask;
2903    v16u8 filt0, dst0, dst1;
2904    v8u16 vec0, vec1, vec2, vec3, filt;
2905
2906    mask = LD_SB(&mc_filt_mask_arr[0]);
2907
2908    /* rearranging filter */
2909    filt = LD_UH(filter);
2910    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2911
2912    LD_SB4(src, src_stride, src0, src1, src2, src3);
2913    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2914    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2915    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2916                vec0, vec1, vec2, vec3);
2917    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2918    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2919    INSERT_D2_UB(tp0, tp1, dst0);
2920    INSERT_D2_UB(tp2, tp3, dst1);
2921    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2922}
2923
2924static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
2925                                                  int32_t src_stride,
2926                                                  uint8_t *dst,
2927                                                  int32_t dst_stride,
2928                                                  const int8_t *filter,
2929                                                  int32_t height)
2930{
2931    int64_t tp0, tp1, tp2, tp3;
2932    v16i8 src0, src1, src2, src3, mask;
2933    v16u8 filt0, dst0, dst1;
2934    v8u16 vec0, vec1, vec2, vec3, filt;
2935
2936    mask = LD_SB(&mc_filt_mask_arr[0]);
2937
2938    /* rearranging filter */
2939    filt = LD_UH(filter);
2940    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2941
2942    LD_SB4(src, src_stride, src0, src1, src2, src3);
2943    src += (4 * src_stride);
2944    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2945    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2946    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
2947                vec2, vec3);
2948    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2949    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2950    INSERT_D2_UB(tp0, tp1, dst0);
2951    INSERT_D2_UB(tp2, tp3, dst1);
2952    LD_SB4(src, src_stride, src0, src1, src2, src3);
2953    src += (4 * src_stride);
2954    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2955    dst += (4 * dst_stride);
2956
2957    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2958    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2959    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
2960                vec2, vec3);
2961    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2962    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2963    INSERT_D2_UB(tp0, tp1, dst0);
2964    INSERT_D2_UB(tp2, tp3, dst1);
2965    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2966    dst += (4 * dst_stride);
2967
2968    if (16 == height) {
2969        LD_SB4(src, src_stride, src0, src1, src2, src3);
2970        src += (4 * src_stride);
2971
2972        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2973        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2974        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
2975                    vec1, vec2, vec3);
2976        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2977        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2978        INSERT_D2_UB(tp0, tp1, dst0);
2979        INSERT_D2_UB(tp2, tp3, dst1);
2980        LD_SB4(src, src_stride, src0, src1, src2, src3);
2981        PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2982        dst += (4 * dst_stride);
2983
2984        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2985        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2986        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
2987                    vec1, vec2, vec3);
2988        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2989        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2990        INSERT_D2_UB(tp0, tp1, dst0);
2991        INSERT_D2_UB(tp2, tp3, dst1);
2992        PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2993    }
2994}
2995
2996void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2997                         const uint8_t *src, ptrdiff_t src_stride,
2998                         int height, int mx, int my)
2999{
3000    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3001
3002    if (4 == height) {
3003        common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3004                                          filter);
3005    } else {
3006        common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
3007                                              filter, height);
3008    }
3009}
3010
3011void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3012                          const uint8_t *src, ptrdiff_t src_stride,
3013                          int height, int mx, int my)
3014{
3015    uint32_t loop_cnt;
3016    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3017    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3018    v16u8 filt0, dst0, dst1, dst2, dst3;
3019    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3020    v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
3021
3022    mask = LD_SB(&mc_filt_mask_arr[0]);
3023
3024    /* rearranging filter */
3025    filt = LD_UH(filter);
3026    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3027
3028    LD_SB4(src, src_stride, src0, src2, src4, src6);
3029    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3030    src += (4 * src_stride);
3031
3032    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3033    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3034    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3035    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3036    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
3037                res2, res3);
3038    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
3039                res6, res7);
3040    SRARI_H4_UH(res0, res1, res2, res3, 7);
3041    SRARI_H4_UH(res4, res5, res6, res7, 7);
3042    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3043    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3044    dst += dst_stride;
3045    PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
3046    dst += dst_stride;
3047    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3048    dst += dst_stride;
3049    PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
3050    dst += dst_stride;
3051
3052    for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
3053        LD_SB4(src, src_stride, src0, src2, src4, src6);
3054        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3055        src += (4 * src_stride);
3056
3057        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3058        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3059        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3060        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3061        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
3062                    res1, res2, res3);
3063        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4,
3064                    res5, res6, res7);
3065        SRARI_H4_UH(res0, res1, res2, res3, 7);
3066        SRARI_H4_UH(res4, res5, res6, res7, 7);
3067        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3068        PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3069        dst += dst_stride;
3070        PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
3071        dst += dst_stride;
3072        PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3073        dst += dst_stride;
3074        PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
3075        dst += dst_stride;
3076    }
3077}
3078
3079void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3080                          const uint8_t *src, ptrdiff_t src_stride,
3081                          int height, int mx, int my)
3082{
3083    uint32_t loop_cnt;
3084    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3085    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3086    v16u8 filt0, dst0, dst1, dst2, dst3;
3087    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3088    v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
3089
3090    mask = LD_SB(&mc_filt_mask_arr[0]);
3091
3092    /* rearranging filter */
3093    filt = LD_UH(filter);
3094    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3095
3096    for (loop_cnt = (height >> 1); loop_cnt--;) {
3097        src0 = LD_SB(src);
3098        src2 = LD_SB(src + 16);
3099        src3 = LD_SB(src + 24);
3100        src1 = __msa_sldi_b(src2, src0, 8);
3101        src += src_stride;
3102        src4 = LD_SB(src);
3103        src6 = LD_SB(src + 16);
3104        src7 = LD_SB(src + 24);
3105        src5 = __msa_sldi_b(src6, src4, 8);
3106        src += src_stride;
3107
3108        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3109        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3110        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3111        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3112        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3113                    res0, res1, res2, res3);
3114        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3115                    res4, res5, res6, res7);
3116        SRARI_H4_UH(res0, res1, res2, res3, 7);
3117        SRARI_H4_UH(res4, res5, res6, res7, 7);
3118        LD_UB2(dst, 16, dst0, dst1);
3119        PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3120        PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
3121        dst += dst_stride;
3122        LD_UB2(dst, 16, dst2, dst3);
3123        PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3124        PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
3125        dst += dst_stride;
3126    }
3127}
3128
3129void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3130                          const uint8_t *src, ptrdiff_t src_stride,
3131                          int height, int mx, int my)
3132{
3133    uint32_t loop_cnt;
3134    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3135    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3136    v16u8 filt0, dst0, dst1, dst2, dst3;
3137    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3138    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
3139
3140    mask = LD_SB(&mc_filt_mask_arr[0]);
3141
3142    /* rearranging filter */
3143    filt = LD_UH(filter);
3144    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3145
3146    for (loop_cnt = height; loop_cnt--;) {
3147        LD_SB4(src, 16, src0, src2, src4, src6);
3148        src7 = LD_SB(src + 56);
3149        SLDI_B3_SB(src2, src0, src4, src2, src6, src4, 8, src1, src3, src5);
3150        src += src_stride;
3151
3152        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3153        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3154        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3155        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3156        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3157                    out0, out1, out2, out3);
3158        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3159                    out4, out5, out6, out7);
3160        SRARI_H4_UH(out0, out1, out2, out3, 7);
3161        SRARI_H4_UH(out4, out5, out6, out7, 7);
3162        LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
3163        PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
3164        PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
3165        PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
3166        PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
3167        dst += dst_stride;
3168    }
3169}
3170
3171static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
3172                                              int32_t src_stride,
3173                                              uint8_t *dst, int32_t dst_stride,
3174                                              const int8_t *filter)
3175{
3176    uint32_t tp0, tp1, tp2, tp3;
3177    v16i8 src0, src1, src2, src3, src4;
3178    v16u8 dst0, out, filt0, src2110, src4332;
3179    v16i8 src10_r, src32_r, src21_r, src43_r;
3180    v8i16 filt;
3181    v8u16 tmp0, tmp1;
3182
3183    filt = LD_SH(filter);
3184    filt0 = (v16u8) __msa_splati_h(filt, 0);
3185
3186    LD_SB4(src, src_stride, src0, src1, src2, src3);
3187    src += (4 * src_stride);
3188
3189    src4 = LD_SB(src);
3190    src += src_stride;
3191
3192    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3193    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3194    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
3195               src10_r, src21_r, src32_r, src43_r);
3196    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3197    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
3198    SRARI_H2_UH(tmp0, tmp1, 7);
3199    SAT_UH2_UH(tmp0, tmp1, 7);
3200
3201    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3202    out = __msa_aver_u_b(out, dst0);
3203
3204    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3205}
3206
3207static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
3208                                              int32_t src_stride,
3209                                              uint8_t *dst, int32_t dst_stride,
3210                                              const int8_t *filter)
3211{
3212    uint32_t tp0, tp1, tp2, tp3;
3213    v16u8 dst0, dst1;
3214    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
3215    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3216    v16u8 src2110, src4332, src6554, src8776, filt0;
3217    v8u16 tmp0, tmp1, tmp2, tmp3;
3218    v8i16 filt;
3219
3220    filt = LD_SH(filter);
3221    filt0 = (v16u8) __msa_splati_h(filt, 0);
3222
3223    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3224    src += (8 * src_stride);
3225    src8 = LD_SB(src);
3226
3227    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3228    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3229    LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3230    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
3231    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3232               src32_r, src43_r);
3233    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3234               src76_r, src87_r);
3235    ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3236               src87_r, src76_r, src2110, src4332, src6554, src8776);
3237    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
3238                tmp0, tmp1, tmp2, tmp3);
3239    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3240    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3241    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
3242    AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
3243    ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3244}
3245
3246void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3247                         const uint8_t *src, ptrdiff_t src_stride,
3248                         int height, int mx, int my)
3249{
3250    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3251
3252    if (4 == height) {
3253        common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
3254                                          filter);
3255    } else if (8 == height) {
3256        common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
3257                                          filter);
3258    }
3259}
3260
3261static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
3262                                              int32_t src_stride,
3263                                              uint8_t *dst,
3264                                              int32_t dst_stride,
3265                                              const int8_t *filter)
3266{
3267    int64_t tp0, tp1, tp2, tp3;
3268    v16u8 src0, src1, src2, src3, src4;
3269    v16u8 dst0, dst1, vec0, vec1, vec2, vec3, filt0;
3270    v8u16 tmp0, tmp1, tmp2, tmp3;
3271    v8i16 filt;
3272
3273    /* rearranging filter_y */
3274    filt = LD_SH(filter);
3275    filt0 = (v16u8) __msa_splati_h(filt, 0);
3276
3277    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
3278    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3279    INSERT_D2_UB(tp0, tp1, dst0);
3280    INSERT_D2_UB(tp2, tp3, dst1);
3281    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
3282    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
3283    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3284                tmp0, tmp1, tmp2, tmp3);
3285    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3286    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3287    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3288}
3289
3290static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
3291                                                  int32_t src_stride,
3292                                                  uint8_t *dst,
3293                                                  int32_t dst_stride,
3294                                                  const int8_t *filter,
3295                                                  int32_t height)
3296{
3297    uint32_t loop_cnt;
3298    int64_t tp0, tp1, tp2, tp3;
3299    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3300    v16u8 dst0, dst1, dst2, dst3;
3301    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3302    v8u16 tmp0, tmp1, tmp2, tmp3;
3303    v8i16 filt;
3304
3305    /* rearranging filter_y */
3306    filt = LD_SH(filter);
3307    filt0 = (v16u8) __msa_splati_h(filt, 0);
3308
3309    src0 = LD_UB(src);
3310    src += src_stride;
3311
3312    for (loop_cnt = (height >> 3); loop_cnt--;) {
3313        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
3314        src += (8 * src_stride);
3315
3316        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3317        INSERT_D2_UB(tp0, tp1, dst0);
3318        INSERT_D2_UB(tp2, tp3, dst1);
3319        LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3320        INSERT_D2_UB(tp0, tp1, dst2);
3321        INSERT_D2_UB(tp2, tp3, dst3);
3322
3323        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
3324                   vec0, vec1, vec2, vec3);
3325        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
3326                   vec4, vec5, vec6, vec7);
3327        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3328                    tmp0, tmp1, tmp2, tmp3);
3329        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3330        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3331        PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3332        dst += (4 * dst_stride);
3333
3334        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3335                    tmp0, tmp1, tmp2, tmp3);
3336        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3337        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3338        PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride);
3339        dst += (4 * dst_stride);
3340
3341        src0 = src8;
3342    }
3343}
3344
3345void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3346                         const uint8_t *src, ptrdiff_t src_stride,
3347                         int height, int mx, int my)
3348{
3349    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3350
3351    if (4 == height) {
3352        common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3353                                          filter);
3354    } else {
3355        common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
3356                                              filter, height);
3357    }
3358}
3359
3360void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3361                          const uint8_t *src, ptrdiff_t src_stride,
3362                          int height, int mx, int my)
3363{
3364    uint32_t loop_cnt;
3365    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3366    v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
3367    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3368    v8u16 tmp0, tmp1, tmp2, tmp3, filt;
3369
3370    /* rearranging filter_y */
3371    filt = LD_UH(filter);
3372    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3373
3374    src0 = LD_UB(src);
3375    src += src_stride;
3376
3377    for (loop_cnt = (height >> 2); loop_cnt--;) {
3378        LD_UB4(src, src_stride, src1, src2, src3, src4);
3379        src += (4 * src_stride);
3380
3381        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3382        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3383        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3384        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3385        SRARI_H2_UH(tmp0, tmp1, 7);
3386        SAT_UH2_UH(tmp0, tmp1, 7);
3387        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3388        dst += dst_stride;
3389
3390        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3391        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3392        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3393        SRARI_H2_UH(tmp2, tmp3, 7);
3394        SAT_UH2_UH(tmp2, tmp3, 7);
3395        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
3396        dst += dst_stride;
3397
3398        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3399        SRARI_H2_UH(tmp0, tmp1, 7);
3400        SAT_UH2_UH(tmp0, tmp1, 7);
3401        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
3402        dst += dst_stride;
3403
3404        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3405        SRARI_H2_UH(tmp2, tmp3, 7);
3406        SAT_UH2_UH(tmp2, tmp3, 7);
3407        PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
3408        dst += dst_stride;
3409
3410        src0 = src4;
3411    }
3412}
3413
3414void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3415                          const uint8_t *src, ptrdiff_t src_stride,
3416                          int height, int mx, int my)
3417{
3418    uint32_t loop_cnt;
3419    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3420    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3421    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3422    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3423    v8u16 tmp0, tmp1, tmp2, tmp3, filt;
3424
3425    /* rearranging filter_y */
3426    filt = LD_UH(filter);
3427    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3428
3429    LD_UB2(src, 16, src0, src5);
3430    src += src_stride;
3431
3432    for (loop_cnt = (height >> 2); loop_cnt--;) {
3433        LD_UB4(src, src_stride, src1, src2, src3, src4);
3434        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3435        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3436        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3437
3438        LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
3439        LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
3440        src += (4 * src_stride);
3441
3442        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3443        SRARI_H2_UH(tmp0, tmp1, 7);
3444        SAT_UH2_UH(tmp0, tmp1, 7);
3445        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3446
3447        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3448        SRARI_H2_UH(tmp2, tmp3, 7);
3449        SAT_UH2_UH(tmp2, tmp3, 7);
3450        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
3451
3452        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3453        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3454        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3455        SRARI_H2_UH(tmp0, tmp1, 7);
3456        SAT_UH2_UH(tmp0, tmp1, 7);
3457        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
3458
3459        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3460        SRARI_H2_UH(tmp2, tmp3, 7);
3461        SAT_UH2_UH(tmp2, tmp3, 7);
3462        PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
3463
3464        ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
3465        ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
3466        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3467        SRARI_H2_UH(tmp0, tmp1, 7);
3468        SAT_UH2_UH(tmp0, tmp1, 7);
3469        PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
3470
3471        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3472        SRARI_H2_UH(tmp2, tmp3, 7);
3473        SAT_UH2_UH(tmp2, tmp3, 7);
3474        PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
3475
3476        ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
3477        ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
3478        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3479        SRARI_H2_UH(tmp0, tmp1, 7);
3480        SAT_UH2_UH(tmp0, tmp1, 7);
3481        PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
3482
3483        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3484        SRARI_H2_UH(tmp2, tmp3, 7);
3485        SAT_UH2_UH(tmp2, tmp3, 7);
3486        PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
3487        dst += (4 * dst_stride);
3488
3489        src0 = src4;
3490        src5 = src9;
3491    }
3492}
3493
3494void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3495                          const uint8_t *src, ptrdiff_t src_stride,
3496                          int height, int mx, int my)
3497{
3498    uint32_t loop_cnt;
3499    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3500    v16u8 src0, src1, src2, src3, src4, src5;
3501    v16u8 src6, src7, src8, src9, src10, src11, filt0;
3502    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3503    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3504    v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3505    v8u16 filt;
3506
3507    /* rearranging filter_y */
3508    filt = LD_UH(filter);
3509    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3510
3511    LD_UB4(src, 16, src0, src3, src6, src9);
3512    src += src_stride;
3513
3514    for (loop_cnt = (height >> 1); loop_cnt--;) {
3515        LD_UB2(src, src_stride, src1, src2);
3516        LD_UB2(dst, dst_stride, dst0, dst1);
3517        LD_UB2(src + 16, src_stride, src4, src5);
3518        LD_UB2(dst + 16, dst_stride, dst2, dst3);
3519        LD_UB2(src + 32, src_stride, src7, src8);
3520        LD_UB2(dst + 32, dst_stride, dst4, dst5);
3521        LD_UB2(src + 48, src_stride, src10, src11);
3522        LD_UB2(dst + 48, dst_stride, dst6, dst7);
3523        src += (2 * src_stride);
3524
3525        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3526        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3527        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3528        SRARI_H2_UH(tmp0, tmp1, 7);
3529        SAT_UH2_UH(tmp0, tmp1, 7);
3530        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3531
3532        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3533        SRARI_H2_UH(tmp2, tmp3, 7);
3534        SAT_UH2_UH(tmp2, tmp3, 7);
3535        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
3536
3537        ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
3538        ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
3539        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3540        SRARI_H2_UH(tmp4, tmp5, 7);
3541        SAT_UH2_UH(tmp4, tmp5, 7);
3542        PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
3543
3544        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3545        SRARI_H2_UH(tmp6, tmp7, 7);
3546        SAT_UH2_UH(tmp6, tmp7, 7);
3547        PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
3548
3549        ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
3550        ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
3551        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3552        SRARI_H2_UH(tmp0, tmp1, 7);
3553        SAT_UH2_UH(tmp0, tmp1, 7);
3554        PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
3555
3556        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3557        SRARI_H2_UH(tmp2, tmp3, 7);
3558        SAT_UH2_UH(tmp2, tmp3, 7);
3559        PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
3560
3561        ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
3562        ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
3563        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3564        SRARI_H2_UH(tmp4, tmp5, 7);
3565        SAT_UH2_UH(tmp4, tmp5, 7);
3566        PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
3567
3568        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3569        SRARI_H2_UH(tmp6, tmp7, 7);
3570        SAT_UH2_UH(tmp6, tmp7, 7);
3571        PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
3572        dst += (2 * dst_stride);
3573
3574        src0 = src2;
3575        src3 = src5;
3576        src6 = src8;
3577        src9 = src11;
3578    }
3579}
3580
3581static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
3582                                                   int32_t src_stride,
3583                                                   uint8_t *dst,
3584                                                   int32_t dst_stride,
3585                                                   const int8_t *filter_horiz,
3586                                                   const int8_t *filter_vert)
3587{
3588    uint32_t tp0, tp1, tp2, tp3;
3589    v16i8 src0, src1, src2, src3, src4, mask;
3590    v16u8 filt_hz, filt_vt, vec0, vec1;
3591    v16u8 dst0, out;
3592    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
3593
3594    mask = LD_SB(&mc_filt_mask_arr[16]);
3595
3596    /* rearranging filter */
3597    filt = LD_UH(filter_horiz);
3598    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
3599
3600    filt = LD_UH(filter_vert);
3601    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
3602
3603    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3604
3605    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
3606    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
3607    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3608    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
3609    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
3610    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3611
3612    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3613    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3614
3615    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3616    SRARI_H2_UH(tmp0, tmp1, 7);
3617    SAT_UH2_UH(tmp0, tmp1, 7);
3618
3619    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3620    out = __msa_aver_u_b(out, dst0);
3621
3622    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3623}
3624
3625static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
3626                                                   int32_t src_stride,
3627                                                   uint8_t *dst,
3628                                                   int32_t dst_stride,
3629                                                   const int8_t *filter_horiz,
3630                                                   const int8_t *filter_vert)
3631{
3632    uint32_t tp0, tp1, tp2, tp3;
3633    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
3634    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
3635    v16u8 dst0, dst1;
3636    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3637    v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
3638    v8i16 filt;
3639
3640    mask = LD_SB(&mc_filt_mask_arr[16]);
3641
3642    /* rearranging filter */
3643    filt = LD_SH(filter_horiz);
3644    filt_hz = (v16u8) __msa_splati_h(filt, 0);
3645
3646    filt = LD_SH(filter_vert);
3647    filt_vt = (v16u8) __msa_splati_h(filt, 0);
3648
3649    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3650    src += (8 * src_stride);
3651    src8 = LD_SB(src);
3652
3653    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
3654    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
3655    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
3656    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
3657    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
3658    SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
3659               hz_out3, hz_out5);
3660    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
3661
3662    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3663    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3664    LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3665    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
3666    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3667    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
3668    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
3669                tmp0, tmp1, tmp2, tmp3);
3670    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3671    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3672    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
3673    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
3674    ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3675}
3676
3677void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3678                          const uint8_t *src, ptrdiff_t src_stride,
3679                          int height, int mx, int my)
3680{
3681    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3682    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3683
3684    if (4 == height) {
3685        common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
3686                                               filter_horiz, filter_vert);
3687    } else if (8 == height) {
3688        common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
3689                                               filter_horiz, filter_vert);
3690    }
3691}
3692
3693static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
3694                                                   int32_t src_stride,
3695                                                   uint8_t *dst,
3696                                                   int32_t dst_stride,
3697                                                   const int8_t *filter_horiz,
3698                                                   const int8_t *filter_vert)
3699{
3700    uint64_t tp0, tp1, tp2, tp3;
3701    v16i8 src0, src1, src2, src3, src4, mask;
3702    v16u8 filt_hz, filt_vt, dst0, dst1, vec0, vec1, vec2, vec3;
3703    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3704    v8i16 filt;
3705
3706    mask = LD_SB(&mc_filt_mask_arr[0]);
3707
3708    /* rearranging filter */
3709    filt = LD_SH(filter_horiz);
3710    filt_hz = (v16u8) __msa_splati_h(filt, 0);
3711
3712    filt = LD_SH(filter_vert);
3713    filt_vt = (v16u8) __msa_splati_h(filt, 0);
3714
3715    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3716    src += (5 * src_stride);
3717
3718    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3719    INSERT_D2_UB(tp0, tp1, dst0);
3720    INSERT_D2_UB(tp2, tp3, dst1);
3721    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3722    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3723    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3724    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3725
3726    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3727    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3728    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
3729
3730    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3731    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3732    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
3733
3734    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3735    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3736    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
3737
3738    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3739    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3740    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3741}
3742
3743static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
3744                                                       int32_t src_stride,
3745                                                       uint8_t *dst,
3746                                                       int32_t dst_stride,
3747                                                       const int8_t *filter_horiz,
3748                                                       const int8_t *filter_vert,
3749                                                       int32_t height)
3750{
3751    uint32_t loop_cnt;
3752    uint64_t tp0, tp1, tp2, tp3;
3753    v16i8 src0, src1, src2, src3, src4, mask;
3754    v16u8 filt_hz, filt_vt, vec0, dst0, dst1;
3755    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3756    v8i16 filt;
3757
3758    mask = LD_SB(&mc_filt_mask_arr[0]);
3759
3760    /* rearranging filter */
3761    filt = LD_SH(filter_horiz);
3762    filt_hz = (v16u8) __msa_splati_h(filt, 0);
3763
3764    filt = LD_SH(filter_vert);
3765    filt_vt = (v16u8) __msa_splati_h(filt, 0);
3766
3767    src0 = LD_SB(src);
3768    src += src_stride;
3769
3770    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3771
3772    for (loop_cnt = (height >> 2); loop_cnt--;) {
3773        LD_SB4(src, src_stride, src1, src2, src3, src4);
3774        src += (4 * src_stride);
3775
3776        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3777        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3778        tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3779
3780        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3781        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3782        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
3783
3784        SRARI_H2_UH(tmp0, tmp1, 7);
3785        SAT_UH2_UH(tmp0, tmp1, 7);
3786
3787        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3788        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3789        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
3790
3791        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3792        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3793        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
3794
3795        SRARI_H2_UH(tmp2, tmp3, 7);
3796        SAT_UH2_UH(tmp2, tmp3, 7);
3797        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3798        INSERT_D2_UB(tp0, tp1, dst0);
3799        INSERT_D2_UB(tp2, tp3, dst1);
3800        PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3801        dst += (4 * dst_stride);
3802    }
3803}
3804
3805void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3806                          const uint8_t *src, ptrdiff_t src_stride,
3807                          int height, int mx, int my)
3808{
3809    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3810    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3811
3812    if (4 == height) {
3813        common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3814                                               filter_horiz, filter_vert);
3815    } else {
3816        common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride,
3817                                                   dst, dst_stride,
3818                                                   filter_horiz, filter_vert,
3819                                                   height);
3820    }
3821}
3822
3823void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3824                           const uint8_t *src, ptrdiff_t src_stride,
3825                           int height, int mx, int my)
3826{
3827    uint32_t loop_cnt;
3828    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3829    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3830    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3831    v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
3832    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
3833    v8i16 filt;
3834
3835    mask = LD_SB(&mc_filt_mask_arr[0]);
3836
3837    /* rearranging filter */
3838    filt = LD_SH(filter_horiz);
3839    filt_hz = (v16u8) __msa_splati_h(filt, 0);
3840
3841    filt = LD_SH(filter_vert);
3842    filt_vt = (v16u8) __msa_splati_h(filt, 0);
3843
3844    LD_SB2(src, 8, src0, src1);
3845    src += src_stride;
3846
3847    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3848    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3849
3850    for (loop_cnt = (height >> 2); loop_cnt--;) {
3851        LD_SB4(src, src_stride, src0, src2, src4, src6);
3852        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3853        src += (4 * src_stride);
3854        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3855
3856        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3857        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3858        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3859        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3860        SRARI_H2_UH(tmp0, tmp1, 7);
3861        SAT_UH2_UH(tmp0, tmp1, 7);
3862        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3863        dst += dst_stride;
3864
3865        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3866        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3867        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3868        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3869        SRARI_H2_UH(tmp0, tmp1, 7);
3870        SAT_UH2_UH(tmp0, tmp1, 7);
3871        PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
3872        dst += dst_stride;
3873
3874        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3875        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
3876        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3877        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3878        SRARI_H2_UH(tmp0, tmp1, 7);
3879        SAT_UH2_UH(tmp0, tmp1, 7);
3880        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
3881        dst += dst_stride;
3882
3883        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
3884        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
3885        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3886        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3887        SRARI_H2_UH(tmp0, tmp1, 7);
3888        SAT_UH2_UH(tmp0, tmp1, 7);
3889        PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
3890        dst += dst_stride;
3891    }
3892}
3893
3894void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3895                           const uint8_t *src, ptrdiff_t src_stride,
3896                           int height, int mx, int my)
3897{
3898    int32_t multiple8_cnt;
3899
3900    for (multiple8_cnt = 2; multiple8_cnt--;) {
3901        ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
3902
3903        src += 16;
3904        dst += 16;
3905    }
3906}
3907
3908void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3909                           const uint8_t *src, ptrdiff_t src_stride,
3910                           int height, int mx, int my)
3911{
3912    int32_t multiple8_cnt;
3913
3914    for (multiple8_cnt = 4; multiple8_cnt--;) {
3915        ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
3916
3917        src += 16;
3918        dst += 16;
3919    }
3920}
3921
3922static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
3923                            uint8_t *dst, int32_t dst_stride,
3924                            int32_t height)
3925{
3926    int32_t cnt;
3927    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
3928
3929    if (0 == height % 8) {
3930        for (cnt = height >> 3; cnt--;) {
3931            LD4(src, src_stride, out0, out1, out2, out3);
3932            src += (4 * src_stride);
3933            LD4(src, src_stride, out4, out5, out6, out7);
3934            src += (4 * src_stride);
3935
3936            SD4(out0, out1, out2, out3, dst, dst_stride);
3937            dst += (4 * dst_stride);
3938            SD4(out4, out5, out6, out7, dst, dst_stride);
3939            dst += (4 * dst_stride);
3940        }
3941    } else if (0 == height % 4) {
3942        for (cnt = (height / 4); cnt--;) {
3943            LD4(src, src_stride, out0, out1, out2, out3);
3944            src += (4 * src_stride);
3945
3946            SD4(out0, out1, out2, out3, dst, dst_stride);
3947            dst += (4 * dst_stride);
3948        }
3949    }
3950}
3951
3952static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
3953                             uint8_t *dst, int32_t dst_stride,
3954                             int32_t height)
3955{
3956    int32_t cnt;
3957    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
3958
3959    if (8 == height) {
3960        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3961        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3962    } else if (16 == height) {
3963        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3964        src += (8 * src_stride);
3965        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3966        dst += (8 * dst_stride);
3967        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3968        src += (8 * src_stride);
3969        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3970        dst += (8 * dst_stride);
3971    } else if (32 == height) {
3972        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3973        src += (8 * src_stride);
3974        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3975        dst += (8 * dst_stride);
3976        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3977        src += (8 * src_stride);
3978        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3979        dst += (8 * dst_stride);
3980        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3981        src += (8 * src_stride);
3982        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3983        dst += (8 * dst_stride);
3984        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3985        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3986    } else if (0 == height % 4) {
3987        for (cnt = (height >> 2); cnt--;) {
3988            LD_UB4(src, src_stride, src0, src1, src2, src3);
3989            src += (4 * src_stride);
3990            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
3991            dst += (4 * dst_stride);
3992        }
3993    }
3994}
3995
3996static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
3997                             uint8_t *dst, int32_t dst_stride,
3998                             int32_t height)
3999{
4000    int32_t cnt;
4001    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4002
4003    if (0 == height % 8) {
4004        for (cnt = (height >> 3); cnt--;) {
4005            LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
4006            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
4007            LD_UB8(src + 16, src_stride, src0, src1, src2, src3, src4, src5, src6,
4008                   src7);
4009            src += (8 * src_stride);
4010            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 16,
4011                   dst_stride);
4012            dst += (8 * dst_stride);
4013        }
4014    } else if (0 == height % 4) {
4015        for (cnt = (height >> 2); cnt--;) {
4016            LD_UB4(src, src_stride, src0, src1, src2, src3);
4017            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
4018            src += (4 * src_stride);
4019            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
4020            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
4021            dst += (4 * dst_stride);
4022        }
4023    }
4024}
4025
4026static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
4027                             uint8_t *dst, int32_t dst_stride,
4028                             int32_t height)
4029{
4030    int32_t cnt;
4031    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4032    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4033
4034    for (cnt = (height >> 2); cnt--;) {
4035        LD_UB4(src, 16, src0, src1, src2, src3);
4036        src += src_stride;
4037        LD_UB4(src, 16, src4, src5, src6, src7);
4038        src += src_stride;
4039        LD_UB4(src, 16, src8, src9, src10, src11);
4040        src += src_stride;
4041        LD_UB4(src, 16, src12, src13, src14, src15);
4042        src += src_stride;
4043
4044        ST_UB4(src0, src1, src2, src3, dst, 16);
4045        dst += dst_stride;
4046        ST_UB4(src4, src5, src6, src7, dst, 16);
4047        dst += dst_stride;
4048        ST_UB4(src8, src9, src10, src11, dst, 16);
4049        dst += dst_stride;
4050        ST_UB4(src12, src13, src14, src15, dst, 16);
4051        dst += dst_stride;
4052    }
4053}
4054
4055static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
4056                           uint8_t *dst, int32_t dst_stride,
4057                           int32_t height)
4058{
4059    uint32_t tp0, tp1, tp2, tp3;
4060    v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
4061
4062    if (8 == height) {
4063        LW4(src, src_stride, tp0, tp1, tp2, tp3);
4064        src += 4 * src_stride;
4065        INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
4066        LW4(src, src_stride, tp0, tp1, tp2, tp3);
4067        INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
4068        LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
4069        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4070        LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
4071        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
4072        AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
4073        ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4074    } else if (4 == height) {
4075        LW4(src, src_stride, tp0, tp1, tp2, tp3);
4076        INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
4077        LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
4078        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4079        dst0 = __msa_aver_u_b(src0, dst0);
4080        ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
4081    }
4082}
4083
4084static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
4085                           uint8_t *dst, int32_t dst_stride,
4086                           int32_t height)
4087{
4088    int32_t cnt;
4089    uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
4090    v16u8 src0, src1, src2, src3;
4091    v16u8 dst0, dst1, dst2, dst3;
4092
4093    if (0 == (height % 8)) {
4094        for (cnt = (height >> 3); cnt--;) {
4095            LD4(src, src_stride, tp0, tp1, tp2, tp3);
4096            src += 4 * src_stride;
4097            LD4(src, src_stride, tp4, tp5, tp6, tp7);
4098            src += 4 * src_stride;
4099            INSERT_D2_UB(tp0, tp1, src0);
4100            INSERT_D2_UB(tp2, tp3, src1);
4101            INSERT_D2_UB(tp4, tp5, src2);
4102            INSERT_D2_UB(tp6, tp7, src3);
4103            LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
4104            LD4(dst + 4 * dst_stride, dst_stride, tp4, tp5, tp6, tp7);
4105            INSERT_D2_UB(tp0, tp1, dst0);
4106            INSERT_D2_UB(tp2, tp3, dst1);
4107            INSERT_D2_UB(tp4, tp5, dst2);
4108            INSERT_D2_UB(tp6, tp7, dst3);
4109            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0,
4110                        dst1, dst2, dst3);
4111            ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
4112            dst += 8 * dst_stride;
4113        }
4114    } else if (4 == height) {
4115        LD4(src, src_stride, tp0, tp1, tp2, tp3);
4116        INSERT_D2_UB(tp0, tp1, src0);
4117        INSERT_D2_UB(tp2, tp3, src1);
4118        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
4119        INSERT_D2_UB(tp0, tp1, dst0);
4120        INSERT_D2_UB(tp2, tp3, dst1);
4121        AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
4122        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
4123    }
4124}
4125
4126static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
4127                            uint8_t *dst, int32_t dst_stride,
4128                            int32_t height)
4129{
4130    int32_t cnt;
4131    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4132    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4133
4134    if (0 == (height % 8)) {
4135        for (cnt = (height / 8); cnt--;) {
4136            LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
4137            src += (8 * src_stride);
4138            LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4139
4140            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4141                        dst0, dst1, dst2, dst3);
4142            AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4143                        dst4, dst5, dst6, dst7);
4144            ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
4145            dst += (8 * dst_stride);
4146        }
4147    } else if (0 == (height % 4)) {
4148        for (cnt = (height / 4); cnt--;) {
4149            LD_UB4(src, src_stride, src0, src1, src2, src3);
4150            src += (4 * src_stride);
4151            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
4152
4153            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4154                        dst0, dst1, dst2, dst3);
4155            ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
4156            dst += (4 * dst_stride);
4157        }
4158    }
4159}
4160
4161static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
4162                            uint8_t *dst, int32_t dst_stride,
4163                            int32_t height)
4164{
4165    int32_t cnt;
4166    uint8_t *dst_dup = dst;
4167    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4168    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4169    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4170    v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4171
4172    if (0 == (height % 8)) {
4173        for (cnt = (height / 8); cnt--;) {
4174            LD_UB4(src, src_stride, src0, src2, src4, src6);
4175            LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
4176            src += (4 * src_stride);
4177            LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
4178            LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
4179            dst_dup += (4 * dst_stride);
4180            LD_UB4(src, src_stride, src8, src10, src12, src14);
4181            LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
4182            src += (4 * src_stride);
4183            LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
4184            LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
4185            dst_dup += (4 * dst_stride);
4186
4187            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4188                        dst0, dst1, dst2, dst3);
4189            AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4190                        dst4, dst5, dst6, dst7);
4191            AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4192                        dst8, dst9, dst10, dst11);
4193            AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4194                        dst12, dst13, dst14, dst15);
4195
4196            ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
4197            ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
4198            dst += (4 * dst_stride);
4199            ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
4200            ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
4201            dst += (4 * dst_stride);
4202        }
4203    } else if (0 == (height % 4)) {
4204        for (cnt = (height / 4); cnt--;) {
4205            LD_UB4(src, src_stride, src0, src2, src4, src6);
4206            LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
4207            src += (4 * src_stride);
4208            LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
4209            LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
4210            dst_dup += (4 * dst_stride);
4211
4212            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4213                        dst0, dst1, dst2, dst3);
4214            AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4215                        dst4, dst5, dst6, dst7);
4216
4217            ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
4218            ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
4219            dst += (4 * dst_stride);
4220        }
4221    }
4222}
4223
4224static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
4225                            uint8_t *dst, int32_t dst_stride,
4226                            int32_t height)
4227{
4228    int32_t cnt;
4229    uint8_t *dst_dup = dst;
4230    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4231    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4232    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4233    v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4234
4235    for (cnt = (height / 4); cnt--;) {
4236        LD_UB4(src, 16, src0, src1, src2, src3);
4237        src += src_stride;
4238        LD_UB4(src, 16, src4, src5, src6, src7);
4239        src += src_stride;
4240        LD_UB4(src, 16, src8, src9, src10, src11);
4241        src += src_stride;
4242        LD_UB4(src, 16, src12, src13, src14, src15);
4243        src += src_stride;
4244
4245        LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
4246        dst_dup += dst_stride;
4247        LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
4248        dst_dup += dst_stride;
4249        LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
4250        dst_dup += dst_stride;
4251        LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
4252        dst_dup += dst_stride;
4253
4254        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4255                    dst0, dst1, dst2, dst3);
4256        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4257                    dst4, dst5, dst6, dst7);
4258        AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4259                    dst8, dst9, dst10, dst11);
4260        AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4261                    dst12, dst13, dst14, dst15);
4262
4263        ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
4264        dst += dst_stride;
4265        ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
4266        dst += dst_stride;
4267        ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
4268        dst += dst_stride;
4269        ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
4270        dst += dst_stride;
4271    }
4272}
4273
4274static const int8_t vp9_subpel_filters_msa[3][15][8] = {
4275    [FILTER_8TAP_REGULAR] = {
4276         {0, 1, -5, 126, 8, -3, 1, 0},
4277         {-1, 3, -10, 122, 18, -6, 2, 0},
4278         {-1, 4, -13, 118, 27, -9, 3, -1},
4279         {-1, 4, -16, 112, 37, -11, 4, -1},
4280         {-1, 5, -18, 105, 48, -14, 4, -1},
4281         {-1, 5, -19, 97, 58, -16, 5, -1},
4282         {-1, 6, -19, 88, 68, -18, 5, -1},
4283         {-1, 6, -19, 78, 78, -19, 6, -1},
4284         {-1, 5, -18, 68, 88, -19, 6, -1},
4285         {-1, 5, -16, 58, 97, -19, 5, -1},
4286         {-1, 4, -14, 48, 105, -18, 5, -1},
4287         {-1, 4, -11, 37, 112, -16, 4, -1},
4288         {-1, 3, -9, 27, 118, -13, 4, -1},
4289         {0, 2, -6, 18, 122, -10, 3, -1},
4290         {0, 1, -3, 8, 126, -5, 1, 0},
4291    }, [FILTER_8TAP_SHARP] = {
4292        {-1, 3, -7, 127, 8, -3, 1, 0},
4293        {-2, 5, -13, 125, 17, -6, 3, -1},
4294        {-3, 7, -17, 121, 27, -10, 5, -2},
4295        {-4, 9, -20, 115, 37, -13, 6, -2},
4296        {-4, 10, -23, 108, 48, -16, 8, -3},
4297        {-4, 10, -24, 100, 59, -19, 9, -3},
4298        {-4, 11, -24, 90, 70, -21, 10, -4},
4299        {-4, 11, -23, 80, 80, -23, 11, -4},
4300        {-4, 10, -21, 70, 90, -24, 11, -4},
4301        {-3, 9, -19, 59, 100, -24, 10, -4},
4302        {-3, 8, -16, 48, 108, -23, 10, -4},
4303        {-2, 6, -13, 37, 115, -20, 9, -4},
4304        {-2, 5, -10, 27, 121, -17, 7, -3},
4305        {-1, 3, -6, 17, 125, -13, 5, -2},
4306        {0, 1, -3, 8, 127, -7, 3, -1},
4307    }, [FILTER_8TAP_SMOOTH] = {
4308        {-3, -1, 32, 64, 38, 1, -3, 0},
4309        {-2, -2, 29, 63, 41, 2, -3, 0},
4310        {-2, -2, 26, 63, 43, 4, -4, 0},
4311        {-2, -3, 24, 62, 46, 5, -4, 0},
4312        {-2, -3, 21, 60, 49, 7, -4, 0},
4313        {-1, -4, 18, 59, 51, 9, -4, 0},
4314        {-1, -4, 16, 57, 53, 12, -4, -1},
4315        {-1, -4, 14, 55, 55, 14, -4, -1},
4316        {-1, -4, 12, 53, 57, 16, -4, -1},
4317        {0, -4, 9, 51, 59, 18, -4, -1},
4318        {0, -4, 7, 49, 60, 21, -3, -2},
4319        {0, -4, 5, 46, 62, 24, -3, -2},
4320        {0, -4, 4, 43, 63, 26, -2, -2},
4321        {0, -3, 2, 41, 63, 29, -2, -2},
4322        {0, -3, 1, 38, 64, 32, -1, -3},
4323    }
4324};
4325
4326#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)                           \
4327void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \
4328                                        const uint8_t *src,                    \
4329                                        ptrdiff_t srcstride,                   \
4330                                        int h, int mx, int my)                 \
4331{                                                                              \
4332    const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \
4333                                                                               \
4334    common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \
4335}                                                                              \
4336                                                                               \
4337void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \
4338                                        const uint8_t *src,                    \
4339                                        ptrdiff_t srcstride,                   \
4340                                        int h, int mx, int my)                 \
4341{                                                                              \
4342    const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \
4343                                                                               \
4344    common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \
4345}                                                                              \
4346                                                                               \
4347void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \
4348                                         const uint8_t *src,                   \
4349                                         ptrdiff_t srcstride,                  \
4350                                         int h, int mx, int my)                \
4351{                                                                              \
4352    const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];            \
4353    const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];            \
4354                                                                               \
4355    common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter,   \
4356                                    vfilter, h);                               \
4357}                                                                              \
4358                                                                               \
4359void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \
4360                                        const uint8_t *src,                    \
4361                                        ptrdiff_t srcstride,                   \
4362                                        int h, int mx, int my)                 \
4363{                                                                              \
4364    const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \
4365                                                                               \
4366    common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,               \
4367                                            dststride, filter, h);             \
4368}                                                                              \
4369                                                                               \
4370void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \
4371                                        const uint8_t *src,                    \
4372                                        ptrdiff_t srcstride,                   \
4373                                        int h, int mx, int my)                 \
4374{                                                                              \
4375    const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \
4376                                                                               \
4377    common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride,    \
4378                                            filter, h);                        \
4379}                                                                              \
4380                                                                               \
4381void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \
4382                                         const uint8_t *src,                   \
4383                                         ptrdiff_t srcstride,                  \
4384                                         int h, int mx, int my)                \
4385{                                                                              \
4386    const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];            \
4387    const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];            \
4388                                                                               \
4389    common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,          \
4390                                                 dststride, hfilter,           \
4391                                                 vfilter, h);                  \
4392}
4393
4394#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)                           \
4395void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \
4396                         const uint8_t *src, ptrdiff_t srcstride,  \
4397                         int h, int mx, int my)                    \
4398{                                                                  \
4399                                                                   \
4400    copy_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \
4401}                                                                  \
4402                                                                   \
4403void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,         \
4404                        const uint8_t *src, ptrdiff_t srcstride,   \
4405                        int h, int mx, int my)                     \
4406{                                                                  \
4407                                                                   \
4408    avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);      \
4409}
4410
4411#define VP9_AVG_MIPS_MSA_FUNC(SIZE)                               \
4412void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \
4413                        const uint8_t *src, ptrdiff_t srcstride,  \
4414                        int h, int mx, int my)                    \
4415{                                                                 \
4416                                                                  \
4417    avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \
4418}
4419
4420VP9_8TAP_MIPS_MSA_FUNC(64, regular, FILTER_8TAP_REGULAR);
4421VP9_8TAP_MIPS_MSA_FUNC(32, regular, FILTER_8TAP_REGULAR);
4422VP9_8TAP_MIPS_MSA_FUNC(16, regular, FILTER_8TAP_REGULAR);
4423VP9_8TAP_MIPS_MSA_FUNC(8, regular, FILTER_8TAP_REGULAR);
4424VP9_8TAP_MIPS_MSA_FUNC(4, regular, FILTER_8TAP_REGULAR);
4425
4426VP9_8TAP_MIPS_MSA_FUNC(64, sharp, FILTER_8TAP_SHARP);
4427VP9_8TAP_MIPS_MSA_FUNC(32, sharp, FILTER_8TAP_SHARP);
4428VP9_8TAP_MIPS_MSA_FUNC(16, sharp, FILTER_8TAP_SHARP);
4429VP9_8TAP_MIPS_MSA_FUNC(8, sharp, FILTER_8TAP_SHARP);
4430VP9_8TAP_MIPS_MSA_FUNC(4, sharp, FILTER_8TAP_SHARP);
4431
4432VP9_8TAP_MIPS_MSA_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
4433VP9_8TAP_MIPS_MSA_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
4434VP9_8TAP_MIPS_MSA_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
4435VP9_8TAP_MIPS_MSA_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
4436VP9_8TAP_MIPS_MSA_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
4437
4438VP9_COPY_AVG_MIPS_MSA_FUNC(64);
4439VP9_COPY_AVG_MIPS_MSA_FUNC(32);
4440VP9_COPY_AVG_MIPS_MSA_FUNC(16);
4441VP9_COPY_AVG_MIPS_MSA_FUNC(8);
4442VP9_AVG_MIPS_MSA_FUNC(4);
4443
4444#undef VP9_8TAP_MIPS_MSA_FUNC
4445#undef VP9_COPY_AVG_MIPS_MSA_FUNC
4446#undef VP9_AVG_MIPS_MSA_FUNC
4447