1/*
2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavcodec/vp8dsp.h"
22#include "libavutil/mips/generic_macros_msa.h"
23#include "vp8dsp_mips.h"
24
25static const uint8_t mc_filt_mask_arr[16 * 3] = {
26    /* 8 width cases */
27    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28    /* 4 width cases */
29    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30    /* 4 width cases */
31    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32};
33
34static const int8_t subpel_filters_msa[7][8] = {
35    {-6, 123, 12, -1, 0, 0, 0, 0},
36    {2, -11, 108, 36, -8, 1, 0, 0},     /* New 1/4 pel 6 tap filter */
37    {-9, 93, 50, -6, 0, 0, 0, 0},
38    {3, -16, 77, 77, -16, 3, 0, 0},     /* New 1/2 pel 6 tap filter */
39    {-6, 50, 93, -9, 0, 0, 0, 0},
40    {1, -8, 36, 108, -11, 2, 0, 0},     /* New 1/4 pel 6 tap filter */
41    {-1, 12, 123, -6, 0, 0, 0, 0},
42};
43
44static const int8_t bilinear_filters_msa[7][2] = {
45    {112, 16},
46    {96, 32},
47    {80, 48},
48    {64, 64},
49    {48, 80},
50    {32, 96},
51    {16, 112}
52};
53
54#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2,                 \
55                        filt_h0, filt_h1, filt_h2)                       \
56( {                                                                      \
57    v16i8 vec0_m, vec1_m, vec2_m;                                        \
58    v8i16 hz_out_m;                                                      \
59                                                                         \
60    VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,  \
61               vec0_m, vec1_m, vec2_m);                                  \
62    hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m,                      \
63                            filt_h0, filt_h1, filt_h2);                  \
64                                                                         \
65    hz_out_m = __msa_srari_h(hz_out_m, 7);                               \
66    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
67                                                                         \
68    hz_out_m;                                                            \
69} )
70
71#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
72                                   mask0, mask1, mask2,                \
73                                   filt0, filt1, filt2,                \
74                                   out0, out1)                         \
75{                                                                      \
76    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;              \
77                                                                       \
78    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
79    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
80    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
81    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
82    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);  \
83    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);            \
84}
85
86#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
87                                   mask0, mask1, mask2,                       \
88                                   filt0, filt1, filt2,                       \
89                                   out0, out1, out2, out3)                    \
90{                                                                             \
91    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
92                                                                              \
93    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
94    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
95    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
96                out0, out1, out2, out3);                                      \
97    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
98    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
99    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);         \
100    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);         \
101    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
102                 out0, out1, out2, out3);                                     \
103    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2,  \
104                 out0, out1, out2, out3);                                     \
105}
106
107#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)           \
108( {                                                             \
109    v8i16 tmp0;                                                 \
110                                                                \
111    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
112    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
113                                                                \
114    tmp0;                                                       \
115} )
116
117#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)    \
118( {                                                                    \
119    v16i8 vec0_m, vec1_m;                                              \
120    v8i16 hz_out_m;                                                    \
121                                                                       \
122    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m);  \
123    hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1);  \
124                                                                       \
125    hz_out_m = __msa_srari_h(hz_out_m, 7);                             \
126    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                             \
127                                                                       \
128    hz_out_m;                                                          \
129} )
130
131#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
132                                   mask0, mask1, filt0, filt1,         \
133                                   out0, out1)                         \
134{                                                                      \
135    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \
136                                                                       \
137    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
138    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
139    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
140    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
141}
142
143#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
144                                   mask0, mask1, filt0, filt1,                \
145                                   out0, out1, out2, out3)                    \
146{                                                                             \
147    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \
148                                                                              \
149    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
150    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
151    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
152                out0, out1, out2, out3);                                      \
153    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
154    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
155    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
156                 out0, out1, out2, out3);                                     \
157}
158
159static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride,
160                                 uint8_t *dst, int32_t dst_stride,
161                                 const int8_t *filter)
162{
163    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
164    v16u8 mask0, mask1, mask2, out;
165    v8i16 filt, out0, out1;
166
167    mask0 = LD_UB(&mc_filt_mask_arr[16]);
168    src -= 2;
169
170    /* rearranging filter */
171    filt = LD_SH(filter);
172    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
173
174    mask1 = mask0 + 2;
175    mask2 = mask0 + 4;
176
177    LD_SB4(src, src_stride, src0, src1, src2, src3);
178    XORI_B4_128_SB(src0, src1, src2, src3);
179    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
180                               filt0, filt1, filt2, out0, out1);
181    SRARI_H2_SH(out0, out1, 7);
182    SAT_SH2_SH(out0, out1, 7);
183    out = PCKEV_XORI128_UB(out0, out1);
184    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
185}
186
187static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride,
188                                 uint8_t *dst, int32_t dst_stride,
189                                 const int8_t *filter)
190{
191    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
192    v16u8 mask0, mask1, mask2, out;
193    v8i16 filt, out0, out1, out2, out3;
194
195    mask0 = LD_UB(&mc_filt_mask_arr[16]);
196    src -= 2;
197
198    /* rearranging filter */
199    filt = LD_SH(filter);
200    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
201
202    mask1 = mask0 + 2;
203    mask2 = mask0 + 4;
204
205    LD_SB4(src, src_stride, src0, src1, src2, src3);
206    XORI_B4_128_SB(src0, src1, src2, src3);
207    src += (4 * src_stride);
208    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
209                               filt0, filt1, filt2, out0, out1);
210    LD_SB4(src, src_stride, src0, src1, src2, src3);
211    XORI_B4_128_SB(src0, src1, src2, src3);
212    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
213                               filt0, filt1, filt2, out2, out3);
214    SRARI_H4_SH(out0, out1, out2, out3, 7);
215    SAT_SH4_SH(out0, out1, out2, out3, 7);
216    out = PCKEV_XORI128_UB(out0, out1);
217    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
218    out = PCKEV_XORI128_UB(out2, out3);
219    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
220}
221
222void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
223                             uint8_t *src, ptrdiff_t src_stride,
224                             int height, int mx, int my)
225{
226    const int8_t *filter = subpel_filters_msa[mx - 1];
227
228    if (4 == height) {
229        common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
230    } else if (8 == height) {
231        common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
232    }
233}
234
235void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
236                             uint8_t *src, ptrdiff_t src_stride,
237                             int height, int mx, int my)
238{
239    uint32_t loop_cnt;
240    const int8_t *filter = subpel_filters_msa[mx - 1];
241    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
242    v16u8 mask0, mask1, mask2, tmp0, tmp1;
243    v8i16 filt, out0, out1, out2, out3;
244
245    mask0 = LD_UB(&mc_filt_mask_arr[0]);
246
247    src -= 2;
248
249    /* rearranging filter */
250    filt = LD_SH(filter);
251    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
252
253    mask1 = mask0 + 2;
254    mask2 = mask0 + 4;
255
256    LD_SB4(src, src_stride, src0, src1, src2, src3);
257    XORI_B4_128_SB(src0, src1, src2, src3);
258    src += (4 * src_stride);
259    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
260                               filt0, filt1, filt2, out0, out1, out2, out3);
261    SRARI_H4_SH(out0, out1, out2, out3, 7);
262    SAT_SH4_SH(out0, out1, out2, out3, 7);
263    tmp0 = PCKEV_XORI128_UB(out0, out1);
264    tmp1 = PCKEV_XORI128_UB(out2, out3);
265    ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
266    dst += (4 * dst_stride);
267
268    for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
269        LD_SB4(src, src_stride, src0, src1, src2, src3);
270        XORI_B4_128_SB(src0, src1, src2, src3);
271        src += (4 * src_stride);
272        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
273                                   filt0, filt1, filt2, out0, out1, out2, out3);
274        SRARI_H4_SH(out0, out1, out2, out3, 7);
275        SAT_SH4_SH(out0, out1, out2, out3, 7);
276        tmp0 = PCKEV_XORI128_UB(out0, out1);
277        tmp1 = PCKEV_XORI128_UB(out2, out3);
278        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
279        dst += (4 * dst_stride);
280    }
281}
282
283void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
284                              uint8_t *src, ptrdiff_t src_stride,
285                              int height, int mx, int my)
286{
287    uint32_t loop_cnt;
288    const int8_t *filter = subpel_filters_msa[mx - 1];
289    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
290    v16u8 mask0, mask1, mask2, out;
291    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
292
293    mask0 = LD_UB(&mc_filt_mask_arr[0]);
294    src -= 2;
295
296    /* rearranging filter */
297    filt = LD_SH(filter);
298    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
299
300    mask1 = mask0 + 2;
301    mask2 = mask0 + 4;
302
303    for (loop_cnt = (height >> 2); loop_cnt--;) {
304        LD_SB4(src, src_stride, src0, src2, src4, src6);
305        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
306        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
307        src += (4 * src_stride);
308
309        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
310                                   filt0, filt1, filt2, out0, out1, out2, out3);
311        HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
312                                   filt0, filt1, filt2, out4, out5, out6, out7);
313        SRARI_H4_SH(out0, out1, out2, out3, 7);
314        SRARI_H4_SH(out4, out5, out6, out7, 7);
315        SAT_SH4_SH(out0, out1, out2, out3, 7);
316        SAT_SH4_SH(out4, out5, out6, out7, 7);
317        out = PCKEV_XORI128_UB(out0, out1);
318        ST_UB(out, dst);
319        dst += dst_stride;
320        out = PCKEV_XORI128_UB(out2, out3);
321        ST_UB(out, dst);
322        dst += dst_stride;
323        out = PCKEV_XORI128_UB(out4, out5);
324        ST_UB(out, dst);
325        dst += dst_stride;
326        out = PCKEV_XORI128_UB(out6, out7);
327        ST_UB(out, dst);
328        dst += dst_stride;
329    }
330}
331
332void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
333                             uint8_t *src, ptrdiff_t src_stride,
334                             int height, int mx, int my)
335{
336    uint32_t loop_cnt;
337    const int8_t *filter = subpel_filters_msa[my - 1];
338    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
339    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
340    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
341    v16u8 out;
342    v8i16 filt, out10, out32;
343
344    src -= (2 * src_stride);
345
346    filt = LD_SH(filter);
347    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
348
349    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
350    src += (5 * src_stride);
351
352    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
353               src32_r, src43_r);
354    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
355    XORI_B2_128_SB(src2110, src4332);
356
357    for (loop_cnt = (height >> 2); loop_cnt--;) {
358        LD_SB4(src, src_stride, src5, src6, src7, src8);
359        src += (4 * src_stride);
360
361        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
362                   src65_r, src76_r, src87_r);
363        ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
364        XORI_B2_128_SB(src6554, src8776);
365        out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
366        out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
367        SRARI_H2_SH(out10, out32, 7);
368        SAT_SH2_SH(out10, out32, 7);
369        out = PCKEV_XORI128_UB(out10, out32);
370        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
371        dst += (4 * dst_stride);
372
373        src2110 = src6554;
374        src4332 = src8776;
375        src4 = src8;
376    }
377}
378
379void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
380                             uint8_t *src, ptrdiff_t src_stride,
381                             int height, int mx, int my)
382{
383    uint32_t loop_cnt;
384    const int8_t *filter = subpel_filters_msa[my - 1];
385    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
386    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
387    v16i8 src109_r, filt0, filt1, filt2;
388    v16u8 tmp0, tmp1;
389    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
390
391    src -= (2 * src_stride);
392
393    filt = LD_SH(filter);
394    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
395
396    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
397    src += (5 * src_stride);
398
399    XORI_B5_128_SB(src0, src1, src2, src3, src4);
400    ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3,
401               src10_r, src32_r, src21_r, src43_r);
402
403    for (loop_cnt = (height >> 2); loop_cnt--;) {
404        LD_SB4(src, src_stride, src7, src8, src9, src10);
405        XORI_B4_128_SB(src7, src8, src9, src10);
406        src += (4 * src_stride);
407
408        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
409                   src87_r, src98_r, src109_r);
410        out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
411        out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
412        out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
413        out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
414        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
415        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
416        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
417        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
418        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
419        dst += (4 * dst_stride);
420
421        src10_r = src76_r;
422        src32_r = src98_r;
423        src21_r = src87_r;
424        src43_r = src109_r;
425        src4 = src10;
426    }
427}
428
429void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
430                              uint8_t *src, ptrdiff_t src_stride,
431                              int height, int mx, int my)
432{
433    uint32_t loop_cnt;
434    const int8_t *filter = subpel_filters_msa[my - 1];
435    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
436    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
437    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
438    v16i8 src65_l, src87_l, filt0, filt1, filt2;
439    v16u8 tmp0, tmp1, tmp2, tmp3;
440    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
441
442    src -= (2 * src_stride);
443
444    filt = LD_SH(filter);
445    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
446
447    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
448    src += (5 * src_stride);
449
450    XORI_B5_128_SB(src0, src1, src2, src3, src4);
451    ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
452               src32_r, src43_r, src21_r);
453    ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
454               src32_l, src43_l, src21_l);
455
456    for (loop_cnt = (height >> 2); loop_cnt--;) {
457        LD_SB4(src, src_stride, src5, src6, src7, src8);
458        src += (4 * src_stride);
459
460        XORI_B4_128_SB(src5, src6, src7, src8);
461        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
462                   src65_r, src76_r, src87_r);
463        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
464                   src65_l, src76_l, src87_l);
465        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1,
466                              filt2);
467        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1,
468                              filt2);
469        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1,
470                              filt2);
471        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1,
472                              filt2);
473        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1,
474                              filt2);
475        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1,
476                              filt2);
477        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1,
478                              filt2);
479        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1,
480                              filt2);
481        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
482        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
483        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
484        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
485        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
486                    out3_r, tmp0, tmp1, tmp2, tmp3);
487        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
488        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
489        dst += (4 * dst_stride);
490
491        src10_r = src54_r;
492        src32_r = src76_r;
493        src21_r = src65_r;
494        src43_r = src87_r;
495        src10_l = src54_l;
496        src32_l = src76_l;
497        src21_l = src65_l;
498        src43_l = src87_l;
499        src4 = src8;
500    }
501}
502
503void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
504                               uint8_t *src, ptrdiff_t src_stride,
505                               int height, int mx, int my)
506{
507    uint32_t loop_cnt;
508    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
509    const int8_t *filter_vert = subpel_filters_msa[my - 1];
510    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
511    v16i8 filt_hz0, filt_hz1, filt_hz2;
512    v16u8 mask0, mask1, mask2, out;
513    v8i16 tmp0, tmp1;
514    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
515    v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
516
517    mask0 = LD_UB(&mc_filt_mask_arr[16]);
518    src -= (2 + 2 * src_stride);
519
520    /* rearranging filter */
521    filt = LD_SH(filter_horiz);
522    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
523
524    filt = LD_SH(filter_vert);
525    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
526
527    mask1 = mask0 + 2;
528    mask2 = mask0 + 4;
529
530    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
531    src += (5 * src_stride);
532
533    XORI_B5_128_SB(src0, src1, src2, src3, src4);
534    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
535                              filt_hz1, filt_hz2);
536    hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0,
537                              filt_hz1, filt_hz2);
538    hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
539    hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
540                              filt_hz1, filt_hz2);
541    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
542
543    for (loop_cnt = (height >> 2); loop_cnt--;) {
544        LD_SB2(src, src_stride, src5, src6);
545        src += (2 * src_stride);
546
547        XORI_B2_128_SB(src5, src6);
548        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
549                                  filt_hz1, filt_hz2);
550        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
551
552        LD_SB2(src, src_stride, src7, src8);
553        src += (2 * src_stride);
554
555        XORI_B2_128_SB(src7, src8);
556        hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
557                                  filt_hz1, filt_hz2);
558        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
559
560        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
561        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
562
563        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
564        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
565
566        SRARI_H2_SH(tmp0, tmp1, 7);
567        SAT_SH2_SH(tmp0, tmp1, 7);
568        out = PCKEV_XORI128_UB(tmp0, tmp1);
569        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
570        dst += (4 * dst_stride);
571
572        hz_out3 = hz_out7;
573        out0 = out2;
574        out1 = out3;
575    }
576}
577
578void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
579                               uint8_t *src, ptrdiff_t src_stride,
580                               int height, int mx, int my)
581{
582    uint32_t loop_cnt;
583    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
584    const int8_t *filter_vert = subpel_filters_msa[my - 1];
585    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
586    v16i8 filt_hz0, filt_hz1, filt_hz2;
587    v16u8 mask0, mask1, mask2, vec0, vec1;
588    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
589    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
590    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
591    v8i16 tmp0, tmp1, tmp2, tmp3;
592
593    mask0 = LD_UB(&mc_filt_mask_arr[0]);
594    src -= (2 + 2 * src_stride);
595
596    /* rearranging filter */
597    filt = LD_SH(filter_horiz);
598    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
599
600    mask1 = mask0 + 2;
601    mask2 = mask0 + 4;
602
603    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
604    src += (5 * src_stride);
605
606    XORI_B5_128_SB(src0, src1, src2, src3, src4);
607    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
608                              filt_hz1, filt_hz2);
609    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
610                              filt_hz1, filt_hz2);
611    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
612                              filt_hz1, filt_hz2);
613    hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
614                              filt_hz1, filt_hz2);
615    hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
616                              filt_hz1, filt_hz2);
617
618    filt = LD_SH(filter_vert);
619    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
620
621    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
622    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
623
624    for (loop_cnt = (height >> 2); loop_cnt--;) {
625        LD_SB4(src, src_stride, src5, src6, src7, src8);
626        src += (4 * src_stride);
627
628        XORI_B4_128_SB(src5, src6, src7, src8);
629        hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
630                                  filt_hz1, filt_hz2);
631        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
632        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
633
634        hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
635                                  filt_hz1, filt_hz2);
636        out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
637        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
638
639        hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
640                                  filt_hz1, filt_hz2);
641        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
642        tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
643
644        hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
645                                  filt_hz1, filt_hz2);
646        out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
647        tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
648
649        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
650        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
651        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
652        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
653        ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
654        dst += (4 * dst_stride);
655
656        hz_out4 = hz_out8;
657        out0 = out2;
658        out1 = out7;
659        out3 = out5;
660        out4 = out6;
661    }
662}
663
664
665void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
666                               uint8_t *src, ptrdiff_t src_stride,
667                               int height, int mx, int my)
668{
669    int32_t multiple8_cnt;
670
671    for (multiple8_cnt = 2; multiple8_cnt--;) {
672        ff_put_vp8_epel8_h6v6_msa(dst, dst_stride, src, src_stride, height,
673                                  mx, my);
674
675        src += 8;
676        dst += 8;
677    }
678}
679
680static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
681                                 uint8_t *dst, int32_t dst_stride,
682                                 const int8_t *filter)
683{
684    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
685    v8i16 filt, out0, out1;
686    v16u8 out;
687
688    mask0 = LD_SB(&mc_filt_mask_arr[16]);
689    src -= 1;
690
691    /* rearranging filter */
692    filt = LD_SH(filter);
693    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
694
695    mask1 = mask0 + 2;
696
697    LD_SB4(src, src_stride, src0, src1, src2, src3);
698    XORI_B4_128_SB(src0, src1, src2, src3);
699    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
700                               filt0, filt1, out0, out1);
701    SRARI_H2_SH(out0, out1, 7);
702    SAT_SH2_SH(out0, out1, 7);
703    out = PCKEV_XORI128_UB(out0, out1);
704    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
705}
706
707static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
708                                 uint8_t *dst, int32_t dst_stride,
709                                 const int8_t *filter)
710{
711    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
712    v16u8 out;
713    v8i16 filt, out0, out1, out2, out3;
714
715    mask0 = LD_SB(&mc_filt_mask_arr[16]);
716    src -= 1;
717
718    /* rearranging filter */
719    filt = LD_SH(filter);
720    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
721
722    mask1 = mask0 + 2;
723
724    LD_SB4(src, src_stride, src0, src1, src2, src3);
725    src += (4 * src_stride);
726
727    XORI_B4_128_SB(src0, src1, src2, src3);
728    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
729                               filt0, filt1, out0, out1);
730    LD_SB4(src, src_stride, src0, src1, src2, src3);
731    XORI_B4_128_SB(src0, src1, src2, src3);
732    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
733                               filt0, filt1, out2, out3);
734    SRARI_H4_SH(out0, out1, out2, out3, 7);
735    SAT_SH4_SH(out0, out1, out2, out3, 7);
736    out = PCKEV_XORI128_UB(out0, out1);
737    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
738    out = PCKEV_XORI128_UB(out2, out3);
739    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
740}
741
742static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
743                                  uint8_t *dst, int32_t dst_stride,
744                                  const int8_t *filter)
745{
746    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
747    v16i8 filt0, filt1, mask0, mask1;
748    v16u8 out;
749    v8i16 filt, out0, out1, out2, out3;
750
751    mask0 = LD_SB(&mc_filt_mask_arr[16]);
752    src -= 1;
753
754    /* rearranging filter */
755    filt = LD_SH(filter);
756    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
757
758    mask1 = mask0 + 2;
759
760    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
761    src += (8 * src_stride);
762    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
763    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
764                               filt0, filt1, out0, out1);
765    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
766                               filt0, filt1, out2, out3);
767    SRARI_H4_SH(out0, out1, out2, out3, 7);
768    SAT_SH4_SH(out0, out1, out2, out3, 7);
769    out = PCKEV_XORI128_UB(out0, out1);
770    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
771    dst += (4 * dst_stride);
772    out = PCKEV_XORI128_UB(out2, out3);
773    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
774    dst += (4 * dst_stride);
775
776    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
777    src += (8 * src_stride);
778    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
779    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
780                               filt0, filt1, out0, out1);
781    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
782                               filt0, filt1, out2, out3);
783    SRARI_H4_SH(out0, out1, out2, out3, 7);
784    SAT_SH4_SH(out0, out1, out2, out3, 7);
785    out = PCKEV_XORI128_UB(out0, out1);
786    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
787    dst += (4 * dst_stride);
788    out = PCKEV_XORI128_UB(out2, out3);
789    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
790}
791
792void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
793                             uint8_t *src, ptrdiff_t src_stride,
794                             int height, int mx, int my)
795{
796    const int8_t *filter = subpel_filters_msa[mx - 1];
797
798    if (4 == height) {
799        common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
800    } else if (8 == height) {
801        common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
802    } else if (16 == height) {
803        common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
804    }
805}
806
807void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
808                             uint8_t *src, ptrdiff_t src_stride,
809                             int height, int mx, int my)
810{
811    uint32_t loop_cnt;
812    const int8_t *filter = subpel_filters_msa[mx - 1];
813    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
814    v16u8 tmp0, tmp1;
815    v8i16 filt, out0, out1, out2, out3;
816
817    mask0 = LD_SB(&mc_filt_mask_arr[0]);
818    src -= 1;
819
820    /* rearranging filter */
821    filt = LD_SH(filter);
822    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
823
824    mask1 = mask0 + 2;
825
826    for (loop_cnt = (height >> 2); loop_cnt--;) {
827        LD_SB4(src, src_stride, src0, src1, src2, src3);
828        src += (4 * src_stride);
829
830        XORI_B4_128_SB(src0, src1, src2, src3);
831        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
832                                   filt1, out0, out1, out2, out3);
833        SRARI_H4_SH(out0, out1, out2, out3, 7);
834        SAT_SH4_SH(out0, out1, out2, out3, 7);
835        tmp0 = PCKEV_XORI128_UB(out0, out1);
836        tmp1 = PCKEV_XORI128_UB(out2, out3);
837        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
838        dst += (4 * dst_stride);
839    }
840}
841
842void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
843                              uint8_t *src, ptrdiff_t src_stride,
844                              int height, int mx, int my)
845{
846    uint32_t loop_cnt;
847    const int8_t *filter = subpel_filters_msa[mx - 1];
848    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
849    v16i8 filt0, filt1, mask0, mask1;
850    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
851    v16u8 out;
852
853    mask0 = LD_SB(&mc_filt_mask_arr[0]);
854    src -= 1;
855
856    /* rearranging filter */
857    filt = LD_SH(filter);
858    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
859
860    mask1 = mask0 + 2;
861
862    for (loop_cnt = (height >> 2); loop_cnt--;) {
863        LD_SB4(src, src_stride, src0, src2, src4, src6);
864        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
865        src += (4 * src_stride);
866
867        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
868        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
869                                   filt1, out0, out1, out2, out3);
870        HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
871                                   filt1, out4, out5, out6, out7);
872        SRARI_H4_SH(out0, out1, out2, out3, 7);
873        SRARI_H4_SH(out4, out5, out6, out7, 7);
874        SAT_SH4_SH(out0, out1, out2, out3, 7);
875        SAT_SH4_SH(out4, out5, out6, out7, 7);
876        out = PCKEV_XORI128_UB(out0, out1);
877        ST_UB(out, dst);
878        dst += dst_stride;
879        out = PCKEV_XORI128_UB(out2, out3);
880        ST_UB(out, dst);
881        dst += dst_stride;
882        out = PCKEV_XORI128_UB(out4, out5);
883        ST_UB(out, dst);
884        dst += dst_stride;
885        out = PCKEV_XORI128_UB(out6, out7);
886        ST_UB(out, dst);
887        dst += dst_stride;
888    }
889}
890
891void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
892                             uint8_t *src, ptrdiff_t src_stride,
893                             int height, int mx, int my)
894{
895    uint32_t loop_cnt;
896    const int8_t *filter = subpel_filters_msa[my - 1];
897    v16i8 src0, src1, src2, src3, src4, src5;
898    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
899    v16i8 src2110, src4332, filt0, filt1;
900    v8i16 filt, out10, out32;
901    v16u8 out;
902
903    src -= src_stride;
904
905    filt = LD_SH(filter);
906    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
907
908    LD_SB3(src, src_stride, src0, src1, src2);
909    src += (3 * src_stride);
910
911    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
912
913    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
914    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
915
916    for (loop_cnt = (height >> 2); loop_cnt--;) {
917        LD_SB3(src, src_stride, src3, src4, src5);
918        src += (3 * src_stride);
919        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
920        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
921        src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
922        out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
923
924        src2 = LD_SB(src);
925        src += (src_stride);
926        ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
927        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
928        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
929        out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
930        SRARI_H2_SH(out10, out32, 7);
931        SAT_SH2_SH(out10, out32, 7);
932        out = PCKEV_XORI128_UB(out10, out32);
933        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
934        dst += (4 * dst_stride);
935    }
936}
937
938void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
939                             uint8_t *src, ptrdiff_t src_stride,
940                             int height, int mx, int my)
941{
942    uint32_t loop_cnt;
943    const int8_t *filter = subpel_filters_msa[my - 1];
944    v16i8 src0, src1, src2, src7, src8, src9, src10;
945    v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
946    v16u8 tmp0, tmp1;
947    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
948
949    src -= src_stride;
950
951    filt = LD_SH(filter);
952    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
953
954    LD_SB3(src, src_stride, src0, src1, src2);
955    src += (3 * src_stride);
956
957    XORI_B3_128_SB(src0, src1, src2);
958    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
959
960    for (loop_cnt = (height >> 2); loop_cnt--;) {
961        LD_SB4(src, src_stride, src7, src8, src9, src10);
962        src += (4 * src_stride);
963
964        XORI_B4_128_SB(src7, src8, src9, src10);
965        ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
966                   src72_r, src87_r, src98_r, src109_r);
967        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
968        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
969        out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
970        out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
971        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
972        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
973        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
974        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
975        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
976        dst += (4 * dst_stride);
977
978        src10_r = src98_r;
979        src21_r = src109_r;
980        src2 = src10;
981    }
982}
983
984void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
985                              uint8_t *src, ptrdiff_t src_stride,
986                              int height, int mx, int my)
987{
988    uint32_t loop_cnt;
989    const int8_t *filter = subpel_filters_msa[my - 1];
990    v16i8 src0, src1, src2, src3, src4, src5, src6;
991    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
992    v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
993    v16u8 tmp0, tmp1, tmp2, tmp3;
994    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
995
996    src -= src_stride;
997
998    filt = LD_SH(filter);
999    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1000
1001    LD_SB3(src, src_stride, src0, src1, src2);
1002    src += (3 * src_stride);
1003
1004    XORI_B3_128_SB(src0, src1, src2);
1005    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
1006    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
1007
1008    for (loop_cnt = (height >> 2); loop_cnt--;) {
1009        LD_SB4(src, src_stride, src3, src4, src5, src6);
1010        src += (4 * src_stride);
1011
1012        XORI_B4_128_SB(src3, src4, src5, src6);
1013        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1014                   src32_r, src43_r, src54_r, src65_r);
1015        ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1016                   src32_l, src43_l, src54_l, src65_l);
1017        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
1018        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
1019        out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
1020        out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
1021        out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
1022        out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
1023        out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
1024        out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
1025        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1026        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1027        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1028        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1029        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1030                    out3_r, tmp0, tmp1, tmp2, tmp3);
1031        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1032        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1033        dst += (4 * dst_stride);
1034
1035        src10_r = src54_r;
1036        src21_r = src65_r;
1037        src10_l = src54_l;
1038        src21_l = src65_l;
1039        src2 = src6;
1040    }
1041}
1042
1043void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1044                               uint8_t *src, ptrdiff_t src_stride,
1045                               int height, int mx, int my)
1046{
1047    uint32_t loop_cnt;
1048    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1049    const int8_t *filter_vert = subpel_filters_msa[my - 1];
1050    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1051    v16u8 mask0, mask1, out;
1052    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1053    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1054
1055    mask0 = LD_UB(&mc_filt_mask_arr[16]);
1056    src -= (1 + 1 * src_stride);
1057
1058    /* rearranging filter */
1059    filt = LD_SH(filter_horiz);
1060    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1061
1062    mask1 = mask0 + 2;
1063
1064    LD_SB3(src, src_stride, src0, src1, src2);
1065    src += (3 * src_stride);
1066
1067    XORI_B3_128_SB(src0, src1, src2);
1068    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1069    hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
1070    vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1071
1072    filt = LD_SH(filter_vert);
1073    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1074
1075    for (loop_cnt = (height >> 2); loop_cnt--;) {
1076        LD_SB4(src, src_stride, src3, src4, src5, src6);
1077        src += (4 * src_stride);
1078
1079        XORI_B2_128_SB(src3, src4);
1080        hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1081        hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1082        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1083        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1084
1085        XORI_B2_128_SB(src5, src6);
1086        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1087        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1088        vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1089        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1090
1091        SRARI_H2_SH(tmp0, tmp1, 7);
1092        SAT_SH2_SH(tmp0, tmp1, 7);
1093        out = PCKEV_XORI128_UB(tmp0, tmp1);
1094        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1095        dst += (4 * dst_stride);
1096
1097        hz_out1 = hz_out5;
1098        vec0 = vec2;
1099    }
1100}
1101
1102void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1103                               uint8_t *src, ptrdiff_t src_stride,
1104                               int height, int mx, int my)
1105{
1106    uint32_t loop_cnt;
1107    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1108    const int8_t *filter_vert = subpel_filters_msa[my - 1];
1109    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1110    v16u8 mask0, mask1, out0, out1;
1111    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1112    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1113    v8i16 vec0, vec1, vec2, vec3, vec4;
1114
1115    mask0 = LD_UB(&mc_filt_mask_arr[0]);
1116    src -= (1 + 1 * src_stride);
1117
1118    /* rearranging filter */
1119    filt = LD_SH(filter_horiz);
1120    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1121
1122    mask1 = mask0 + 2;
1123
1124    LD_SB3(src, src_stride, src0, src1, src2);
1125    src += (3 * src_stride);
1126
1127    XORI_B3_128_SB(src0, src1, src2);
1128    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1129    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1130    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1131    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1132
1133    filt = LD_SH(filter_vert);
1134    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1135
1136    for (loop_cnt = (height >> 2); loop_cnt--;) {
1137        LD_SB4(src, src_stride, src3, src4, src5, src6);
1138        src += (4 * src_stride);
1139
1140        XORI_B4_128_SB(src3, src4, src5, src6);
1141        hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1142        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1143        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1144
1145        hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1146        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1147        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1148
1149        hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1150        vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1151        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
1152
1153        hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1154        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1155        tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1156
1157        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1158        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1159        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1160        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1161        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1162        dst += (4 * dst_stride);
1163
1164        vec0 = vec4;
1165        vec2 = vec1;
1166    }
1167}
1168
1169void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1170                                uint8_t *src, ptrdiff_t src_stride,
1171                                int height, int mx, int my)
1172{
1173    int32_t multiple8_cnt;
1174
1175    for (multiple8_cnt = 2; multiple8_cnt--;) {
1176        ff_put_vp8_epel8_h4v4_msa(dst, dst_stride, src, src_stride, height,
1177                                  mx, my);
1178
1179        src += 8;
1180        dst += 8;
1181    }
1182}
1183
1184void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1185                               uint8_t *src, ptrdiff_t src_stride,
1186                               int height, int mx, int my)
1187{
1188    uint32_t loop_cnt;
1189    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1190    const int8_t *filter_vert = subpel_filters_msa[my - 1];
1191    v16i8 src0, src1, src2, src3, src4, src5, src6;
1192    v16i8 filt_hz0, filt_hz1, filt_hz2;
1193    v16u8 res0, res1, mask0, mask1, mask2;
1194    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1195    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1196
1197    mask0 = LD_UB(&mc_filt_mask_arr[16]);
1198    src -= (2 + 1 * src_stride);
1199
1200    /* rearranging filter */
1201    filt = LD_SH(filter_horiz);
1202    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1203
1204    mask1 = mask0 + 2;
1205    mask2 = mask0 + 4;
1206
1207    LD_SB3(src, src_stride, src0, src1, src2);
1208    src += (3 * src_stride);
1209
1210    XORI_B3_128_SB(src0, src1, src2);
1211    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
1212                              filt_hz1, filt_hz2);
1213    hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0,
1214                              filt_hz1, filt_hz2);
1215    vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1216
1217    filt = LD_SH(filter_vert);
1218    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1219
1220    for (loop_cnt = (height >> 2); loop_cnt--;) {
1221        LD_SB4(src, src_stride, src3, src4, src5, src6);
1222        src += (4 * src_stride);
1223
1224        XORI_B4_128_SB(src3, src4, src5, src6);
1225        hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
1226                                  filt_hz1, filt_hz2);
1227        hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1228        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1229        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1230
1231        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
1232                                  filt_hz1, filt_hz2);
1233        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1234        vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1235        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1236
1237        SRARI_H2_SH(tmp0, tmp1, 7);
1238        SAT_SH2_SH(tmp0, tmp1, 7);
1239        PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1240        XORI_B2_128_UB(res0, res1);
1241        ST_W2(res0, 0, 1, dst, dst_stride);
1242        ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1243        dst += (4 * dst_stride);
1244
1245        hz_out1 = hz_out5;
1246        vec0 = vec2;
1247    }
1248}
1249
1250void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1251                               uint8_t *src, ptrdiff_t src_stride,
1252                               int height, int mx, int my)
1253{
1254    uint32_t loop_cnt;
1255    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1256    const int8_t *filter_vert = subpel_filters_msa[my - 1];
1257    v16i8 src0, src1, src2, src3, src4, src5, src6;
1258    v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1259    v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1260    v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1261    v16u8 out0, out1;
1262
1263    mask0 = LD_SB(&mc_filt_mask_arr[0]);
1264    src -= (2 + src_stride);
1265
1266    /* rearranging filter */
1267    filt = LD_SH(filter_horiz);
1268    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1269
1270    mask1 = mask0 + 2;
1271    mask2 = mask0 + 4;
1272
1273    LD_SB3(src, src_stride, src0, src1, src2);
1274    src += (3 * src_stride);
1275
1276    XORI_B3_128_SB(src0, src1, src2);
1277    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
1278                              filt_hz1, filt_hz2);
1279    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
1280                              filt_hz1, filt_hz2);
1281    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
1282                              filt_hz1, filt_hz2);
1283    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1284
1285    filt = LD_SH(filter_vert);
1286    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1287
1288    for (loop_cnt = (height >> 2); loop_cnt--;) {
1289        LD_SB4(src, src_stride, src3, src4, src5, src6);
1290        src += (4 * src_stride);
1291
1292        XORI_B4_128_SB(src3, src4, src5, src6);
1293
1294        hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
1295                                  filt_hz1, filt_hz2);
1296        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1297        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1298
1299        hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
1300                                  filt_hz1, filt_hz2);
1301        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1302        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1303
1304        hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
1305                                  filt_hz1, filt_hz2);
1306        vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1307        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
1308
1309        hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
1310                                  filt_hz1, filt_hz2);
1311        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1312        tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1313
1314        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1315        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1316        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1317        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1318        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1319        dst += (4 * dst_stride);
1320    }
1321}
1322
1323void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1324                               uint8_t *src, ptrdiff_t src_stride,
1325                               int height, int mx, int my)
1326{
1327    int32_t multiple8_cnt;
1328
1329    for (multiple8_cnt = 2; multiple8_cnt--;) {
1330        ff_put_vp8_epel8_h6v4_msa(dst, dst_stride, src, src_stride, height,
1331                                  mx, my);
1332
1333        src += 8;
1334        dst += 8;
1335    }
1336}
1337
1338void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1339                               uint8_t *src, ptrdiff_t src_stride,
1340                               int height, int mx, int my)
1341{
1342    uint32_t loop_cnt;
1343    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1344    const int8_t *filter_vert = subpel_filters_msa[my - 1];
1345    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1346    v16i8 filt_hz0, filt_hz1, mask0, mask1;
1347    v16u8 out;
1348    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1349    v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1350    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
1351
1352    mask0 = LD_SB(&mc_filt_mask_arr[16]);
1353
1354    src -= (1 + 2 * src_stride);
1355
1356    /* rearranging filter */
1357    filt = LD_SH(filter_horiz);
1358    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1359
1360    mask1 = mask0 + 2;
1361
1362    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1363    src += (5 * src_stride);
1364
1365    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1366    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1367    hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1368    hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1369    hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1370    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1371
1372    filt = LD_SH(filter_vert);
1373    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1374
1375    for (loop_cnt = (height >> 2); loop_cnt--;) {
1376        LD_SB4(src, src_stride, src5, src6, src7, src8);
1377        XORI_B4_128_SB(src5, src6, src7, src8);
1378        src += (4 * src_stride);
1379
1380        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1381        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1382        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1383        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1384
1385        hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1386        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1387        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1388        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1389
1390        SRARI_H2_SH(tmp0, tmp1, 7);
1391        SAT_SH2_SH(tmp0, tmp1, 7);
1392        out = PCKEV_XORI128_UB(tmp0, tmp1);
1393        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1394        dst += (4 * dst_stride);
1395
1396        hz_out3 = hz_out7;
1397        out0 = out2;
1398        out1 = out3;
1399    }
1400}
1401
1402void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1403                               uint8_t *src, ptrdiff_t src_stride,
1404                               int height, int mx, int my)
1405{
1406    uint32_t loop_cnt;
1407    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1408    const int8_t *filter_vert = subpel_filters_msa[my - 1];
1409    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1410    v16i8 filt_hz0, filt_hz1, mask0, mask1;
1411    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1412    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1413    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1414    v16u8 vec0, vec1;
1415
1416    mask0 = LD_SB(&mc_filt_mask_arr[0]);
1417    src -= (1 + 2 * src_stride);
1418
1419    /* rearranging filter */
1420    filt = LD_SH(filter_horiz);
1421    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1422
1423    mask1 = mask0 + 2;
1424
1425    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1426    src += (5 * src_stride);
1427
1428    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1429    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1430    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1431    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1432    hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1433    hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1434    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1435    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1436
1437    filt = LD_SH(filter_vert);
1438    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1439
1440    for (loop_cnt = (height >> 2); loop_cnt--;) {
1441        LD_SB4(src, src_stride, src5, src6, src7, src8);
1442        src += (4 * src_stride);
1443
1444        XORI_B4_128_SB(src5, src6, src7, src8);
1445
1446        hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1447        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1448        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1449
1450        hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1451        out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
1452        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1453
1454        hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1455        out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1456        tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1457
1458        hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1459        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1460        tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1461
1462        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1463        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1464        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
1465        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
1466        ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
1467        dst += (4 * dst_stride);
1468
1469        hz_out4 = hz_out8;
1470        out0 = out2;
1471        out1 = out6;
1472        out3 = out5;
1473        out4 = out7;
1474    }
1475}
1476
1477void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1478                                uint8_t *src, ptrdiff_t src_stride,
1479                                int height, int mx, int my)
1480{
1481    int32_t multiple8_cnt;
1482
1483    for (multiple8_cnt = 2; multiple8_cnt--;) {
1484        ff_put_vp8_epel8_h4v6_msa(dst, dst_stride, src, src_stride, height,
1485                                  mx, my);
1486
1487        src += 8;
1488        dst += 8;
1489    }
1490}
1491
1492static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride,
1493                                 uint8_t *dst, int32_t dst_stride,
1494                                 const int8_t *filter)
1495{
1496    v16i8 src0, src1, src2, src3, mask;
1497    v16u8 filt0, vec0, vec1, res0, res1;
1498    v8u16 vec2, vec3, filt;
1499
1500    mask = LD_SB(&mc_filt_mask_arr[16]);
1501
1502    /* rearranging filter */
1503    filt = LD_UH(filter);
1504    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1505
1506    LD_SB4(src, src_stride, src0, src1, src2, src3);
1507    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1508    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1509    SRARI_H2_UH(vec2, vec3, 7);
1510    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
1511    ST_W2(res0, 0, 1, dst, dst_stride);
1512    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1513}
1514
1515static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride,
1516                                 uint8_t *dst, int32_t dst_stride,
1517                                 const int8_t *filter)
1518{
1519    v16u8 vec0, vec1, vec2, vec3, filt0;
1520    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1521    v16i8 res0, res1, res2, res3;
1522    v8u16 vec4, vec5, vec6, vec7, filt;
1523
1524    mask = LD_SB(&mc_filt_mask_arr[16]);
1525
1526    /* rearranging filter */
1527    filt = LD_UH(filter);
1528    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1529
1530    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1531    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1532    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1533    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1534                vec4, vec5, vec6, vec7);
1535    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
1536    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1537                res0, res1, res2, res3);
1538    ST_W2(res0, 0, 1, dst, dst_stride);
1539    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1540    ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
1541    ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
1542}
1543
1544void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1545                                uint8_t *src, ptrdiff_t src_stride,
1546                                int height, int mx, int my)
1547{
1548    const int8_t *filter = bilinear_filters_msa[mx - 1];
1549
1550    if (4 == height) {
1551        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1552    } else if (8 == height) {
1553        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1554    }
1555}
1556
1557static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride,
1558                                 uint8_t *dst, int32_t dst_stride,
1559                                 const int8_t *filter)
1560{
1561    v16u8 filt0;
1562    v16i8 src0, src1, src2, src3, mask;
1563    v8u16 vec0, vec1, vec2, vec3, filt;
1564
1565    mask = LD_SB(&mc_filt_mask_arr[0]);
1566
1567    /* rearranging filter */
1568    filt = LD_UH(filter);
1569    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1570
1571    LD_SB4(src, src_stride, src0, src1, src2, src3);
1572    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1573    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1574    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1575                vec0, vec1, vec2, vec3);
1576    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1577    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
1578    ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
1579}
1580
1581static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
1582                                     uint8_t *dst, int32_t dst_stride,
1583                                     const int8_t *filter, int32_t height)
1584{
1585    v16u8 filt0;
1586    v16i8 src0, src1, src2, src3, mask, out0, out1;
1587    v8u16 vec0, vec1, vec2, vec3, filt;
1588
1589    mask = LD_SB(&mc_filt_mask_arr[0]);
1590
1591    /* rearranging filter */
1592    filt = LD_UH(filter);
1593    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1594
1595    LD_SB4(src, src_stride, src0, src1, src2, src3);
1596    src += (4 * src_stride);
1597
1598    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1599    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1600    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1601                vec0, vec1, vec2, vec3);
1602    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1603
1604    LD_SB4(src, src_stride, src0, src1, src2, src3);
1605    src += (4 * src_stride);
1606
1607    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1608    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1609
1610    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1611    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1612    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1613                vec0, vec1, vec2, vec3);
1614    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1615    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1616    ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1617    dst += (8 * dst_stride);
1618
1619    if (16 == height) {
1620        LD_SB4(src, src_stride, src0, src1, src2, src3);
1621        src += (4 * src_stride);
1622
1623        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1624        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1625        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1626                    vec0, vec1, vec2, vec3);
1627        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1628        LD_SB4(src, src_stride, src0, src1, src2, src3);
1629        src += (4 * src_stride);
1630
1631        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1632        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1633
1634        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1635        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1636        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1637                    vec0, vec1, vec2, vec3);
1638        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1639        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1640        ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1641    }
1642}
1643
1644void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1645                                uint8_t *src, ptrdiff_t src_stride,
1646                                int height, int mx, int my)
1647{
1648    const int8_t *filter = bilinear_filters_msa[mx - 1];
1649
1650    if (4 == height) {
1651        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1652    } else {
1653        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1654                                 height);
1655    }
1656}
1657
1658void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1659                                 uint8_t *src, ptrdiff_t src_stride,
1660                                 int height, int mx, int my)
1661{
1662    uint32_t loop_cnt;
1663    const int8_t *filter = bilinear_filters_msa[mx - 1];
1664    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1665    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1666    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
1667
1668    mask = LD_SB(&mc_filt_mask_arr[0]);
1669
1670    loop_cnt = (height >> 2) - 1;
1671
1672    /* rearranging filter */
1673    filt = LD_UH(filter);
1674    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1675
1676    LD_SB4(src, src_stride, src0, src2, src4, src6);
1677    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1678    src += (4 * src_stride);
1679
1680    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1681    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1682    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1683    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1684    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1685                out0, out1, out2, out3);
1686    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1687                out4, out5, out6, out7);
1688    SRARI_H4_UH(out0, out1, out2, out3, 7);
1689    SRARI_H4_UH(out4, out5, out6, out7, 7);
1690    PCKEV_ST_SB(out0, out1, dst);
1691    dst += dst_stride;
1692    PCKEV_ST_SB(out2, out3, dst);
1693    dst += dst_stride;
1694    PCKEV_ST_SB(out4, out5, dst);
1695    dst += dst_stride;
1696    PCKEV_ST_SB(out6, out7, dst);
1697    dst += dst_stride;
1698
1699    for (; loop_cnt--;) {
1700        LD_SB4(src, src_stride, src0, src2, src4, src6);
1701        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1702        src += (4 * src_stride);
1703
1704        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1705        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1706        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1707        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1708        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1709                    out0, out1, out2, out3);
1710        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1711                    out4, out5, out6, out7);
1712        SRARI_H4_UH(out0, out1, out2, out3, 7);
1713        SRARI_H4_UH(out4, out5, out6, out7, 7);
1714        PCKEV_ST_SB(out0, out1, dst);
1715        dst += dst_stride;
1716        PCKEV_ST_SB(out2, out3, dst);
1717        dst += dst_stride;
1718        PCKEV_ST_SB(out4, out5, dst);
1719        dst += dst_stride;
1720        PCKEV_ST_SB(out6, out7, dst);
1721        dst += dst_stride;
1722    }
1723}
1724
1725static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride,
1726                                 uint8_t *dst, int32_t dst_stride,
1727                                 const int8_t *filter)
1728{
1729    v16i8 src0, src1, src2, src3, src4;
1730    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
1731    v16u8 filt0;
1732    v8i16 filt;
1733    v8u16 tmp0, tmp1;
1734
1735    filt = LD_SH(filter);
1736    filt0 = (v16u8) __msa_splati_h(filt, 0);
1737
1738    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1739    src += (5 * src_stride);
1740
1741    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1742               src10_r, src21_r, src32_r, src43_r);
1743    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1744    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1745    SRARI_H2_UH(tmp0, tmp1, 7);
1746    SAT_UH2_UH(tmp0, tmp1, 7);
1747    src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
1748    ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
1749}
1750
1751static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride,
1752                                 uint8_t *dst, int32_t dst_stride,
1753                                 const int8_t *filter)
1754{
1755    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1756    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
1757    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
1758    v8u16 tmp0, tmp1, tmp2, tmp3;
1759    v16u8 filt0;
1760    v8i16 filt;
1761
1762    filt = LD_SH(filter);
1763    filt0 = (v16u8) __msa_splati_h(filt, 0);
1764
1765    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1766    src += (8 * src_stride);
1767
1768    src8 = LD_SB(src);
1769    src += src_stride;
1770
1771    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1772               src32_r, src43_r);
1773    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1774               src76_r, src87_r);
1775    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1776               src87_r, src76_r, src2110, src4332, src6554, src8776);
1777    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
1778                tmp0, tmp1, tmp2, tmp3);
1779    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1780    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1781    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
1782    ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1783}
1784
1785void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1786                                uint8_t *src, ptrdiff_t src_stride,
1787                                int height, int mx, int my)
1788{
1789    const int8_t *filter = bilinear_filters_msa[my - 1];
1790
1791    if (4 == height) {
1792        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1793    } else if (8 == height) {
1794        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1795    }
1796}
1797
1798static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride,
1799                                 uint8_t *dst, int32_t dst_stride,
1800                                 const int8_t *filter)
1801{
1802    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
1803    v16i8 out0, out1;
1804    v8u16 tmp0, tmp1, tmp2, tmp3;
1805    v8i16 filt;
1806
1807    /* rearranging filter_y */
1808    filt = LD_SH(filter);
1809    filt0 = (v16u8) __msa_splati_h(filt, 0);
1810
1811    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1812    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
1813    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
1814    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1815                tmp0, tmp1, tmp2, tmp3);
1816    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1817    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1818    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1819    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1820}
1821
1822static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
1823                                     uint8_t *dst, int32_t dst_stride,
1824                                     const int8_t *filter, int32_t height)
1825{
1826    uint32_t loop_cnt;
1827    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1828    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1829    v16i8 out0, out1;
1830    v8u16 tmp0, tmp1, tmp2, tmp3;
1831    v8i16 filt;
1832
1833    /* rearranging filter_y */
1834    filt = LD_SH(filter);
1835    filt0 = (v16u8) __msa_splati_h(filt, 0);
1836
1837    src0 = LD_UB(src);
1838    src += src_stride;
1839
1840    for (loop_cnt = (height >> 3); loop_cnt--;) {
1841        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
1842        src += (8 * src_stride);
1843
1844        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1845                   vec0, vec1, vec2, vec3);
1846        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1847                   vec4, vec5, vec6, vec7);
1848        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1849                    tmp0, tmp1, tmp2, tmp3);
1850        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1851        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1852        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1853        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1854
1855        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1856                    tmp0, tmp1, tmp2, tmp3);
1857        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1858        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1859        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1860        ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1861        dst += (8 * dst_stride);
1862
1863        src0 = src8;
1864    }
1865}
1866
1867void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1868                                uint8_t *src, ptrdiff_t src_stride,
1869                                int height, int mx, int my)
1870{
1871    const int8_t *filter = bilinear_filters_msa[my - 1];
1872
1873    if (4 == height) {
1874        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1875    } else {
1876        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1877                                 height);
1878    }
1879}
1880
1881void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1882                                 uint8_t *src, ptrdiff_t src_stride,
1883                                 int height, int mx, int my)
1884{
1885    uint32_t loop_cnt;
1886    const int8_t *filter = bilinear_filters_msa[my - 1];
1887    v16u8 src0, src1, src2, src3, src4;
1888    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1889    v8u16 tmp0, tmp1, tmp2, tmp3;
1890    v8i16 filt;
1891
1892    /* rearranging filter_y */
1893    filt = LD_SH(filter);
1894    filt0 = (v16u8) __msa_splati_h(filt, 0);
1895
1896    src0 = LD_UB(src);
1897    src += src_stride;
1898
1899    for (loop_cnt = (height >> 2); loop_cnt--;) {
1900        LD_UB4(src, src_stride, src1, src2, src3, src4);
1901        src += (4 * src_stride);
1902
1903        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
1904        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
1905        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1906        SRARI_H2_UH(tmp0, tmp1, 7);
1907        SAT_UH2_UH(tmp0, tmp1, 7);
1908        PCKEV_ST_SB(tmp0, tmp1, dst);
1909        dst += dst_stride;
1910
1911        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
1912        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
1913        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1914        SRARI_H2_UH(tmp2, tmp3, 7);
1915        SAT_UH2_UH(tmp2, tmp3, 7);
1916        PCKEV_ST_SB(tmp2, tmp3, dst);
1917        dst += dst_stride;
1918
1919        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1920        SRARI_H2_UH(tmp0, tmp1, 7);
1921        SAT_UH2_UH(tmp0, tmp1, 7);
1922        PCKEV_ST_SB(tmp0, tmp1, dst);
1923        dst += dst_stride;
1924
1925        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1926        SRARI_H2_UH(tmp2, tmp3, 7);
1927        SAT_UH2_UH(tmp2, tmp3, 7);
1928        PCKEV_ST_SB(tmp2, tmp3, dst);
1929        dst += dst_stride;
1930
1931        src0 = src4;
1932    }
1933}
1934
1935static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride,
1936                                      uint8_t *dst, int32_t dst_stride,
1937                                      const int8_t *filter_horiz,
1938                                      const int8_t *filter_vert)
1939{
1940    v16i8 src0, src1, src2, src3, src4, mask;
1941    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
1942    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
1943
1944    mask = LD_SB(&mc_filt_mask_arr[16]);
1945
1946    /* rearranging filter */
1947    filt = LD_UH(filter_horiz);
1948    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1949
1950    filt = LD_UH(filter_vert);
1951    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1952
1953    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1954    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1955    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1956    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
1957    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1958    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
1959
1960    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1961    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1962    SRARI_H2_UH(tmp0, tmp1, 7);
1963    SAT_UH2_UH(tmp0, tmp1, 7);
1964    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1965    ST_W2(res0, 0, 1, dst, dst_stride);
1966    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1967}
1968
1969static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride,
1970                                      uint8_t *dst, int32_t dst_stride,
1971                                      const int8_t *filter_horiz,
1972                                      const int8_t *filter_vert)
1973{
1974    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
1975    v16i8 res0, res1, res2, res3;
1976    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
1977    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1978    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
1979
1980    mask = LD_SB(&mc_filt_mask_arr[16]);
1981
1982    /* rearranging filter */
1983    filt = LD_UH(filter_horiz);
1984    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1985
1986    filt = LD_UH(filter_vert);
1987    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1988
1989    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1990    src += (8 * src_stride);
1991    src8 = LD_SB(src);
1992
1993    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1994    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1995    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
1996    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
1997    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
1998    SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
1999               hz_out3, hz_out5);
2000    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2001
2002    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2003    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2004    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2005                vec4, vec5, vec6, vec7);
2006    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2007    SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
2008    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2009                res0, res1, res2, res3);
2010    ST_W2(res0, 0, 1, dst, dst_stride);
2011    ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2012    ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
2013    ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
2014}
2015
2016void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2017                                 uint8_t *src, ptrdiff_t src_stride,
2018                                 int height, int mx, int my)
2019{
2020    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2021    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2022
2023    if (4 == height) {
2024        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
2025                                  filter_horiz, filter_vert);
2026    } else if (8 == height) {
2027        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
2028                                  filter_horiz, filter_vert);
2029    }
2030}
2031
2032static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride,
2033                                      uint8_t *dst, int32_t dst_stride,
2034                                      const int8_t *filter_horiz,
2035                                      const int8_t *filter_vert)
2036{
2037    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2038    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2039    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2040    v8i16 filt;
2041
2042    mask = LD_SB(&mc_filt_mask_arr[0]);
2043
2044    /* rearranging filter */
2045    filt = LD_SH(filter_horiz);
2046    filt_hz = (v16u8) __msa_splati_h(filt, 0);
2047
2048    filt = LD_SH(filter_vert);
2049    filt_vt = (v16u8) __msa_splati_h(filt, 0);
2050
2051    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2052
2053    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2054    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2055    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2056    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2057
2058    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2059    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2060    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2061
2062    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2063    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2064    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2065
2066    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2067    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2068    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2069
2070    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2071    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2072    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2073    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2074}
2075
2076static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride,
2077                                          uint8_t *dst, int32_t dst_stride,
2078                                          const int8_t *filter_horiz,
2079                                          const int8_t *filter_vert,
2080                                          int32_t height)
2081{
2082    uint32_t loop_cnt;
2083    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2084    v16u8 filt_hz, filt_vt, vec0;
2085    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2086    v8i16 filt;
2087
2088    mask = LD_SB(&mc_filt_mask_arr[0]);
2089
2090    /* rearranging filter */
2091    filt = LD_SH(filter_horiz);
2092    filt_hz = (v16u8) __msa_splati_h(filt, 0);
2093
2094    filt = LD_SH(filter_vert);
2095    filt_vt = (v16u8) __msa_splati_h(filt, 0);
2096
2097    src0 = LD_SB(src);
2098    src += src_stride;
2099
2100    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2101
2102    for (loop_cnt = (height >> 3); loop_cnt--;) {
2103        LD_SB4(src, src_stride, src1, src2, src3, src4);
2104        src += (4 * src_stride);
2105
2106        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2107        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2108        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2109
2110        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2111        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2112        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2113
2114        SRARI_H2_UH(tmp1, tmp2, 7);
2115        SAT_UH2_UH(tmp1, tmp2, 7);
2116
2117        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2118        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2119        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2120
2121        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2122        LD_SB4(src, src_stride, src1, src2, src3, src4);
2123        src += (4 * src_stride);
2124        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2125        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2126
2127        SRARI_H2_UH(tmp3, tmp4, 7);
2128        SAT_UH2_UH(tmp3, tmp4, 7);
2129        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
2130        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2131
2132        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2133        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2134        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2135
2136        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2137        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2138        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2139
2140        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2141        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2142        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2143
2144        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2145        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2146        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2147
2148        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2149        SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2150        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
2151        ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2152        dst += (8 * dst_stride);
2153    }
2154}
2155
2156void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2157                                 uint8_t *src, ptrdiff_t src_stride,
2158                                 int height, int mx, int my)
2159{
2160    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2161    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2162
2163    if (4 == height) {
2164        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
2165                                  filter_horiz, filter_vert);
2166    } else {
2167        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
2168                                      filter_horiz, filter_vert, height);
2169    }
2170}
2171
2172void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2173                                  uint8_t *src, ptrdiff_t src_stride,
2174                                  int height, int mx, int my)
2175{
2176    uint32_t loop_cnt;
2177    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2178    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2179    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2180    v16u8 filt_hz, filt_vt, vec0, vec1;
2181    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2182    v8i16 filt;
2183
2184    mask = LD_SB(&mc_filt_mask_arr[0]);
2185
2186    /* rearranging filter */
2187    filt = LD_SH(filter_horiz);
2188    filt_hz = (v16u8) __msa_splati_h(filt, 0);
2189
2190    filt = LD_SH(filter_vert);
2191    filt_vt = (v16u8) __msa_splati_h(filt, 0);
2192
2193    LD_SB2(src, 8, src0, src1);
2194    src += src_stride;
2195
2196    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2197    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2198
2199
2200    for (loop_cnt = (height >> 2); loop_cnt--;) {
2201        LD_SB4(src, src_stride, src0, src2, src4, src6);
2202        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2203        src += (4 * src_stride);
2204
2205        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2206        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2207        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2208        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2209        SRARI_H2_UH(tmp1, tmp2, 7);
2210        SAT_UH2_UH(tmp1, tmp2, 7);
2211        PCKEV_ST_SB(tmp1, tmp2, dst);
2212        dst += dst_stride;
2213
2214        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2215        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2216        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2217        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2218        SRARI_H2_UH(tmp1, tmp2, 7);
2219        SAT_UH2_UH(tmp1, tmp2, 7);
2220        PCKEV_ST_SB(tmp1, tmp2, dst);
2221        dst += dst_stride;
2222
2223        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2224        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
2225        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2226        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2227        SRARI_H2_UH(tmp1, tmp2, 7);
2228        SAT_UH2_UH(tmp1, tmp2, 7);
2229        PCKEV_ST_SB(tmp1, tmp2, dst);
2230        dst += dst_stride;
2231
2232        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
2233        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
2234        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2235        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2236        SRARI_H2_UH(tmp1, tmp2, 7);
2237        SAT_UH2_UH(tmp1, tmp2, 7);
2238        PCKEV_ST_SB(tmp1, tmp2, dst);
2239        dst += dst_stride;
2240    }
2241}
2242
2243void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride,
2244                            uint8_t *src, ptrdiff_t src_stride,
2245                            int height, int mx, int my)
2246{
2247    int32_t cnt;
2248    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
2249    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2250
2251    if (0 == height % 8) {
2252        for (cnt = height >> 3; cnt--;) {
2253            LD_UB8(src, src_stride,
2254                   src0, src1, src2, src3, src4, src5, src6, src7);
2255            src += (8 * src_stride);
2256
2257            out0 = __msa_copy_u_d((v2i64) src0, 0);
2258            out1 = __msa_copy_u_d((v2i64) src1, 0);
2259            out2 = __msa_copy_u_d((v2i64) src2, 0);
2260            out3 = __msa_copy_u_d((v2i64) src3, 0);
2261            out4 = __msa_copy_u_d((v2i64) src4, 0);
2262            out5 = __msa_copy_u_d((v2i64) src5, 0);
2263            out6 = __msa_copy_u_d((v2i64) src6, 0);
2264            out7 = __msa_copy_u_d((v2i64) src7, 0);
2265
2266            SD4(out0, out1, out2, out3, dst, dst_stride);
2267            dst += (4 * dst_stride);
2268            SD4(out4, out5, out6, out7, dst, dst_stride);
2269            dst += (4 * dst_stride);
2270        }
2271    } else if (0 == height % 4) {
2272        for (cnt = (height / 4); cnt--;) {
2273            LD_UB4(src, src_stride, src0, src1, src2, src3);
2274            src += (4 * src_stride);
2275            out0 = __msa_copy_u_d((v2i64) src0, 0);
2276            out1 = __msa_copy_u_d((v2i64) src1, 0);
2277            out2 = __msa_copy_u_d((v2i64) src2, 0);
2278            out3 = __msa_copy_u_d((v2i64) src3, 0);
2279
2280            SD4(out0, out1, out2, out3, dst, dst_stride);
2281            dst += (4 * dst_stride);
2282        }
2283    }
2284}
2285
2286static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
2287                                  uint8_t *dst, int32_t dst_stride,
2288                                  int32_t height, int32_t width)
2289{
2290    int32_t cnt, loop_cnt;
2291    uint8_t *src_tmp, *dst_tmp;
2292    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2293
2294    for (cnt = (width >> 4); cnt--;) {
2295        src_tmp = src;
2296        dst_tmp = dst;
2297
2298        for (loop_cnt = (height >> 3); loop_cnt--;) {
2299            LD_UB8(src_tmp, src_stride,
2300                   src0, src1, src2, src3, src4, src5, src6, src7);
2301            src_tmp += (8 * src_stride);
2302
2303            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2304                   dst_tmp, dst_stride);
2305            dst_tmp += (8 * dst_stride);
2306        }
2307
2308        src += 16;
2309        dst += 16;
2310    }
2311}
2312
2313void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride,
2314                            uint8_t *src, ptrdiff_t src_stride,
2315                            int height, int mx, int my)
2316{
2317    int32_t cnt;
2318    v16u8 src0, src1, src2, src3;
2319
2320    if (0 == height % 8) {
2321        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
2322    } else if (0 == height % 4) {
2323        for (cnt = (height >> 2); cnt--;) {
2324            LD_UB4(src, src_stride, src0, src1, src2, src3);
2325            src += (4 * src_stride);
2326
2327            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2328            dst += (4 * dst_stride);
2329        }
2330    }
2331}
2332