1/*
2 * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "h264dsp_mips.h"
23
24static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
25    /* 8 width cases */
26    0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
27    1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
28    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
29
30    /* 4 width cases */
31    0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
32    1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
33    2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
34};
35
36#define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5,  \
37                                        out1, out2)                          \
38{                                                                            \
39    v16i8 tmp0_m, tmp1_m;                                                    \
40    v16i8 minus5b_m = __msa_ldi_b(-5);                                       \
41    v16i8 plus20b_m = __msa_ldi_b(20);                                       \
42                                                                             \
43    ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m);                                 \
44    HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2);                                 \
45    ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m);                                 \
46    DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2);          \
47    ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m);                                 \
48    DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2);          \
49}
50
51#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2)  \
52( {                                                        \
53    v8i16 out0_m;                                          \
54    v16i8 tmp0_m;                                          \
55    v16i8 minus5b = __msa_ldi_b(-5);                       \
56    v16i8 plus20b = __msa_ldi_b(20);                       \
57                                                           \
58    tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0);        \
59    out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m);               \
60                                                           \
61    tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0);        \
62    out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m);     \
63                                                           \
64    tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0);        \
65    out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m);     \
66                                                           \
67    out0_m;                                                \
68} )
69
70#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)       \
71( {                                                                 \
72    v8i16 out0_m;                                                   \
73                                                                    \
74    out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);           \
75    out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \
76    out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2);  \
77                                                                    \
78    out0_m;                                                         \
79} )
80
81#define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2)       \
82( {                                                                 \
83    v4i32 out0_m;                                                   \
84                                                                    \
85    out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0);           \
86    out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1);  \
87    out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2);  \
88    out0_m = __msa_srari_w(out0_m, 10);                             \
89    out0_m = __msa_sat_s_w(out0_m, 7);                              \
90    out0_m;                                                         \
91} )
92
93static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y,
94                                    uint8_t *dst, int32_t stride)
95{
96    const int16_t filt_const0 = 0xfb01;
97    const int16_t filt_const1 = 0x1414;
98    const int16_t filt_const2 = 0x1fb;
99    v16u8 out;
100    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
101    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
102    v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
103    v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
104    v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1;
105
106    filt0 = (v16i8) __msa_fill_h(filt_const0);
107    filt1 = (v16i8) __msa_fill_h(filt_const1);
108    filt2 = (v16i8) __msa_fill_h(filt_const2);
109
110    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
111
112    LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
113    src_y += (5 * stride);
114
115    src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
116    src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
117    src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
118    src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
119
120    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
121
122    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
123    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
124    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
125    hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
126
127    SRARI_H2_SH(hz_out0, hz_out1, 5);
128    SAT_SH2_SH(hz_out0, hz_out1, 7);
129
130    LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
131
132    src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
133    src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
134    src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
135    src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
136
137    XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
138    ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
139    ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
140    vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
141                             filt2);
142    vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
143                             filt2);
144    SRARI_H2_SH(vt_out0, vt_out1, 5);
145    SAT_SH2_SH(vt_out0, vt_out1, 7);
146
147    out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
148    out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
149
150    SAT_SH2_SH(out0, out1, 7);
151    out = PCKEV_XORI128_UB(out0, out1);
152    ST_W4(out, 0, 1, 2, 3, dst, stride);
153}
154
155static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
156                                    uint8_t *dst, int32_t stride)
157{
158    const int16_t filt_const0 = 0xfb01;
159    const int16_t filt_const1 = 0x1414;
160    const int16_t filt_const2 = 0x1fb;
161    v16u8 out0, out1;
162    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
163    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
164    v16i8 src_vt7, src_vt8, src_vt9, src_vt10, src_vt11, src_vt12;
165    v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
166    v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
167    v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
168    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
169    v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
170
171    filt0 = (v16i8) __msa_fill_h(filt_const0);
172    filt1 = (v16i8) __msa_fill_h(filt_const1);
173    filt2 = (v16i8) __msa_fill_h(filt_const2);
174
175    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
176    LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
177    src_y += (5 * stride);
178
179    XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
180
181    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
182    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
183    src_x += (4 * stride);
184
185    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
186    hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
187    hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
188    hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
189
190    SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
191    SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
192
193    LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
194    src_y += (4 * stride);
195    XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
196
197    ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
198               src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
199    ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
200               src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
201    vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
202                             filt2);
203    vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
204                             filt2);
205    vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
206                             filt2);
207    vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
208                             filt2);
209    SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
210    SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
211
212    tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
213    tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
214    tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
215    tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
216
217    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
218    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
219
220    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
221    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
222    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
223    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
224    dst += (4 * stride);
225
226    LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
227    XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
228
229    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
230    hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
231    hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
232    hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
233
234    SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
235    SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
236
237    ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
238               src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
239               src_vt1211_r);
240    vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
241                             filt2);
242    vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
243                             filt2);
244    vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
245                             filt2);
246    vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
247                             filt1, filt2);
248    SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
249    SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
250
251    tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
252    tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
253    tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
254    tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
255
256    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
257    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
258    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
259    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
260}
261
262static void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x,
263                                      const uint8_t *src_y, uint8_t *dst,
264                                      int32_t stride)
265{
266    const int16_t filt_const0 = 0xfb01;
267    const int16_t filt_const1 = 0x1414;
268    const int16_t filt_const2 = 0x1fb;
269    const uint8_t *src_x_tmp = src_x;
270    const uint8_t *src_y_tmp = src_y;
271    uint8_t *dst_tmp = dst;
272    uint32_t multiple8_cnt, loop_cnt;
273    v16u8 tmp0, tmp1;
274    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
275    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
276    v16i8 src_vt7, src_vt8;
277    v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
278    v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
279    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
280    v8i16 vt_out3, out0, out1, out2, out3;
281
282    filt0 = (v16i8) __msa_fill_h(filt_const0);
283    filt1 = (v16i8) __msa_fill_h(filt_const1);
284    filt2 = (v16i8) __msa_fill_h(filt_const2);
285
286    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
287
288    for (multiple8_cnt = 2; multiple8_cnt--;) {
289        src_x = src_x_tmp;
290        src_y = src_y_tmp;
291        dst = dst_tmp;
292
293        LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
294        src_y += (5 * stride);
295
296        XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
297
298        for (loop_cnt = 4; loop_cnt--;) {
299            LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
300            XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
301            src_x += (4 * stride);
302
303            hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
304            hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
305            hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
306            hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
307            SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
308            SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
309
310            LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
311            src_y += (4 * stride);
312
313            XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
314            ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
315                       src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
316                       src_vt43_r);
317            ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
318                       src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
319                       src_vt87_r);
320            vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
321                                     filt1, filt2);
322            vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
323                                     filt1, filt2);
324            vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
325                                     filt1, filt2);
326            vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
327                                     filt1, filt2);
328            SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
329            SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
330
331            out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
332            out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
333            out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
334            out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
335
336            SAT_SH4_SH(out0, out1, out2, out3, 7);
337            tmp0 = PCKEV_XORI128_UB(out0, out1);
338            tmp1 = PCKEV_XORI128_UB(out2, out3);
339            ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, stride);
340            dst += (4 * stride);
341
342            src_vt0 = src_vt4;
343            src_vt1 = src_vt5;
344            src_vt2 = src_vt6;
345            src_vt3 = src_vt7;
346            src_vt4 = src_vt8;
347        }
348
349        src_x_tmp += 8;
350        src_y_tmp += 8;
351        dst_tmp += 8;
352    }
353}
354
355static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x,
356                                                 const uint8_t *src_y,
357                                                 uint8_t *dst,
358                                                 int32_t stride)
359{
360    uint32_t tp0, tp1, tp2, tp3;
361    const int16_t filt_const0 = 0xfb01;
362    const int16_t filt_const1 = 0x1414;
363    const int16_t filt_const2 = 0x1fb;
364    v16u8 res, dst0 = { 0 };
365    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
366    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
367    v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
368    v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
369    v8i16 hz_out0, hz_out1, vt_out0, vt_out1, res0, res1;
370
371    filt0 = (v16i8) __msa_fill_h(filt_const0);
372    filt1 = (v16i8) __msa_fill_h(filt_const1);
373    filt2 = (v16i8) __msa_fill_h(filt_const2);
374
375    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
376
377    LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
378    src_y += (5 * stride);
379
380    src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
381    src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
382    src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
383    src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
384
385    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
386
387    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
388    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
389    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
390    hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
391
392    SRARI_H2_SH(hz_out0, hz_out1, 5);
393    SAT_SH2_SH(hz_out0, hz_out1, 7);
394
395    LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
396
397    src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
398    src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
399    src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
400    src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
401
402    XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
403    ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
404    ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
405    vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
406                             filt2);
407    vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
408                             filt2);
409    SRARI_H2_SH(vt_out0, vt_out1, 5);
410    SAT_SH2_SH(vt_out0, vt_out1, 7);
411    LW4(dst, stride, tp0, tp1, tp2, tp3);
412    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
413
414    res1 = __msa_srari_h((hz_out1 + vt_out1), 1);
415    res0 = __msa_srari_h((hz_out0 + vt_out0), 1);
416
417    SAT_SH2_SH(res0, res1, 7);
418    res = PCKEV_XORI128_UB(res0, res1);
419    dst0 = __msa_aver_u_b(res, dst0);
420
421    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
422}
423
424static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
425                                                 const uint8_t *src_y,
426                                                 uint8_t *dst,
427                                                 int32_t stride)
428{
429    const int16_t filt_const0 = 0xfb01;
430    const int16_t filt_const1 = 0x1414;
431    const int16_t filt_const2 = 0x1fb;
432    uint64_t tp0, tp1, tp2, tp3;
433    v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
434    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt0, src_vt1, src_vt2;
435    v16i8 src_vt3, src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
436    v16i8 src_vt9, src_vt10, src_vt11, src_vt12, mask0, mask1, mask2;
437    v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
438    v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
439    v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
440    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
441    v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
442
443    filt0 = (v16i8) __msa_fill_h(filt_const0);
444    filt1 = (v16i8) __msa_fill_h(filt_const1);
445    filt2 = (v16i8) __msa_fill_h(filt_const2);
446
447    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
448    LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
449    src_y += (5 * stride);
450
451    XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
452
453    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
454    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
455    src_x += (4 * stride);
456
457    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
458    hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
459    hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
460    hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
461
462    SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
463    SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
464
465    LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
466    src_y += (4 * stride);
467    XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
468
469    ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
470               src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
471    ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
472               src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
473    vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
474                             filt2);
475    vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
476                             filt2);
477    vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
478                             filt2);
479    vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
480                             filt2);
481    SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
482    SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
483
484    tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
485    tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
486    tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
487    tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
488
489    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
490    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
491
492    LD4(dst, stride, tp0, tp1, tp2, tp3);
493    INSERT_D2_UB(tp0, tp1, dst0);
494    INSERT_D2_UB(tp2, tp3, dst1);
495
496    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
497    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
498    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
499    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
500    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
501    dst += (4 * stride);
502
503    LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
504    XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
505
506    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
507    hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
508    hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
509    hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
510
511    SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
512    SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
513
514    ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
515               src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
516               src_vt1211_r);
517    vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
518                             filt2);
519    vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
520                             filt2);
521    vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
522                             filt2);
523    vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
524                             filt1, filt2);
525    SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
526    SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
527
528    tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
529    tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
530    tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
531    tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
532
533    LD4(dst, stride, tp0, tp1, tp2, tp3);
534    INSERT_D2_UB(tp0, tp1, dst0);
535    INSERT_D2_UB(tp2, tp3, dst1);
536
537    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
538    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
539    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
540    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
541    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
542}
543
544static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
545                                                   const uint8_t *src_y,
546                                                   uint8_t *dst,
547                                                   int32_t stride)
548{
549    const int16_t filt_const0 = 0xfb01;
550    const int16_t filt_const1 = 0x1414;
551    const int16_t filt_const2 = 0x1fb;
552    const uint8_t *src_x_tmp = src_x;
553    const uint8_t *src_y_tmp = src_y;
554    uint8_t *dst_tmp = dst;
555    uint32_t multiple8_cnt, loop_cnt;
556    uint64_t tp0, tp1, tp2, tp3;
557    v16u8 tmp0, tmp1, dst0 = { 0 }, dst1 = { 0 };
558    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
559    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
560    v16i8 src_vt7, src_vt8;
561    v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
562    v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
563    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
564    v8i16 vt_out3, out0, out1, out2, out3;
565
566    filt0 = (v16i8) __msa_fill_h(filt_const0);
567    filt1 = (v16i8) __msa_fill_h(filt_const1);
568    filt2 = (v16i8) __msa_fill_h(filt_const2);
569
570    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
571
572    for (multiple8_cnt = 2; multiple8_cnt--;) {
573        src_x = src_x_tmp;
574        src_y = src_y_tmp;
575        dst = dst_tmp;
576
577        LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
578        src_y += (5 * stride);
579
580        XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
581
582        for (loop_cnt = 4; loop_cnt--;) {
583            LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
584            XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
585            src_x += (4 * stride);
586
587            hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
588            hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
589            hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
590            hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
591            SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
592            SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
593
594            LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
595            src_y += (4 * stride);
596
597            XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
598            ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
599                       src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
600                       src_vt43_r);
601            ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
602                       src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
603                       src_vt87_r);
604            vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
605                                     filt1, filt2);
606            vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
607                                     filt1, filt2);
608            vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
609                                     filt1, filt2);
610            vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
611                                     filt1, filt2);
612            SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
613            SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
614
615            out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
616            out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
617            out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
618            out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
619
620            LD4(dst, stride, tp0, tp1, tp2, tp3);
621            INSERT_D2_UB(tp0, tp1, dst0);
622            INSERT_D2_UB(tp2, tp3, dst1);
623
624            SAT_SH4_SH(out0, out1, out2, out3, 7);
625            tmp0 = PCKEV_XORI128_UB(out0, out1);
626            tmp1 = PCKEV_XORI128_UB(out2, out3);
627            AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
628            ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
629            dst += (4 * stride);
630
631            src_vt0 = src_vt4;
632            src_vt1 = src_vt5;
633            src_vt2 = src_vt6;
634            src_vt3 = src_vt7;
635            src_vt4 = src_vt8;
636        }
637
638        src_x_tmp += 8;
639        src_y_tmp += 8;
640        dst_tmp += 8;
641    }
642}
643
644void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
645                                 ptrdiff_t stride)
646{
647    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
648    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
649
650    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
651    src += (8 * stride);
652    LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
653
654    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
655    dst += (8 * stride);
656    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
657}
658
659void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
660                                ptrdiff_t stride)
661{
662    uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
663
664    LD4(src, stride, src0, src1, src2, src3);
665    src += 4 * stride;
666    LD4(src, stride, src4, src5, src6, src7);
667    SD4(src0, src1, src2, src3, dst, stride);
668    dst += 4 * stride;
669    SD4(src4, src5, src6, src7, dst, stride);
670}
671
672void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
673                                 ptrdiff_t stride)
674{
675    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
676    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
677
678    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
679    src += (8 * stride);
680    LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
681
682    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
683                dst2, dst3);
684    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
685                dst6, dst7);
686    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
687    dst += (8 * stride);
688
689    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
690    LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
691
692    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
693                dst2, dst3);
694    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
695                dst6, dst7);
696    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
697}
698
699void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
700                                ptrdiff_t stride)
701{
702    uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
703    v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
704    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
705
706    LD4(src, stride, tp0, tp1, tp2, tp3);
707    src += 4 * stride;
708    LD4(src, stride, tp4, tp5, tp6, tp7);
709    INSERT_D2_UB(tp0, tp1, src0);
710    INSERT_D2_UB(tp2, tp3, src1);
711    INSERT_D2_UB(tp4, tp5, src2);
712    INSERT_D2_UB(tp6, tp7, src3);
713
714    LD4(dst, stride, tp0, tp1, tp2, tp3);
715    LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
716    INSERT_D2_UB(tp0, tp1, dst0);
717    INSERT_D2_UB(tp2, tp3, dst1);
718    INSERT_D2_UB(tp4, tp5, dst2);
719    INSERT_D2_UB(tp6, tp7, dst3);
720
721    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
722                dst2, dst3);
723
724    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
725}
726
727void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
728                                ptrdiff_t stride)
729{
730    uint32_t tp0, tp1, tp2, tp3;
731    v16u8 src0 = { 0 }, dst0 = { 0 };
732
733    LW4(src, stride, tp0, tp1, tp2, tp3);
734    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
735    LW4(dst, stride, tp0, tp1, tp2, tp3);
736    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
737
738    dst0 = __msa_aver_u_b(src0, dst0);
739
740    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
741}
742
743void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
744                                 ptrdiff_t stride)
745{
746    uint32_t loop_cnt;
747    v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
748    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
749    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
750    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
751    v16i8 minus5b = __msa_ldi_b(-5);
752    v16i8 plus20b = __msa_ldi_b(20);
753
754    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
755    mask3 = mask0 + 8;
756    mask4 = mask1 + 8;
757    mask5 = mask2 + 8;
758    src -= 2;
759
760    for (loop_cnt = 4; loop_cnt--;) {
761        LD_SB2(src, 16, src0, src1);
762        src += stride;
763        LD_SB2(src, 16, src2, src3);
764        src += stride;
765        LD_SB2(src, 16, src4, src5);
766        src += stride;
767        LD_SB2(src, 16, src6, src7);
768        src += stride;
769
770        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
771        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
772        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
773        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
774        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
775        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
776        VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
777        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
778        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
779                     minus5b, res0, res1, res2, res3);
780        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
781                     plus20b, res0, res1, res2, res3);
782        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
783        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
784        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
785        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
786        VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
787        VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
788        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
789        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
790                     minus5b, res4, res5, res6, res7);
791        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
792                     plus20b, res4, res5, res6, res7);
793        SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2,
794                   src0, src2, src4, src6);
795        SRARI_H4_SH(res0, res1, res2, res3, 5);
796        SRARI_H4_SH(res4, res5, res6, res7, 5);
797        SAT_SH4_SH(res0, res1, res2, res3, 7);
798        SAT_SH4_SH(res4, res5, res6, res7, 7);
799        PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
800        PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
801        dst0 = __msa_aver_s_b(dst0, src0);
802        dst1 = __msa_aver_s_b(dst1, src2);
803        dst2 = __msa_aver_s_b(dst2, src4);
804        dst3 = __msa_aver_s_b(dst3, src6);
805        XORI_B4_128_SB(dst0, dst1, dst2, dst3);
806        ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
807        dst += (4 * stride);
808    }
809}
810
811void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
812                                 ptrdiff_t stride)
813{
814    uint32_t loop_cnt;
815    v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
816    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
817    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
818    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
819    v16i8 minus5b = __msa_ldi_b(-5);
820    v16i8 plus20b = __msa_ldi_b(20);
821
822    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
823    mask3 = mask0 + 8;
824    mask4 = mask1 + 8;
825    mask5 = mask2 + 8;
826    src -= 2;
827
828    for (loop_cnt = 4; loop_cnt--;) {
829        LD_SB2(src, 16, src0, src1);
830        src += stride;
831        LD_SB2(src, 16, src2, src3);
832        src += stride;
833        LD_SB2(src, 16, src4, src5);
834        src += stride;
835        LD_SB2(src, 16, src6, src7);
836        src += stride;
837
838        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
839        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
840        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
841        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
842        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
843        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
844        VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
845        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
846        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
847                     minus5b, res0, res1, res2, res3);
848        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
849                     plus20b, res0, res1, res2, res3);
850        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
851        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
852        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
853        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
854        VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
855        VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
856        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
857        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
858                     minus5b, res4, res5, res6, res7);
859        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
860                     plus20b, res4, res5, res6, res7);
861        SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3,
862                   src0, src2, src4, src6);
863        SRARI_H4_SH(res0, res1, res2, res3, 5);
864        SRARI_H4_SH(res4, res5, res6, res7, 5);
865        SAT_SH4_SH(res0, res1, res2, res3, 7);
866        SAT_SH4_SH(res4, res5, res6, res7, 7);
867        PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
868        PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
869        dst0 = __msa_aver_s_b(dst0, src0);
870        dst1 = __msa_aver_s_b(dst1, src2);
871        dst2 = __msa_aver_s_b(dst2, src4);
872        dst3 = __msa_aver_s_b(dst3, src6);
873        XORI_B4_128_SB(dst0, dst1, dst2, dst3);
874        ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
875        dst += (4 * stride);
876    }
877}
878
879void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
880                                ptrdiff_t stride)
881{
882    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
883    v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
884    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
885    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
886    v16i8 minus5b = __msa_ldi_b(-5);
887    v16i8 plus20b = __msa_ldi_b(20);
888
889    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
890    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
891    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
892    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
893    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
894    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
895    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
896    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
897    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
898                 res0, res1, res2, res3);
899    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
900    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
901    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
902                 res0, res1, res2, res3);
903    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
904    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
905    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
906    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
907    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
908    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
909                 res4, res5, res6, res7);
910    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
911    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
912    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
913                 res4, res5, res6, res7);
914    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
915               src0, src1, src2, src3);
916    SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
917               src4, src5, src6, src7);
918    PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
919    PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
920    SRARI_H4_SH(res0, res1, res2, res3, 5);
921    SRARI_H4_SH(res4, res5, res6, res7, 5);
922    SAT_SH4_SH(res0, res1, res2, res3, 7);
923    SAT_SH4_SH(res4, res5, res6, res7, 7);
924    PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
925    PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
926    tmp0 = __msa_aver_s_b(tmp0, src0);
927    tmp1 = __msa_aver_s_b(tmp1, src1);
928    tmp2 = __msa_aver_s_b(tmp2, src4);
929    tmp3 = __msa_aver_s_b(tmp3, src5);
930    XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
931    ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
932}
933
934void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
935                                ptrdiff_t stride)
936{
937    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
938    v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
939    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
940    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
941    v16i8 minus5b = __msa_ldi_b(-5);
942    v16i8 plus20b = __msa_ldi_b(20);
943
944    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
945    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
946    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
947    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
948    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
949    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
950    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
951    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
952    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
953                 res0, res1, res2, res3);
954    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
955    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
956    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
957                 res0, res1, res2, res3);
958    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
959    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
960    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
961    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
962    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
963    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
964                 res4, res5, res6, res7);
965    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
966    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
967    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
968                 res4, res5, res6, res7);
969    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
970               src0, src1, src2, src3);
971    SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
972               src4, src5, src6, src7);
973    PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
974    PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
975    SRARI_H4_SH(res0, res1, res2, res3, 5);
976    SRARI_H4_SH(res4, res5, res6, res7, 5);
977    SAT_SH4_SH(res0, res1, res2, res3, 7);
978    SAT_SH4_SH(res4, res5, res6, res7, 7);
979    PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
980    PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
981    tmp0 = __msa_aver_s_b(tmp0, src0);
982    tmp1 = __msa_aver_s_b(tmp1, src1);
983    tmp2 = __msa_aver_s_b(tmp2, src4);
984    tmp3 = __msa_aver_s_b(tmp3, src5);
985    XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
986    ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
987}
988
989void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
990                                ptrdiff_t stride)
991{
992    v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
993    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
994    v8i16 res0, res1;
995    v16i8 minus5b = __msa_ldi_b(-5);
996    v16i8 plus20b = __msa_ldi_b(20);
997
998    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
999    LD_SB4(src - 2, stride, src0, src1, src2, src3);
1000    XORI_B4_128_SB(src0, src1, src2, src3);
1001    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1002    HADD_SB2_SH(vec0, vec1, res0, res1);
1003    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1004    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1005    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1006    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1007    SRARI_H2_SH(res0, res1, 5);
1008    SAT_SH2_SH(res0, res1, 7);
1009    res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1010    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
1011               src0, src1, src2, src3);
1012    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1013    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1014    src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
1015    res = __msa_aver_s_b(res, src0);
1016    res = (v16i8) __msa_xori_b((v16u8) res, 128);
1017    ST_W4(res, 0, 1, 2, 3, dst, stride);
1018}
1019
1020void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
1021                                ptrdiff_t stride)
1022{
1023    v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
1024    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1025    v8i16 res0, res1;
1026    v16i8 minus5b = __msa_ldi_b(-5);
1027    v16i8 plus20b = __msa_ldi_b(20);
1028
1029    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1030    LD_SB4(src - 2, stride, src0, src1, src2, src3);
1031    XORI_B4_128_SB(src0, src1, src2, src3);
1032    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1033    HADD_SB2_SH(vec0, vec1, res0, res1);
1034    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1035    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1036    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1037    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1038    SRARI_H2_SH(res0, res1, 5);
1039    SAT_SH2_SH(res0, res1, 7);
1040    res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1041    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
1042               src0, src1, src2, src3);
1043    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1044    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1045    src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
1046    res = __msa_aver_s_b(res, src0);
1047    res = (v16i8) __msa_xori_b((v16u8) res, 128);
1048    ST_W4(res, 0, 1, 2, 3, dst, stride);
1049}
1050
1051void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
1052                                 ptrdiff_t stride)
1053{
1054    uint32_t loop_cnt;
1055    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1056    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1057    v16i8 vec11;
1058    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1059    v16i8 minus5b = __msa_ldi_b(-5);
1060    v16i8 plus20b = __msa_ldi_b(20);
1061
1062    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1063    src -= 2;
1064
1065    for (loop_cnt = 4; loop_cnt--;) {
1066        LD_SB2(src, 8, src0, src1);
1067        src += stride;
1068        LD_SB2(src, 8, src2, src3);
1069        src += stride;
1070        LD_SB2(src, 8, src4, src5);
1071        src += stride;
1072        LD_SB2(src, 8, src6, src7);
1073        src += stride;
1074
1075        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1076        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
1077        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1078        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
1079        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1080        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
1081        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1082        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1083        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1084                     minus5b, res0, res1, res2, res3);
1085        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1086                     plus20b, res0, res1, res2, res3);
1087        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
1088        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
1089        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
1090        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
1091        VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
1092        VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
1093        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
1094        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1095                     minus5b, res4, res5, res6, res7);
1096        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1097                     plus20b, res4, res5, res6, res7);
1098        SRARI_H4_SH(res0, res1, res2, res3, 5);
1099        SRARI_H4_SH(res4, res5, res6, res7, 5);
1100        SAT_SH4_SH(res0, res1, res2, res3, 7);
1101        SAT_SH4_SH(res4, res5, res6, res7, 7);
1102        PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
1103                    vec2, vec3);
1104        XORI_B4_128_SB(vec0, vec1, vec2, vec3);
1105        ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
1106        dst += (4 * stride);
1107    }
1108}
1109
1110void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
1111                                ptrdiff_t stride)
1112{
1113    v16u8 out0, out1, out2, out3;
1114    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1115    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1116    v16i8 vec11;
1117    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1118    v16i8 minus5b = __msa_ldi_b(-5);
1119    v16i8 plus20b = __msa_ldi_b(20);
1120
1121    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1122    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1123    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1124    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1125    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1126    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
1127    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1128    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1129    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1130                 res0, res1, res2, res3);
1131    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1132    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1133    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1134                 plus20b, res0, res1, res2, res3);
1135    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
1136    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
1137    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
1138    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
1139    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
1140    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1141                 res4, res5, res6, res7);
1142    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
1143    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
1144    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1145                 plus20b, res4, res5, res6, res7);
1146    SRARI_H4_SH(res0, res1, res2, res3, 5);
1147    SRARI_H4_SH(res4, res5, res6, res7, 5);
1148    SAT_SH4_SH(res0, res1, res2, res3, 7);
1149    SAT_SH4_SH(res4, res5, res6, res7, 7);
1150    out0 = PCKEV_XORI128_UB(res0, res1);
1151    out1 = PCKEV_XORI128_UB(res2, res3);
1152    out2 = PCKEV_XORI128_UB(res4, res5);
1153    out3 = PCKEV_XORI128_UB(res6, res7);
1154    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1155}
1156
1157void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
1158                                ptrdiff_t stride)
1159{
1160    v16u8 out;
1161    v16i8 src0, src1, src2, src3, mask0, mask1, mask2;
1162    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1163    v8i16 res0, res1;
1164    v16i8 minus5b = __msa_ldi_b(-5);
1165    v16i8 plus20b = __msa_ldi_b(20);
1166
1167    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1168    LD_SB4(src - 2, stride, src0, src1, src2, src3);
1169    XORI_B4_128_SB(src0, src1, src2, src3);
1170    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1171    HADD_SB2_SH(vec0, vec1, res0, res1);
1172    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1173    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1174    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1175    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1176    SRARI_H2_SH(res0, res1, 5);
1177    SAT_SH2_SH(res0, res1, 7);
1178    out = PCKEV_XORI128_UB(res0, res1);
1179    ST_W4(out, 0, 1, 2, 3, dst, stride);
1180}
1181
1182void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
1183                                 ptrdiff_t stride)
1184{
1185    int32_t loop_cnt;
1186    int16_t filt_const0 = 0xfb01;
1187    int16_t filt_const1 = 0x1414;
1188    int16_t filt_const2 = 0x1fb;
1189    v16u8 res0, res1, res2, res3;
1190    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1191    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1192    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1193    v16i8 src65_l, src87_l, filt0, filt1, filt2;
1194    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1195
1196    filt0 = (v16i8) __msa_fill_h(filt_const0);
1197    filt1 = (v16i8) __msa_fill_h(filt_const1);
1198    filt2 = (v16i8) __msa_fill_h(filt_const2);
1199
1200    src -= (stride * 2);
1201
1202    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1203    src += (5 * stride);
1204
1205    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1206    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1207               src32_r, src43_r);
1208    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1209               src32_l, src43_l);
1210
1211    for (loop_cnt = 4; loop_cnt--;) {
1212        LD_SB4(src, stride, src5, src6, src7, src8);
1213        src += (4 * stride);
1214
1215        XORI_B4_128_SB(src5, src6, src7, src8);
1216        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1217                   src65_r, src76_r, src87_r);
1218        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1219                   src65_l, src76_l, src87_l);
1220        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1221        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1222        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1223        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1224        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1225        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1226        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1227        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1228        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1229        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1230        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1231        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1232        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1233                    out3_r, res0, res1, res2, res3);
1234        res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
1235        res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
1236        res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
1237        res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
1238        XORI_B4_128_UB(res0, res1, res2, res3);
1239        ST_UB4(res0, res1, res2, res3, dst, stride);
1240        dst += (4 * stride);
1241
1242        src10_r = src54_r;
1243        src32_r = src76_r;
1244        src21_r = src65_r;
1245        src43_r = src87_r;
1246        src10_l = src54_l;
1247        src32_l = src76_l;
1248        src21_l = src65_l;
1249        src43_l = src87_l;
1250        src2 = src6;
1251        src3 = src7;
1252        src4 = src8;
1253    }
1254}
1255
1256void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
1257                                 ptrdiff_t stride)
1258{
1259    int32_t loop_cnt;
1260    int16_t filt_const0 = 0xfb01;
1261    int16_t filt_const1 = 0x1414;
1262    int16_t filt_const2 = 0x1fb;
1263    v16u8 res0, res1, res2, res3;
1264    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1265    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1266    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1267    v16i8 src65_l, src87_l, filt0, filt1, filt2;
1268    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1269
1270    filt0 = (v16i8) __msa_fill_h(filt_const0);
1271    filt1 = (v16i8) __msa_fill_h(filt_const1);
1272    filt2 = (v16i8) __msa_fill_h(filt_const2);
1273
1274    src -= (stride * 2);
1275
1276    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1277    src += (5 * stride);
1278
1279    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1280    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1281               src32_r, src43_r);
1282    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1283               src32_l, src43_l);
1284
1285    for (loop_cnt = 4; loop_cnt--;) {
1286        LD_SB4(src, stride, src5, src6, src7, src8);
1287        src += (4 * stride);
1288
1289        XORI_B4_128_SB(src5, src6, src7, src8);
1290        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1291                   src65_r, src76_r, src87_r);
1292        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1293                   src65_l, src76_l, src87_l);
1294        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1295        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1296        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1297        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1298        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1299        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1300        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1301        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1302        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1303        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1304        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1305        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1306        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1307                    out3_r, res0, res1, res2, res3);
1308        res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
1309        res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
1310        res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
1311        res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
1312        XORI_B4_128_UB(res0, res1, res2, res3);
1313        ST_UB4(res0, res1, res2, res3, dst, stride);
1314        dst += (4 * stride);
1315
1316        src10_r = src54_r;
1317        src32_r = src76_r;
1318        src21_r = src65_r;
1319        src43_r = src87_r;
1320        src10_l = src54_l;
1321        src32_l = src76_l;
1322        src21_l = src65_l;
1323        src43_l = src87_l;
1324        src3 = src7;
1325        src4 = src8;
1326    }
1327}
1328
1329void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
1330                                ptrdiff_t stride)
1331{
1332    const int16_t filt_const0 = 0xfb01;
1333    const int16_t filt_const1 = 0x1414;
1334    const int16_t filt_const2 = 0x1fb;
1335    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1336    v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1337    v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1338    v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
1339    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1340
1341    filt0 = (v16i8) __msa_fill_h(filt_const0);
1342    filt1 = (v16i8) __msa_fill_h(filt_const1);
1343    filt2 = (v16i8) __msa_fill_h(filt_const2);
1344
1345    src -= (stride * 2);
1346
1347    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1348    src += (5 * stride);
1349    LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1350    XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1351    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1352    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1353               src32_r, src43_r);
1354    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1355               src76_r, src87_r);
1356    ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1357               src109_r, src1110_r, src1211_r);
1358    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1359    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1360    out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1361    out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1362    out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1363    out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1364    out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1365    out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1366    PCKEV_D2_SB(src3, src2, src5, src4, tmp0, tmp1);
1367    PCKEV_D2_SB(src7, src6, src9, src8, tmp2, tmp3);
1368    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1369    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
1370    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1371    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1372    PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1373    PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1374    out0 = __msa_aver_s_b(out0, tmp0);
1375    out1 = __msa_aver_s_b(out1, tmp1);
1376    out2 = __msa_aver_s_b(out2, tmp2);
1377    out3 = __msa_aver_s_b(out3, tmp3);
1378    XORI_B4_128_SB(out0, out1, out2, out3);
1379    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1380}
1381
1382void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
1383                                ptrdiff_t stride)
1384{
1385    const int16_t filt_const0 = 0xfb01;
1386    const int16_t filt_const1 = 0x1414;
1387    const int16_t filt_const2 = 0x1fb;
1388    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1389    v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1390    v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1391    v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
1392    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1393
1394    filt0 = (v16i8) __msa_fill_h(filt_const0);
1395    filt1 = (v16i8) __msa_fill_h(filt_const1);
1396    filt2 = (v16i8) __msa_fill_h(filt_const2);
1397
1398    src -= (stride * 2);
1399
1400    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1401    src += (5 * stride);
1402    LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1403    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1404    XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1405    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1406               src32_r, src43_r);
1407    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1408               src76_r, src87_r);
1409    ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1410               src109_r, src1110_r, src1211_r);
1411    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1412    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1413    out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1414    out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1415    out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1416    out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1417    out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1418    out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1419    PCKEV_D2_SB(src4, src3, src6, src5, tmp0, tmp1);
1420    PCKEV_D2_SB(src8, src7, src10, src9, tmp2, tmp3);
1421    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1422    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
1423    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1424    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1425    PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1426    PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1427    out0 = __msa_aver_s_b(out0, tmp0);
1428    out1 = __msa_aver_s_b(out1, tmp1);
1429    out2 = __msa_aver_s_b(out2, tmp2);
1430    out3 = __msa_aver_s_b(out3, tmp3);
1431    XORI_B4_128_SB(out0, out1, out2, out3);
1432    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1433}
1434
1435void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
1436                                ptrdiff_t stride)
1437{
1438    int16_t filt_const0 = 0xfb01;
1439    int16_t filt_const1 = 0x1414;
1440    int16_t filt_const2 = 0x1fb;
1441    v16u8 out;
1442    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1443    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1444    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1445    v8i16 out10, out32;
1446
1447    filt0 = (v16i8) __msa_fill_h(filt_const0);
1448    filt1 = (v16i8) __msa_fill_h(filt_const1);
1449    filt2 = (v16i8) __msa_fill_h(filt_const2);
1450
1451    src -= (stride * 2);
1452
1453    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1454    src += (5 * stride);
1455    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1456               src32_r, src43_r);
1457    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1458    XORI_B2_128_SB(src2110, src4332);
1459    LD_SB4(src, stride, src5, src6, src7, src8);
1460    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1461               src76_r, src87_r);
1462    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1463    XORI_B2_128_SB(src6554, src8776);
1464    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1465    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1466    SRARI_H2_SH(out10, out32, 5);
1467    SAT_SH2_SH(out10, out32, 7);
1468    out = PCKEV_XORI128_UB(out10, out32);
1469    src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1470    src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1471    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1472    out = __msa_aver_u_b(out, (v16u8) src32_r);
1473    ST_W4(out, 0, 1, 2, 3, dst, stride);
1474}
1475
1476void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
1477                                ptrdiff_t stride)
1478{
1479    int16_t filt_const0 = 0xfb01;
1480    int16_t filt_const1 = 0x1414;
1481    int16_t filt_const2 = 0x1fb;
1482    v16u8 out;
1483    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1484    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1485    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1486    v8i16 out10, out32;
1487
1488    filt0 = (v16i8) __msa_fill_h(filt_const0);
1489    filt1 = (v16i8) __msa_fill_h(filt_const1);
1490    filt2 = (v16i8) __msa_fill_h(filt_const2);
1491
1492    src -= (stride * 2);
1493
1494    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1495    src += (5 * stride);
1496    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1497               src32_r, src43_r);
1498    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1499    XORI_B2_128_SB(src2110, src4332);
1500    LD_SB4(src, stride, src5, src6, src7, src8);
1501    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1502               src76_r, src87_r);
1503    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1504    XORI_B2_128_SB(src6554, src8776);
1505    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1506    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1507    SRARI_H2_SH(out10, out32, 5);
1508    SAT_SH2_SH(out10, out32, 7);
1509    out = PCKEV_XORI128_UB(out10, out32);
1510    src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1511    src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1512    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1513    out = __msa_aver_u_b(out, (v16u8) src32_r);
1514    ST_W4(out, 0, 1, 2, 3, dst, stride);
1515}
1516
1517void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
1518                                 ptrdiff_t stride)
1519{
1520    avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2), dst, stride);
1521}
1522
1523void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
1524                                 ptrdiff_t stride)
1525{
1526    avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1527}
1528
1529void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
1530                                 ptrdiff_t stride)
1531{
1532    avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2), dst,
1533                              stride);
1534}
1535
1536void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
1537                                 ptrdiff_t stride)
1538{
1539    avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1540                              stride);
1541}
1542
1543void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
1544                                ptrdiff_t stride)
1545{
1546    avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2), dst, stride);
1547}
1548
1549void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
1550                                ptrdiff_t stride)
1551{
1552    avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1553}
1554
1555void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
1556                                ptrdiff_t stride)
1557{
1558    avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2), dst, stride);
1559}
1560
1561void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
1562                                ptrdiff_t stride)
1563{
1564    avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1565                            stride);
1566}
1567
1568
1569void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
1570                                ptrdiff_t stride)
1571{
1572    avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2), dst, stride);
1573}
1574
1575void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
1576                                ptrdiff_t stride)
1577{
1578    avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1579}
1580
1581void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
1582                                ptrdiff_t stride)
1583{
1584    avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2), dst, stride);
1585}
1586
1587void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
1588                                ptrdiff_t stride)
1589{
1590    avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1591                            stride);
1592}
1593
1594void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
1595                                 ptrdiff_t stride)
1596{
1597    uint8_t *dst_tmp = dst;
1598    const uint8_t *src_tmp = src - (2 * stride) - 2;
1599    uint32_t multiple8_cnt, loop_cnt;
1600    const int32_t filt_const0 = 0xfffb0001;
1601    const int32_t filt_const1 = 0x140014;
1602    const int32_t filt_const2 = 0x1fffb;
1603    v16u8 out0, out1;
1604    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1605    v16i8 mask2;
1606    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1607    v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1608    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1609    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1610    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1611    v8i16 hz_out87_l, filt0, filt1, filt2;
1612    v4i32 tmp0, tmp1;
1613
1614    filt0 = (v8i16) __msa_fill_w(filt_const0);
1615    filt1 = (v8i16) __msa_fill_w(filt_const1);
1616    filt2 = (v8i16) __msa_fill_w(filt_const2);
1617
1618    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1619
1620    for (multiple8_cnt = 2; multiple8_cnt--;) {
1621        dst = dst_tmp;
1622        src = src_tmp;
1623
1624        LD_SB5(src, stride, src0, src1, src2, src3, src4);
1625        XORI_B5_128_SB(src0, src1, src2, src3, src4);
1626        src += (5 * stride);
1627
1628        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1629        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1630        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1631        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1632        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1633
1634        for (loop_cnt = 4; loop_cnt--;) {
1635            LD_SB4(src, stride, src5, src6, src7, src8);
1636            src += (4 * stride);
1637
1638            XORI_B4_128_SB(src5, src6, src7, src8);
1639
1640            hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1641            hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1642            hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1643            hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1644
1645            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1646                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1647                       hz_out43_r);
1648            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1649                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1650                       hz_out43_l);
1651            ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1652                       hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1653                       hz_out87_r);
1654            ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1655                       hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1656                       hz_out87_l);
1657
1658            tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
1659                                  filt1, filt2);
1660            tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
1661                                  filt1, filt2);
1662            dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1663            tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
1664                                  filt1, filt2);
1665            tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
1666                                  filt1, filt2);
1667            dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1668            tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
1669                                  filt1, filt2);
1670            tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
1671                                  filt1, filt2);
1672            dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1673            tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
1674                                  filt1, filt2);
1675            tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
1676                                  filt1, filt2);
1677            dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1678
1679            dst1 = __msa_srari_h(hz_out2, 5);
1680            dst3 = __msa_srari_h(hz_out3, 5);
1681            dst5 = __msa_srari_h(hz_out4, 5);
1682            dst7 = __msa_srari_h(hz_out5, 5);
1683            SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1684
1685            dst0 = __msa_aver_s_h(dst0, dst1);
1686            dst1 = __msa_aver_s_h(dst2, dst3);
1687            dst2 = __msa_aver_s_h(dst4, dst5);
1688            dst3 = __msa_aver_s_h(dst6, dst7);
1689
1690            out0 = PCKEV_XORI128_UB(dst0, dst1);
1691            out1 = PCKEV_XORI128_UB(dst2, dst3);
1692            ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1693            dst += (4 * stride);
1694
1695            hz_out0 = hz_out4;
1696            hz_out1 = hz_out5;
1697            hz_out2 = hz_out6;
1698            hz_out3 = hz_out7;
1699            hz_out4 = hz_out8;
1700        }
1701
1702        src_tmp += 8;
1703        dst_tmp += 8;
1704    }
1705}
1706
1707void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
1708                                 ptrdiff_t stride)
1709{
1710    uint8_t *dst_tmp = dst;
1711    const uint8_t *src_tmp = src - (2 * stride) - 2;
1712    uint32_t multiple8_cnt, loop_cnt;
1713    const int32_t filt_const0 = 0xfffb0001;
1714    const int32_t filt_const1 = 0x140014;
1715    const int32_t filt_const2 = 0x1fffb;
1716    v16u8 out0, out1;
1717    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1718    v16i8 mask2;
1719    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1720    v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1721    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1722    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1723    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1724    v8i16 hz_out87_l, filt0, filt1, filt2;
1725    v4i32 tmp0, tmp1;
1726
1727    filt0 = (v8i16) __msa_fill_w(filt_const0);
1728    filt1 = (v8i16) __msa_fill_w(filt_const1);
1729    filt2 = (v8i16) __msa_fill_w(filt_const2);
1730
1731    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1732
1733    for (multiple8_cnt = 2; multiple8_cnt--;) {
1734        dst = dst_tmp;
1735        src = src_tmp;
1736
1737        LD_SB5(src, stride, src0, src1, src2, src3, src4);
1738        XORI_B5_128_SB(src0, src1, src2, src3, src4);
1739        src += (5 * stride);
1740
1741        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1742        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1743        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1744        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1745        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1746
1747        for (loop_cnt = 4; loop_cnt--;) {
1748            LD_SB4(src, stride, src5, src6, src7, src8);
1749            src += (4 * stride);
1750
1751            XORI_B4_128_SB(src5, src6, src7, src8);
1752
1753            hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1754            hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1755            hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1756            hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1757
1758            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1759                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1760                       hz_out43_r);
1761            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1762                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1763                       hz_out43_l);
1764            ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1765                       hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1766                       hz_out87_r);
1767            ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1768                       hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1769                       hz_out87_l);
1770
1771            tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
1772                                  filt1, filt2);
1773            tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
1774                                  filt1, filt2);
1775            dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1776            tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
1777                                  filt1, filt2);
1778            tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
1779                                  filt1, filt2);
1780            dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1781            tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
1782                                  filt1, filt2);
1783            tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
1784                                  filt1, filt2);
1785            dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1786            tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
1787                                  filt1, filt2);
1788            tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
1789                                  filt1, filt2);
1790            dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1791
1792            dst1 = __msa_srari_h(hz_out3, 5);
1793            dst3 = __msa_srari_h(hz_out4, 5);
1794            dst5 = __msa_srari_h(hz_out5, 5);
1795            dst7 = __msa_srari_h(hz_out6, 5);
1796            SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1797
1798            dst0 = __msa_aver_s_h(dst0, dst1);
1799            dst1 = __msa_aver_s_h(dst2, dst3);
1800            dst2 = __msa_aver_s_h(dst4, dst5);
1801            dst3 = __msa_aver_s_h(dst6, dst7);
1802
1803            out0 = PCKEV_XORI128_UB(dst0, dst1);
1804            out1 = PCKEV_XORI128_UB(dst2, dst3);
1805            ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1806            dst += (4 * stride);
1807
1808            hz_out0 = hz_out4;
1809            hz_out1 = hz_out5;
1810            hz_out2 = hz_out6;
1811            hz_out3 = hz_out7;
1812            hz_out4 = hz_out8;
1813        }
1814
1815        src_tmp += 8;
1816        dst_tmp += 8;
1817    }
1818}
1819
1820void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
1821                                ptrdiff_t stride)
1822{
1823    const int32_t filt_const0 = 0xfffb0001;
1824    const int32_t filt_const1 = 0x140014;
1825    const int32_t filt_const2 = 0x1fffb;
1826    v16u8 out0, out1;
1827    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1828    v16i8 src11, src12, mask0, mask1, mask2;
1829    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1830    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1831    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1832    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1833    v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1834    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1835    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1836    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1837    v4i32 tmp0, tmp1;
1838
1839    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1840
1841    filt0 = (v8i16) __msa_fill_w(filt_const0);
1842    filt1 = (v8i16) __msa_fill_w(filt_const1);
1843    filt2 = (v8i16) __msa_fill_w(filt_const2);
1844
1845    src -= ((2 * stride) + 2);
1846
1847    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1848    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1849    src += (5 * stride);
1850
1851    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1852    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1853    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1854    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1855    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1856
1857    LD_SB4(src, stride, src5, src6, src7, src8);
1858    src += (4 * stride);
1859    XORI_B4_128_SB(src5, src6, src7, src8);
1860
1861    hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1862    hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1863    hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1864    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1865
1866    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1867               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
1868    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1869               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
1870    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1871               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
1872    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1873               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
1874
1875    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
1876                          filt2);
1877    tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
1878                          filt2);
1879    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1880    tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
1881                          filt2);
1882    tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
1883                          filt2);
1884    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1885    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
1886                          filt2);
1887    tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
1888                          filt2);
1889    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1890    tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
1891                          filt2);
1892    tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
1893                          filt2);
1894    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1895
1896    SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
1897    SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
1898
1899    dst0 = __msa_aver_s_h(dst0, hz_out2);
1900    dst1 = __msa_aver_s_h(dst1, hz_out3);
1901    dst2 = __msa_aver_s_h(dst2, hz_out4);
1902    dst3 = __msa_aver_s_h(dst3, hz_out5);
1903
1904    out0 = PCKEV_XORI128_UB(dst0, dst1);
1905    out1 = PCKEV_XORI128_UB(dst2, dst3);
1906    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1907    dst += (4 * stride);
1908
1909    LD_SB4(src, stride, src9, src10, src11, src12);
1910    XORI_B4_128_SB(src9, src10, src11, src12);
1911    hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
1912    hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
1913    hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
1914    hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
1915    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1916               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
1917               hz_out1211_r);
1918    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1919               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
1920               hz_out1211_l);
1921    tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
1922                          filt2);
1923    tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
1924                          filt2);
1925    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1926    tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
1927                          filt2);
1928    tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
1929                          filt2);
1930    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1931    tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
1932                          filt2);
1933    tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
1934                          filt2);
1935    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1936    tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
1937                          filt2);
1938    tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
1939                          filt2);
1940    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1941
1942    SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
1943    SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
1944
1945    dst0 = __msa_aver_s_h(dst0, hz_out6);
1946    dst1 = __msa_aver_s_h(dst1, hz_out7);
1947    dst2 = __msa_aver_s_h(dst2, hz_out8);
1948    dst3 = __msa_aver_s_h(dst3, hz_out9);
1949
1950    out0 = PCKEV_XORI128_UB(dst0, dst1);
1951    out1 = PCKEV_XORI128_UB(dst2, dst3);
1952    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1953}
1954
1955void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
1956                                ptrdiff_t stride)
1957{
1958    const int32_t filt_const0 = 0xfffb0001;
1959    const int32_t filt_const1 = 0x140014;
1960    const int32_t filt_const2 = 0x1fffb;
1961    v16u8 out0, out1;
1962    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1963    v16i8 src11, src12, mask0, mask1, mask2;
1964    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1965    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1966    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1967    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1968    v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1969    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1970    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1971    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1972    v4i32 tmp0, tmp1;
1973
1974    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1975
1976    filt0 = (v8i16) __msa_fill_w(filt_const0);
1977    filt1 = (v8i16) __msa_fill_w(filt_const1);
1978    filt2 = (v8i16) __msa_fill_w(filt_const2);
1979
1980    src -= ((2 * stride) + 2);
1981
1982    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1983    XORI_B5_128_SB(src0, src1, src2, src3, src4);
1984    src += (5 * stride);
1985
1986    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1987    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1988    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1989    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1990    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1991
1992    LD_SB4(src, stride, src5, src6, src7, src8);
1993    src += (4 * stride);
1994    XORI_B4_128_SB(src5, src6, src7, src8);
1995
1996    hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1997    hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1998    hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1999    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2000
2001    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2002               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2003    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2004               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
2005    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2006               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2007    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2008               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
2009
2010    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2011                          filt2);
2012    tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
2013                          filt2);
2014    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2015    tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2016                          filt2);
2017    tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
2018                          filt2);
2019    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2020    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2021                          filt2);
2022    tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
2023                          filt2);
2024    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2025    tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2026                          filt2);
2027    tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
2028                          filt2);
2029    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2030
2031    SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
2032    SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
2033
2034    dst0 = __msa_aver_s_h(dst0, hz_out3);
2035    dst1 = __msa_aver_s_h(dst1, hz_out4);
2036    dst2 = __msa_aver_s_h(dst2, hz_out5);
2037    dst3 = __msa_aver_s_h(dst3, hz_out6);
2038
2039    out0 = PCKEV_XORI128_UB(dst0, dst1);
2040    out1 = PCKEV_XORI128_UB(dst2, dst3);
2041    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
2042    dst += (4 * stride);
2043
2044    LD_SB4(src, stride, src9, src10, src11, src12);
2045    XORI_B4_128_SB(src9, src10, src11, src12);
2046    hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
2047    hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
2048    hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
2049    hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
2050    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2051               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
2052               hz_out1211_r);
2053    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2054               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
2055               hz_out1211_l);
2056    tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
2057                          filt2);
2058    tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
2059                          filt2);
2060    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2061    tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
2062                          filt2);
2063    tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
2064                          filt2);
2065    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2066    tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
2067                          filt2);
2068    tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
2069                          filt2);
2070    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2071    tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
2072                          filt2);
2073    tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
2074                          filt2);
2075    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2076
2077    SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
2078    SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
2079
2080    dst0 = __msa_aver_s_h(dst0, hz_out7);
2081    dst1 = __msa_aver_s_h(dst1, hz_out8);
2082    dst2 = __msa_aver_s_h(dst2, hz_out9);
2083    dst3 = __msa_aver_s_h(dst3, hz_out10);
2084
2085    out0 = PCKEV_XORI128_UB(dst0, dst1);
2086    out1 = PCKEV_XORI128_UB(dst2, dst3);
2087    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
2088}
2089
2090void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
2091                                ptrdiff_t stride)
2092{
2093    const int32_t filt_const0 = 0xfffb0001;
2094    const int32_t filt_const1 = 0x140014;
2095    const int32_t filt_const2 = 0x1fffb;
2096    v16u8 res;
2097    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2098    v16i8 mask0, mask1, mask2;
2099    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2100    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2101    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2102    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2103    v4i32 tmp0, tmp1;
2104
2105    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2106
2107    filt0 = (v8i16) __msa_fill_w(filt_const0);
2108    filt1 = (v8i16) __msa_fill_w(filt_const1);
2109    filt2 = (v8i16) __msa_fill_w(filt_const2);
2110
2111    src -= ((2 * stride) + 2);
2112
2113    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2114    src += (5 * stride);
2115    LD_SB4(src, stride, src5, src6, src7, src8);
2116
2117    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2118    XORI_B4_128_SB(src5, src6, src7, src8);
2119
2120    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
2121    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
2122    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
2123    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
2124    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2125    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2126    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2127
2128    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2129               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2130    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2131               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2132
2133    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2134                          filt2);
2135    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2136                          filt2);
2137    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2138    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2139                          filt2);
2140    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2141                          filt2);
2142    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2143
2144    SRARI_H2_SH(hz_out2, hz_out4, 5);
2145    SAT_SH2_SH(hz_out2, hz_out4, 7);
2146
2147    dst0 = __msa_aver_s_h(dst0, hz_out2);
2148    dst1 = __msa_aver_s_h(dst1, hz_out4);
2149
2150    res = PCKEV_XORI128_UB(dst0, dst1);
2151    ST_W4(res, 0, 1, 2, 3, dst, stride);
2152}
2153
2154void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
2155                                ptrdiff_t stride)
2156{
2157    const int32_t filt_const0 = 0xfffb0001;
2158    const int32_t filt_const1 = 0x140014;
2159    const int32_t filt_const2 = 0x1fffb;
2160    v16u8 res;
2161    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2162    v16i8 mask0, mask1, mask2;
2163    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2164    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2165    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2166    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2167    v4i32 tmp0, tmp1;
2168
2169    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2170
2171    filt0 = (v8i16) __msa_fill_w(filt_const0);
2172    filt1 = (v8i16) __msa_fill_w(filt_const1);
2173    filt2 = (v8i16) __msa_fill_w(filt_const2);
2174
2175    src -= ((2 * stride) + 2);
2176
2177    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2178    src += (5 * stride);
2179    LD_SB4(src, stride, src5, src6, src7, src8);
2180
2181    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2182    XORI_B4_128_SB(src5, src6, src7, src8);
2183
2184    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
2185    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
2186    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
2187    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
2188    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2189    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2190    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2191
2192    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2193               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2194    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2195               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2196
2197    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2198                          filt2);
2199    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2200                          filt2);
2201    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2202    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2203                          filt2);
2204    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2205                          filt2);
2206    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2207
2208    PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
2209    SRARI_H2_SH(hz_out0, hz_out1, 5);
2210    SAT_SH2_SH(hz_out0, hz_out1, 7);
2211
2212    dst0 = __msa_aver_s_h(dst0, hz_out0);
2213    dst1 = __msa_aver_s_h(dst1, hz_out1);
2214
2215    res = PCKEV_XORI128_UB(dst0, dst1);
2216    ST_W4(res, 0, 1, 2, 3, dst, stride);
2217}
2218
2219void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
2220                                 ptrdiff_t stride)
2221{
2222    int32_t loop_cnt;
2223    int16_t filt_const0 = 0xfb01;
2224    int16_t filt_const1 = 0x1414;
2225    int16_t filt_const2 = 0x1fb;
2226    v16u8 res0, res1, res2, res3;
2227    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2228    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2229    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2230    v16i8 src65_l, src87_l, filt0, filt1, filt2;
2231    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2232
2233    filt0 = (v16i8) __msa_fill_h(filt_const0);
2234    filt1 = (v16i8) __msa_fill_h(filt_const1);
2235    filt2 = (v16i8) __msa_fill_h(filt_const2);
2236    src -= (stride * 2);
2237
2238    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2239    src += (5 * stride);
2240
2241    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2242    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2243               src32_r, src43_r);
2244    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2245               src32_l, src43_l);
2246
2247    for (loop_cnt = 4; loop_cnt--;) {
2248        LD_SB4(src, stride, src5, src6, src7, src8);
2249        src += (4 * stride);
2250
2251        XORI_B4_128_SB(src5, src6, src7, src8);
2252        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2253                   src65_r, src76_r, src87_r);
2254        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2255                   src65_l, src76_l, src87_l);
2256        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2257        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2258        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2259        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2260        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2261        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2262        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2263        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2264        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2265        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2266        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2267        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2268        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2269                    out3_r, res0, res1, res2, res3);
2270        XORI_B4_128_UB(res0, res1, res2, res3);
2271        ST_UB4(res0, res1, res2, res3, dst, stride);
2272        dst += (4 * stride);
2273
2274        src10_r = src54_r;
2275        src32_r = src76_r;
2276        src21_r = src65_r;
2277        src43_r = src87_r;
2278        src10_l = src54_l;
2279        src32_l = src76_l;
2280        src21_l = src65_l;
2281        src43_l = src87_l;
2282        src4 = src8;
2283    }
2284}
2285
2286void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
2287                                ptrdiff_t stride)
2288{
2289    const int16_t filt_const0 = 0xfb01;
2290    const int16_t filt_const1 = 0x1414;
2291    const int16_t filt_const2 = 0x1fb;
2292    v16u8 out0, out1, out2, out3;
2293    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2294    v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
2295    v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
2296    v16i8 filt0, filt1, filt2;
2297    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2298
2299    filt0 = (v16i8) __msa_fill_h(filt_const0);
2300    filt1 = (v16i8) __msa_fill_h(filt_const1);
2301    filt2 = (v16i8) __msa_fill_h(filt_const2);
2302
2303    src -= (stride * 2);
2304
2305    LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2306    src += (8 * stride);
2307    LD_SB5(src, stride, src8, src9, src10, src11, src12);
2308    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2309               src32_r, src43_r);
2310    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
2311               src98_r, src109_r);
2312    ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
2313               src910_r, src1110_r, src1211_r);
2314    XORI_B4_128_SB(src10_r, src21_r, src32_r, src43_r);
2315    XORI_B4_128_SB(src76_r, src87_r, src98_r, src109_r);
2316    XORI_B4_128_SB(src89_r, src910_r, src1110_r, src1211_r);
2317    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
2318    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
2319    out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
2320    out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
2321    out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
2322    out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
2323    out6_r = AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
2324    out7_r = AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
2325    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2326    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
2327    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2328    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2329    out0 = PCKEV_XORI128_UB(out0_r, out1_r);
2330    out1 = PCKEV_XORI128_UB(out2_r, out3_r);
2331    out2 = PCKEV_XORI128_UB(out4_r, out5_r);
2332    out3 = PCKEV_XORI128_UB(out6_r, out7_r);
2333    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
2334}
2335
2336void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
2337                                ptrdiff_t stride)
2338{
2339    const int16_t filt_const0 = 0xfb01;
2340    const int16_t filt_const1 = 0x1414;
2341    const int16_t filt_const2 = 0x1fb;
2342    v16u8 out;
2343    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2344    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2345    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2346    v8i16 out10, out32;
2347
2348    filt0 = (v16i8) __msa_fill_h(filt_const0);
2349    filt1 = (v16i8) __msa_fill_h(filt_const1);
2350    filt2 = (v16i8) __msa_fill_h(filt_const2);
2351
2352    src -= (stride * 2);
2353
2354    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2355    src += (5 * stride);
2356    LD_SB4(src, stride, src5, src6, src7, src8);
2357
2358    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2359               src32_r, src43_r);
2360    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2361               src76_r, src87_r);
2362    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
2363               src76_r, src2110, src4332, src6554, src8776);
2364    XORI_B4_128_SB(src2110, src4332, src6554, src8776);
2365    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2366    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2367    SRARI_H2_SH(out10, out32, 5);
2368    SAT_SH2_SH(out10, out32, 7);
2369    out = PCKEV_XORI128_UB(out10, out32);
2370    ST_W4(out, 0, 1, 2, 3, dst, stride);
2371}
2372
2373void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
2374                                 ptrdiff_t stride)
2375{
2376    uint32_t row;
2377    v16u8 out;
2378    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2379    v16i8 src11;
2380    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2381    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2382    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2383    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2384    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2385    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2386    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2387    v8i16 minus5h = __msa_ldi_h(-5);
2388    v8i16 plus20h = __msa_ldi_h(20);
2389
2390    mask3 = mask0 + 4;
2391    mask4 = mask1 + 4;
2392    mask5 = mask2 + 4;
2393
2394    src -= ((2 * stride) + 2);
2395
2396    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2397    LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2398    src += (5 * stride);
2399    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2400    XORI_B5_128_SB(src7, src8, src9, src10, src11);
2401
2402    for (row = 16; row--;) {
2403        LD_SB2(src, 8, src5, src6);
2404        src += stride;
2405        XORI_B2_128_SB(src5, src6);
2406
2407        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2408                                        vt_res0, vt_res1);
2409        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
2410                                        vt_res2, vt_res3);
2411        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2412                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2413        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2414                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2415        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2416                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2417        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2418                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2419        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2420        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2421        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2422        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2423        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2424        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2425        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2426        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2427        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2428        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2429        dst0 = __msa_srari_h(shf_vec2, 5);
2430        dst1 = __msa_srari_h(shf_vec5, 5);
2431        dst2 = __msa_srari_h(shf_vec8, 5);
2432        dst3 = __msa_srari_h(shf_vec11, 5);
2433        SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2434        PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
2435        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2436        dst0 = __msa_aver_s_h(dst2, dst0);
2437        dst1 = __msa_aver_s_h(dst3, dst1);
2438        out = PCKEV_XORI128_UB(dst0, dst1);
2439        ST_UB(out, dst);
2440        dst += stride;
2441
2442        src0 = src1;
2443        src1 = src2;
2444        src2 = src3;
2445        src3 = src4;
2446        src4 = src5;
2447        src7 = src8;
2448        src8 = src9;
2449        src9 = src10;
2450        src10 = src11;
2451        src11 = src6;
2452    }
2453}
2454
2455void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
2456                                 ptrdiff_t stride)
2457{
2458    uint32_t row;
2459    v16u8 out;
2460    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2461    v16i8 src11;
2462    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2463    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2464    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2465    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2466    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2467    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2468    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2469    v8i16 minus5h = __msa_ldi_h(-5);
2470    v8i16 plus20h = __msa_ldi_h(20);
2471
2472    mask3 = mask0 + 4;
2473    mask4 = mask1 + 4;
2474    mask5 = mask2 + 4;
2475
2476    src -= ((2 * stride) + 2);
2477
2478    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2479    LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2480    src += (5 * stride);
2481    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2482    XORI_B5_128_SB(src7, src8, src9, src10, src11);
2483
2484    for (row = 16; row--;) {
2485        LD_SB2(src, 8, src5, src6);
2486        src += stride;
2487        XORI_B2_128_SB(src5, src6);
2488
2489        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2490                                        vt_res0, vt_res1);
2491        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
2492                                        vt_res2, vt_res3);
2493        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2494                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2495        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2496                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2497        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2498                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2499        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2500                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2501        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2502        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2503        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2504        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2505        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2506        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2507        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2508        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2509        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2510        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2511        dst0 = __msa_srari_h(shf_vec2, 5);
2512        dst1 = __msa_srari_h(shf_vec5, 5);
2513        dst2 = __msa_srari_h(shf_vec8, 5);
2514        dst3 = __msa_srari_h(shf_vec11, 5);
2515        SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2516        dst0 = __msa_pckod_h(dst2, dst0);
2517        dst1 = __msa_pckod_h(dst3, dst1);
2518        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2519        dst0 = __msa_aver_s_h(dst2, dst0);
2520        dst1 = __msa_aver_s_h(dst3, dst1);
2521        out = PCKEV_XORI128_UB(dst0, dst1);
2522        ST_UB(out, dst);
2523        dst += stride;
2524
2525        src0 = src1;
2526        src1 = src2;
2527        src2 = src3;
2528        src3 = src4;
2529        src4 = src5;
2530        src7 = src8;
2531        src8 = src9;
2532        src9 = src10;
2533        src10 = src11;
2534        src11 = src6;
2535    }
2536}
2537
2538void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
2539                                ptrdiff_t stride)
2540{
2541    uint32_t row;
2542    v16u8 out;
2543    v16i8 src0, src1, src2, src3, src4, src5, src6;
2544    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2545    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2546    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2547    v8i16 mask3, mask4, mask5;
2548    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2549    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2550    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2551    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2552    v8i16 minus5h = __msa_ldi_h(-5);
2553    v8i16 plus20h = __msa_ldi_h(20);
2554
2555    mask3 = mask0 + 4;
2556    mask4 = mask1 + 4;
2557    mask5 = mask2 + 4;
2558
2559    src -= ((2 * stride) + 2);
2560
2561    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2562    src += (5 * stride);
2563    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2564
2565    for (row = 4; row--;) {
2566        LD_SB2(src, stride, src5, src6);
2567        src += (2 * stride);
2568        XORI_B2_128_SB(src5, src6);
2569
2570        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2571                                        vt_res0, vt_res1);
2572        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2573                                        vt_res2, vt_res3);
2574        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2575                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2576        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2577                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2578        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2579                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2580        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2581                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2582        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2583        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2584        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2585        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2586        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2587        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2588        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2589        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2590        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2591        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2592        dst0 = __msa_srari_h(shf_vec2, 5);
2593        dst1 = __msa_srari_h(shf_vec5, 5);
2594        dst2 = __msa_srari_h(shf_vec8, 5);
2595        dst3 = __msa_srari_h(shf_vec11, 5);
2596        SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2597        PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
2598        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2599        dst0 = __msa_aver_s_h(dst2, dst0);
2600        dst1 = __msa_aver_s_h(dst3, dst1);
2601        out = PCKEV_XORI128_UB(dst0, dst1);
2602        ST_D2(out, 0, 1, dst, stride);
2603        dst += (2 * stride);
2604
2605        src0 = src2;
2606        src1 = src3;
2607        src2 = src4;
2608        src3 = src5;
2609        src4 = src6;
2610    }
2611}
2612
2613void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
2614                                ptrdiff_t stride)
2615{
2616    uint32_t row;
2617    v16u8 out;
2618    v16i8 src0, src1, src2, src3, src4, src5, src6;
2619    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2620    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2621    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2622    v8i16 mask3, mask4, mask5;
2623    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2624    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2625    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2626    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2627    v8i16 minus5h = __msa_ldi_h(-5);
2628    v8i16 plus20h = __msa_ldi_h(20);
2629
2630    mask3 = mask0 + 4;
2631    mask4 = mask1 + 4;
2632    mask5 = mask2 + 4;
2633
2634    src -= ((2 * stride) + 2);
2635
2636    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2637    src += (5 * stride);
2638    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2639
2640    for (row = 4; row--;) {
2641        LD_SB2(src, stride, src5, src6);
2642        src += (2 * stride);
2643        XORI_B2_128_SB(src5, src6);
2644
2645        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2646                                        vt_res0, vt_res1);
2647        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2648                                        vt_res2, vt_res3);
2649        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2650                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2651        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2652                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2653        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2654                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2655        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2656                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2657        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2658        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2659        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2660        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2661        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2662        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2663        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2664        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2665        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2666        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2667        dst0 = __msa_srari_h(shf_vec2, 5);
2668        dst1 = __msa_srari_h(shf_vec5, 5);
2669        dst2 = __msa_srari_h(shf_vec8, 5);
2670        dst3 = __msa_srari_h(shf_vec11, 5);
2671        SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2672        dst0 = __msa_pckod_h(dst2, dst0);
2673        dst1 = __msa_pckod_h(dst3, dst1);
2674        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2675        dst0 = __msa_aver_s_h(dst2, dst0);
2676        dst1 = __msa_aver_s_h(dst3, dst1);
2677        out = PCKEV_XORI128_UB(dst0, dst1);
2678        ST_D2(out, 0, 1, dst, stride);
2679        dst += (2 * stride);
2680
2681        src0 = src2;
2682        src1 = src3;
2683        src2 = src4;
2684        src3 = src5;
2685        src4 = src6;
2686    }
2687}
2688
2689void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
2690                                ptrdiff_t stride)
2691{
2692    const int16_t filt_const0 = 0xfb01;
2693    const int16_t filt_const1 = 0x1414;
2694    const int16_t filt_const2 = 0x1fb;
2695    v16u8 out;
2696    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2697    v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2698    v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2699    v16i8 src76_l, src87_l, filt0, filt1, filt2;
2700    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2701    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2702    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2703    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2704    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2705    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2706    v8i16 minus5h = __msa_ldi_h(-5);
2707    v8i16 plus20h = __msa_ldi_h(20);
2708    v8i16 zeros = { 0 };
2709
2710    filt0 = (v16i8) __msa_fill_h(filt_const0);
2711    filt1 = (v16i8) __msa_fill_h(filt_const1);
2712    filt2 = (v16i8) __msa_fill_h(filt_const2);
2713
2714    src -= ((2 * stride) + 2);
2715
2716    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2717    src += (5 * stride);
2718    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2719    LD_SB4(src, stride, src5, src6, src7, src8);
2720    XORI_B4_128_SB(src5, src6, src7, src8);
2721
2722    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2723               src32_r, src43_r);
2724    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2725               src76_r, src87_r);
2726    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2727               src32_l, src43_l);
2728    ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2729               src76_l, src87_l);
2730    vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2731    vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2732    vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2733    vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2734    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2735               mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2736    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2737               mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2738    hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2739    DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2740    hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2741    DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2742
2743    vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2744    vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2745    vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2746    vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2747    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2748               mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2749    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2750               mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2751    hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2752    DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2753    hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2754    DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2755
2756    SRARI_W2_SW(hz_res0, hz_res1, 10);
2757    SAT_SW2_SW(hz_res0, hz_res1, 7);
2758    SRARI_W2_SW(hz_res2, hz_res3, 10);
2759    SAT_SW2_SW(hz_res2, hz_res3, 7);
2760
2761    dst0 = __msa_srari_h(shf_vec2, 5);
2762    dst1 = __msa_srari_h(shf_vec5, 5);
2763    dst2 = __msa_srari_h(shf_vec6, 5);
2764    dst3 = __msa_srari_h(shf_vec7, 5);
2765
2766    SAT_SH2_SH(dst0, dst1, 7);
2767    SAT_SH2_SH(dst2, dst3, 7);
2768    ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
2769    ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
2770
2771    hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2772    hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2773    hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2774    hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2775
2776    PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2777    out = PCKEV_XORI128_UB(dst0, dst2);
2778    ST_W4(out, 0, 1, 2, 3, dst, stride);
2779}
2780
2781void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
2782                                ptrdiff_t stride)
2783{
2784    const int16_t filt_const0 = 0xfb01;
2785    const int16_t filt_const1 = 0x1414;
2786    const int16_t filt_const2 = 0x1fb;
2787    v16u8 out;
2788    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2789    v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2790    v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2791    v16i8 src76_l, src87_l, filt0, filt1, filt2;
2792    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2793    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2794    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2795    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2796    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2797    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2798    v8i16 minus5h = __msa_ldi_h(-5);
2799    v8i16 plus20h = __msa_ldi_h(20);
2800    v8i16 zeros = { 0 };
2801
2802    filt0 = (v16i8) __msa_fill_h(filt_const0);
2803    filt1 = (v16i8) __msa_fill_h(filt_const1);
2804    filt2 = (v16i8) __msa_fill_h(filt_const2);
2805
2806    src -= ((2 * stride) + 2);
2807
2808    LD_SB5(src, stride, src0, src1, src2, src3, src4);
2809    src += (5 * stride);
2810    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2811    LD_SB4(src, stride, src5, src6, src7, src8);
2812    XORI_B4_128_SB(src5, src6, src7, src8);
2813
2814    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2815               src32_r, src43_r);
2816    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2817               src76_r, src87_r);
2818    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2819               src32_l, src43_l);
2820    ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2821               src76_l, src87_l);
2822
2823    vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2824    vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2825    vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2826    vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2827    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2828               mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2829    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2830               mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2831    hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2832    DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2833    hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2834    DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2835
2836    vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2837    vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2838    vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2839    vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2840    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2841               mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2842    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2843               mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2844    hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2845    DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2846    hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2847    DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2848
2849    SRARI_W2_SW(hz_res0, hz_res1, 10);
2850    SAT_SW2_SW(hz_res0, hz_res1, 7);
2851    SRARI_W2_SW(hz_res2, hz_res3, 10);
2852    SAT_SW2_SW(hz_res2, hz_res3, 7);
2853
2854    dst0 = __msa_srari_h(shf_vec2, 5);
2855    dst1 = __msa_srari_h(shf_vec5, 5);
2856    dst2 = __msa_srari_h(shf_vec6, 5);
2857    dst3 = __msa_srari_h(shf_vec7, 5);
2858
2859    SAT_SH2_SH(dst0, dst1, 7);
2860    SAT_SH2_SH(dst2, dst3, 7);
2861
2862    dst0 = __msa_ilvod_h(zeros, dst0);
2863    dst1 = __msa_ilvod_h(zeros, dst1);
2864    dst2 = __msa_ilvod_h(zeros, dst2);
2865    dst3 = __msa_ilvod_h(zeros, dst3);
2866
2867    hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2868    hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2869    hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2870    hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2871
2872    PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2873    out = PCKEV_XORI128_UB(dst0, dst2);
2874    ST_W4(out, 0, 1, 2, 3, dst, stride);
2875}
2876
2877void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
2878                                 ptrdiff_t stride)
2879{
2880    const int32_t filt_const0 = 0xfffb0001;
2881    const int32_t filt_const1 = 0x140014;
2882    const int32_t filt_const2 = 0x1fffb;
2883    const uint8_t *src_tmp = src - (2 * stride) - 2;
2884    uint8_t *dst_tmp = dst;
2885    uint32_t multiple8_cnt, loop_cnt;
2886    v16u8 out0, out1;
2887    v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
2888    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2889    v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3;
2890    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2891    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2892    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2893    v8i16 hz_out87_l, filt0, filt1, filt2;
2894    v4i32 tmp0, tmp1;
2895
2896    filt0 = (v8i16) __msa_fill_w(filt_const0);
2897    filt1 = (v8i16) __msa_fill_w(filt_const1);
2898    filt2 = (v8i16) __msa_fill_w(filt_const2);
2899
2900    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2901
2902    for (multiple8_cnt = 2; multiple8_cnt--;) {
2903        src = src_tmp;
2904        dst = dst_tmp;
2905
2906        LD_SB5(src, stride, src0, src1, src2, src3, src4);
2907        XORI_B5_128_SB(src0, src1, src2, src3, src4);
2908        src += (5 * stride);
2909
2910        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2911        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2912        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2913        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2914        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
2915
2916        for (loop_cnt = 4; loop_cnt--;) {
2917            LD_SB4(src, stride, src0, src1, src2, src3);
2918            XORI_B4_128_SB(src0, src1, src2, src3);
2919            src += (4 * stride);
2920
2921            hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2922            hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2923            hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2924            hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2925
2926            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2927                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
2928                       hz_out43_r);
2929            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2930                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
2931                       hz_out43_l);
2932            ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2933                       hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
2934                       hz_out87_r);
2935            ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2936                       hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
2937                       hz_out87_l);
2938
2939            tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
2940                                  filt1, filt2);
2941            tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
2942                                  filt1, filt2);
2943            dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2944            tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
2945                                  filt1, filt2);
2946            tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
2947                                  filt1, filt2);
2948            dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2949            tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
2950                                  filt1, filt2);
2951            tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
2952                                  filt1, filt2);
2953            dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2954            tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
2955                                  filt1, filt2);
2956            tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
2957                                  filt1, filt2);
2958            dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2959
2960            out0 = PCKEV_XORI128_UB(dst0, dst1);
2961            out1 = PCKEV_XORI128_UB(dst2, dst3);
2962            ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
2963            dst += (4 * stride);
2964
2965            hz_out0 = hz_out4;
2966            hz_out1 = hz_out5;
2967            hz_out2 = hz_out6;
2968            hz_out3 = hz_out7;
2969            hz_out4 = hz_out8;
2970        }
2971
2972        src_tmp += 8;
2973        dst_tmp += 8;
2974    }
2975}
2976
2977void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
2978                                ptrdiff_t stride)
2979{
2980    const int32_t filt_const0 = 0xfffb0001;
2981    const int32_t filt_const1 = 0x140014;
2982    const int32_t filt_const2 = 0x1fffb;
2983    v16u8 out0, out1;
2984    v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
2985    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2986    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
2987    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2988    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
2989    v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
2990    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
2991    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
2992    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
2993    v4i32 tmp0, tmp1;
2994
2995    filt0 = (v8i16) __msa_fill_w(filt_const0);
2996    filt1 = (v8i16) __msa_fill_w(filt_const1);
2997    filt2 = (v8i16) __msa_fill_w(filt_const2);
2998
2999    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3000
3001    src -= ((2 * stride) + 2);
3002    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3003    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3004    src += (5 * stride);
3005
3006    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3007    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3008    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3009    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3010    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3011
3012    LD_SB4(src, stride, src0, src1, src2, src3);
3013    XORI_B4_128_SB(src0, src1, src2, src3);
3014    src += (4 * stride);
3015    hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3016    hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3017    hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3018    hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3019    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3020               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3021    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3022               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3023    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3024               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3025    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3026               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3027
3028    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3029                          filt2);
3030    tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3031                          filt2);
3032    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3033    tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3034                          filt2);
3035    tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3036                          filt2);
3037    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3038    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3039                          filt2);
3040    tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3041                          filt2);
3042    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3043    tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3044                          filt2);
3045    tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3046                          filt2);
3047    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3048    out0 = PCKEV_XORI128_UB(dst0, dst1);
3049    out1 = PCKEV_XORI128_UB(dst2, dst3);
3050    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
3051    dst += (4 * stride);
3052
3053    LD_SB4(src, stride, src0, src1, src2, src3);
3054    XORI_B4_128_SB(src0, src1, src2, src3);
3055    hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3056    hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3057    hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3058    hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3059    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3060               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3061               hz_out1211_r);
3062    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3063               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3064               hz_out1211_l);
3065    tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3066                          filt2);
3067    tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3068                          filt2);
3069    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3070    tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3071                          filt2);
3072    tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3073                          filt2);
3074    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3075    tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3076                          filt2);
3077    tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3078                          filt2);
3079    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3080    tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3081                          filt2);
3082    tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3083                          filt2);
3084    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3085    out0 = PCKEV_XORI128_UB(dst0, dst1);
3086    out1 = PCKEV_XORI128_UB(dst2, dst3);
3087    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
3088}
3089
3090void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
3091                                ptrdiff_t stride)
3092{
3093    const int32_t filt_const0 = 0xfffb0001;
3094    const int32_t filt_const1 = 0x140014;
3095    const int32_t filt_const2 = 0x1fffb;
3096    v16u8 res;
3097    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3098    v16i8 mask0, mask1, mask2;
3099    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3100    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3101    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3102    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3103    v4i32 tmp0, tmp1;
3104
3105    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3106
3107    filt0 = (v8i16) __msa_fill_w(filt_const0);
3108    filt1 = (v8i16) __msa_fill_w(filt_const1);
3109    filt2 = (v8i16) __msa_fill_w(filt_const2);
3110
3111    src -= ((2 * stride) + 2);
3112
3113    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3114    src += (5 * stride);
3115    LD_SB4(src, stride, src5, src6, src7, src8);
3116
3117    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3118    XORI_B4_128_SB(src5, src6, src7, src8);
3119    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
3120    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
3121    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
3122    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
3123    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3124    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3125    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3126    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3127               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3128    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3129               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3130
3131    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3132                          filt2);
3133    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3134                          filt2);
3135    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3136    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3137                          filt2);
3138    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3139                          filt2);
3140    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3141    res = PCKEV_XORI128_UB(dst0, dst1);
3142    ST_W4(res, 0, 1, 2, 3, dst, stride);
3143}
3144
3145void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
3146                                 ptrdiff_t stride)
3147{
3148    uint32_t loop_cnt;
3149    v16u8 dst0, dst1, dst2, dst3;
3150    v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
3151    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3152    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3153    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3154    v16i8 minus5b = __msa_ldi_b(-5);
3155    v16i8 plus20b = __msa_ldi_b(20);
3156
3157    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3158    mask3 = mask0 + 8;
3159    mask4 = mask1 + 8;
3160    mask5 = mask2 + 8;
3161    src -= 2;
3162
3163    for (loop_cnt = 4; loop_cnt--;) {
3164        LD_SB2(src, 16, src0, src1);
3165        src += stride;
3166        LD_SB2(src, 16, src2, src3);
3167        src += stride;
3168        LD_SB2(src, 16, src4, src5);
3169        src += stride;
3170        LD_SB2(src, 16, src6, src7);
3171        src += stride;
3172
3173        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3174        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3175        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3176        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3177        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3178        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3179        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3180        VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3181        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3182        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3183                     minus5b, res0, res1, res2, res3);
3184        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3185                     plus20b, res0, res1, res2, res3);
3186        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3187        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3188        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3189        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3190        VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3191        VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3192        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3193        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3194                     minus5b, res4, res5, res6, res7);
3195        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3196                     plus20b, res4, res5, res6, res7);
3197        SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2,
3198                   src0, src2, src4, src6);
3199        SRARI_H4_SH(res0, res1, res2, res3, 5);
3200        SRARI_H4_SH(res4, res5, res6, res7, 5);
3201        SAT_SH4_SH(res0, res1, res2, res3, 7);
3202        SAT_SH4_SH(res4, res5, res6, res7, 7);
3203        PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
3204        PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
3205        out0 = __msa_aver_s_b(out0, src0);
3206        out1 = __msa_aver_s_b(out1, src2);
3207        out2 = __msa_aver_s_b(out2, src4);
3208        out3 = __msa_aver_s_b(out3, src6);
3209        XORI_B4_128_SB(out0, out1, out2, out3);
3210        AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
3211        AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
3212        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3213        dst += (4 * stride);
3214    }
3215}
3216
3217void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
3218                                 ptrdiff_t stride)
3219{
3220    uint32_t loop_cnt;
3221    v16u8 dst0, dst1, dst2, dst3;
3222    v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
3223    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3224    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3225    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3226    v16i8 minus5b = __msa_ldi_b(-5);
3227    v16i8 plus20b = __msa_ldi_b(20);
3228
3229    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3230    mask3 = mask0 + 8;
3231    mask4 = mask1 + 8;
3232    mask5 = mask2 + 8;
3233    src -= 2;
3234
3235    for (loop_cnt = 4; loop_cnt--;) {
3236        LD_SB2(src, 16, src0, src1);
3237        src += stride;
3238        LD_SB2(src, 16, src2, src3);
3239        src += stride;
3240        LD_SB2(src, 16, src4, src5);
3241        src += stride;
3242        LD_SB2(src, 16, src6, src7);
3243        src += stride;
3244
3245        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3246        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3247        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3248        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3249        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3250        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3251        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3252        VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3253        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3254        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3255                     minus5b, res0, res1, res2, res3);
3256        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3257                     plus20b, res0, res1, res2, res3);
3258        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3259        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3260        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3261        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3262        VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3263        VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3264        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3265        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3266                     minus5b, res4, res5, res6, res7);
3267        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3268                     plus20b, res4, res5, res6, res7);
3269        SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3,
3270                   src0, src2, src4, src6);
3271        SRARI_H4_SH(res0, res1, res2, res3, 5);
3272        SRARI_H4_SH(res4, res5, res6, res7, 5);
3273        SAT_SH4_SH(res0, res1, res2, res3, 7);
3274        SAT_SH4_SH(res4, res5, res6, res7, 7);
3275        PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
3276        PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
3277        out0 = __msa_aver_s_b(out0, src0);
3278        out1 = __msa_aver_s_b(out1, src2);
3279        out2 = __msa_aver_s_b(out2, src4);
3280        out3 = __msa_aver_s_b(out3, src6);
3281        XORI_B4_128_SB(out0, out1, out2, out3);
3282        AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
3283        AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
3284        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3285        dst += (4 * stride);
3286    }
3287}
3288
3289void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
3290                                ptrdiff_t stride)
3291{
3292    uint64_t tp0, tp1, tp2, tp3;
3293    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3294    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3295    v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3296    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3297    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3298    v16i8 minus5b = __msa_ldi_b(-5);
3299    v16i8 plus20b = __msa_ldi_b(20);
3300
3301    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3302    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3303    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3304    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3305    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3306    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3307    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3308    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3309    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3310                 res0, res1, res2, res3);
3311    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3312    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3313    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3314                 res0, res1, res2, res3);
3315    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3316    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3317    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3318    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3319    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3320    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3321                 res4, res5, res6, res7);
3322    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3323    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3324    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3325                 res4, res5, res6, res7);
3326    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
3327               src0, src1, src2, src3);
3328    SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
3329               src4, src5, src6, src7);
3330    PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3331    PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3332    SRARI_H4_SH(res0, res1, res2, res3, 5);
3333    SRARI_H4_SH(res4, res5, res6, res7, 5);
3334    SAT_SH4_SH(res0, res1, res2, res3, 7);
3335    SAT_SH4_SH(res4, res5, res6, res7, 7);
3336    PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3337    PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3338    tmp0 = __msa_aver_s_b(tmp0, src0);
3339    tmp1 = __msa_aver_s_b(tmp1, src1);
3340    tmp2 = __msa_aver_s_b(tmp2, src4);
3341    tmp3 = __msa_aver_s_b(tmp3, src5);
3342    XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3343    LD4(dst, stride, tp0, tp1, tp2, tp3);
3344    INSERT_D2_UB(tp0, tp1, dst0);
3345    INSERT_D2_UB(tp2, tp3, dst1);
3346    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3347    INSERT_D2_UB(tp0, tp1, dst2);
3348    INSERT_D2_UB(tp2, tp3, dst3);
3349    AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
3350    AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
3351    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3352}
3353
3354void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
3355                                ptrdiff_t stride)
3356{
3357    uint64_t tp0, tp1, tp2, tp3;
3358    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3359    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3360    v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3361    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3362    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3363    v16i8 minus5b = __msa_ldi_b(-5);
3364    v16i8 plus20b = __msa_ldi_b(20);
3365
3366    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3367    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3368    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3369    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3370    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3371    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3372    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3373    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3374    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3375                 res0, res1, res2, res3);
3376    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3377    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3378    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3379                 res0, res1, res2, res3);
3380    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3381    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3382    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3383    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3384    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3385    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3386                 res4, res5, res6, res7);
3387    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3388    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3389    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3390                 res4, res5, res6, res7);
3391    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
3392               src0, src1, src2, src3);
3393    SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
3394               src4, src5, src6, src7);
3395    PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3396    PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3397    SRARI_H4_SH(res0, res1, res2, res3, 5);
3398    SRARI_H4_SH(res4, res5, res6, res7, 5);
3399    SAT_SH4_SH(res0, res1, res2, res3, 7);
3400    SAT_SH4_SH(res4, res5, res6, res7, 7);
3401    PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3402    PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3403    tmp0 = __msa_aver_s_b(tmp0, src0);
3404    tmp1 = __msa_aver_s_b(tmp1, src1);
3405    tmp2 = __msa_aver_s_b(tmp2, src4);
3406    tmp3 = __msa_aver_s_b(tmp3, src5);
3407    XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3408    LD4(dst, stride, tp0, tp1, tp2, tp3);
3409    INSERT_D2_UB(tp0, tp1, dst0);
3410    INSERT_D2_UB(tp2, tp3, dst1);
3411    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3412    INSERT_D2_UB(tp0, tp1, dst2);
3413    INSERT_D2_UB(tp2, tp3, dst3);
3414    AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
3415    AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
3416    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3417}
3418
3419void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
3420                                ptrdiff_t stride)
3421{
3422    uint32_t tp0, tp1, tp2, tp3;
3423    v16u8 dst0 = { 0 };
3424    v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3425    v16i8 mask0, mask1, mask2;
3426    v8i16 out0, out1;
3427    v16i8 minus5b = __msa_ldi_b(-5);
3428    v16i8 plus20b = __msa_ldi_b(20);
3429
3430    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3431    LD_SB4(src - 2, stride, src0, src1, src2, src3);
3432    XORI_B4_128_SB(src0, src1, src2, src3);
3433    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3434    HADD_SB2_SH(vec0, vec1, out0, out1);
3435    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3436    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3437    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3438    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3439    SRARI_H2_SH(out0, out1, 5);
3440    SAT_SH2_SH(out0, out1, 7);
3441    res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3442    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
3443               src0, src1, src2, src3);
3444    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3445    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3446    src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3447    res = __msa_aver_s_b(res, src0);
3448    res = (v16i8) __msa_xori_b((v16u8) res, 128);
3449    LW4(dst, stride, tp0, tp1, tp2, tp3);
3450    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3451    dst0 = __msa_aver_u_b((v16u8) res, dst0);
3452    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
3453}
3454
3455void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
3456                                ptrdiff_t stride)
3457{
3458    uint32_t tp0, tp1, tp2, tp3;
3459    v16u8 dst0 = { 0 };
3460    v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3461    v16i8 mask0, mask1, mask2;
3462    v8i16 out0, out1;
3463    v16i8 minus5b = __msa_ldi_b(-5);
3464    v16i8 plus20b = __msa_ldi_b(20);
3465
3466    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3467    LD_SB4(src - 2, stride, src0, src1, src2, src3);
3468    XORI_B4_128_SB(src0, src1, src2, src3);
3469    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3470    HADD_SB2_SH(vec0, vec1, out0, out1);
3471    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3472    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3473    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3474    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3475    SRARI_H2_SH(out0, out1, 5);
3476    SAT_SH2_SH(out0, out1, 7);
3477    res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3478    SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
3479               src0, src1, src2, src3);
3480    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3481    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3482    src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3483    res = __msa_aver_s_b(res, src0);
3484    res = (v16i8) __msa_xori_b((v16u8) res, 128);
3485    LW4(dst, stride, tp0, tp1, tp2, tp3);
3486    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3487    dst0 = __msa_aver_u_b((v16u8) res, dst0);
3488    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
3489}
3490
3491void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
3492                                 ptrdiff_t stride)
3493{
3494    uint32_t loop_cnt;
3495    v16u8 dst0, dst1, dst2, dst3;
3496    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3497    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3498    v16i8 vec11;
3499    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3500    v16i8 minus5b = __msa_ldi_b(-5);
3501    v16i8 plus20b = __msa_ldi_b(20);
3502
3503    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3504    src -= 2;
3505
3506    for (loop_cnt = 4; loop_cnt--;) {
3507        LD_SB2(src, 8, src0, src1);
3508        src += stride;
3509        LD_SB2(src, 8, src2, src3);
3510        src += stride;
3511        LD_SB2(src, 8, src4, src5);
3512        src += stride;
3513        LD_SB2(src, 8, src6, src7);
3514        src += stride;
3515
3516        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3517        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3518        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
3519        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
3520        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
3521        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
3522        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
3523        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
3524        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3525        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3526                     minus5b, res0, res1, res2, res3);
3527        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3528                     plus20b, res0, res1, res2, res3);
3529        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
3530        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
3531        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
3532        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
3533        VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
3534        VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
3535        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3536        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3537                     minus5b, res4, res5, res6, res7);
3538        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3539                     plus20b, res4, res5, res6, res7);
3540        SRARI_H4_SH(res0, res1, res2, res3, 5);
3541        SRARI_H4_SH(res4, res5, res6, res7, 5);
3542        SAT_SH4_SH(res0, res1, res2, res3, 7);
3543        SAT_SH4_SH(res4, res5, res6, res7, 7);
3544        PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
3545                    vec2, vec3);
3546        XORI_B4_128_SB(vec0, vec1, vec2, vec3);
3547        AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
3548        AVER_UB2_UB(vec2, dst2, vec3, dst3, dst2, dst3);
3549        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3550        dst += (4 * stride);
3551    }
3552}
3553
3554void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
3555                                ptrdiff_t stride)
3556{
3557    uint64_t tp0, tp1, tp2, tp3;
3558    v16u8 out0, out1, out2 = { 0 }, out3 = { 0 };
3559    v16u8 out4, out5, out6 = { 0 }, out7 = { 0 };
3560    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3561    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3562    v16i8 vec11;
3563    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3564    v16i8 minus5b = __msa_ldi_b(-5);
3565    v16i8 plus20b = __msa_ldi_b(20);
3566
3567    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3568
3569    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3570    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3571    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3572    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3573    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3574    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3575    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3576    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3577                 res0, res1, res2, res3);
3578    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3579    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3580    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3581                 res0, res1, res2, res3);
3582    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3583    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3584    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3585    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3586    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3587    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3588                 res4, res5, res6, res7);
3589    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3590    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3591    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3592                 res4, res5, res6, res7);
3593    SRARI_H4_SH(res0, res1, res2, res3, 5);
3594    SRARI_H4_SH(res4, res5, res6, res7, 5);
3595    SAT_SH4_SH(res0, res1, res2, res3, 7);
3596    SAT_SH4_SH(res4, res5, res6, res7, 7);
3597    out0 = PCKEV_XORI128_UB(res0, res1);
3598    out1 = PCKEV_XORI128_UB(res2, res3);
3599    out4 = PCKEV_XORI128_UB(res4, res5);
3600    out5 = PCKEV_XORI128_UB(res6, res7);
3601    LD4(dst, stride, tp0, tp1, tp2, tp3);
3602    INSERT_D2_UB(tp0, tp1, out2);
3603    INSERT_D2_UB(tp2, tp3, out3);
3604    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3605    INSERT_D2_UB(tp0, tp1, out6);
3606    INSERT_D2_UB(tp2, tp3, out7);
3607    AVER_UB2_UB(out0, out2, out1, out3, out0, out1);
3608    AVER_UB2_UB(out4, out6, out5, out7, out4, out5);
3609    ST_D8(out0, out1, out4, out5, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3610}
3611
3612void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
3613                                ptrdiff_t stride)
3614{
3615    uint32_t tp0, tp1, tp2, tp3;
3616    v16u8 res, dst0 = { 0 };
3617    v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, vec4, vec5;
3618    v16i8 mask0, mask1, mask2;
3619    v8i16 res0, res1;
3620    v16i8 minus5b = __msa_ldi_b(-5);
3621    v16i8 plus20b = __msa_ldi_b(20);
3622
3623    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3624    LD_SB4(src - 2, stride, src0, src1, src2, src3);
3625    XORI_B4_128_SB(src0, src1, src2, src3);
3626    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3627    HADD_SB2_SH(vec0, vec1, res0, res1);
3628    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3629    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
3630    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3631    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
3632    SRARI_H2_SH(res0, res1, 5);
3633    SAT_SH2_SH(res0, res1, 7);
3634    res = PCKEV_XORI128_UB(res0, res1);
3635    LW4(dst, stride, tp0, tp1, tp2, tp3);
3636    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3637    res = __msa_aver_u_b(res, dst0);
3638    ST_W4(res, 0, 1, 2, 3, dst, stride);
3639}
3640
3641void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
3642                                 ptrdiff_t stride)
3643{
3644    int32_t loop_cnt;
3645    int16_t filt_const0 = 0xfb01;
3646    int16_t filt_const1 = 0x1414;
3647    int16_t filt_const2 = 0x1fb;
3648    v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3649    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3650    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3651    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3652    v16i8 src65_l, src87_l, filt0, filt1, filt2;
3653    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3654
3655    filt0 = (v16i8) __msa_fill_h(filt_const0);
3656    filt1 = (v16i8) __msa_fill_h(filt_const1);
3657    filt2 = (v16i8) __msa_fill_h(filt_const2);
3658
3659    src -= (stride * 2);
3660
3661    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3662    src += (5 * stride);
3663
3664    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3665    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3666               src32_r, src43_r);
3667    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3668               src32_l, src43_l);
3669
3670    for (loop_cnt = 4; loop_cnt--;) {
3671        LD_SB4(src, stride, src5, src6, src7, src8);
3672        src += (4 * stride);
3673
3674        XORI_B4_128_SB(src5, src6, src7, src8);
3675        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3676                   src65_r, src76_r, src87_r);
3677        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3678                   src65_l, src76_l, src87_l);
3679        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3680        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3681        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3682        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3683        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3684        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3685        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3686        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3687        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3688        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3689        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3690        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3691        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3692                    out3_r, res0, res1, res2, res3);
3693        res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
3694        res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
3695        res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
3696        res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
3697        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3698        XORI_B4_128_UB(res0, res1, res2, res3);
3699        AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
3700        AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
3701        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3702        dst += (4 * stride);
3703
3704        src10_r = src54_r;
3705        src32_r = src76_r;
3706        src21_r = src65_r;
3707        src43_r = src87_r;
3708        src10_l = src54_l;
3709        src32_l = src76_l;
3710        src21_l = src65_l;
3711        src43_l = src87_l;
3712        src2 = src6;
3713        src3 = src7;
3714        src4 = src8;
3715    }
3716}
3717
3718void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
3719                                 ptrdiff_t stride)
3720{
3721    int32_t loop_cnt;
3722    int16_t filt_const0 = 0xfb01;
3723    int16_t filt_const1 = 0x1414;
3724    int16_t filt_const2 = 0x1fb;
3725    v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3726    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3727    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3728    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3729    v16i8 src65_l, src87_l, filt0, filt1, filt2;
3730    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3731
3732    filt0 = (v16i8) __msa_fill_h(filt_const0);
3733    filt1 = (v16i8) __msa_fill_h(filt_const1);
3734    filt2 = (v16i8) __msa_fill_h(filt_const2);
3735
3736    src -= (stride * 2);
3737
3738    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3739    src += (5 * stride);
3740
3741    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3742    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3743               src32_r, src43_r);
3744    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3745               src32_l, src43_l);
3746
3747    for (loop_cnt = 4; loop_cnt--;) {
3748        LD_SB4(src, stride, src5, src6, src7, src8);
3749        src += (4 * stride);
3750
3751        XORI_B4_128_SB(src5, src6, src7, src8);
3752        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3753                   src65_r, src76_r, src87_r);
3754        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3755                   src65_l, src76_l, src87_l);
3756        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3757        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3758        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3759        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3760        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3761        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3762        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3763        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3764        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3765        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3766        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3767        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3768        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3769                    out3_r, res0, res1, res2, res3);
3770        res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
3771        res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
3772        res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
3773        res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
3774        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3775        XORI_B4_128_UB(res0, res1, res2, res3);
3776        AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
3777        AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
3778        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3779        dst += (4 * stride);
3780
3781        src10_r = src54_r;
3782        src32_r = src76_r;
3783        src21_r = src65_r;
3784        src43_r = src87_r;
3785        src10_l = src54_l;
3786        src32_l = src76_l;
3787        src21_l = src65_l;
3788        src43_l = src87_l;
3789        src3 = src7;
3790        src4 = src8;
3791    }
3792}
3793
3794void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
3795                                ptrdiff_t stride)
3796{
3797    uint64_t tp0, tp1, tp2, tp3;
3798    const int16_t filt_const0 = 0xfb01;
3799    const int16_t filt_const1 = 0x1414;
3800    const int16_t filt_const2 = 0x1fb;
3801    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3802    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3803    v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3804    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3805    v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3806    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3807
3808    filt0 = (v16i8) __msa_fill_h(filt_const0);
3809    filt1 = (v16i8) __msa_fill_h(filt_const1);
3810    filt2 = (v16i8) __msa_fill_h(filt_const2);
3811
3812    src -= (stride * 2);
3813
3814    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3815    src += (5 * stride);
3816
3817    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3818    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3819               src32_r, src43_r);
3820    LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3821    XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3822    ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3823               src87_r, src98_r, src109_r);
3824    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3825    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3826    out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3827    out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3828    PCKEV_D2_SB(src3, src2, src7, src4, tmp0, tmp1);
3829    ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3830               src21_r, src32_r, src43_r);
3831    out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3832    out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3833    out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3834    out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3835    PCKEV_D2_SB(src9, src8, src11, src10, tmp2, tmp3);
3836    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3837    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3838    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3839    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3840
3841    LD4(dst, stride, tp0, tp1, tp2, tp3);
3842    INSERT_D2_UB(tp0, tp1, dst0);
3843    INSERT_D2_UB(tp2, tp3, dst1);
3844    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3845    INSERT_D2_UB(tp0, tp1, dst2);
3846    INSERT_D2_UB(tp2, tp3, dst3);
3847
3848    PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3849    PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3850    out0 = __msa_aver_s_b(out0, tmp0);
3851    out1 = __msa_aver_s_b(out1, tmp1);
3852    out2 = __msa_aver_s_b(out2, tmp2);
3853    out3 = __msa_aver_s_b(out3, tmp3);
3854    XORI_B4_128_SB(out0, out1, out2, out3);
3855    AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3856                dst2, dst3);
3857    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3858}
3859
3860void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
3861                                ptrdiff_t stride)
3862{
3863    uint64_t tp0, tp1, tp2, tp3;
3864    const int16_t filt_const0 = 0xfb01;
3865    const int16_t filt_const1 = 0x1414;
3866    const int16_t filt_const2 = 0x1fb;
3867    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3868    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3869    v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3870    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3871    v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3872    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3873
3874    filt0 = (v16i8) __msa_fill_h(filt_const0);
3875    filt1 = (v16i8) __msa_fill_h(filt_const1);
3876    filt2 = (v16i8) __msa_fill_h(filt_const2);
3877
3878    src -= (stride * 2);
3879
3880    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3881    src += (5 * stride);
3882
3883    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3884    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3885               src32_r, src43_r);
3886    LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3887    XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3888    ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3889               src87_r, src98_r, src109_r);
3890    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3891    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3892    out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3893    out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3894    PCKEV_D2_SB(src4, src3, src8, src7, tmp0, tmp1);
3895    ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3896               src21_r, src32_r, src43_r);
3897    out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3898    out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3899    out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3900    out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3901    PCKEV_D2_SB(src10, src9, src12, src11, tmp2, tmp3);
3902    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3903    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3904    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3905    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3906
3907    LD4(dst, stride, tp0, tp1, tp2, tp3);
3908    INSERT_D2_UB(tp0, tp1, dst0);
3909    INSERT_D2_UB(tp2, tp3, dst1);
3910    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3911    INSERT_D2_UB(tp0, tp1, dst2);
3912    INSERT_D2_UB(tp2, tp3, dst3);
3913
3914    PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3915    PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3916    out0 = __msa_aver_s_b(out0, tmp0);
3917    out1 = __msa_aver_s_b(out1, tmp1);
3918    out2 = __msa_aver_s_b(out2, tmp2);
3919    out3 = __msa_aver_s_b(out3, tmp3);
3920    XORI_B4_128_SB(out0, out1, out2, out3);
3921    AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3922                dst2, dst3);
3923    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3924}
3925
3926void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
3927                                ptrdiff_t stride)
3928{
3929    uint32_t tp0, tp1, tp2, tp3;
3930    int16_t filt_const0 = 0xfb01;
3931    int16_t filt_const1 = 0x1414;
3932    int16_t filt_const2 = 0x1fb;
3933    v16u8 res, dst0 = { 0 };
3934    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3935    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3936    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3937    v8i16 out10, out32;
3938
3939    filt0 = (v16i8) __msa_fill_h(filt_const0);
3940    filt1 = (v16i8) __msa_fill_h(filt_const1);
3941    filt2 = (v16i8) __msa_fill_h(filt_const2);
3942
3943    src -= (stride * 2);
3944    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3945    src += (5 * stride);
3946
3947    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3948               src32_r, src43_r);
3949    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3950    XORI_B2_128_SB(src2110, src4332);
3951    LD_SB4(src, stride, src5, src6, src7, src8);
3952    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3953               src76_r, src87_r);
3954    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
3955    XORI_B2_128_SB(src6554, src8776);
3956    src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3957    src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
3958    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
3959    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3960    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3961    SRARI_H2_SH(out10, out32, 5);
3962    SAT_SH2_SH(out10, out32, 7);
3963    LW4(dst, stride, tp0, tp1, tp2, tp3);
3964    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3965    res = PCKEV_XORI128_UB(out10, out32);
3966    res = __msa_aver_u_b(res, (v16u8) src32_r);
3967    dst0 = __msa_aver_u_b(res, dst0);
3968    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
3969}
3970
3971void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
3972                                ptrdiff_t stride)
3973{
3974    uint32_t tp0, tp1, tp2, tp3;
3975    int16_t filt_const0 = 0xfb01;
3976    int16_t filt_const1 = 0x1414;
3977    int16_t filt_const2 = 0x1fb;
3978    v16u8 res, dst0 = { 0 };
3979    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3980    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3981    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3982    v8i16 out10, out32;
3983
3984    filt0 = (v16i8) __msa_fill_h(filt_const0);
3985    filt1 = (v16i8) __msa_fill_h(filt_const1);
3986    filt2 = (v16i8) __msa_fill_h(filt_const2);
3987
3988    src -= (stride * 2);
3989
3990    LD_SB5(src, stride, src0, src1, src2, src3, src4);
3991    src += (5 * stride);
3992
3993    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3994               src32_r, src43_r);
3995    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3996    XORI_B2_128_SB(src2110, src4332);
3997    LD_SB4(src, stride, src5, src6, src7, src8);
3998    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3999               src76_r, src87_r);
4000    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4001    XORI_B2_128_SB(src6554, src8776);
4002    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4003    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4004    SRARI_H2_SH(out10, out32, 5);
4005    SAT_SH2_SH(out10, out32, 7);
4006    LW4(dst, stride, tp0, tp1, tp2, tp3);
4007    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4008    res = PCKEV_XORI128_UB(out10, out32);
4009    src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
4010    src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
4011    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
4012    res = __msa_aver_u_b(res, (v16u8) src32_r);
4013    dst0 = __msa_aver_u_b(res, dst0);
4014    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
4015}
4016
4017void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
4018                                 ptrdiff_t stride)
4019{
4020    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
4021                                           src - (stride * 2),
4022                                           dst, stride);
4023}
4024
4025void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
4026                                 ptrdiff_t stride)
4027{
4028    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
4029                                           src - (stride * 2) +
4030                                           sizeof(uint8_t),
4031                                           dst, stride);
4032}
4033
4034void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
4035                                 ptrdiff_t stride)
4036{
4037    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4038                                           src - (stride * 2),
4039                                           dst, stride);
4040}
4041
4042void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
4043                                 ptrdiff_t stride)
4044{
4045    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4046                                           src - (stride * 2) +
4047                                           sizeof(uint8_t),
4048                                           dst, stride);
4049}
4050
4051void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
4052                                ptrdiff_t stride)
4053{
4054    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
4055                                         src - (stride * 2),
4056                                         dst, stride);
4057}
4058
4059void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
4060                                ptrdiff_t stride)
4061{
4062    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
4063                                         src - (stride * 2) +
4064                                         sizeof(uint8_t), dst, stride);
4065}
4066
4067void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
4068                                ptrdiff_t stride)
4069{
4070    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4071                                         src - (stride * 2),
4072                                         dst, stride);
4073}
4074
4075void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
4076                                ptrdiff_t stride)
4077{
4078    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4079                                         src - (stride * 2) +
4080                                         sizeof(uint8_t), dst, stride);
4081}
4082
4083
4084void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
4085                                ptrdiff_t stride)
4086{
4087    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
4088                                         src - (stride * 2),
4089                                         dst, stride);
4090}
4091
4092void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
4093                                ptrdiff_t stride)
4094{
4095    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
4096                                         src - (stride * 2) +
4097                                         sizeof(uint8_t), dst, stride);
4098}
4099
4100void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
4101                                ptrdiff_t stride)
4102{
4103    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4104                                         src - (stride * 2),
4105                                         dst, stride);
4106}
4107
4108void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
4109                                ptrdiff_t stride)
4110{
4111    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4112                                         src - (stride * 2) +
4113                                         sizeof(uint8_t), dst, stride);
4114}
4115
4116void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
4117                                 ptrdiff_t stride)
4118{
4119    uint64_t tp0, tp1, tp2, tp3;
4120    uint8_t *dst_tmp = dst;
4121    const uint8_t *src_tmp = src - (2 * stride) - 2;
4122    uint32_t multiple8_cnt, loop_cnt;
4123    const int32_t filt_const0 = 0xfffb0001;
4124    const int32_t filt_const1 = 0x140014;
4125    const int32_t filt_const2 = 0x1fffb;
4126    v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4127    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4128    v16i8 mask2;
4129    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4130    v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4131    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4132    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4133    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4134    v8i16 hz_out87_l, filt0, filt1, filt2;
4135    v4i32 tmp0_w, tmp1_w;
4136
4137    filt0 = (v8i16) __msa_fill_w(filt_const0);
4138    filt1 = (v8i16) __msa_fill_w(filt_const1);
4139    filt2 = (v8i16) __msa_fill_w(filt_const2);
4140
4141    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4142
4143    for (multiple8_cnt = 2; multiple8_cnt--;) {
4144        dst = dst_tmp;
4145        src = src_tmp;
4146
4147        LD_SB5(src, stride, src0, src1, src2, src3, src4);
4148        XORI_B5_128_SB(src0, src1, src2, src3, src4);
4149        src += (5 * stride);
4150
4151        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4152        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4153        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4154        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4155        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4156
4157        for (loop_cnt = 4; loop_cnt--;) {
4158            LD_SB2(src, stride, src5, src6);
4159            src += (2 * stride);
4160
4161            XORI_B2_128_SB(src5, src6);
4162            hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4163            hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4164            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4165                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4166                       hz_out43_r);
4167            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4168                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4169                       hz_out43_l);
4170            ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r,
4171                       hz_out65_r);
4172            ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l,
4173                       hz_out65_l);
4174            tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4175                                    filt1, filt2);
4176            tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4177                                    filt1, filt2);
4178            tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4179            tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4180                                    filt1, filt2);
4181            tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4182                                    filt1, filt2);
4183            tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4184
4185            tmp1 = __msa_srari_h(hz_out2, 5);
4186            tmp3 = __msa_srari_h(hz_out3, 5);
4187            SAT_SH2_SH(tmp1, tmp3, 7);
4188
4189            tmp0 = __msa_aver_s_h(tmp0, tmp1);
4190            tmp1 = __msa_aver_s_h(tmp2, tmp3);
4191
4192            LD2(dst, stride, tp0, tp1);
4193            INSERT_D2_UB(tp0, tp1, dst0);
4194
4195            out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4196            dst0 = __msa_aver_u_b(out0, dst0);
4197            ST_D2(dst0, 0, 1, dst, stride);
4198            dst += (2 * stride);
4199
4200            LD_SB2(src, stride, src7, src8);
4201            src += (2 * stride);
4202
4203            XORI_B2_128_SB(src7, src8);
4204            hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4205            hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4206            ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4207                       hz_out87_r);
4208            ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4209                       hz_out87_l);
4210            tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4211                                    filt1, filt2);
4212            tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4213                                    filt1, filt2);
4214            tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4215            tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4216                                    filt1, filt2);
4217            tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4218                                    filt1, filt2);
4219            tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4220
4221            tmp5 = __msa_srari_h(hz_out4, 5);
4222            tmp7 = __msa_srari_h(hz_out5, 5);
4223            SAT_SH2_SH(tmp5, tmp7, 7);
4224
4225            tmp2 = __msa_aver_s_h(tmp4, tmp5);
4226            tmp3 = __msa_aver_s_h(tmp6, tmp7);
4227
4228            LD2(dst, stride, tp2, tp3);
4229            INSERT_D2_UB(tp2, tp3, dst1);
4230
4231            out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4232            dst1 = __msa_aver_u_b(out1, dst1);
4233            ST_D2(dst1, 0, 1, dst, stride);
4234            dst += (2 * stride);
4235
4236            hz_out0 = hz_out4;
4237            hz_out1 = hz_out5;
4238            hz_out2 = hz_out6;
4239            hz_out3 = hz_out7;
4240            hz_out4 = hz_out8;
4241        }
4242
4243        src_tmp += 8;
4244        dst_tmp += 8;
4245    }
4246}
4247
4248void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
4249                                 ptrdiff_t stride)
4250{
4251    uint64_t tp0, tp1, tp2, tp3;
4252    uint8_t *dst_tmp = dst;
4253    const uint8_t *src_tmp = src - (2 * stride) - 2;
4254    uint32_t multiple8_cnt, loop_cnt;
4255    const int32_t filt_const0 = 0xfffb0001;
4256    const int32_t filt_const1 = 0x140014;
4257    const int32_t filt_const2 = 0x1fffb;
4258    v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4259    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4260    v16i8 mask2;
4261    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4262    v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4263    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4264    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4265    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4266    v8i16 hz_out87_l, filt0, filt1, filt2;
4267    v4i32 tmp0_w, tmp1_w;
4268
4269    filt0 = (v8i16) __msa_fill_w(filt_const0);
4270    filt1 = (v8i16) __msa_fill_w(filt_const1);
4271    filt2 = (v8i16) __msa_fill_w(filt_const2);
4272
4273    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4274
4275    for (multiple8_cnt = 2; multiple8_cnt--;) {
4276        dst = dst_tmp;
4277        src = src_tmp;
4278
4279        LD_SB5(src, stride, src0, src1, src2, src3, src4);
4280        XORI_B5_128_SB(src0, src1, src2, src3, src4);
4281        src += (5 * stride);
4282
4283        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4284        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4285        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4286        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4287        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4288
4289        for (loop_cnt = 4; loop_cnt--;) {
4290            LD_SB2(src, stride, src5, src6);
4291            src += (2 * stride);
4292
4293            XORI_B2_128_SB(src5, src6);
4294            hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4295            hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4296            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4297                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4298                       hz_out43_r);
4299            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4300                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4301                       hz_out43_l);
4302            ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, hz_out65_r);
4303            ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, hz_out65_l);
4304
4305            tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4306                                    filt1, filt2);
4307            tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4308                                    filt1, filt2);
4309            tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4310            tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4311                                    filt1, filt2);
4312            tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4313                                    filt1, filt2);
4314            tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4315
4316            tmp1 = __msa_srari_h(hz_out3, 5);
4317            tmp3 = __msa_srari_h(hz_out4, 5);
4318            SAT_SH2_SH(tmp1, tmp3, 7);
4319
4320            tmp0 = __msa_aver_s_h(tmp0, tmp1);
4321            tmp1 = __msa_aver_s_h(tmp2, tmp3);
4322
4323            LD2(dst, stride, tp0, tp1);
4324            INSERT_D2_UB(tp0, tp1, dst0);
4325            out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4326            dst0 = __msa_aver_u_b(out0, dst0);
4327            ST_D2(dst0, 0, 1, dst, stride);
4328            dst += (2 * stride);
4329
4330            LD_SB2(src, stride, src7, src8);
4331            src += (2 * stride);
4332
4333            XORI_B2_128_SB(src7, src8);
4334            hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4335            hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4336            ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4337                       hz_out87_r);
4338            ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4339                       hz_out87_l);
4340            tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4341                                    filt1, filt2);
4342            tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4343                                    filt1, filt2);
4344            tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4345            tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4346                                    filt1, filt2);
4347            tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4348                                    filt1, filt2);
4349            tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4350
4351            tmp5 = __msa_srari_h(hz_out5, 5);
4352            tmp7 = __msa_srari_h(hz_out6, 5);
4353            SAT_SH2_SH(tmp5, tmp7, 7);
4354
4355            tmp2 = __msa_aver_s_h(tmp4, tmp5);
4356            tmp3 = __msa_aver_s_h(tmp6, tmp7);
4357
4358            LD2(dst, stride, tp2, tp3);
4359            INSERT_D2_UB(tp2, tp3, dst1);
4360            out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4361            dst1 = __msa_aver_u_b(out1, dst1);
4362            ST_D2(dst1, 0, 1, dst, stride);
4363            dst += (2 * stride);
4364
4365            hz_out0 = hz_out4;
4366            hz_out1 = hz_out5;
4367            hz_out2 = hz_out6;
4368            hz_out3 = hz_out7;
4369            hz_out4 = hz_out8;
4370        }
4371
4372        src_tmp += 8;
4373        dst_tmp += 8;
4374    }
4375}
4376
4377void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
4378                                ptrdiff_t stride)
4379{
4380    const int32_t filt_const0 = 0xfffb0001;
4381    const int32_t filt_const1 = 0x140014;
4382    const int32_t filt_const2 = 0x1fffb;
4383    uint64_t tp0, tp1, tp2, tp3;
4384    v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4385    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4386    v16i8 src11, src12, mask0, mask1, mask2;
4387    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4388    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4389    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4390    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4391    v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4392    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4393    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4394    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4395    v4i32 tmp0_w, tmp1_w;
4396
4397    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4398
4399    filt0 = (v8i16) __msa_fill_w(filt_const0);
4400    filt1 = (v8i16) __msa_fill_w(filt_const1);
4401    filt2 = (v8i16) __msa_fill_w(filt_const2);
4402
4403    src -= ((2 * stride) + 2);
4404
4405    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4406    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4407    src += (5 * stride);
4408
4409    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4410    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4411    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4412    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4413    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4414
4415    LD_SB4(src, stride, src5, src6, src7, src8);
4416    src += (4 * stride);
4417    XORI_B4_128_SB(src5, src6, src7, src8);
4418
4419    hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4420    hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4421    hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4422    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4423
4424    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4425               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4426    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4427               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4428    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4429               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4430    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4431               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4432
4433    tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4434                            filt2);
4435    tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4436                            filt2);
4437    tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4438    tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4439                            filt2);
4440    tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4441                            filt2);
4442    tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4443    tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4444                            filt2);
4445    tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4446                            filt2);
4447    tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4448    tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4449                            filt2);
4450    tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4451                            filt2);
4452    tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4453
4454    SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
4455    SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
4456
4457    LD4(dst, stride, tp0, tp1, tp2, tp3);
4458    INSERT_D2_UB(tp0, tp1, dst0);
4459    INSERT_D2_UB(tp2, tp3, dst1);
4460
4461    tmp0 = __msa_aver_s_h(tmp0, hz_out2);
4462    tmp1 = __msa_aver_s_h(tmp1, hz_out3);
4463    tmp2 = __msa_aver_s_h(tmp2, hz_out4);
4464    tmp3 = __msa_aver_s_h(tmp3, hz_out5);
4465
4466    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4467    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4468    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4469    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4470    dst += (4 * stride);
4471
4472    LD_SB4(src, stride, src9, src10, src11, src12);
4473    XORI_B4_128_SB(src9, src10, src11, src12);
4474    hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
4475    hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
4476    hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
4477    hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
4478    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4479               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4480               hz_out1211_r);
4481    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4482               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4483               hz_out1211_l);
4484    tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4485                            filt2);
4486    tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4487                            filt2);
4488    tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4489    tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4490                            filt2);
4491    tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4492                            filt2);
4493    tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4494    tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4495                            filt2);
4496    tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4497                            filt2);
4498    tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4499    tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4500                            filt2);
4501    tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4502                            filt2);
4503    tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4504
4505    SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
4506    SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
4507
4508    LD4(dst, stride, tp0, tp1, tp2, tp3);
4509    INSERT_D2_UB(tp0, tp1, dst0);
4510    INSERT_D2_UB(tp2, tp3, dst1);
4511
4512    tmp0 = __msa_aver_s_h(tmp0, hz_out6);
4513    tmp1 = __msa_aver_s_h(tmp1, hz_out7);
4514    tmp2 = __msa_aver_s_h(tmp2, hz_out8);
4515    tmp3 = __msa_aver_s_h(tmp3, hz_out9);
4516
4517    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4518    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4519    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4520    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4521}
4522
4523void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
4524                                ptrdiff_t stride)
4525{
4526    const int32_t filt_const0 = 0xfffb0001;
4527    const int32_t filt_const1 = 0x140014;
4528    const int32_t filt_const2 = 0x1fffb;
4529    uint64_t tp0, tp1, tp2, tp3;
4530    v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4531    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4532    v16i8 src11, src12, mask0, mask1, mask2;
4533    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4534    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4535    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4536    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4537    v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4538    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4539    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4540    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4541    v4i32 tmp0_w, tmp1_w;
4542
4543    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4544
4545    filt0 = (v8i16) __msa_fill_w(filt_const0);
4546    filt1 = (v8i16) __msa_fill_w(filt_const1);
4547    filt2 = (v8i16) __msa_fill_w(filt_const2);
4548
4549    src -= ((2 * stride) + 2);
4550
4551    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4552    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4553    src += (5 * stride);
4554
4555    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4556    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4557    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4558    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4559    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4560
4561    LD_SB4(src, stride, src5, src6, src7, src8);
4562    src += (4 * stride);
4563    XORI_B4_128_SB(src5, src6, src7, src8);
4564
4565    hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4566    hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4567    hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4568    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4569
4570    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4571               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4572    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4573               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4574    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4575               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4576    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4577               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4578
4579    tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4580                            filt2);
4581    tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4582                            filt2);
4583    tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4584    tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4585                            filt2);
4586    tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4587                            filt2);
4588    tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4589    tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4590                            filt2);
4591    tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4592                            filt2);
4593    tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4594    tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4595                            filt2);
4596    tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4597                            filt2);
4598    tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4599
4600    SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
4601    SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
4602
4603    LD4(dst, stride, tp0, tp1, tp2, tp3);
4604    INSERT_D2_UB(tp0, tp1, dst0);
4605    INSERT_D2_UB(tp2, tp3, dst1);
4606
4607    tmp0 = __msa_aver_s_h(tmp0, hz_out3);
4608    tmp1 = __msa_aver_s_h(tmp1, hz_out4);
4609    tmp2 = __msa_aver_s_h(tmp2, hz_out5);
4610    tmp3 = __msa_aver_s_h(tmp3, hz_out6);
4611
4612    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4613    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4614    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4615    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4616    dst += (4 * stride);
4617
4618    LD_SB4(src, stride, src9, src10, src11, src12);
4619    XORI_B4_128_SB(src9, src10, src11, src12);
4620    hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
4621    hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
4622    hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
4623    hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
4624    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4625               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4626               hz_out1211_r);
4627    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4628               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4629               hz_out1211_l);
4630    tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4631                            filt2);
4632    tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4633                            filt2);
4634    tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4635    tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4636                            filt2);
4637    tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4638                            filt2);
4639    tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4640    tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4641                            filt2);
4642    tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4643                            filt2);
4644    tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4645    tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4646                            filt2);
4647    tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4648                            filt2);
4649    tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4650
4651    SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
4652    SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
4653
4654    LD4(dst, stride, tp0, tp1, tp2, tp3);
4655    INSERT_D2_UB(tp0, tp1, dst0);
4656    INSERT_D2_UB(tp2, tp3, dst1);
4657
4658    tmp0 = __msa_aver_s_h(tmp0, hz_out7);
4659    tmp1 = __msa_aver_s_h(tmp1, hz_out8);
4660    tmp2 = __msa_aver_s_h(tmp2, hz_out9);
4661    tmp3 = __msa_aver_s_h(tmp3, hz_out10);
4662
4663    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4664    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4665    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4666    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4667}
4668
4669void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
4670                                ptrdiff_t stride)
4671{
4672    uint32_t tp0, tp1, tp2, tp3;
4673    const int32_t filt_const0 = 0xfffb0001;
4674    const int32_t filt_const1 = 0x140014;
4675    const int32_t filt_const2 = 0x1fffb;
4676    v16u8 res, out = { 0 };
4677    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4678    v16i8 mask0, mask1, mask2;
4679    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4680    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4681    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4682    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4683    v4i32 tmp0, tmp1;
4684
4685    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
4686
4687    filt0 = (v8i16) __msa_fill_w(filt_const0);
4688    filt1 = (v8i16) __msa_fill_w(filt_const1);
4689    filt2 = (v8i16) __msa_fill_w(filt_const2);
4690
4691    src -= ((2 * stride) + 2);
4692
4693    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4694    src += (5 * stride);
4695    LD_SB4(src, stride, src5, src6, src7, src8);
4696
4697    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4698    XORI_B4_128_SB(src5, src6, src7, src8);
4699
4700    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
4701    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
4702    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
4703    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
4704    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4705    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4706    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4707
4708    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4709               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4710    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4711               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4712
4713    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4714                          filt2);
4715    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4716                          filt2);
4717    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4718    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4719                          filt2);
4720    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4721                          filt2);
4722    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4723
4724    SRARI_H2_SH(hz_out2, hz_out4, 5);
4725    SAT_SH2_SH(hz_out2, hz_out4, 7);
4726
4727    dst0 = __msa_aver_s_h(dst0, hz_out2);
4728    dst1 = __msa_aver_s_h(dst1, hz_out4);
4729    LW4(dst, stride, tp0, tp1, tp2, tp3);
4730    INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
4731    res = PCKEV_XORI128_UB(dst0, dst1);
4732    res = __msa_aver_u_b(res, out);
4733    ST_W4(res, 0, 1, 2, 3, dst, stride);
4734}
4735
4736void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
4737                                ptrdiff_t stride)
4738{
4739    const int32_t filt_const0 = 0xfffb0001;
4740    const int32_t filt_const1 = 0x140014;
4741    const int32_t filt_const2 = 0x1fffb;
4742    uint32_t tp0, tp1, tp2, tp3;
4743    v16u8 res, out = { 0 };
4744    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4745    v16i8 mask0, mask1, mask2;
4746    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4747    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4748    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4749    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4750    v4i32 tmp0, tmp1;
4751
4752    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
4753
4754    filt0 = (v8i16) __msa_fill_w(filt_const0);
4755    filt1 = (v8i16) __msa_fill_w(filt_const1);
4756    filt2 = (v8i16) __msa_fill_w(filt_const2);
4757
4758    src -= ((2 * stride) + 2);
4759
4760    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4761    src += (5 * stride);
4762    LD_SB4(src, stride, src5, src6, src7, src8);
4763
4764    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4765    XORI_B4_128_SB(src5, src6, src7, src8);
4766
4767    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
4768    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
4769    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
4770    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
4771    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4772    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4773    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4774
4775    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4776               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4777    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4778               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4779
4780    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4781                          filt2);
4782    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4783                          filt2);
4784    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4785    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4786                          filt2);
4787    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4788                          filt2);
4789    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4790
4791    PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
4792    SRARI_H2_SH(hz_out0, hz_out1, 5);
4793    SAT_SH2_SH(hz_out0, hz_out1, 7);
4794
4795    dst0 = __msa_aver_s_h(dst0, hz_out0);
4796    dst1 = __msa_aver_s_h(dst1, hz_out1);
4797    LW4(dst, stride, tp0, tp1, tp2, tp3);
4798    INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
4799    res = PCKEV_XORI128_UB(dst0, dst1);
4800    res = __msa_aver_u_b(res, out);
4801    ST_W4(res, 0, 1, 2, 3, dst, stride);
4802}
4803
4804void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
4805                                 ptrdiff_t stride)
4806{
4807    int32_t loop_cnt;
4808    int16_t filt_const0 = 0xfb01;
4809    int16_t filt_const1 = 0x1414;
4810    int16_t filt_const2 = 0x1fb;
4811    v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
4812    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4813    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4814    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
4815    v16i8 src65_l, src87_l, filt0, filt1, filt2;
4816    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
4817
4818    filt0 = (v16i8) __msa_fill_h(filt_const0);
4819    filt1 = (v16i8) __msa_fill_h(filt_const1);
4820    filt2 = (v16i8) __msa_fill_h(filt_const2);
4821    src -= (stride * 2);
4822
4823    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4824    src += (5 * stride);
4825
4826    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4827    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4828               src32_r, src43_r);
4829    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
4830               src32_l, src43_l);
4831
4832    for (loop_cnt = 4; loop_cnt--;) {
4833        LD_SB4(src, stride, src5, src6, src7, src8);
4834        src += (4 * stride);
4835
4836        XORI_B4_128_SB(src5, src6, src7, src8);
4837        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
4838                   src65_r, src76_r, src87_r);
4839        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
4840                   src65_l, src76_l, src87_l);
4841        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
4842        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
4843        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
4844        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
4845        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
4846        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
4847        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
4848        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
4849        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
4850        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4851        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
4852        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
4853        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
4854        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
4855                    out3_r, res0, res1, res2, res3);
4856        XORI_B4_128_UB(res0, res1, res2, res3);
4857        AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
4858        AVER_UB2_UB(res2, dst2, res3, dst3, res2, res3);
4859        ST_UB4(res0, res1, res2, res3, dst, stride);
4860        dst += (4 * stride);
4861
4862        src10_r = src54_r;
4863        src32_r = src76_r;
4864        src21_r = src65_r;
4865        src43_r = src87_r;
4866        src10_l = src54_l;
4867        src32_l = src76_l;
4868        src21_l = src65_l;
4869        src43_l = src87_l;
4870        src4 = src8;
4871    }
4872}
4873
4874void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
4875                                ptrdiff_t stride)
4876{
4877    uint64_t tp0, tp1, tp2, tp3;
4878    const int16_t filt_const0 = 0xfb01;
4879    const int16_t filt_const1 = 0x1414;
4880    const int16_t filt_const2 = 0x1fb;
4881    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
4882    v16u8 out0, out1, out2, out3;
4883    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src109_r;
4884    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
4885    v16i8 filt0, filt1, filt2;
4886    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
4887
4888    filt0 = (v16i8) __msa_fill_h(filt_const0);
4889    filt1 = (v16i8) __msa_fill_h(filt_const1);
4890    filt2 = (v16i8) __msa_fill_h(filt_const2);
4891
4892    src -= (stride * 2);
4893
4894    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4895    src += (5 * stride);
4896
4897    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4898    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4899               src32_r, src43_r);
4900
4901    LD_SB4(src, stride, src7, src8, src9, src10);
4902    src += (4 * stride);
4903    XORI_B4_128_SB(src7, src8, src9, src10);
4904    ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
4905               src87_r, src98_r, src109_r);
4906    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
4907    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
4908    out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
4909    out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
4910
4911    LD_SB4(src, stride, src0, src1, src2, src3);
4912    XORI_B4_128_SB(src0, src1, src2, src3);
4913    ILVR_B4_SB(src0, src10, src1, src0, src2, src1, src3, src2, src10_r,
4914               src21_r, src32_r, src43_r);
4915    out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
4916    out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
4917    out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
4918    out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
4919
4920    LD4(dst, stride, tp0, tp1, tp2, tp3);
4921    INSERT_D2_UB(tp0, tp1, dst0);
4922    INSERT_D2_UB(tp2, tp3, dst1);
4923    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
4924    INSERT_D2_UB(tp0, tp1, dst2);
4925    INSERT_D2_UB(tp2, tp3, dst3);
4926
4927    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
4928    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
4929    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4930    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
4931    out0 = PCKEV_XORI128_UB(out0_r, out1_r);
4932    out1 = PCKEV_XORI128_UB(out2_r, out3_r);
4933    out2 = PCKEV_XORI128_UB(out4_r, out5_r);
4934    out3 = PCKEV_XORI128_UB(out6_r, out7_r);
4935    AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
4936                dst2, dst3);
4937    ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
4938}
4939
4940void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
4941                                ptrdiff_t stride)
4942{
4943    uint32_t tp0, tp1, tp2, tp3;
4944    int16_t filt_const0 = 0xfb01;
4945    int16_t filt_const1 = 0x1414;
4946    int16_t filt_const2 = 0x1fb;
4947    v16u8 res, dst0 = { 0 };
4948    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4949    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4950    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
4951    v8i16 out10, out32;
4952
4953    filt0 = (v16i8) __msa_fill_h(filt_const0);
4954    filt1 = (v16i8) __msa_fill_h(filt_const1);
4955    filt2 = (v16i8) __msa_fill_h(filt_const2);
4956
4957    src -= (stride * 2);
4958    LD_SB5(src, stride, src0, src1, src2, src3, src4);
4959    src += (5 * stride);
4960
4961    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4962               src32_r, src43_r);
4963    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
4964    XORI_B2_128_SB(src2110, src4332);
4965    LD_SB4(src, stride, src5, src6, src7, src8);
4966    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4967               src76_r, src87_r);
4968    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4969    XORI_B2_128_SB(src6554, src8776);
4970    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4971    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4972    SRARI_H2_SH(out10, out32, 5);
4973    SAT_SH2_SH(out10, out32, 7);
4974    LW4(dst, stride, tp0, tp1, tp2, tp3);
4975    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4976    res = PCKEV_XORI128_UB(out10, out32);
4977    dst0 = __msa_aver_u_b(res, dst0);
4978    ST_W4(dst0, 0, 1, 2, 3, dst, stride);
4979}
4980
4981void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
4982                                 ptrdiff_t stride)
4983{
4984    uint32_t row;
4985    v16u8 out, dst0;
4986    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4987    v16i8 src11;
4988    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
4989    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
4990    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
4991    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
4992    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
4993    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
4994    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
4995    v8i16 minus5h = __msa_ldi_h(-5);
4996    v8i16 plus20h = __msa_ldi_h(20);
4997
4998    mask3 = mask0 + 4;
4999    mask4 = mask1 + 4;
5000    mask5 = mask2 + 4;
5001
5002    src -= ((2 * stride) + 2);
5003
5004    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5005    LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5006    src += (5 * stride);
5007    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5008    XORI_B5_128_SB(src7, src8, src9, src10, src11);
5009
5010    for (row = 16; row--;) {
5011        LD_SB2(src, 8, src5, src6);
5012        src += stride;
5013        XORI_B2_128_SB(src5, src6);
5014        dst0 = LD_UB(dst);
5015
5016        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5017                                        vt_res0, vt_res1);
5018        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
5019                                        vt_res2, vt_res3);
5020        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5021                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5022        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5023                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5024        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5025                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5026        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5027                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5028        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5029        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5030        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5031        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5032        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5033        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5034        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5035        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5036        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5037        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5038        tmp0 = __msa_srari_h(shf_vec2, 5);
5039        tmp1 = __msa_srari_h(shf_vec5, 5);
5040        tmp2 = __msa_srari_h(shf_vec8, 5);
5041        tmp3 = __msa_srari_h(shf_vec11, 5);
5042        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5043        PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
5044        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5045        tmp0 = __msa_aver_s_h(tmp2, tmp0);
5046        tmp1 = __msa_aver_s_h(tmp3, tmp1);
5047        out = PCKEV_XORI128_UB(tmp0, tmp1);
5048        out = __msa_aver_u_b(out, dst0);
5049        ST_UB(out, dst);
5050        dst += stride;
5051
5052        src0 = src1;
5053        src1 = src2;
5054        src2 = src3;
5055        src3 = src4;
5056        src4 = src5;
5057        src7 = src8;
5058        src8 = src9;
5059        src9 = src10;
5060        src10 = src11;
5061        src11 = src6;
5062    }
5063}
5064
5065void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
5066                                 ptrdiff_t stride)
5067{
5068    uint32_t row;
5069    v16u8 out, dst0;
5070    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5071    v16i8 src11;
5072    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
5073    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5074    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
5075    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5076    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5077    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5078    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5079    v8i16 minus5h = __msa_ldi_h(-5);
5080    v8i16 plus20h = __msa_ldi_h(20);
5081
5082    mask3 = mask0 + 4;
5083    mask4 = mask1 + 4;
5084    mask5 = mask2 + 4;
5085
5086    src -= ((2 * stride) + 2);
5087
5088    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5089    LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5090    src += (5 * stride);
5091    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5092    XORI_B5_128_SB(src7, src8, src9, src10, src11);
5093
5094    for (row = 16; row--;) {
5095        LD_SB2(src, 8, src5, src6);
5096        src += stride;
5097        XORI_B2_128_SB(src5, src6);
5098        dst0 = LD_UB(dst);
5099
5100        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5101                                        vt_res0, vt_res1);
5102        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
5103                                        vt_res2, vt_res3);
5104        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5105                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5106        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5107                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5108        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5109                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5110        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5111                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5112        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5113        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5114        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5115        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5116        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5117        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5118        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5119        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5120        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5121        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5122        tmp0 = __msa_srari_h(shf_vec2, 5);
5123        tmp1 = __msa_srari_h(shf_vec5, 5);
5124        tmp2 = __msa_srari_h(shf_vec8, 5);
5125        tmp3 = __msa_srari_h(shf_vec11, 5);
5126        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5127        tmp0 = __msa_pckod_h(tmp2, tmp0);
5128        tmp1 = __msa_pckod_h(tmp3, tmp1);
5129        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5130        tmp0 = __msa_aver_s_h(tmp2, tmp0);
5131        tmp1 = __msa_aver_s_h(tmp3, tmp1);
5132        out = PCKEV_XORI128_UB(tmp0, tmp1);
5133        out = __msa_aver_u_b(out, dst0);
5134        ST_UB(out, dst);
5135        dst += stride;
5136
5137        src0 = src1;
5138        src1 = src2;
5139        src2 = src3;
5140        src3 = src4;
5141        src4 = src5;
5142        src7 = src8;
5143        src8 = src9;
5144        src9 = src10;
5145        src10 = src11;
5146        src11 = src6;
5147    }
5148}
5149
5150void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
5151                                ptrdiff_t stride)
5152{
5153    uint32_t row;
5154    uint64_t tp0, tp1;
5155    v16u8 out, dst0 = { 0 };
5156    v16i8 src0, src1, src2, src3, src4, src5, src6;
5157    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5158    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5159    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5160    v8i16 mask3, mask4, mask5;
5161    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5162    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5163    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5164    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5165    v8i16 minus5h = __msa_ldi_h(-5);
5166    v8i16 plus20h = __msa_ldi_h(20);
5167
5168    mask3 = mask0 + 4;
5169    mask4 = mask1 + 4;
5170    mask5 = mask2 + 4;
5171
5172    src -= ((2 * stride) + 2);
5173
5174    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5175    src += (5 * stride);
5176    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5177
5178    for (row = 4; row--;) {
5179        LD_SB2(src, stride, src5, src6);
5180        src += (2 * stride);
5181        XORI_B2_128_SB(src5, src6);
5182
5183        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5184                                        vt_res0, vt_res1);
5185        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
5186                                        vt_res2, vt_res3);
5187        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5188                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5189        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5190                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5191        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5192                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5193        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5194                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5195        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5196        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5197        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5198        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5199        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5200        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5201        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5202        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5203        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5204        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5205        tmp0 = __msa_srari_h(shf_vec2, 5);
5206        tmp1 = __msa_srari_h(shf_vec5, 5);
5207        tmp2 = __msa_srari_h(shf_vec8, 5);
5208        tmp3 = __msa_srari_h(shf_vec11, 5);
5209        LD2(dst, stride, tp0, tp1);
5210        INSERT_D2_UB(tp0, tp1, dst0);
5211        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5212        PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
5213        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5214        tmp0 = __msa_aver_s_h(tmp2, tmp0);
5215        tmp1 = __msa_aver_s_h(tmp3, tmp1);
5216        out = PCKEV_XORI128_UB(tmp0, tmp1);
5217        out = __msa_aver_u_b(out, dst0);
5218        ST_D2(out, 0, 1, dst, stride);
5219        dst += (2 * stride);
5220
5221        src0 = src2;
5222        src1 = src3;
5223        src2 = src4;
5224        src3 = src5;
5225        src4 = src6;
5226    }
5227}
5228
5229void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
5230                                ptrdiff_t stride)
5231{
5232    uint32_t row;
5233    uint64_t tp0, tp1;
5234    v16u8 out, dst0 = { 0 };
5235    v16i8 src0, src1, src2, src3, src4, src5, src6;
5236    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5237    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5238    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5239    v8i16 mask3, mask4, mask5;
5240    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5241    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5242    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5243    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5244    v8i16 minus5h = __msa_ldi_h(-5);
5245    v8i16 plus20h = __msa_ldi_h(20);
5246
5247    mask3 = mask0 + 4;
5248    mask4 = mask1 + 4;
5249    mask5 = mask2 + 4;
5250
5251    src -= ((2 * stride) + 2);
5252
5253    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5254    src += (5 * stride);
5255    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5256
5257    for (row = 4; row--;) {
5258        LD_SB2(src, stride, src5, src6);
5259        src += (2 * stride);
5260        XORI_B2_128_SB(src5, src6);
5261
5262        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5263                                        vt_res0, vt_res1);
5264        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
5265                                        vt_res2, vt_res3);
5266        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5267                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5268        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5269                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5270        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5271                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5272        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5273                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5274        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5275        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5276        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5277        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5278        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5279        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5280        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5281        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5282        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5283        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5284        tmp0 = __msa_srari_h(shf_vec2, 5);
5285        tmp1 = __msa_srari_h(shf_vec5, 5);
5286        tmp2 = __msa_srari_h(shf_vec8, 5);
5287        tmp3 = __msa_srari_h(shf_vec11, 5);
5288        LD2(dst, stride, tp0, tp1);
5289        INSERT_D2_UB(tp0, tp1, dst0);
5290        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5291        tmp0 = __msa_pckod_h(tmp2, tmp0);
5292        tmp1 = __msa_pckod_h(tmp3, tmp1);
5293        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5294        tmp0 = __msa_aver_s_h(tmp2, tmp0);
5295        tmp1 = __msa_aver_s_h(tmp3, tmp1);
5296        out = PCKEV_XORI128_UB(tmp0, tmp1);
5297        out = __msa_aver_u_b(out, dst0);
5298        ST_D2(out, 0, 1, dst, stride);
5299        dst += (2 * stride);
5300
5301        src0 = src2;
5302        src1 = src3;
5303        src2 = src4;
5304        src3 = src5;
5305        src4 = src6;
5306    }
5307}
5308
5309void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
5310                                ptrdiff_t stride)
5311{
5312    uint32_t tp0, tp1, tp2, tp3;
5313    const int16_t filt_const0 = 0xfb01;
5314    const int16_t filt_const1 = 0x1414;
5315    const int16_t filt_const2 = 0x1fb;
5316    v16u8 out, dstv = { 0 };
5317    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5318    v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5319    v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5320    v16i8 src76_l, src87_l, filt0, filt1, filt2;
5321    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5322    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5323    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5324    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5325    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5326    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5327    v8i16 minus5h = __msa_ldi_h(-5);
5328    v8i16 plus20h = __msa_ldi_h(20);
5329    v8i16 zeros = { 0 };
5330
5331    filt0 = (v16i8) __msa_fill_h(filt_const0);
5332    filt1 = (v16i8) __msa_fill_h(filt_const1);
5333    filt2 = (v16i8) __msa_fill_h(filt_const2);
5334
5335    src -= ((2 * stride) + 2);
5336
5337    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5338    src += (5 * stride);
5339    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5340    LD_SB4(src, stride, src5, src6, src7, src8);
5341    XORI_B4_128_SB(src5, src6, src7, src8);
5342
5343    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5344               src32_r, src43_r);
5345    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5346               src76_r, src87_r);
5347    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5348               src32_l, src43_l);
5349    ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5350               src76_l, src87_l);
5351    vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5352    vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5353    vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5354    vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5355    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5356               mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5357    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5358               mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5359    hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5360    DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5361    hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5362    DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5363
5364    vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5365    vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5366    vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5367    vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5368    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5369               mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5370    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5371               mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5372    hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5373    DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5374    hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5375    DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5376
5377    SRARI_W2_SW(hz_res0, hz_res1, 10);
5378    SAT_SW2_SW(hz_res0, hz_res1, 7);
5379    SRARI_W2_SW(hz_res2, hz_res3, 10);
5380    SAT_SW2_SW(hz_res2, hz_res3, 7);
5381
5382    dst0 = __msa_srari_h(shf_vec2, 5);
5383    dst1 = __msa_srari_h(shf_vec5, 5);
5384    dst2 = __msa_srari_h(shf_vec6, 5);
5385    dst3 = __msa_srari_h(shf_vec7, 5);
5386
5387    SAT_SH2_SH(dst0, dst1, 7);
5388    SAT_SH2_SH(dst2, dst3, 7);
5389    ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
5390    ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
5391
5392    hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5393    hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5394    hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5395    hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5396
5397    LW4(dst, stride, tp0, tp1, tp2, tp3);
5398    INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
5399    PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5400    out = PCKEV_XORI128_UB(dst0, dst2);
5401    out = __msa_aver_u_b(out, dstv);
5402    ST_W4(out, 0, 1, 2, 3, dst, stride);
5403}
5404
5405void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
5406                                ptrdiff_t stride)
5407{
5408    uint32_t tp0, tp1, tp2, tp3;
5409    const int16_t filt_const0 = 0xfb01;
5410    const int16_t filt_const1 = 0x1414;
5411    const int16_t filt_const2 = 0x1fb;
5412    v16u8 out, dstv = { 0 };
5413    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5414    v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5415    v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5416    v16i8 src76_l, src87_l, filt0, filt1, filt2;
5417    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5418    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5419    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5420    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5421    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5422    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5423    v8i16 minus5h = __msa_ldi_h(-5);
5424    v8i16 plus20h = __msa_ldi_h(20);
5425    v8i16 zeros = { 0 };
5426
5427    filt0 = (v16i8) __msa_fill_h(filt_const0);
5428    filt1 = (v16i8) __msa_fill_h(filt_const1);
5429    filt2 = (v16i8) __msa_fill_h(filt_const2);
5430
5431    src -= ((2 * stride) + 2);
5432
5433    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5434    src += (5 * stride);
5435    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5436    LD_SB4(src, stride, src5, src6, src7, src8);
5437    XORI_B4_128_SB(src5, src6, src7, src8);
5438
5439    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5440               src32_r, src43_r);
5441    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5442               src76_r, src87_r);
5443    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5444               src32_l, src43_l);
5445    ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5446               src76_l, src87_l);
5447    vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5448    vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5449    vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5450    vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5451    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5452               mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5453    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5454               mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5455    hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5456    DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5457    hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5458    DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5459
5460    vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5461    vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5462    vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5463    vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5464    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5465               mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5466    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5467               mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5468    hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5469    DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5470    hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5471    DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5472
5473    SRARI_W2_SW(hz_res0, hz_res1, 10);
5474    SAT_SW2_SW(hz_res0, hz_res1, 7);
5475    SRARI_W2_SW(hz_res2, hz_res3, 10);
5476    SAT_SW2_SW(hz_res2, hz_res3, 7);
5477
5478    dst0 = __msa_srari_h(shf_vec2, 5);
5479    dst1 = __msa_srari_h(shf_vec5, 5);
5480    dst2 = __msa_srari_h(shf_vec6, 5);
5481    dst3 = __msa_srari_h(shf_vec7, 5);
5482
5483    SAT_SH2_SH(dst0, dst1, 7);
5484    SAT_SH2_SH(dst2, dst3, 7);
5485
5486    dst0 = __msa_ilvod_h(zeros, dst0);
5487    dst1 = __msa_ilvod_h(zeros, dst1);
5488    dst2 = __msa_ilvod_h(zeros, dst2);
5489    dst3 = __msa_ilvod_h(zeros, dst3);
5490
5491    hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5492    hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5493    hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5494    hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5495
5496    LW4(dst, stride, tp0, tp1, tp2, tp3);
5497    INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
5498    PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5499    out = PCKEV_XORI128_UB(dst0, dst2);
5500    out = __msa_aver_u_b(out, dstv);
5501    ST_W4(out, 0, 1, 2, 3, dst, stride);
5502}
5503
5504void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
5505                                 ptrdiff_t stride)
5506{
5507    const int32_t filt_const0 = 0xfffb0001;
5508    const int32_t filt_const1 = 0x140014;
5509    const int32_t filt_const2 = 0x1fffb;
5510    const uint8_t *src_tmp = src - (2 * stride) - 2;
5511    uint8_t *dst_tmp = dst;
5512    uint64_t tp0, tp1, tp2, tp3;
5513    uint32_t multiple8_cnt, loop_cnt;
5514    v16u8 dst0, dst1, out0, out1;
5515    v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
5516    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5517    v8i16 hz_out7, hz_out8, res0, res1, res2, res3;
5518    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5519    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
5520    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
5521    v8i16 hz_out87_l, filt0, filt1, filt2;
5522    v4i32 tmp0, tmp1;
5523
5524    filt0 = (v8i16) __msa_fill_w(filt_const0);
5525    filt1 = (v8i16) __msa_fill_w(filt_const1);
5526    filt2 = (v8i16) __msa_fill_w(filt_const2);
5527
5528    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
5529
5530    for (multiple8_cnt = 2; multiple8_cnt--;) {
5531        src = src_tmp;
5532        dst = dst_tmp;
5533
5534        LD_SB5(src, stride, src0, src1, src2, src3, src4);
5535        XORI_B5_128_SB(src0, src1, src2, src3, src4);
5536        src += (5 * stride);
5537
5538        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5539        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5540        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5541        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5542        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
5543
5544        for (loop_cnt = 4; loop_cnt--;) {
5545            LD_SB4(src, stride, src0, src1, src2, src3);
5546            XORI_B4_128_SB(src0, src1, src2, src3);
5547            src += (4 * stride);
5548
5549            hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5550            hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5551            hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5552            hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5553            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5554                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
5555                       hz_out43_r);
5556            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5557                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
5558                       hz_out43_l);
5559            ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5560                       hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
5561                       hz_out87_r);
5562            ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5563                       hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
5564                       hz_out87_l);
5565
5566            tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
5567                                  filt1, filt2);
5568            tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
5569                                  filt1, filt2);
5570            res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5571            tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
5572                                  filt1, filt2);
5573            tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
5574                                  filt1, filt2);
5575            res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5576            tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
5577                                  filt1, filt2);
5578            tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
5579                                  filt1, filt2);
5580            res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5581            tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
5582                                  filt1, filt2);
5583            tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
5584                                  filt1, filt2);
5585            res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5586
5587            LD4(dst, stride, tp0, tp1, tp2, tp3);
5588            INSERT_D2_UB(tp0, tp1, dst0);
5589            INSERT_D2_UB(tp2, tp3, dst1);
5590            out0 = PCKEV_XORI128_UB(res0, res1);
5591            out1 = PCKEV_XORI128_UB(res2, res3);
5592            AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
5593            ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
5594            dst += (4 * stride);
5595
5596            hz_out0 = hz_out4;
5597            hz_out1 = hz_out5;
5598            hz_out2 = hz_out6;
5599            hz_out3 = hz_out7;
5600            hz_out4 = hz_out8;
5601        }
5602
5603        src_tmp += 8;
5604        dst_tmp += 8;
5605    }
5606}
5607
5608void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
5609                                ptrdiff_t stride)
5610{
5611    const int32_t filt_const0 = 0xfffb0001;
5612    const int32_t filt_const1 = 0x140014;
5613    const int32_t filt_const2 = 0x1fffb;
5614    uint64_t tp0, tp1, tp2, tp3;
5615    v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
5616    v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
5617    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5618    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
5619    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5620    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
5621    v8i16 hz_out1110_r, hz_out1211_r, res0, res1, res2, res3;
5622    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
5623    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
5624    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
5625    v4i32 tmp0, tmp1;
5626
5627    filt0 = (v8i16) __msa_fill_w(filt_const0);
5628    filt1 = (v8i16) __msa_fill_w(filt_const1);
5629    filt2 = (v8i16) __msa_fill_w(filt_const2);
5630
5631    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
5632
5633    src -= ((2 * stride) + 2);
5634    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5635    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5636    src += (5 * stride);
5637
5638    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5639    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5640    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5641    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5642    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
5643
5644    LD_SB4(src, stride, src0, src1, src2, src3);
5645    XORI_B4_128_SB(src0, src1, src2, src3);
5646    src += (4 * stride);
5647    hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5648    hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5649    hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5650    hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5651    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5652               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5653    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5654               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
5655    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5656               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5657    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5658               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
5659
5660    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5661                          filt2);
5662    tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
5663                          filt2);
5664    res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5665    tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5666                          filt2);
5667    tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
5668                          filt2);
5669    res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5670    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5671                          filt2);
5672    tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
5673                          filt2);
5674    res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5675    tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5676                          filt2);
5677    tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
5678                          filt2);
5679    res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5680    LD4(dst, stride, tp0, tp1, tp2, tp3);
5681    INSERT_D2_UB(tp0, tp1, dst0);
5682    INSERT_D2_UB(tp2, tp3, dst1);
5683    out0 = PCKEV_XORI128_UB(res0, res1);
5684    out1 = PCKEV_XORI128_UB(res2, res3);
5685    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
5686    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
5687    dst += (4 * stride);
5688
5689    LD_SB4(src, stride, src0, src1, src2, src3);
5690    XORI_B4_128_SB(src0, src1, src2, src3);
5691    hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5692    hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5693    hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5694    hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5695    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5696               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
5697               hz_out1211_r);
5698    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5699               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
5700               hz_out1211_l);
5701    tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
5702                          filt2);
5703    tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
5704                          filt2);
5705    res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5706    tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
5707                          filt2);
5708    tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
5709                          filt2);
5710    res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5711    tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
5712                          filt2);
5713    tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
5714                          filt2);
5715    res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5716    tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
5717                          filt2);
5718    tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
5719                          filt2);
5720    res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5721    LD4(dst, stride, tp0, tp1, tp2, tp3);
5722    INSERT_D2_UB(tp0, tp1, dst0);
5723    INSERT_D2_UB(tp2, tp3, dst1);
5724    out0 = PCKEV_XORI128_UB(res0, res1);
5725    out1 = PCKEV_XORI128_UB(res2, res3);
5726    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
5727    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
5728}
5729
5730void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
5731                                ptrdiff_t stride)
5732{
5733    const int32_t filt_const0 = 0xfffb0001;
5734    const int32_t filt_const1 = 0x140014;
5735    const int32_t filt_const2 = 0x1fffb;
5736    uint32_t tp0, tp1, tp2, tp3;
5737    v16u8 res, dst0 = { 0 };
5738    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5739    v16i8 mask0, mask1, mask2;
5740    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5741    v8i16 hz_out7, hz_out8, res0, res1, filt0, filt1, filt2;
5742    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5743    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
5744    v4i32 tmp0, tmp1;
5745
5746    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
5747
5748    filt0 = (v8i16) __msa_fill_w(filt_const0);
5749    filt1 = (v8i16) __msa_fill_w(filt_const1);
5750    filt2 = (v8i16) __msa_fill_w(filt_const2);
5751
5752    src -= ((2 * stride) + 2);
5753
5754    LD_SB5(src, stride, src0, src1, src2, src3, src4);
5755    src += (5 * stride);
5756    LD_SB4(src, stride, src5, src6, src7, src8);
5757
5758    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5759    XORI_B4_128_SB(src5, src6, src7, src8);
5760    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
5761    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
5762    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
5763    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
5764    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
5765    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
5766    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
5767    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5768               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5769    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5770               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5771
5772    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5773                          filt2);
5774    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5775                          filt2);
5776    res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5777    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5778                          filt2);
5779    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5780                          filt2);
5781    res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5782    LW4(dst, stride, tp0, tp1, tp2, tp3);
5783    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
5784    res = PCKEV_XORI128_UB(res0, res1);
5785    res = __msa_aver_u_b(res, dst0);
5786    ST_W4(res, 0, 1, 2, 3, dst, stride);
5787}
5788