1/*
2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "qpeldsp_mips.h"
23
24#define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2)  \
25( {                                                                     \
26    v16u8 out, tmp0, tmp1;                                              \
27    v16u8 data0, data1, data2, data3, data4, data5;                     \
28    v8i16 res_r, res_l;                                                 \
29    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
30    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
31                                                                        \
32    VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1);         \
33    ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l);                            \
34    data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15);       \
35    data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1);        \
36    HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l);                        \
37    ILVRL_B2_UH(data3, data0, sum1_r, sum1_l);                          \
38    data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14);       \
39    data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2);        \
40    sum0_r *= (v8u16) (coef0);                                          \
41    sum0_l *= (v8u16) (coef0);                                          \
42    ILVRL_B2_UH(data4, data1, sum2_r, sum2_l);                          \
43    data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13);       \
44    data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3);        \
45    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
46    ILVRL_B2_UH(data5, data2, sum3_r, sum3_l);                          \
47    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
48    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
49    res_r = (v8i16) (sum0_r - sum3_r);                                  \
50    res_l = (v8i16) (sum0_l - sum3_l);                                  \
51    SRARI_H2_SH(res_r, res_l, 5);                                       \
52    CLIP_SH2_0_255(res_r, res_l);                                       \
53    out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
54                                                                        \
55    out;                                                                \
56} )
57
58#define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,                       \
59                                      mask0, mask1, mask2, mask3,       \
60                                      coef0, coef1, coef2)              \
61( {                                                                     \
62    v16u8 out;                                                          \
63    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
64    v8u16 sum4_r, sum5_r, sum6_r, sum7_r;                               \
65    v8i16 res0_r, res1_r;                                               \
66                                                                        \
67    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r);   \
68    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r);   \
69    HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r);                        \
70    DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r);          \
71    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r);   \
72    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r);   \
73    DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r);         \
74    DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r);         \
75    res0_r = (v8i16) (sum0_r - sum3_r);                                 \
76    res1_r = (v8i16) (sum4_r - sum7_r);                                 \
77    SRARI_H2_SH(res0_r, res1_r, 5);                                     \
78    CLIP_SH2_0_255(res0_r, res1_r);                                     \
79    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);        \
80                                                                        \
81    out;                                                                \
82} )
83
84#define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,                        \
85                                           mask0, mask1, mask2, mask3,  \
86                                           coef0, coef1, coef2)         \
87( {                                                                     \
88    v16u8 out;                                                          \
89    v8i16 res0_r;                                                       \
90    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
91                                                                        \
92    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r);   \
93    sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r);            \
94    sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0);             \
95    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r);   \
96    DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r);         \
97    res0_r = (v8i16) (sum0_r - sum3_r);                                 \
98    res0_r = __msa_srari_h(res0_r, 5);                                  \
99    CLIP_SH_0_255(res0_r);                                              \
100    out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);        \
101                                                                        \
102    out;                                                                \
103} )
104
105#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,   \
106                                                    mask2, mask3, coef0,  \
107                                                    coef1, coef2)         \
108( {                                                                       \
109    v16u8 out;                                                            \
110    v8i16 res0_r;                                                         \
111    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                                 \
112                                                                          \
113    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r);     \
114    sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r);              \
115    sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0);               \
116    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r);     \
117    DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r);           \
118    res0_r = (v8i16) (sum0_r - sum3_r);                                   \
119    res0_r += 15;                                                         \
120    res0_r >>= 5;                                                         \
121    CLIP_SH_0_255(res0_r);                                                \
122    out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);          \
123                                                                          \
124    out;                                                                  \
125} )
126
127#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,              \
128                                         coef0, coef1, coef2)           \
129( {                                                                     \
130    v16u8 out, tmp0, tmp1;                                              \
131    v16u8 data0, data1, data2, data3, data4, data5;                     \
132    v8i16 res_r, res_l;                                                 \
133    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
134    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
135                                                                        \
136    VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1);         \
137    ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l);                            \
138    data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15);       \
139    data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1);        \
140    HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l);                        \
141    ILVRL_B2_UH(data3, data0, sum1_r, sum1_l);                          \
142    data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14);       \
143    data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2);        \
144    sum0_r *= (v8u16) (coef0);                                          \
145    sum0_l *= (v8u16) (coef0);                                          \
146    ILVRL_B2_UH(data4, data1, sum2_r, sum2_l);                          \
147    data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13);       \
148    data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3);        \
149    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
150    ILVRL_B2_UH(data5, data2, sum3_r, sum3_l);                          \
151    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
152    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
153    res_r = (v8i16) (sum0_r - sum3_r);                                  \
154    res_l = (v8i16) (sum0_l - sum3_l);                                  \
155    res_r += 15;                                                        \
156    res_l += 15;                                                        \
157    res_r >>= 5;                                                        \
158    res_l >>= 5;                                                        \
159    CLIP_SH2_0_255(res_r, res_l);                                       \
160    out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
161                                                                        \
162    out;                                                                \
163} )
164
165#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1,                  \
166                                               mask0, mask1, mask2, mask3,  \
167                                               coef0, coef1, coef2)         \
168( {                                                                         \
169    v16u8 out;                                                              \
170    v8i16 res0_r, res1_r;                                                   \
171    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                                   \
172    v8u16 sum4_r, sum5_r, sum6_r, sum7_r;                                   \
173                                                                            \
174    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r);       \
175    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r);       \
176    HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r);                            \
177    DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r);              \
178    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r);       \
179    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r);       \
180    DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r);             \
181    DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r);             \
182    res0_r = (v8i16) (sum0_r - sum3_r);                                     \
183    res1_r = (v8i16) (sum4_r - sum7_r);                                     \
184    res0_r += 15;                                                           \
185    res1_r += 15;                                                           \
186    res0_r >>= 5;                                                           \
187    res1_r >>= 5;                                                           \
188    CLIP_SH2_0_255(res0_r, res1_r);                                         \
189    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);            \
190                                                                            \
191    out;                                                                    \
192} )
193
194#define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3,                  \
195                               inp4, inp5, inp6, inp7,                  \
196                               coef0, coef1, coef2)                     \
197( {                                                                     \
198    v16u8 res;                                                          \
199    v8i16 res_r, res_l;                                                 \
200    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
201    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
202                                                                        \
203    ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l);                            \
204    ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l);                            \
205    DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l);          \
206    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
207    ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l);                            \
208    ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l);                            \
209    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
210    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
211    res_r = (v8i16) (sum0_r - sum3_r);                                  \
212    res_l = (v8i16) (sum0_l - sum3_l);                                  \
213    SRARI_H2_SH(res_r, res_l, 5);                                       \
214    CLIP_SH2_0_255(res_r, res_l);                                       \
215    res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
216                                                                        \
217    res;                                                                \
218} )
219
220#define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03,        \
221                                     inp04, inp05, inp06, inp07,        \
222                                     inp10, inp11, inp12, inp13,        \
223                                     inp14, inp15, inp16, inp17,        \
224                                     coef0, coef1, coef2)               \
225( {                                                                     \
226    v16u8 res;                                                          \
227    v8i16 val0, val1;                                                   \
228    v8u16 sum00, sum01, sum02, sum03;                                   \
229    v8u16 sum10, sum11, sum12, sum13;                                   \
230                                                                        \
231    ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13,  \
232               sum00, sum10, sum03, sum13);                             \
233    DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10);              \
234    HADD_UB2_UH(sum03, sum13, sum03, sum13);                            \
235    ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11,  \
236               sum02, sum12, sum01, sum11);                             \
237    DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10);             \
238    DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13);             \
239    val0 = (v8i16) (sum00 - sum03);                                     \
240    val1 = (v8i16) (sum10 - sum13);                                     \
241    SRARI_H2_SH(val0, val1, 5);                                         \
242    CLIP_SH2_0_255(val0, val1);                                         \
243    res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0);            \
244                                                                        \
245    res;                                                                \
246} )
247
248#define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3,         \
249                                        inp4, inp5, inp6, inp7,         \
250                                        coef0, coef1, coef2)            \
251( {                                                                     \
252    v16u8 res;                                                          \
253    v8i16 res_r, res_l;                                                 \
254    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
255    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
256                                                                        \
257    ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l);                            \
258    ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l);                            \
259    DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l);          \
260    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
261    ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l);                            \
262    ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l);                            \
263    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
264    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
265    res_r = (v8i16) (sum0_r - sum3_r);                                  \
266    res_l = (v8i16) (sum0_l - sum3_l);                                  \
267    res_r += 15;                                                        \
268    res_l += 15;                                                        \
269    res_r >>= 5;                                                        \
270    res_l >>= 5;                                                        \
271    CLIP_SH2_0_255(res_r, res_l);                                       \
272    res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
273                                                                        \
274    res;                                                                \
275} )
276
277#define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03,  \
278                                              inp04, inp05, inp06, inp07,  \
279                                              inp10, inp11, inp12, inp13,  \
280                                              inp14, inp15, inp16, inp17,  \
281                                              coef0, coef1, coef2)         \
282( {                                                                        \
283    v16u8 res;                                                             \
284    v8i16 val0, val1;                                                      \
285    v8u16 sum00, sum01, sum02, sum03;                                      \
286    v8u16 sum10, sum11, sum12, sum13;                                      \
287                                                                           \
288    ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13,     \
289               sum00, sum10, sum03, sum13);                                \
290    DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10);                 \
291    HADD_UB2_UH(sum03, sum13, sum03, sum13);                               \
292    ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11,     \
293               sum02, sum12, sum01, sum11);                                \
294    DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10);                \
295    DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13);                \
296    val0 = (v8i16) (sum00 - sum03);                                        \
297    val1 = (v8i16) (sum10 - sum13);                                        \
298    val0 += 15;                                                            \
299    val1 += 15;                                                            \
300    val0 >>= 5;                                                            \
301    val1 >>= 5;                                                            \
302    CLIP_SH2_0_255(val0, val1);                                            \
303    res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0);               \
304                                                                           \
305    res;                                                                   \
306} )
307
308static void horiz_mc_qpel_aver_src0_8width_msa(const uint8_t *src,
309                                               int32_t src_stride,
310                                               uint8_t *dst,
311                                               int32_t dst_stride,
312                                               int32_t height)
313{
314    uint8_t loop_count;
315    v16u8 inp0, inp1, inp2, inp3;
316    v16u8 res0, res1;
317    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
318    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
319    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
320    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
321    v16u8 const20 = (v16u8) __msa_ldi_b(20);
322    v16u8 const6 = (v16u8) __msa_ldi_b(6);
323    v16u8 const3 = (v16u8) __msa_ldi_b(3);
324
325    for (loop_count = (height >> 2); loop_count--;) {
326        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
327        src += (4 * src_stride);
328        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
329                                             mask0, mask1, mask2, mask3,
330                                             const20, const6, const3);
331        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
332                                             mask0, mask1, mask2, mask3,
333                                             const20, const6, const3);
334        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
335        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
336        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
337        ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
338        dst += (4 * dst_stride);
339    }
340}
341
342static void horiz_mc_qpel_aver_src0_16width_msa(const uint8_t *src,
343                                                int32_t src_stride,
344                                                uint8_t *dst,
345                                                int32_t dst_stride,
346                                                int32_t height)
347{
348    uint8_t loop_count;
349    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
350    v16u8 res;
351    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
352    v16u8 const6 = (v16u8) __msa_ldi_b(6);
353    v16u8 const3 = (v16u8) __msa_ldi_b(3);
354    v8u16 const20 = (v8u16) __msa_ldi_h(20);
355
356    for (loop_count = (height >> 2); loop_count--;) {
357        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
358        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
359        src += (4 * src_stride);
360        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
361                                      const20, const6, const3);
362        res = __msa_aver_u_b(inp0, res);
363        ST_UB(res, dst);
364        dst += dst_stride;
365
366        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
367                                      const20, const6, const3);
368        res = __msa_aver_u_b(inp2, res);
369        ST_UB(res, dst);
370        dst += dst_stride;
371
372        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
373                                      const20, const6, const3);
374        res = __msa_aver_u_b(inp4, res);
375        ST_UB(res, dst);
376        dst += dst_stride;
377
378        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
379                                      const20, const6, const3);
380        res = __msa_aver_u_b(inp6, res);
381        ST_UB(res, dst);
382        dst += dst_stride;
383    }
384}
385
386static void horiz_mc_qpel_8width_msa(const uint8_t *src,
387                                     int32_t src_stride,
388                                     uint8_t *dst,
389                                     int32_t dst_stride,
390                                     int32_t height)
391{
392    uint8_t loop_count;
393    v16u8 inp0, inp1, inp2, inp3;
394    v16u8 res0, res1;
395    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
396    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
397    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
398    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
399    v16u8 const20 = (v16u8) __msa_ldi_b(20);
400    v16u8 const6 = (v16u8) __msa_ldi_b(6);
401    v16u8 const3 = (v16u8) __msa_ldi_b(3);
402
403    for (loop_count = (height >> 2); loop_count--;) {
404        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
405        src += (4 * src_stride);
406        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
407                                             mask0, mask1, mask2, mask3,
408                                             const20, const6, const3);
409        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
410                                             mask0, mask1, mask2, mask3,
411                                             const20, const6, const3);
412        ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
413        dst += (4 * dst_stride);
414    }
415}
416
417static void horiz_mc_qpel_16width_msa(const uint8_t *src,
418                                      int32_t src_stride,
419                                      uint8_t *dst,
420                                      int32_t dst_stride,
421                                      int32_t height)
422{
423    uint8_t loop_count;
424    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
425    v16u8 res;
426    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
427    v8u16 const20 = (v8u16) __msa_ldi_h(20);
428    v16u8 const6 = (v16u8) __msa_ldi_b(6);
429    v16u8 const3 = (v16u8) __msa_ldi_b(3);
430
431    for (loop_count = (height >> 2); loop_count--;) {
432        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
433        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
434        src += (4 * src_stride);
435        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
436                                      const20, const6, const3);
437        ST_UB(res, dst);
438        dst += dst_stride;
439
440        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
441                                      const20, const6, const3);
442        ST_UB(res, dst);
443        dst += dst_stride;
444
445        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
446                                      const20, const6, const3);
447        ST_UB(res, dst);
448        dst += dst_stride;
449
450        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
451                                      const20, const6, const3);
452        ST_UB(res, dst);
453        dst += dst_stride;
454    }
455}
456
457static void horiz_mc_qpel_aver_src1_8width_msa(const uint8_t *src,
458                                               int32_t src_stride,
459                                               uint8_t *dst,
460                                               int32_t dst_stride,
461                                               int32_t height)
462{
463    uint8_t loop_count;
464    v16u8 inp0, inp1, inp2, inp3;
465    v16u8 res0, res1;
466    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
467    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
468    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
469    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
470    v16u8 const20 = (v16u8) __msa_ldi_b(20);
471    v16u8 const6 = (v16u8) __msa_ldi_b(6);
472    v16u8 const3 = (v16u8) __msa_ldi_b(3);
473
474    for (loop_count = (height >> 2); loop_count--;) {
475        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
476        src += (4 * src_stride);
477        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
478                                             mask0, mask1, mask2, mask3,
479                                             const20, const6, const3);
480        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
481                                             mask0, mask1, mask2, mask3,
482                                             const20, const6, const3);
483        SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
484                   inp0, inp1, inp2, inp3);
485        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
486        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
487        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
488        ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
489        dst += (4 * dst_stride);
490    }
491}
492
493static void horiz_mc_qpel_aver_src1_16width_msa(const uint8_t *src,
494                                                int32_t src_stride,
495                                                uint8_t *dst,
496                                                int32_t dst_stride,
497                                                int32_t height)
498{
499    uint8_t loop_count;
500    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
501    v16u8 res;
502    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
503    v8u16 const20 = (v8u16) __msa_ldi_h(20);
504    v16u8 const6 = (v16u8) __msa_ldi_b(6);
505    v16u8 const3 = (v16u8) __msa_ldi_b(3);
506
507    for (loop_count = (height >> 2); loop_count--;) {
508        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
509        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
510        src += (4 * src_stride);
511        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
512                                      const20, const6, const3);
513        res = __msa_aver_u_b(res, inp1);
514        ST_UB(res, dst);
515        dst += dst_stride;
516
517        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
518                                      const20, const6, const3);
519        res = __msa_aver_u_b(res, inp3);
520        ST_UB(res, dst);
521        dst += dst_stride;
522
523        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
524                                      const20, const6, const3);
525        res = __msa_aver_u_b(res, inp5);
526        ST_UB(res, dst);
527        dst += dst_stride;
528
529        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
530                                      const20, const6, const3);
531        res = __msa_aver_u_b(res, inp7);
532        ST_UB(res, dst);
533        dst += dst_stride;
534    }
535}
536
537static void horiz_mc_qpel_no_rnd_aver_src0_8width_msa(const uint8_t *src,
538                                                      int32_t src_stride,
539                                                      uint8_t *dst,
540                                                      int32_t dst_stride,
541                                                      int32_t height)
542{
543    uint8_t loop_count;
544    v16u8 inp0, inp1, inp2, inp3;
545    v16u8 res0, res1;
546    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
547    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
548    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
549    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
550    v16u8 const20 = (v16u8) __msa_ldi_b(20);
551    v16u8 const6 = (v16u8) __msa_ldi_b(6);
552    v16u8 const3 = (v16u8) __msa_ldi_b(3);
553
554    for (loop_count = (height >> 2); loop_count--;) {
555        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
556        src += (4 * src_stride);
557        res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
558                                                      mask2, mask3, const20,
559                                                      const6, const3);
560        res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
561                                                      mask2, mask3, const20,
562                                                      const6, const3);
563        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
564        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
565        res0 = __msa_ave_u_b(inp0, res0);
566        res1 = __msa_ave_u_b(inp2, res1);
567        ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
568        dst += (4 * dst_stride);
569    }
570}
571
572static void horiz_mc_qpel_no_rnd_aver_src0_16width_msa(const uint8_t *src,
573                                                       int32_t src_stride,
574                                                       uint8_t *dst,
575                                                       int32_t dst_stride,
576                                                       int32_t height)
577{
578    uint8_t loop_count;
579    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
580    v16u8 res;
581    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
582    v8u16 const20 = (v8u16) __msa_ldi_h(20);
583    v16u8 const6 = (v16u8) __msa_ldi_b(6);
584    v16u8 const3 = (v16u8) __msa_ldi_b(3);
585
586    for (loop_count = (height >> 2); loop_count--;) {
587        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
588        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
589        src += (4 * src_stride);
590        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
591                                               const20, const6, const3);
592        res = __msa_ave_u_b(inp0, res);
593        ST_UB(res, dst);
594        dst += dst_stride;
595
596        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
597                                               const20, const6, const3);
598        res = __msa_ave_u_b(inp2, res);
599        ST_UB(res, dst);
600        dst += dst_stride;
601
602        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
603                                               const20, const6, const3);
604        res = __msa_ave_u_b(inp4, res);
605        ST_UB(res, dst);
606        dst += dst_stride;
607
608        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
609                                               const20, const6, const3);
610        res = __msa_ave_u_b(inp6, res);
611        ST_UB(res, dst);
612        dst += dst_stride;
613    }
614}
615
616static void horiz_mc_qpel_no_rnd_8width_msa(const uint8_t *src,
617                                            int32_t src_stride,
618                                            uint8_t *dst,
619                                            int32_t dst_stride,
620                                            int32_t height)
621{
622    uint8_t loop_count;
623    v16u8 inp0, inp1, inp2, inp3;
624    v16u8 res0, res1;
625    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
626    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
627    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
628    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
629    v16u8 const20 = (v16u8) __msa_ldi_b(20);
630    v16u8 const6 = (v16u8) __msa_ldi_b(6);
631    v16u8 const3 = (v16u8) __msa_ldi_b(3);
632
633    for (loop_count = (height >> 2); loop_count--;) {
634        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
635        src += (4 * src_stride);
636        res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
637                                                      mask2, mask3, const20,
638                                                      const6, const3);
639        res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
640                                                      mask2, mask3, const20,
641                                                      const6, const3);
642        ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
643        dst += (4 * dst_stride);
644    }
645}
646
647static void horiz_mc_qpel_no_rnd_16width_msa(const uint8_t *src,
648                                             int32_t src_stride,
649                                             uint8_t *dst,
650                                             int32_t dst_stride,
651                                             int32_t height)
652{
653    uint8_t loop_count;
654    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
655    v16u8 res;
656    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
657    v16u8 const6 = (v16u8) __msa_ldi_b(6);
658    v16u8 const3 = (v16u8) __msa_ldi_b(3);
659    v8u16 const20 = (v8u16) __msa_ldi_h(20);
660
661    for (loop_count = (height >> 2); loop_count--;) {
662        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
663        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
664        src += (4 * src_stride);
665        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
666                                               const20, const6, const3);
667        ST_UB(res, dst);
668        dst += dst_stride;
669
670        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
671                                               const20, const6, const3);
672        ST_UB(res, dst);
673        dst += dst_stride;
674
675        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
676                                               const20, const6, const3);
677        ST_UB(res, dst);
678        dst += dst_stride;
679
680        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
681                                               const20, const6, const3);
682        ST_UB(res, dst);
683        dst += dst_stride;
684    }
685}
686
687static void horiz_mc_qpel_no_rnd_aver_src1_8width_msa(const uint8_t *src,
688                                                      int32_t src_stride,
689                                                      uint8_t *dst,
690                                                      int32_t dst_stride,
691                                                      int32_t height)
692{
693    uint8_t loop_count;
694    v16u8 inp0, inp1, inp2, inp3;
695    v16u8 res0, res1;
696    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
697    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
698    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
699    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
700    v16u8 const20 = (v16u8) __msa_ldi_b(20);
701    v16u8 const6 = (v16u8) __msa_ldi_b(6);
702    v16u8 const3 = (v16u8) __msa_ldi_b(3);
703
704    for (loop_count = (height >> 2); loop_count--;) {
705        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
706        src += (4 * src_stride);
707        res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
708                                                      mask2, mask3, const20,
709                                                      const6, const3);
710        res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
711                                                      mask2, mask3, const20,
712                                                      const6, const3);
713        SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
714                   inp0, inp1, inp2, inp3);
715        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
716        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
717        res0 = __msa_ave_u_b(inp0, res0);
718        res1 = __msa_ave_u_b(inp2, res1);
719        ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
720        dst += (4 * dst_stride);
721    }
722}
723
724static void horiz_mc_qpel_no_rnd_aver_src1_16width_msa(const uint8_t *src,
725                                                       int32_t src_stride,
726                                                       uint8_t *dst,
727                                                       int32_t dst_stride,
728                                                       int32_t height)
729{
730    uint8_t loop_count;
731    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
732    v16u8 res;
733    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
734    v16u8 const6 = (v16u8) __msa_ldi_b(6);
735    v16u8 const3 = (v16u8) __msa_ldi_b(3);
736    v8u16 const20 = (v8u16) __msa_ldi_h(20);
737
738    for (loop_count = (height >> 2); loop_count--;) {
739        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
740        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
741        src += (4 * src_stride);
742        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
743                                               const20, const6, const3);
744        res = __msa_ave_u_b(res, inp1);
745        ST_UB(res, dst);
746        dst += dst_stride;
747
748        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
749                                               const20, const6, const3);
750        res = __msa_ave_u_b(res, inp3);
751        ST_UB(res, dst);
752        dst += dst_stride;
753
754        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
755                                               const20, const6, const3);
756        res = __msa_ave_u_b(res, inp5);
757        ST_UB(res, dst);
758        dst += dst_stride;
759
760        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
761                                               const20, const6, const3);
762        res = __msa_ave_u_b(res, inp7);
763        ST_UB(res, dst);
764        dst += dst_stride;
765    }
766}
767
768static void horiz_mc_qpel_avg_dst_aver_src0_8width_msa(const uint8_t *src,
769                                                       int32_t src_stride,
770                                                       uint8_t *dst,
771                                                       int32_t dst_stride,
772                                                       int32_t height)
773{
774    uint8_t loop_count;
775    v16u8 inp0, inp1, inp2, inp3;
776    v16u8 dst0, dst1, dst2, dst3;
777    v16u8 res0, res1;
778    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
779    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
780    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
781    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
782    v16u8 const20 = (v16u8) __msa_ldi_b(20);
783    v16u8 const6 = (v16u8) __msa_ldi_b(6);
784    v16u8 const3 = (v16u8) __msa_ldi_b(3);
785
786    for (loop_count = (height >> 2); loop_count--;) {
787        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
788        src += (4 * src_stride);
789        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
790                                             mask0, mask1, mask2, mask3,
791                                             const20, const6, const3);
792        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
793                                             mask0, mask1, mask2, mask3,
794                                             const20, const6, const3);
795        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
796        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
797        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
798        dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
799        dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
800        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
801        AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
802        ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
803        dst += (4 * dst_stride);
804    }
805}
806
807static void horiz_mc_qpel_avg_dst_aver_src0_16width_msa(const uint8_t *src,
808                                                        int32_t src_stride,
809                                                        uint8_t *dst,
810                                                        int32_t dst_stride,
811                                                        int32_t height)
812{
813    uint8_t loop_count;
814    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
815    v16u8 res0, res1;
816    v16u8 dst0, dst1;
817    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
818    v16u8 const6 = (v16u8) __msa_ldi_b(6);
819    v16u8 const3 = (v16u8) __msa_ldi_b(3);
820    v8u16 const20 = (v8u16) __msa_ldi_h(20);
821
822    for (loop_count = (height >> 2); loop_count--;) {
823        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
824        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
825        src += (4 * src_stride);
826        res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
827                                       const20, const6, const3);
828        res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
829                                       const20, const6, const3);
830        LD_UB2(dst, dst_stride, dst0, dst1);
831        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
832        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
833        ST_UB2(res0, res1, dst, dst_stride);
834        dst += (2 * dst_stride);
835
836        res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
837                                       const20, const6, const3);
838        res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
839                                       const20, const6, const3);
840        LD_UB2(dst, dst_stride, dst0, dst1);
841        AVER_UB2_UB(inp4, res0, inp6, res1, res0, res1);
842        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
843        ST_UB2(res0, res1, dst, dst_stride);
844        dst += (2 * dst_stride);
845    }
846}
847
848static void horiz_mc_qpel_avg_dst_8width_msa(const uint8_t *src,
849                                             int32_t src_stride,
850                                             uint8_t *dst,
851                                             int32_t dst_stride,
852                                             int32_t height)
853{
854    uint8_t loop_count;
855    v16u8 inp0, inp1, inp2, inp3;
856    v16u8 dst0, dst1, dst2, dst3;
857    v16u8 res0, res1;
858    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
859    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
860    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
861    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
862    v16u8 const20 = (v16u8) __msa_ldi_b(20);
863    v16u8 const6 = (v16u8) __msa_ldi_b(6);
864    v16u8 const3 = (v16u8) __msa_ldi_b(3);
865
866    for (loop_count = (height >> 2); loop_count--;) {
867        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
868        src += (4 * src_stride);
869        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
870                                             mask0, mask1, mask2, mask3,
871                                             const20, const6, const3);
872        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
873                                             mask0, mask1, mask2, mask3,
874                                             const20, const6, const3);
875        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
876        dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
877        dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
878        AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
879        ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
880        dst += (4 * dst_stride);
881    }
882}
883
884static void horiz_mc_qpel_avg_dst_16width_msa(const uint8_t *src,
885                                              int32_t src_stride,
886                                              uint8_t *dst,
887                                              int32_t dst_stride,
888                                              int32_t height)
889{
890    uint8_t loop_count;
891    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
892    v16u8 res0, res1;
893    v16u8 dst0, dst1;
894    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
895    v16u8 const6 = (v16u8) __msa_ldi_b(6);
896    v16u8 const3 = (v16u8) __msa_ldi_b(3);
897    v8u16 const20 = (v8u16) __msa_ldi_h(20);
898
899    for (loop_count = (height >> 2); loop_count--;) {
900        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
901        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
902        src += (4 * src_stride);
903        res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
904                                       const20, const6, const3);
905        res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
906                                       const20, const6, const3);
907        LD_UB2(dst, dst_stride, dst0, dst1);
908        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
909        ST_UB2(res0, res1, dst, dst_stride);
910        dst += (2 * dst_stride);
911
912        res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
913                                       const20, const6, const3);
914        res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
915                                       const20, const6, const3);
916        LD_UB2(dst, dst_stride, dst0, dst1);
917        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
918        ST_UB2(res0, res1, dst, dst_stride);
919        dst += (2 * dst_stride);
920    }
921}
922
923static void horiz_mc_qpel_avg_dst_aver_src1_8width_msa(const uint8_t *src,
924                                                       int32_t src_stride,
925                                                       uint8_t *dst,
926                                                       int32_t dst_stride,
927                                                       int32_t height)
928{
929    uint8_t loop_count;
930    v16u8 inp0, inp1, inp2, inp3;
931    v16u8 dst0, dst1, dst2, dst3;
932    v16u8 res0, res1;
933    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
934    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
935    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
936    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
937    v16u8 const20 = (v16u8) __msa_ldi_b(20);
938    v16u8 const6 = (v16u8) __msa_ldi_b(6);
939    v16u8 const3 = (v16u8) __msa_ldi_b(3);
940
941    for (loop_count = (height >> 2); loop_count--;) {
942        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
943        src += (4 * src_stride);
944        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
945                                             mask0, mask1, mask2, mask3,
946                                             const20, const6, const3);
947        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
948                                             mask0, mask1, mask2, mask3,
949                                             const20, const6, const3);
950        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
951        SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
952                   inp0, inp1, inp2, inp3);
953        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
954        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
955        dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
956        dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
957        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
958        AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
959        ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
960        dst += (4 * dst_stride);
961    }
962}
963
964static void horiz_mc_qpel_avg_dst_aver_src1_16width_msa(const uint8_t *src,
965                                                        int32_t src_stride,
966                                                        uint8_t *dst,
967                                                        int32_t dst_stride,
968                                                        int32_t height)
969{
970    uint8_t loop_count;
971    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
972    v16u8 res0, res1, dst0, dst1;
973    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
974    v16u8 const6 = (v16u8) __msa_ldi_b(6);
975    v16u8 const3 = (v16u8) __msa_ldi_b(3);
976    v8u16 const20 = (v8u16) __msa_ldi_h(20);
977
978    for (loop_count = (height >> 2); loop_count--;) {
979        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
980        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
981        src += (4 * src_stride);
982        res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
983                                       const20, const6, const3);
984        res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
985                                       const20, const6, const3);
986        LD_UB2(dst, dst_stride, dst0, dst1);
987        AVER_UB2_UB(res0, inp1, res1, inp3, res0, res1);
988        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
989        ST_UB2(res0, res1, dst, dst_stride);
990        dst += (2 * dst_stride);
991        res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
992                                       const20, const6, const3);
993        res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
994                                       const20, const6, const3);
995        LD_UB2(dst, dst_stride, dst0, dst1);
996        AVER_UB2_UB(res0, inp5, res1, inp7, res0, res1);
997        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
998        ST_UB2(res0, res1, dst, dst_stride);
999        dst += (2 * dst_stride);
1000    }
1001}
1002
1003
1004static void vert_mc_qpel_aver_src0_8x8_msa(const uint8_t *src,
1005                                           int32_t src_stride,
1006                                           uint8_t *dst,
1007                                           int32_t dst_stride)
1008{
1009    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1010    v16u8 tmp0, tmp1, res0, res1;
1011    v16u8 const20 = (v16u8) __msa_ldi_b(20);
1012    v16u8 const6 = (v16u8) __msa_ldi_b(6);
1013    v16u8 const3 = (v16u8) __msa_ldi_b(3);
1014
1015    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1016    src += (4 * src_stride);
1017    LD_UB2(src, src_stride, inp4, inp5);
1018    src += (2 * src_stride);
1019    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1020                                        inp1, inp2, inp3, inp4,
1021                                        inp1, inp0, inp0, inp1,
1022                                        inp2, inp3, inp4, inp5,
1023                                        const20, const6, const3);
1024    LD_UB2(src, src_stride, inp6, inp7);
1025    src += (2 * src_stride);
1026    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1027                                        inp3, inp4, inp5, inp6,
1028                                        inp3, inp2, inp1, inp0,
1029                                        inp4, inp5, inp6, inp7,
1030                                        const20, const6, const3);
1031    tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1032    tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1033    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1034    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1035
1036    inp8 = LD_UB(src);
1037    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1038                                        inp5, inp6, inp7, inp8,
1039                                        inp5, inp4, inp3, inp2,
1040                                        inp6, inp7, inp8, inp8,
1041                                        const20, const6, const3);
1042    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1043                                        inp7, inp8, inp8, inp7,
1044                                        inp7, inp6, inp5, inp4,
1045                                        inp8, inp8, inp7, inp6,
1046                                        const20, const6, const3);
1047    tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1048    tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1049    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1050    ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1051}
1052
1053static void vert_mc_qpel_aver_src0_16x16_msa(const uint8_t *src,
1054                                             int32_t src_stride,
1055                                             uint8_t *dst,
1056                                             int32_t dst_stride)
1057{
1058    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1059    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1060    v16u8 res0;
1061    v16u8 const20 = (v16u8) __msa_ldi_b(20);
1062    v16u8 const6 = (v16u8) __msa_ldi_b(6);
1063    v16u8 const3 = (v16u8) __msa_ldi_b(3);
1064
1065    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1066    src += (5 * src_stride);
1067    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1068                                  inp1, inp2, inp3, inp4,
1069                                  const20, const6, const3);
1070    res0 = __msa_aver_u_b(res0, inp0);
1071    ST_UB(res0, dst);
1072    dst += dst_stride;
1073
1074    inp5 = LD_UB(src);
1075    src += src_stride;
1076    res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1077                                  inp2, inp3, inp4, inp5,
1078                                  const20, const6, const3);
1079    res0 = __msa_aver_u_b(res0, inp1);
1080    ST_UB(res0, dst);
1081    dst += dst_stride;
1082
1083    inp6 = LD_UB(src);
1084    src += src_stride;
1085    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1086                                  inp3, inp4, inp5, inp6,
1087                                  const20, const6, const3);
1088    res0 = __msa_aver_u_b(res0, inp2);
1089    ST_UB(res0, dst);
1090    dst += dst_stride;
1091
1092    inp7 = LD_UB(src);
1093    src += src_stride;
1094    res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1095                                  inp4, inp5, inp6, inp7,
1096                                  const20, const6, const3);
1097    res0 = __msa_aver_u_b(res0, inp3);
1098    ST_UB(res0, dst);
1099    dst += dst_stride;
1100
1101    LD_UB2(src, src_stride, inp8, inp9);
1102    src += (2 * src_stride);
1103    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1104                                  inp5, inp6, inp7, inp8,
1105                                  const20, const6, const3);
1106    res0 = __msa_aver_u_b(res0, inp4);
1107    ST_UB(res0, dst);
1108    dst += dst_stride;
1109
1110    res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1111                                  inp6, inp7, inp8, inp9,
1112                                  const20, const6, const3);
1113    res0 = __msa_aver_u_b(res0, inp5);
1114    ST_UB(res0, dst);
1115    dst += dst_stride;
1116
1117    LD_UB2(src, src_stride, inp10, inp11);
1118    src += (2 * src_stride);
1119    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1120                                  inp7, inp8, inp9, inp10,
1121                                  const20, const6, const3);
1122    res0 = __msa_aver_u_b(res0, inp6);
1123    ST_UB(res0, dst);
1124    dst += dst_stride;
1125
1126    res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1127                                  inp8, inp9, inp10, inp11,
1128                                  const20, const6, const3);
1129    res0 = __msa_aver_u_b(res0, inp7);
1130    ST_UB(res0, dst);
1131    dst += dst_stride;
1132
1133    LD_UB2(src, src_stride, inp12, inp13);
1134    src += (2 * src_stride);
1135    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1136                                  inp9, inp10, inp11, inp12,
1137                                  const20, const6, const3);
1138    res0 = __msa_aver_u_b(res0, inp8);
1139    ST_UB(res0, dst);
1140    dst += dst_stride;
1141
1142    res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1143                                  inp10, inp11, inp12, inp13,
1144                                  const20, const6, const3);
1145    res0 = __msa_aver_u_b(res0, inp9);
1146    ST_UB(res0, dst);
1147    dst += dst_stride;
1148
1149    LD_UB2(src, src_stride, inp14, inp15);
1150    src += (2 * src_stride);
1151    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1152                                  inp11, inp12, inp13, inp14,
1153                                  const20, const6, const3);
1154    res0 = __msa_aver_u_b(res0, inp10);
1155    ST_UB(res0, dst);
1156    dst += dst_stride;
1157
1158    res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1159                                  inp12, inp13, inp14, inp15,
1160                                  const20, const6, const3);
1161    res0 = __msa_aver_u_b(res0, inp11);
1162    ST_UB(res0, dst);
1163    dst += dst_stride;
1164
1165    inp16 = LD_UB(src);
1166    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1167                                  inp13, inp14, inp15, inp16,
1168                                  const20, const6, const3);
1169    res0 = __msa_aver_u_b(res0, inp12);
1170    ST_UB(res0, dst);
1171    dst += dst_stride;
1172
1173    res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1174                                  inp14, inp15, inp16, inp16,
1175                                  const20, const6, const3);
1176    res0 = __msa_aver_u_b(res0, inp13);
1177    ST_UB(res0, dst);
1178    dst += dst_stride;
1179
1180    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1181                                  inp15, inp16, inp16, inp15,
1182                                  const20, const6, const3);
1183    res0 = __msa_aver_u_b(res0, inp14);
1184    ST_UB(res0, dst);
1185    dst += dst_stride;
1186
1187    res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1188                                  inp16, inp16, inp15, inp14,
1189                                  const20, const6, const3);
1190    res0 = __msa_aver_u_b(res0, inp15);
1191    ST_UB(res0, dst);
1192}
1193
1194static void vert_mc_qpel_8x8_msa(const uint8_t *src,
1195                                 int32_t src_stride,
1196                                 uint8_t *dst,
1197                                 int32_t dst_stride)
1198{
1199    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1200    v16u8 res0, res1;
1201    v16u8 const20 = (v16u8) __msa_ldi_b(20);
1202    v16u8 const6 = (v16u8) __msa_ldi_b(6);
1203    v16u8 const3 = (v16u8) __msa_ldi_b(3);
1204
1205    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1206    src += (4 * src_stride);
1207    LD_UB2(src, src_stride, inp4, inp5);
1208    src += (2 * src_stride);
1209    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1210                                        inp1, inp2, inp3, inp4,
1211                                        inp1, inp0, inp0, inp1,
1212                                        inp2, inp3, inp4, inp5,
1213                                        const20, const6, const3);
1214    LD_UB2(src, src_stride, inp6, inp7);
1215    src += (2 * src_stride);
1216    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1217                                        inp3, inp4, inp5, inp6,
1218                                        inp3, inp2, inp1, inp0,
1219                                        inp4, inp5, inp6, inp7,
1220                                        const20, const6, const3);
1221    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1222
1223    inp8 = LD_UB(src);
1224    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1225                                        inp5, inp6, inp7, inp8,
1226                                        inp5, inp4, inp3, inp2,
1227                                        inp6, inp7, inp8, inp8,
1228                                        const20, const6, const3);
1229    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1230                                        inp7, inp8, inp8, inp7,
1231                                        inp7, inp6, inp5, inp4,
1232                                        inp8, inp8, inp7, inp6,
1233                                        const20, const6, const3);
1234    ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1235}
1236
1237static void vert_mc_qpel_16x16_msa(const uint8_t *src,
1238                                   int32_t src_stride,
1239                                   uint8_t *dst,
1240                                   int32_t dst_stride)
1241{
1242    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1243    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1244    v16u8 res0;
1245    v16u8 const20 = (v16u8) __msa_ldi_b(20);
1246    v16u8 const6 = (v16u8) __msa_ldi_b(6);
1247    v16u8 const3 = (v16u8) __msa_ldi_b(3);
1248
1249    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1250    src += (4 * src_stride);
1251    inp4 = LD_UB(src);
1252    src += src_stride;
1253    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1254                                  inp1, inp2, inp3, inp4,
1255                                  const20, const6, const3);
1256    ST_UB(res0, dst);
1257    dst += dst_stride;
1258
1259    inp5 = LD_UB(src);
1260    src += src_stride;
1261    res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1262                                  inp2, inp3, inp4, inp5,
1263                                  const20, const6, const3);
1264    ST_UB(res0, dst);
1265    dst += dst_stride;
1266
1267    inp6 = LD_UB(src);
1268    src += src_stride;
1269    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1270                                  inp3, inp4, inp5, inp6,
1271                                  const20, const6, const3);
1272    ST_UB(res0, dst);
1273    dst += dst_stride;
1274
1275    inp7 = LD_UB(src);
1276    src += src_stride;
1277    res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1278                                  inp4, inp5, inp6, inp7,
1279                                  const20, const6, const3);
1280    ST_UB(res0, dst);
1281    dst += dst_stride;
1282
1283    inp8 = LD_UB(src);
1284    src += src_stride;
1285    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1286                                  inp5, inp6, inp7, inp8,
1287                                  const20, const6, const3);
1288    ST_UB(res0, dst);
1289    dst += dst_stride;
1290
1291    inp9 = LD_UB(src);
1292    src += src_stride;
1293    res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1294                                  inp6, inp7, inp8, inp9,
1295                                  const20, const6, const3);
1296    ST_UB(res0, dst);
1297    dst += dst_stride;
1298
1299    inp10 = LD_UB(src);
1300    src += src_stride;
1301    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1302                                  inp7, inp8, inp9, inp10,
1303                                  const20, const6, const3);
1304    ST_UB(res0, dst);
1305    dst += dst_stride;
1306
1307    inp11 = LD_UB(src);
1308    src += src_stride;
1309    res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1310                                  inp8, inp9, inp10, inp11,
1311                                  const20, const6, const3);
1312    ST_UB(res0, dst);
1313    dst += dst_stride;
1314
1315    inp12 = LD_UB(src);
1316    src += src_stride;
1317    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1318                                  inp9, inp10, inp11, inp12,
1319                                  const20, const6, const3);
1320    ST_UB(res0, dst);
1321    dst += dst_stride;
1322
1323    inp13 = LD_UB(src);
1324    src += src_stride;
1325    res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1326                                  inp10, inp11, inp12, inp13,
1327                                  const20, const6, const3);
1328    ST_UB(res0, dst);
1329    dst += dst_stride;
1330
1331    inp14 = LD_UB(src);
1332    src += src_stride;
1333    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1334                                  inp11, inp12, inp13, inp14,
1335                                  const20, const6, const3);
1336    ST_UB(res0, dst);
1337    dst += dst_stride;
1338
1339    inp15 = LD_UB(src);
1340    src += src_stride;
1341    res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1342                                  inp12, inp13, inp14, inp15,
1343                                  const20, const6, const3);
1344    ST_UB(res0, dst);
1345    dst += dst_stride;
1346
1347    inp16 = LD_UB(src);
1348    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1349                                  inp13, inp14, inp15, inp16,
1350                                  const20, const6, const3);
1351    ST_UB(res0, dst);
1352    dst += dst_stride;
1353
1354    res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1355                                  inp14, inp15, inp16, inp16,
1356                                  const20, const6, const3);
1357    ST_UB(res0, dst);
1358    dst += dst_stride;
1359
1360    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1361                                  inp15, inp16, inp16, inp15,
1362                                  const20, const6, const3);
1363    ST_UB(res0, dst);
1364    dst += dst_stride;
1365
1366    res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1367                                  inp16, inp16, inp15, inp14,
1368                                  const20, const6, const3);
1369    ST_UB(res0, dst);
1370    dst += dst_stride;
1371}
1372
1373static void vert_mc_qpel_aver_src1_8x8_msa(const uint8_t *src,
1374                                           int32_t src_stride,
1375                                           uint8_t *dst,
1376                                           int32_t dst_stride)
1377{
1378    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1379    v16u8 tmp0, tmp1, res0, res1;
1380    v16u8 const20 = (v16u8) __msa_ldi_b(20);
1381    v16u8 const6 = (v16u8) __msa_ldi_b(6);
1382    v16u8 const3 = (v16u8) __msa_ldi_b(3);
1383
1384    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1385    src += (4 * src_stride);
1386    LD_UB2(src, src_stride, inp4, inp5);
1387    src += (2 * src_stride);
1388    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1389                                        inp1, inp2, inp3, inp4,
1390                                        inp1, inp0, inp0, inp1,
1391                                        inp2, inp3, inp4, inp5,
1392                                        const20, const6, const3);
1393
1394    LD_UB2(src, src_stride, inp6, inp7);
1395    src += (2 * src_stride);
1396    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1397                                        inp3, inp4, inp5, inp6,
1398                                        inp3, inp2, inp1, inp0,
1399                                        inp4, inp5, inp6, inp7,
1400                                        const20, const6, const3);
1401    tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1402    tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1403    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1404    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1405
1406    inp8 = LD_UB(src);
1407    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1408                                        inp5, inp6, inp7, inp8,
1409                                        inp5, inp4, inp3, inp2,
1410                                        inp6, inp7, inp8, inp8,
1411                                        const20, const6, const3);
1412    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1413                                        inp7, inp8, inp8, inp7,
1414                                        inp7, inp6, inp5, inp4,
1415                                        inp8, inp8, inp7, inp6,
1416                                        const20, const6, const3);
1417    tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
1418    tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
1419    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1420    ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1421}
1422
1423static void vert_mc_qpel_aver_src1_16x16_msa(const uint8_t *src,
1424                                             int32_t src_stride,
1425                                             uint8_t *dst,
1426                                             int32_t dst_stride)
1427{
1428    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1429    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1430    v16u8 res0;
1431    v16u8 const20 = (v16u8) __msa_ldi_b(20);
1432    v16u8 const6 = (v16u8) __msa_ldi_b(6);
1433    v16u8 const3 = (v16u8) __msa_ldi_b(3);
1434
1435    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1436    src += (4 * src_stride);
1437    inp4 = LD_UB(src);
1438    src += src_stride;
1439    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1440                                  inp1, inp2, inp3, inp4,
1441                                  const20, const6, const3);
1442    res0 = __msa_aver_u_b(res0, inp1);
1443    ST_UB(res0, dst);
1444    dst += dst_stride;
1445
1446    inp5 = LD_UB(src);
1447    src += src_stride;
1448    res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1449                                  inp2, inp3, inp4, inp5,
1450                                  const20, const6, const3);
1451    res0 = __msa_aver_u_b(res0, inp2);
1452    ST_UB(res0, dst);
1453    dst += dst_stride;
1454
1455    inp6 = LD_UB(src);
1456    src += src_stride;
1457    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1458                                  inp3, inp4, inp5, inp6,
1459                                  const20, const6, const3);
1460    res0 = __msa_aver_u_b(res0, inp3);
1461    ST_UB(res0, dst);
1462    dst += dst_stride;
1463
1464    inp7 = LD_UB(src);
1465    src += src_stride;
1466    res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1467                                  inp4, inp5, inp6, inp7,
1468                                  const20, const6, const3);
1469    res0 = __msa_aver_u_b(res0, inp4);
1470    ST_UB(res0, dst);
1471    dst += dst_stride;
1472
1473    inp8 = LD_UB(src);
1474    src += src_stride;
1475    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1476                                  inp5, inp6, inp7, inp8,
1477                                  const20, const6, const3);
1478    res0 = __msa_aver_u_b(res0, inp5);
1479    ST_UB(res0, dst);
1480    dst += dst_stride;
1481
1482    inp9 = LD_UB(src);
1483    src += src_stride;
1484    res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1485                                  inp6, inp7, inp8, inp9,
1486                                  const20, const6, const3);
1487    res0 = __msa_aver_u_b(res0, inp6);
1488    ST_UB(res0, dst);
1489    dst += dst_stride;
1490
1491    inp10 = LD_UB(src);
1492    src += src_stride;
1493    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1494                                  inp7, inp8, inp9, inp10,
1495                                  const20, const6, const3);
1496    res0 = __msa_aver_u_b(res0, inp7);
1497    ST_UB(res0, dst);
1498    dst += dst_stride;
1499
1500    inp11 = LD_UB(src);
1501    src += src_stride;
1502    res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1503                                  inp8, inp9, inp10, inp11,
1504                                  const20, const6, const3);
1505    res0 = __msa_aver_u_b(res0, inp8);
1506    ST_UB(res0, dst);
1507    dst += dst_stride;
1508
1509    inp12 = LD_UB(src);
1510    src += src_stride;
1511    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1512                                  inp9, inp10, inp11, inp12,
1513                                  const20, const6, const3);
1514    res0 = __msa_aver_u_b(res0, inp9);
1515    ST_UB(res0, dst);
1516    dst += dst_stride;
1517
1518    inp13 = LD_UB(src);
1519    src += src_stride;
1520    res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1521                                  inp10, inp11, inp12, inp13,
1522                                  const20, const6, const3);
1523    res0 = __msa_aver_u_b(res0, inp10);
1524    ST_UB(res0, dst);
1525    dst += dst_stride;
1526
1527    inp14 = LD_UB(src);
1528    src += src_stride;
1529    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1530                                  inp11, inp12, inp13, inp14,
1531                                  const20, const6, const3);
1532    res0 = __msa_aver_u_b(res0, inp11);
1533    ST_UB(res0, dst);
1534    dst += dst_stride;
1535
1536    inp15 = LD_UB(src);
1537    src += src_stride;
1538    res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1539                                  inp12, inp13, inp14, inp15,
1540                                  const20, const6, const3);
1541    res0 = __msa_aver_u_b(res0, inp12);
1542    ST_UB(res0, dst);
1543    dst += dst_stride;
1544
1545    inp16 = LD_UB(src);
1546    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1547                                  inp13, inp14, inp15, inp16,
1548                                  const20, const6, const3);
1549    res0 = __msa_aver_u_b(res0, inp13);
1550    ST_UB(res0, dst);
1551    dst += dst_stride;
1552
1553    res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1554                                  inp14, inp15, inp16, inp16,
1555                                  const20, const6, const3);
1556    res0 = __msa_aver_u_b(res0, inp14);
1557    ST_UB(res0, dst);
1558    dst += dst_stride;
1559
1560    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1561                                  inp15, inp16, inp16, inp15,
1562                                  const20, const6, const3);
1563    res0 = __msa_aver_u_b(res0, inp15);
1564    ST_UB(res0, dst);
1565    dst += dst_stride;
1566
1567    res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1568                                  inp16, inp16, inp15, inp14,
1569                                  const20, const6, const3);
1570    res0 = __msa_aver_u_b(res0, inp16);
1571    ST_UB(res0, dst);
1572}
1573
1574static void vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t *src,
1575                                                  int32_t src_stride,
1576                                                  uint8_t *dst,
1577                                                  int32_t dst_stride)
1578{
1579    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1580    v16u8 tmp0, tmp1, res0, res1;
1581    v16u8 const20 = (v16u8) __msa_ldi_b(20);
1582    v16u8 const6 = (v16u8) __msa_ldi_b(6);
1583    v16u8 const3 = (v16u8) __msa_ldi_b(3);
1584
1585    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1586    src += (4 * src_stride);
1587    LD_UB2(src, src_stride, inp4, inp5);
1588    src += (2 * src_stride);
1589    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1590                                                 inp1, inp2, inp3, inp4,
1591                                                 inp1, inp0, inp0, inp1,
1592                                                 inp2, inp3, inp4, inp5,
1593                                                 const20, const6, const3);
1594    LD_UB2(src, src_stride, inp6, inp7);
1595    src += (2 * src_stride);
1596    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1597                                                 inp3, inp4, inp5, inp6,
1598                                                 inp3, inp2, inp1, inp0,
1599                                                 inp4, inp5, inp6, inp7,
1600                                                 const20, const6, const3);
1601    tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1602    tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1603    res0 = __msa_ave_u_b(res0, tmp0);
1604    res1 = __msa_ave_u_b(res1, tmp1);
1605    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1606
1607    inp8 = LD_UB(src);
1608    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1609                                                 inp5, inp6, inp7, inp8,
1610                                                 inp5, inp4, inp3, inp2,
1611                                                 inp6, inp7, inp8, inp8,
1612                                                 const20, const6, const3);
1613    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1614                                                 inp7, inp8, inp8, inp7,
1615                                                 inp7, inp6, inp5, inp4,
1616                                                 inp8, inp8, inp7, inp6,
1617                                                 const20, const6, const3);
1618    tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1619    tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1620    res0 = __msa_ave_u_b(res0, tmp0);
1621    res1 = __msa_ave_u_b(res1, tmp1);
1622    ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1623}
1624
1625static void vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t *src,
1626                                                    int32_t src_stride,
1627                                                    uint8_t *dst,
1628                                                    int32_t dst_stride)
1629{
1630    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1631    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1632    v16u8 res0;
1633    v16u8 const20 = (v16u8) __msa_ldi_b(20);
1634    v16u8 const6 = (v16u8) __msa_ldi_b(6);
1635    v16u8 const3 = (v16u8) __msa_ldi_b(3);
1636
1637    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1638    src += (5 * src_stride);
1639    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
1640                                           inp1, inp2, inp3, inp4,
1641                                           const20, const6, const3);
1642    res0 = __msa_ave_u_b(res0, inp0);
1643    ST_UB(res0, dst);
1644    dst += dst_stride;
1645
1646    inp5 = LD_UB(src);
1647    src += src_stride;
1648    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
1649                                           inp2, inp3, inp4, inp5,
1650                                           const20, const6, const3);
1651    res0 = __msa_ave_u_b(res0, inp1);
1652    ST_UB(res0, dst);
1653    dst += dst_stride;
1654
1655    inp6 = LD_UB(src);
1656    src += src_stride;
1657    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
1658                                           inp3, inp4, inp5, inp6,
1659                                           const20, const6, const3);
1660    res0 = __msa_ave_u_b(res0, inp2);
1661    ST_UB(res0, dst);
1662    dst += dst_stride;
1663
1664    inp7 = LD_UB(src);
1665    src += src_stride;
1666    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
1667                                           inp4, inp5, inp6, inp7,
1668                                           const20, const6, const3);
1669    res0 = __msa_ave_u_b(res0, inp3);
1670    ST_UB(res0, dst);
1671    dst += dst_stride;
1672
1673    inp8 = LD_UB(src);
1674    src += src_stride;
1675    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
1676                                           inp5, inp6, inp7, inp8,
1677                                           const20, const6, const3);
1678    res0 = __msa_ave_u_b(res0, inp4);
1679    ST_UB(res0, dst);
1680    dst += dst_stride;
1681
1682    inp9 = LD_UB(src);
1683    src += src_stride;
1684    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
1685                                           inp6, inp7, inp8, inp9,
1686                                           const20, const6, const3);
1687    res0 = __msa_ave_u_b(res0, inp5);
1688    ST_UB(res0, dst);
1689    dst += dst_stride;
1690
1691    inp10 = LD_UB(src);
1692    src += src_stride;
1693    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
1694                                           inp7, inp8, inp9, inp10,
1695                                           const20, const6, const3);
1696    res0 = __msa_ave_u_b(res0, inp6);
1697    ST_UB(res0, dst);
1698    dst += dst_stride;
1699
1700    inp11 = LD_UB(src);
1701    src += src_stride;
1702    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
1703                                           inp8, inp9, inp10, inp11,
1704                                           const20, const6, const3);
1705    res0 = __msa_ave_u_b(res0, inp7);
1706    ST_UB(res0, dst);
1707    dst += dst_stride;
1708
1709    inp12 = LD_UB(src);
1710    src += src_stride;
1711    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
1712                                           inp9, inp10, inp11, inp12,
1713                                           const20, const6, const3);
1714    res0 = __msa_ave_u_b(res0, inp8);
1715    ST_UB(res0, dst);
1716    dst += dst_stride;
1717
1718    inp13 = LD_UB(src);
1719    src += src_stride;
1720    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
1721                                           inp10, inp11, inp12, inp13,
1722                                           const20, const6, const3);
1723    res0 = __msa_ave_u_b(res0, inp9);
1724    ST_UB(res0, dst);
1725    dst += dst_stride;
1726
1727    inp14 = LD_UB(src);
1728    src += src_stride;
1729    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
1730                                           inp11, inp12, inp13, inp14,
1731                                           const20, const6, const3);
1732    res0 = __msa_ave_u_b(res0, inp10);
1733    ST_UB(res0, dst);
1734    dst += dst_stride;
1735
1736    inp15 = LD_UB(src);
1737    src += src_stride;
1738    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
1739                                           inp12, inp13, inp14, inp15,
1740                                           const20, const6, const3);
1741    res0 = __msa_ave_u_b(res0, inp11);
1742    ST_UB(res0, dst);
1743    dst += dst_stride;
1744
1745    inp16 = LD_UB(src);
1746    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
1747                                           inp13, inp14, inp15, inp16,
1748                                           const20, const6, const3);
1749    res0 = __msa_ave_u_b(res0, inp12);
1750    ST_UB(res0, dst);
1751    dst += dst_stride;
1752
1753    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
1754                                           inp14, inp15, inp16, inp16,
1755                                           const20, const6, const3);
1756    res0 = __msa_ave_u_b(res0, inp13);
1757    ST_UB(res0, dst);
1758    dst += dst_stride;
1759
1760    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
1761                                           inp15, inp16, inp16, inp15,
1762                                           const20, const6, const3);
1763    res0 = __msa_ave_u_b(res0, inp14);
1764    ST_UB(res0, dst);
1765    dst += dst_stride;
1766
1767    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
1768                                           inp16, inp16, inp15, inp14,
1769                                           const20, const6, const3);
1770    res0 = __msa_ave_u_b(res0, inp15);
1771    ST_UB(res0, dst);
1772    dst += dst_stride;
1773}
1774
1775static void vert_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
1776                                        int32_t src_stride,
1777                                        uint8_t *dst,
1778                                        int32_t dst_stride)
1779{
1780    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1781    v16u8 res0, res1;
1782    v16u8 const20 = (v16u8) __msa_ldi_b(20);
1783    v16u8 const6 = (v16u8) __msa_ldi_b(6);
1784    v16u8 const3 = (v16u8) __msa_ldi_b(3);
1785
1786    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1787    src += (4 * src_stride);
1788    LD_UB2(src, src_stride, inp4, inp5);
1789    src += (2 * src_stride);
1790    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1791                                                 inp1, inp2, inp3, inp4,
1792                                                 inp1, inp0, inp0, inp1,
1793                                                 inp2, inp3, inp4, inp5,
1794                                                 const20, const6, const3);
1795    LD_UB2(src, src_stride, inp6, inp7);
1796    src += (2 * src_stride);
1797    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1798                                                 inp3, inp4, inp5, inp6,
1799                                                 inp3, inp2, inp1, inp0,
1800                                                 inp4, inp5, inp6, inp7,
1801                                                 const20, const6, const3);
1802    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1803
1804    inp8 = LD_UB(src);
1805    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1806                                                 inp5, inp6, inp7, inp8,
1807                                                 inp5, inp4, inp3, inp2,
1808                                                 inp6, inp7, inp8, inp8,
1809                                                 const20, const6, const3);
1810    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1811                                                 inp7, inp8, inp8, inp7,
1812                                                 inp7, inp6, inp5, inp4,
1813                                                 inp8, inp8, inp7, inp6,
1814                                                 const20, const6, const3);
1815    ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1816}
1817
1818static void vert_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
1819                                          int32_t src_stride,
1820                                          uint8_t *dst,
1821                                          int32_t dst_stride)
1822{
1823    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1824    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1825    v16u8 res0;
1826    v16u8 const20 = (v16u8) __msa_ldi_b(20);
1827    v16u8 const6 = (v16u8) __msa_ldi_b(6);
1828    v16u8 const3 = (v16u8) __msa_ldi_b(3);
1829
1830    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1831    src += (5 * src_stride);
1832    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
1833                                           inp1, inp2, inp3, inp4,
1834                                           const20, const6, const3);
1835    ST_UB(res0, dst);
1836    dst += dst_stride;
1837
1838    inp5 = LD_UB(src);
1839    src += src_stride;
1840    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
1841                                           inp2, inp3, inp4, inp5,
1842                                           const20, const6, const3);
1843    ST_UB(res0, dst);
1844    dst += dst_stride;
1845
1846    inp6 = LD_UB(src);
1847    src += src_stride;
1848    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
1849                                           inp3, inp4, inp5, inp6,
1850                                           const20, const6, const3);
1851    ST_UB(res0, dst);
1852    dst += dst_stride;
1853
1854    inp7 = LD_UB(src);
1855    src += src_stride;
1856    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
1857                                           inp4, inp5, inp6, inp7,
1858                                           const20, const6, const3);
1859    ST_UB(res0, dst);
1860    dst += dst_stride;
1861
1862    inp8 = LD_UB(src);
1863    src += src_stride;
1864    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
1865                                           inp5, inp6, inp7, inp8,
1866                                           const20, const6, const3);
1867    ST_UB(res0, dst);
1868    dst += dst_stride;
1869
1870    inp9 = LD_UB(src);
1871    src += src_stride;
1872    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
1873                                           inp6, inp7, inp8, inp9,
1874                                           const20, const6, const3);
1875    ST_UB(res0, dst);
1876    dst += dst_stride;
1877
1878    inp10 = LD_UB(src);
1879    src += src_stride;
1880    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
1881                                           inp7, inp8, inp9, inp10,
1882                                           const20, const6, const3);
1883    ST_UB(res0, dst);
1884    dst += dst_stride;
1885
1886    inp11 = LD_UB(src);
1887    src += src_stride;
1888    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
1889                                           inp8, inp9, inp10, inp11,
1890                                           const20, const6, const3);
1891    ST_UB(res0, dst);
1892    dst += dst_stride;
1893
1894    inp12 = LD_UB(src);
1895    src += src_stride;
1896    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
1897                                           inp9, inp10, inp11, inp12,
1898                                           const20, const6, const3);
1899    ST_UB(res0, dst);
1900    dst += dst_stride;
1901
1902    inp13 = LD_UB(src);
1903    src += src_stride;
1904    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
1905                                           inp10, inp11, inp12, inp13,
1906                                           const20, const6, const3);
1907    ST_UB(res0, dst);
1908    dst += dst_stride;
1909
1910    inp14 = LD_UB(src);
1911    src += src_stride;
1912    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
1913                                           inp11, inp12, inp13, inp14,
1914                                           const20, const6, const3);
1915    ST_UB(res0, dst);
1916    dst += dst_stride;
1917
1918    inp15 = LD_UB(src);
1919    src += src_stride;
1920    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
1921                                           inp12, inp13, inp14, inp15,
1922                                           const20, const6, const3);
1923    ST_UB(res0, dst);
1924    dst += dst_stride;
1925
1926    inp16 = LD_UB(src);
1927    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
1928                                           inp13, inp14, inp15, inp16,
1929                                           const20, const6, const3);
1930    ST_UB(res0, dst);
1931    dst += dst_stride;
1932
1933    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
1934                                           inp14, inp15, inp16, inp16,
1935                                           const20, const6, const3);
1936    ST_UB(res0, dst);
1937    dst += dst_stride;
1938
1939    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
1940                                           inp15, inp16, inp16, inp15,
1941                                           const20, const6, const3);
1942    ST_UB(res0, dst);
1943    dst += dst_stride;
1944
1945    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
1946                                           inp16, inp16, inp15, inp14,
1947                                           const20, const6, const3);
1948    ST_UB(res0, dst);
1949}
1950
1951static void vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t *src,
1952                                                  int32_t src_stride,
1953                                                  uint8_t *dst,
1954                                                  int32_t dst_stride)
1955{
1956    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1957    v16u8 tmp0, tmp1, res0, res1;
1958    v16u8 const20 = (v16u8) __msa_ldi_b(20);
1959    v16u8 const6 = (v16u8) __msa_ldi_b(6);
1960    v16u8 const3 = (v16u8) __msa_ldi_b(3);
1961
1962    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1963    src += (4 * src_stride);
1964    LD_UB2(src, src_stride, inp4, inp5);
1965    src += (2 * src_stride);
1966    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1967                                                 inp1, inp2, inp3, inp4,
1968                                                 inp1, inp0, inp0, inp1,
1969                                                 inp2, inp3, inp4, inp5,
1970                                                 const20, const6, const3);
1971    LD_UB2(src, src_stride, inp6, inp7);
1972    src += (2 * src_stride);
1973    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1974                                                 inp3, inp4, inp5, inp6,
1975                                                 inp3, inp2, inp1, inp0,
1976                                                 inp4, inp5, inp6, inp7,
1977                                                 const20, const6, const3);
1978    tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1979    tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1980    res0 = __msa_ave_u_b(res0, tmp0);
1981    res1 = __msa_ave_u_b(res1, tmp1);
1982    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1983
1984    inp8 = LD_UB(src);
1985    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1986                                                 inp5, inp6, inp7, inp8,
1987                                                 inp5, inp4, inp3, inp2,
1988                                                 inp6, inp7, inp8, inp8,
1989                                                 const20, const6, const3);
1990    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1991                                                 inp7, inp8, inp8, inp7,
1992                                                 inp7, inp6, inp5, inp4,
1993                                                 inp8, inp8, inp7, inp6,
1994                                                 const20, const6, const3);
1995    tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
1996    tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
1997    res0 = __msa_ave_u_b(res0, tmp0);
1998    res1 = __msa_ave_u_b(res1, tmp1);
1999    ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2000}
2001
2002static void vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t *src,
2003                                                    int32_t src_stride,
2004                                                    uint8_t *dst,
2005                                                    int32_t dst_stride)
2006{
2007    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2008    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2009    v16u8 res0;
2010    v16u8 const20 = (v16u8) __msa_ldi_b(20);
2011    v16u8 const6 = (v16u8) __msa_ldi_b(6);
2012    v16u8 const3 = (v16u8) __msa_ldi_b(3);
2013
2014    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2015    src += (5 * src_stride);
2016    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
2017                                           inp1, inp2, inp3, inp4,
2018                                           const20, const6, const3);
2019    res0 = __msa_ave_u_b(res0, inp1);
2020    ST_UB(res0, dst);
2021    dst += dst_stride;
2022
2023    inp5 = LD_UB(src);
2024    src += src_stride;
2025    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
2026                                           inp2, inp3, inp4, inp5,
2027                                           const20, const6, const3);
2028    res0 = __msa_ave_u_b(res0, inp2);
2029    ST_UB(res0, dst);
2030    dst += dst_stride;
2031
2032    inp6 = LD_UB(src);
2033    src += src_stride;
2034    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
2035                                           inp3, inp4, inp5, inp6,
2036                                           const20, const6, const3);
2037    res0 = __msa_ave_u_b(res0, inp3);
2038    ST_UB(res0, dst);
2039    dst += dst_stride;
2040
2041    inp7 = LD_UB(src);
2042    src += src_stride;
2043    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
2044                                           inp4, inp5, inp6, inp7,
2045                                           const20, const6, const3);
2046    res0 = __msa_ave_u_b(res0, inp4);
2047    ST_UB(res0, dst);
2048    dst += dst_stride;
2049
2050    inp8 = LD_UB(src);
2051    src += src_stride;
2052    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
2053                                           inp5, inp6, inp7, inp8,
2054                                           const20, const6, const3);
2055    res0 = __msa_ave_u_b(res0, inp5);
2056    ST_UB(res0, dst);
2057    dst += dst_stride;
2058
2059    inp9 = LD_UB(src);
2060    src += src_stride;
2061    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
2062                                           inp6, inp7, inp8, inp9,
2063                                           const20, const6, const3);
2064    res0 = __msa_ave_u_b(res0, inp6);
2065    ST_UB(res0, dst);
2066    dst += dst_stride;
2067
2068    inp10 = LD_UB(src);
2069    src += src_stride;
2070    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
2071                                           inp7, inp8, inp9, inp10,
2072                                           const20, const6, const3);
2073    res0 = __msa_ave_u_b(res0, inp7);
2074    ST_UB(res0, dst);
2075    dst += dst_stride;
2076
2077    inp11 = LD_UB(src);
2078    src += src_stride;
2079    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
2080                                           inp8, inp9, inp10, inp11,
2081                                           const20, const6, const3);
2082    res0 = __msa_ave_u_b(res0, inp8);
2083    ST_UB(res0, dst);
2084    dst += dst_stride;
2085
2086    inp12 = LD_UB(src);
2087    src += src_stride;
2088    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
2089                                           inp9, inp10, inp11, inp12,
2090                                           const20, const6, const3);
2091    res0 = __msa_ave_u_b(res0, inp9);
2092    ST_UB(res0, dst);
2093    dst += dst_stride;
2094
2095    inp13 = LD_UB(src);
2096    src += src_stride;
2097    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
2098                                           inp10, inp11, inp12, inp13,
2099                                           const20, const6, const3);
2100    res0 = __msa_ave_u_b(res0, inp10);
2101    ST_UB(res0, dst);
2102    dst += dst_stride;
2103
2104    inp14 = LD_UB(src);
2105    src += src_stride;
2106    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
2107                                           inp11, inp12, inp13, inp14,
2108                                           const20, const6, const3);
2109    res0 = __msa_ave_u_b(res0, inp11);
2110    ST_UB(res0, dst);
2111    dst += dst_stride;
2112
2113    inp15 = LD_UB(src);
2114    src += src_stride;
2115    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
2116                                           inp12, inp13, inp14, inp15,
2117                                           const20, const6, const3);
2118    res0 = __msa_ave_u_b(res0, inp12);
2119    ST_UB(res0, dst);
2120    dst += dst_stride;
2121
2122    inp16 = LD_UB(src);
2123    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
2124                                           inp13, inp14, inp15, inp16,
2125                                           const20, const6, const3);
2126    res0 = __msa_ave_u_b(res0, inp13);
2127    ST_UB(res0, dst);
2128    dst += dst_stride;
2129
2130    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
2131                                           inp14, inp15, inp16, inp16,
2132                                           const20, const6, const3);
2133    res0 = __msa_ave_u_b(res0, inp14);
2134    ST_UB(res0, dst);
2135    dst += dst_stride;
2136
2137    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
2138                                           inp15, inp16, inp16, inp15,
2139                                           const20, const6, const3);
2140    res0 = __msa_ave_u_b(res0, inp15);
2141    ST_UB(res0, dst);
2142    dst += dst_stride;
2143
2144    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
2145                                           inp16, inp16, inp15, inp14,
2146                                           const20, const6, const3);
2147    res0 = __msa_ave_u_b(res0, inp16);
2148    ST_UB(res0, dst);
2149}
2150
2151static void vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t *src,
2152                                                   int32_t src_stride,
2153                                                   uint8_t *dst,
2154                                                   int32_t dst_stride)
2155{
2156    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2157    v16u8 dst0, dst1, dst2, dst3;
2158    v16u8 tmp0, tmp1, res0, res1;
2159    v16u8 const20 = (v16u8) __msa_ldi_b(20);
2160    v16u8 const6 = (v16u8) __msa_ldi_b(6);
2161    v16u8 const3 = (v16u8) __msa_ldi_b(3);
2162
2163    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2164    src += (4 * src_stride);
2165    LD_UB2(src, src_stride, inp4, inp5);
2166    src += (2 * src_stride);
2167    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2168                                        inp1, inp2, inp3, inp4,
2169                                        inp1, inp0, inp0, inp1,
2170                                        inp2, inp3, inp4, inp5,
2171                                        const20, const6, const3);
2172
2173    LD_UB2(src, src_stride, inp6, inp7);
2174    src += (2 * src_stride);
2175    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2176                                        inp3, inp4, inp5, inp6,
2177                                        inp3, inp2, inp1, inp0,
2178                                        inp4, inp5, inp6, inp7,
2179                                        const20, const6, const3);
2180
2181    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2182    tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
2183    tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
2184    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2185    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2186    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2187    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2188    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2189    dst += (4 * dst_stride);
2190
2191    inp8 = LD_UB(src);
2192    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2193                                        inp5, inp6, inp7, inp8,
2194                                        inp5, inp4, inp3, inp2,
2195                                        inp6, inp7, inp8, inp8,
2196                                        const20, const6, const3);
2197    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2198                                        inp7, inp8, inp8, inp7,
2199                                        inp7, inp6, inp5, inp4,
2200                                        inp8, inp8, inp7, inp6,
2201                                        const20, const6, const3);
2202
2203    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2204    tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
2205    tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
2206    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2207    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2208    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2209    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2210    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2211}
2212
2213static void vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t *src,
2214                                                     int32_t src_stride,
2215                                                     uint8_t *dst,
2216                                                     int32_t dst_stride)
2217{
2218    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2219    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2220    v16u8 res0, res1, dst0, dst1;
2221    v16u8 const20 = (v16u8) __msa_ldi_b(20);
2222    v16u8 const6 = (v16u8) __msa_ldi_b(6);
2223    v16u8 const3 = (v16u8) __msa_ldi_b(3);
2224
2225    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2226    src += (5 * src_stride);
2227    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2228                                  inp1, inp2, inp3, inp4,
2229                                  const20, const6, const3);
2230
2231    inp5 = LD_UB(src);
2232    src += src_stride;
2233    res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2234                                  inp2, inp3, inp4, inp5,
2235                                  const20, const6, const3);
2236
2237    LD_UB2(dst, dst_stride, dst0, dst1);
2238    AVER_UB2_UB(res0, inp0, res1, inp1, res0, res1);
2239    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2240    ST_UB2(res0, res1, dst, dst_stride);
2241    dst += (2 * dst_stride);
2242
2243    inp6 = LD_UB(src);
2244    src += src_stride;
2245    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2246                                  inp3, inp4, inp5, inp6,
2247                                  const20, const6, const3);
2248
2249    inp7 = LD_UB(src);
2250    src += src_stride;
2251    res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2252                                  inp4, inp5, inp6, inp7,
2253                                  const20, const6, const3);
2254
2255    LD_UB2(dst, dst_stride, dst0, dst1);
2256    AVER_UB2_UB(res0, inp2, res1, inp3, res0, res1);
2257    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2258    ST_UB2(res0, res1, dst, dst_stride);
2259    dst += (2 * dst_stride);
2260
2261    LD_UB2(src, src_stride, inp8, inp9);
2262    src += (2 * src_stride);
2263    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2264                                  inp5, inp6, inp7, inp8,
2265                                  const20, const6, const3);
2266    res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2267                                  inp6, inp7, inp8, inp9,
2268                                  const20, const6, const3);
2269
2270    LD_UB2(dst, dst_stride, dst0, dst1);
2271    AVER_UB2_UB(res0, inp4, res1, inp5, res0, res1);
2272    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2273    ST_UB2(res0, res1, dst, dst_stride);
2274    dst += (2 * dst_stride);
2275
2276    LD_UB2(src, src_stride, inp10, inp11);
2277    src += (2 * src_stride);
2278    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2279                                  inp7, inp8, inp9, inp10,
2280                                  const20, const6, const3);
2281    res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2282                                  inp8, inp9, inp10, inp11,
2283                                  const20, const6, const3);
2284
2285    LD_UB2(dst, dst_stride, dst0, dst1);
2286    AVER_UB2_UB(res0, inp6, res1, inp7, res0, res1);
2287    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2288    ST_UB2(res0, res1, dst, dst_stride);
2289    dst += (2 * dst_stride);
2290
2291    LD_UB2(src, src_stride, inp12, inp13);
2292    src += (2 * src_stride);
2293    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2294                                  inp9, inp10, inp11, inp12,
2295                                  const20, const6, const3);
2296    res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2297                                  inp10, inp11, inp12, inp13,
2298                                  const20, const6, const3);
2299    LD_UB2(dst, dst_stride, dst0, dst1);
2300    AVER_UB2_UB(res0, inp8, res1, inp9, res0, res1);
2301    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2302    ST_UB2(res0, res1, dst, dst_stride);
2303    dst += (2 * dst_stride);
2304
2305    LD_UB2(src, src_stride, inp14, inp15);
2306    src += (2 * src_stride);
2307    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2308                                  inp11, inp12, inp13, inp14,
2309                                  const20, const6, const3);
2310    res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2311                                  inp12, inp13, inp14, inp15,
2312                                  const20, const6, const3);
2313
2314    LD_UB2(dst, dst_stride, dst0, dst1);
2315    AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1);
2316    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2317    ST_UB2(res0, res1, dst, dst_stride);
2318    dst += (2 * dst_stride);
2319
2320    inp16 = LD_UB(src);
2321    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2322                                  inp13, inp14, inp15, inp16,
2323                                  const20, const6, const3);
2324    res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2325                                  inp14, inp15, inp16, inp16,
2326                                  const20, const6, const3);
2327    LD_UB2(dst, dst_stride, dst0, dst1);
2328    AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1);
2329    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2330    ST_UB2(res0, res1, dst, dst_stride);
2331    dst += (2 * dst_stride);
2332
2333    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2334                                  inp15, inp16, inp16, inp15,
2335                                  const20, const6, const3);
2336    res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2337                                  inp16, inp16, inp15, inp14,
2338                                  const20, const6, const3);
2339    LD_UB2(dst, dst_stride, dst0, dst1);
2340    AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1);
2341    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2342    ST_UB2(res0, res1, dst, dst_stride);
2343}
2344
2345static void vert_mc_qpel_avg_dst_8x8_msa(const uint8_t *src,
2346                                         int32_t src_stride,
2347                                         uint8_t *dst,
2348                                         int32_t dst_stride)
2349{
2350    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2351    v16u8 dst0, dst1, dst2, dst3;
2352    v16u8 res0, res1;
2353    v16u8 const20 = (v16u8) __msa_ldi_b(20);
2354    v16u8 const6 = (v16u8) __msa_ldi_b(6);
2355    v16u8 const3 = (v16u8) __msa_ldi_b(3);
2356
2357    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2358    src += (4 * src_stride);
2359    LD_UB2(src, src_stride, inp4, inp5);
2360    src += (2 * src_stride);
2361    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2362                                        inp1, inp2, inp3, inp4,
2363                                        inp1, inp0, inp0, inp1,
2364                                        inp2, inp3, inp4, inp5,
2365                                        const20, const6, const3);
2366    LD_UB2(src, src_stride, inp6, inp7);
2367    src += (2 * src_stride);
2368    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2369                                        inp3, inp4, inp5, inp6,
2370                                        inp3, inp2, inp1, inp0,
2371                                        inp4, inp5, inp6, inp7,
2372                                        const20, const6, const3);
2373    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2374    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2375    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2376    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2377    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2378    dst += (4 * dst_stride);
2379
2380    inp8 = LD_UB(src);
2381    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2382                                        inp5, inp6, inp7, inp8,
2383                                        inp5, inp4, inp3, inp2,
2384                                        inp6, inp7, inp8, inp8,
2385                                        const20, const6, const3);
2386    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2387                                        inp7, inp8, inp8, inp7,
2388                                        inp7, inp6, inp5, inp4,
2389                                        inp8, inp8, inp7, inp6,
2390                                        const20, const6, const3);
2391    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2392    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2393    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2394    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2395    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2396}
2397
2398static void vert_mc_qpel_avg_dst_16x16_msa(const uint8_t *src,
2399                                           int32_t src_stride,
2400                                           uint8_t *dst,
2401                                           int32_t dst_stride)
2402{
2403    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2404    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2405    v16u8 res0, res1, dst0, dst1;
2406    v16u8 const20 = (v16u8) __msa_ldi_b(20);
2407    v16u8 const6 = (v16u8) __msa_ldi_b(6);
2408    v16u8 const3 = (v16u8) __msa_ldi_b(3);
2409
2410    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2411    src += (5 * src_stride);
2412    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2413                                  inp1, inp2, inp3, inp4,
2414                                  const20, const6, const3);
2415    inp5 = LD_UB(src);
2416    src += src_stride;
2417    res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2418                                  inp2, inp3, inp4, inp5,
2419                                  const20, const6, const3);
2420    LD_UB2(dst, dst_stride, dst0, dst1);
2421    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2422    ST_UB2(res0, res1, dst, dst_stride);
2423    dst += (2 * dst_stride);
2424
2425    inp6 = LD_UB(src);
2426    src += src_stride;
2427    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2428                                  inp3, inp4, inp5, inp6,
2429                                  const20, const6, const3);
2430    inp7 = LD_UB(src);
2431    src += src_stride;
2432    res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2433                                  inp4, inp5, inp6, inp7,
2434                                  const20, const6, const3);
2435    LD_UB2(dst, dst_stride, dst0, dst1);
2436    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2437    ST_UB2(res0, res1, dst, dst_stride);
2438    dst += (2 * dst_stride);
2439
2440    inp8 = LD_UB(src);
2441    src += src_stride;
2442    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2443                                  inp5, inp6, inp7, inp8,
2444                                  const20, const6, const3);
2445    inp9 = LD_UB(src);
2446    src += src_stride;
2447    res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2448                                  inp6, inp7, inp8, inp9,
2449                                  const20, const6, const3);
2450    LD_UB2(dst, dst_stride, dst0, dst1);
2451    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2452    ST_UB2(res0, res1, dst, dst_stride);
2453    dst += (2 * dst_stride);
2454
2455    inp10 = LD_UB(src);
2456    src += src_stride;
2457    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2458                                  inp7, inp8, inp9, inp10,
2459                                  const20, const6, const3);
2460    inp11 = LD_UB(src);
2461    src += src_stride;
2462    res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2463                                  inp8, inp9, inp10, inp11,
2464                                  const20, const6, const3);
2465    LD_UB2(dst, dst_stride, dst0, dst1);
2466    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2467    ST_UB2(res0, res1, dst, dst_stride);
2468    dst += (2 * dst_stride);
2469
2470    inp12 = LD_UB(src);
2471    src += src_stride;
2472    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2473                                  inp9, inp10, inp11, inp12,
2474                                  const20, const6, const3);
2475    inp13 = LD_UB(src);
2476    src += src_stride;
2477    res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2478                                  inp10, inp11, inp12, inp13,
2479                                  const20, const6, const3);
2480    LD_UB2(dst, dst_stride, dst0, dst1);
2481    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2482    ST_UB2(res0, res1, dst, dst_stride);
2483    dst += (2 * dst_stride);
2484
2485    inp14 = LD_UB(src);
2486    src += src_stride;
2487    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2488                                  inp11, inp12, inp13, inp14,
2489                                  const20, const6, const3);
2490    inp15 = LD_UB(src);
2491    src += src_stride;
2492    res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2493                                  inp12, inp13, inp14, inp15,
2494                                  const20, const6, const3);
2495    LD_UB2(dst, dst_stride, dst0, dst1);
2496    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2497    ST_UB2(res0, res1, dst, dst_stride);
2498    dst += (2 * dst_stride);
2499
2500    inp16 = LD_UB(src);
2501    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2502                                  inp13, inp14, inp15, inp16,
2503                                  const20, const6, const3);
2504    res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2505                                  inp14, inp15, inp16, inp16,
2506                                  const20, const6, const3);
2507    LD_UB2(dst, dst_stride, dst0, dst1);
2508    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2509    ST_UB2(res0, res1, dst, dst_stride);
2510    dst += (2 * dst_stride);
2511
2512    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2513                                  inp15, inp16, inp16, inp15,
2514                                  const20, const6, const3);
2515    res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2516                                  inp16, inp16, inp15, inp14,
2517                                  const20, const6, const3);
2518    LD_UB2(dst, dst_stride, dst0, dst1);
2519    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2520    ST_UB2(res0, res1, dst, dst_stride);
2521}
2522
2523static void vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t *src,
2524                                                   int32_t src_stride,
2525                                                   uint8_t *dst,
2526                                                   int32_t dst_stride)
2527{
2528    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2529    v16u8 dst0, dst1, dst2, dst3;
2530    v16u8 tmp0, tmp1, res0, res1;
2531    v16u8 const20 = (v16u8) __msa_ldi_b(20);
2532    v16u8 const6 = (v16u8) __msa_ldi_b(6);
2533    v16u8 const3 = (v16u8) __msa_ldi_b(3);
2534
2535    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2536    src += (4 * src_stride);
2537    LD_UB2(src, src_stride, inp4, inp5);
2538    src += (2 * src_stride);
2539    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2540                                        inp1, inp2, inp3, inp4,
2541                                        inp1, inp0, inp0, inp1,
2542                                        inp2, inp3, inp4, inp5,
2543                                        const20, const6, const3);
2544    LD_UB2(src, src_stride, inp6, inp7);
2545    src += (2 * src_stride);
2546    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2547                                        inp3, inp4, inp5, inp6,
2548                                        inp3, inp2, inp1, inp0,
2549                                        inp4, inp5, inp6, inp7,
2550                                        const20, const6, const3);
2551    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2552    tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
2553    tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
2554    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2555    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2556    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2557    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2558    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2559    dst += (4 * dst_stride);
2560
2561    inp8 = LD_UB(src);
2562    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2563                                        inp5, inp6, inp7, inp8,
2564                                        inp5, inp4, inp3, inp2,
2565                                        inp6, inp7, inp8, inp8,
2566                                        const20, const6, const3);
2567    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2568                                        inp7, inp8, inp8, inp7,
2569                                        inp7, inp6, inp5, inp4,
2570                                        inp8, inp8, inp7, inp6,
2571                                        const20, const6, const3);
2572    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2573    tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
2574    tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
2575    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2576    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2577    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2578    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2579    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2580}
2581
2582static void vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t *src,
2583                                                     int32_t src_stride,
2584                                                     uint8_t *dst,
2585                                                     int32_t dst_stride)
2586{
2587    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2588    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2589    v16u8 res0, res1, dst0, dst1;
2590    v16u8 const20 = (v16u8) __msa_ldi_b(20);
2591    v16u8 const6 = (v16u8) __msa_ldi_b(6);
2592    v16u8 const3 = (v16u8) __msa_ldi_b(3);
2593
2594    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2595    src += (5 * src_stride);
2596    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2597                                  inp1, inp2, inp3, inp4,
2598                                  const20, const6, const3);
2599    inp5 = LD_UB(src);
2600    src += src_stride;
2601    res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2602                                  inp2, inp3, inp4, inp5,
2603                                  const20, const6, const3);
2604    LD_UB2(dst, dst_stride, dst0, dst1);
2605    AVER_UB2_UB(res0, inp1, res1, inp2, res0, res1);
2606    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2607    ST_UB2(res0, res1, dst, dst_stride);
2608    dst += (2 * dst_stride);
2609
2610    inp6 = LD_UB(src);
2611    src += src_stride;
2612    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2613                                  inp3, inp4, inp5, inp6,
2614                                  const20, const6, const3);
2615    inp7 = LD_UB(src);
2616    src += src_stride;
2617    res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2618                                  inp4, inp5, inp6, inp7,
2619                                  const20, const6, const3);
2620    LD_UB2(dst, dst_stride, dst0, dst1);
2621    AVER_UB2_UB(res0, inp3, res1, inp4, res0, res1);
2622    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2623    ST_UB2(res0, res1, dst, dst_stride);
2624    dst += (2 * dst_stride);
2625
2626    inp8 = LD_UB(src);
2627    src += src_stride;
2628    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2629                                  inp5, inp6, inp7, inp8,
2630                                  const20, const6, const3);
2631    inp9 = LD_UB(src);
2632    src += src_stride;
2633    res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2634                                  inp6, inp7, inp8, inp9,
2635                                  const20, const6, const3);
2636    LD_UB2(dst, dst_stride, dst0, dst1);
2637    AVER_UB2_UB(res0, inp5, res1, inp6, res0, res1);
2638    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2639    ST_UB2(res0, res1, dst, dst_stride);
2640    dst += (2 * dst_stride);
2641
2642    inp10 = LD_UB(src);
2643    src += src_stride;
2644    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2645                                  inp7, inp8, inp9, inp10,
2646                                  const20, const6, const3);
2647    inp11 = LD_UB(src);
2648    src += src_stride;
2649    res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2650                                  inp8, inp9, inp10, inp11,
2651                                  const20, const6, const3);
2652    LD_UB2(dst, dst_stride, dst0, dst1);
2653    AVER_UB2_UB(res0, inp7, res1, inp8, res0, res1);
2654    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2655    ST_UB2(res0, res1, dst, dst_stride);
2656    dst += (2 * dst_stride);
2657
2658    inp12 = LD_UB(src);
2659    src += src_stride;
2660    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2661                                  inp9, inp10, inp11, inp12,
2662                                  const20, const6, const3);
2663    inp13 = LD_UB(src);
2664    src += src_stride;
2665    res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2666                                  inp10, inp11, inp12, inp13,
2667                                  const20, const6, const3);
2668    LD_UB2(dst, dst_stride, dst0, dst1);
2669    AVER_UB2_UB(res0, inp9, res1, inp10, res0, res1);
2670    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2671    ST_UB2(res0, res1, dst, dst_stride);
2672    dst += (2 * dst_stride);
2673
2674    inp14 = LD_UB(src);
2675    src += src_stride;
2676    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2677                                  inp11, inp12, inp13, inp14,
2678                                  const20, const6, const3);
2679    inp15 = LD_UB(src);
2680    src += src_stride;
2681    res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2682                                  inp12, inp13, inp14, inp15,
2683                                  const20, const6, const3);
2684    LD_UB2(dst, dst_stride, dst0, dst1);
2685    AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1);
2686    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2687    ST_UB2(res0, res1, dst, dst_stride);
2688    dst += (2 * dst_stride);
2689
2690    inp16 = LD_UB(src);
2691    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2692                                  inp13, inp14, inp15, inp16,
2693                                  const20, const6, const3);
2694    res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2695                                  inp14, inp15, inp16, inp16,
2696                                  const20, const6, const3);
2697    LD_UB2(dst, dst_stride, dst0, dst1);
2698    AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1);
2699    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2700    ST_UB2(res0, res1, dst, dst_stride);
2701    dst += (2 * dst_stride);
2702
2703    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2704                                  inp15, inp16, inp16, inp15,
2705                                  const20, const6, const3);
2706    res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2707                                  inp16, inp16, inp15, inp14,
2708                                  const20, const6, const3);
2709    LD_UB2(dst, dst_stride, dst0, dst1);
2710    AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1);
2711    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2712    ST_UB2(res0, res1, dst, dst_stride);
2713}
2714
2715static void hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(const uint8_t *src,
2716                                                   int32_t src_stride,
2717                                                   uint8_t *dst,
2718                                                   int32_t dst_stride,
2719                                                   int32_t height)
2720{
2721    uint8_t loop_count;
2722    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2723    v16u8 res;
2724    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2725    v16u8 const6 = (v16u8) __msa_ldi_b(6);
2726    v16u8 const3 = (v16u8) __msa_ldi_b(3);
2727    v8u16 const20 = (v8u16) __msa_ldi_h(20);
2728
2729    for (loop_count = (height >> 2); loop_count--;) {
2730        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2731        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2732        src += (4 * src_stride);
2733        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2734                                               const20, const6, const3);
2735        res = __msa_ave_u_b(inp0, res);
2736        ST_UB(res, dst);
2737        dst += dst_stride;
2738
2739        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
2740                                               const20, const6, const3);
2741        res = __msa_ave_u_b(inp2, res);
2742        ST_UB(res, dst);
2743        dst += dst_stride;
2744
2745        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
2746                                               const20, const6, const3);
2747        res = __msa_ave_u_b(inp4, res);
2748        ST_UB(res, dst);
2749        dst += dst_stride;
2750
2751        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
2752                                               const20, const6, const3);
2753        res = __msa_ave_u_b(inp6, res);
2754        ST_UB(res, dst);
2755        dst += dst_stride;
2756    }
2757
2758    LD_UB2(src, 1, inp0, inp1);
2759    res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2760                                           const20, const6, const3);
2761    res = __msa_ave_u_b(inp0, res);
2762    ST_UB(res, dst);
2763}
2764
2765static void hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(const uint8_t *src,
2766                                                      int32_t src_stride,
2767                                                      uint8_t *dst,
2768                                                      int32_t dst_stride)
2769{
2770    uint8_t buff[272];
2771
2772    hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
2773    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
2774}
2775
2776static void hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t *src,
2777                                                    int32_t src_stride,
2778                                                    uint8_t *dst,
2779                                                    int32_t dst_stride)
2780{
2781    v16u8 inp0, inp1, inp2, inp3;
2782    v16u8 res0, res1, avg0, avg1;
2783    v16u8 horiz0, horiz1, horiz2, horiz3;
2784    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2785    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2786    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2787    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2788    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2789    v16u8 const20 = (v16u8) __msa_ldi_b(20);
2790    v16u8 const6 = (v16u8) __msa_ldi_b(6);
2791    v16u8 const3 = (v16u8) __msa_ldi_b(3);
2792
2793    LD_UB2(src, src_stride, inp0, inp1);
2794    src += (2 * src_stride);
2795    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2796                                                  mask2, mask3, const20,
2797                                                  const6, const3);
2798    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2799    horiz0 = __msa_ave_u_b(inp0, res0);
2800    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2801    LD_UB2(src, src_stride, inp2, inp3);
2802    src += (2 * src_stride);
2803    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2804                                                  mask2, mask3, const20,
2805                                                  const6, const3);
2806    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2807    horiz2 = __msa_ave_u_b(inp2, res1);
2808    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2809    LD_UB2(src, src_stride, inp0, inp1);
2810    src += (2 * src_stride);
2811    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2812                                                  mask2, mask3, const20,
2813                                                  const6, const3);
2814    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2815    horiz4 = __msa_ave_u_b(inp0, res0);
2816    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2817    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
2818                                                 horiz1, horiz2, horiz3, horiz4,
2819                                                 horiz1, horiz0, horiz0, horiz1,
2820                                                 horiz2, horiz3, horiz4, horiz5,
2821                                                 const20, const6, const3);
2822    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2823    res0 = __msa_ave_u_b(avg0, res0);
2824    ST_D2(res0, 0, 1, dst, dst_stride);
2825    dst += (2 * dst_stride);
2826
2827    LD_UB2(src, src_stride, inp2, inp3);
2828    src += (2 * src_stride);
2829    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2830                                                  mask2, mask3, const20,
2831                                                  const6, const3);
2832    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2833    horiz6 = __msa_ave_u_b(inp2, res1);
2834    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2835    inp0 = LD_UB(src);
2836    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
2837                                                       mask2, mask3, const20,
2838                                                       const6, const3);
2839    horiz8 = __msa_ave_u_b(inp0, res0);
2840    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
2841                                                 horiz3, horiz4, horiz5, horiz6,
2842                                                 horiz3, horiz2, horiz1, horiz0,
2843                                                 horiz4, horiz5, horiz6, horiz7,
2844                                                 const20, const6, const3);
2845    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2846    res1 = __msa_ave_u_b(avg1, res1);
2847    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
2848                                                 horiz5, horiz6, horiz7, horiz8,
2849                                                 horiz5, horiz4, horiz3, horiz2,
2850                                                 horiz6, horiz7, horiz8, horiz8,
2851                                                 const20, const6, const3);
2852    ST_D2(res1, 0, 1, dst, dst_stride);
2853    dst += 2 * dst_stride;
2854
2855    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
2856    res0 = __msa_ave_u_b(avg0, res0);
2857    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
2858                                                 horiz7, horiz8, horiz8, horiz7,
2859                                                 horiz7, horiz6, horiz5, horiz4,
2860                                                 horiz8, horiz8, horiz7, horiz6,
2861                                                 const20, const6, const3);
2862    ST_D2(res0, 0, 1, dst, dst_stride);
2863    dst += 2 * dst_stride;
2864
2865    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
2866    res1 = __msa_ave_u_b(avg1, res1);
2867    ST_D2(res1, 0, 1, dst, dst_stride);
2868}
2869
2870static void hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t *src,
2871                                              int32_t src_stride,
2872                                              uint8_t *dst,
2873                                              int32_t dst_stride,
2874                                              int32_t height)
2875{
2876    uint8_t loop_count;
2877    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2878    v16u8 res;
2879    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2880    v16u8 const6 = (v16u8) __msa_ldi_b(6);
2881    v16u8 const3 = (v16u8) __msa_ldi_b(3);
2882    v8u16 const20 = (v8u16) __msa_ldi_h(20);
2883
2884    for (loop_count = (height >> 2); loop_count--;) {
2885        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2886        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2887        src += (4 * src_stride);
2888        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2889                                               const20, const6, const3);
2890        ST_UB(res, dst);
2891        dst += dst_stride;
2892
2893        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
2894                                               const20, const6, const3);
2895        ST_UB(res, dst);
2896        dst += dst_stride;
2897
2898        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
2899                                               const20, const6, const3);
2900        ST_UB(res, dst);
2901        dst += dst_stride;
2902
2903        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
2904                                               const20, const6, const3);
2905        ST_UB(res, dst);
2906        dst += dst_stride;
2907    }
2908
2909    LD_UB2(src, 1, inp0, inp1);
2910    res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2911                                           const20, const6, const3);
2912    ST_UB(res, dst);
2913}
2914
2915static void hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(const uint8_t *src,
2916                                                    int32_t src_stride,
2917                                                    uint8_t *dst,
2918                                                    int32_t dst_stride)
2919{
2920    uint8_t buff[272];
2921
2922    hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
2923    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
2924}
2925
2926static void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src,
2927                                                  int32_t src_stride,
2928                                                  uint8_t *dst,
2929                                                  int32_t dst_stride)
2930{
2931    v16u8 inp0, inp1, inp2, inp3;
2932    v16u8 res0, res1, avg0, avg1;
2933    v16u8 horiz0, horiz1, horiz2, horiz3;
2934    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2935    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2936    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2937    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2938    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2939    v16u8 const20 = (v16u8) __msa_ldi_b(20);
2940    v16u8 const6 = (v16u8) __msa_ldi_b(6);
2941    v16u8 const3 = (v16u8) __msa_ldi_b(3);
2942
2943    LD_UB2(src, src_stride, inp0, inp1);
2944    src += (2 * src_stride);
2945    horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2946                                                    mask2, mask3, const20,
2947                                                    const6, const3);
2948    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2949
2950    LD_UB2(src, src_stride, inp2, inp3);
2951    src += (2 * src_stride);
2952    horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2953                                                    mask2, mask3, const20,
2954                                                    const6, const3);
2955    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2956    LD_UB2(src, src_stride, inp0, inp1);
2957    src += (2 * src_stride);
2958    horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2959                                                    mask2, mask3, const20,
2960                                                    const6, const3);
2961    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2962    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
2963                                                 horiz1, horiz2, horiz3, horiz4,
2964                                                 horiz1, horiz0, horiz0, horiz1,
2965                                                 horiz2, horiz3, horiz4, horiz5,
2966                                                 const20, const6, const3);
2967    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2968    res0 = __msa_ave_u_b(avg0, res0);
2969    ST_D2(res0, 0, 1, dst, dst_stride);
2970    dst += (2 * dst_stride);
2971
2972    LD_UB2(src, src_stride, inp2, inp3);
2973    src += (2 * src_stride);
2974    horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2975                                                    mask2, mask3, const20,
2976                                                    const6, const3);
2977    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2978    inp0 = LD_UB(src);
2979    horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
2980                                                         mask2, mask3, const20,
2981                                                         const6, const3);
2982    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
2983                                                 horiz3, horiz4, horiz5, horiz6,
2984                                                 horiz3, horiz2, horiz1, horiz0,
2985                                                 horiz4, horiz5, horiz6, horiz7,
2986                                                 const20, const6, const3);
2987    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2988    res1 = __msa_ave_u_b(avg1, res1);
2989    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2990    res0 = __msa_ave_u_b(avg0, res0);
2991    ST_D2(res1, 0, 1, dst, dst_stride);
2992    dst += (2 * dst_stride);
2993
2994    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
2995                                                 horiz5, horiz6, horiz7, horiz8,
2996                                                 horiz5, horiz4, horiz3, horiz2,
2997                                                 horiz6, horiz7, horiz8, horiz8,
2998                                                 const20, const6, const3);
2999    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3000    res0 = __msa_ave_u_b(avg0, res0);
3001    ST_D2(res0, 0, 1, dst, dst_stride);
3002    dst += (2 * dst_stride);
3003
3004    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3005                                                 horiz7, horiz8, horiz8, horiz7,
3006                                                 horiz7, horiz6, horiz5, horiz4,
3007                                                 horiz8, horiz8, horiz7, horiz6,
3008                                                 const20, const6, const3);
3009    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3010    res1 = __msa_ave_u_b(avg1, res1);
3011    ST_D2(res1, 0, 1, dst, dst_stride);
3012}
3013
3014static void hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t *src,
3015                                                   int32_t src_stride,
3016                                                   uint8_t *dst,
3017                                                   int32_t dst_stride,
3018                                                   int32_t height)
3019{
3020    uint8_t loop_count;
3021    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3022    v16u8 res;
3023    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3024    v16u8 const6 = (v16u8) __msa_ldi_b(6);
3025    v16u8 const3 = (v16u8) __msa_ldi_b(3);
3026    v8u16 const20 = (v8u16) __msa_ldi_h(20);
3027
3028    for (loop_count = (height >> 2); loop_count--;) {
3029        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3030        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3031        src += (4 * src_stride);
3032        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
3033                                               const20, const6, const3);
3034        res = __msa_ave_u_b(res, inp1);
3035        ST_UB(res, dst);
3036        dst += dst_stride;
3037
3038        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
3039                                               const20, const6, const3);
3040        res = __msa_ave_u_b(res, inp3);
3041        ST_UB(res, dst);
3042        dst += dst_stride;
3043
3044        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
3045                                               const20, const6, const3);
3046        res = __msa_ave_u_b(res, inp5);
3047        ST_UB(res, dst);
3048        dst += dst_stride;
3049
3050        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
3051                                               const20, const6, const3);
3052        res = __msa_ave_u_b(res, inp7);
3053        ST_UB(res, dst);
3054        dst += dst_stride;
3055    }
3056
3057    LD_UB2(src, 1, inp0, inp1);
3058    res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
3059                                           const20, const6, const3);
3060    res = __msa_ave_u_b(inp1, res);
3061    ST_UB(res, dst);
3062}
3063
3064static void hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(const uint8_t *src,
3065                                                      int32_t src_stride,
3066                                                      uint8_t *dst,
3067                                                      int32_t dst_stride)
3068{
3069    uint8_t buff[272];
3070
3071    hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3072    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3073}
3074
3075static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src,
3076                                                    int32_t src_stride,
3077                                                    uint8_t *dst,
3078                                                    int32_t dst_stride)
3079{
3080    v16u8 inp0, inp1, inp2, inp3;
3081    v16u8 res0, res1, avg0, avg1;
3082    v16u8 horiz0, horiz1, horiz2, horiz3;
3083    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3084    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3085    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3086    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3087    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3088    v16u8 const20 = (v16u8) __msa_ldi_b(20);
3089    v16u8 const6 = (v16u8) __msa_ldi_b(6);
3090    v16u8 const3 = (v16u8) __msa_ldi_b(3);
3091
3092    LD_UB2(src, src_stride, inp0, inp1);
3093    src += (2 * src_stride);
3094    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3095                                                  mask2, mask3, const20,
3096                                                  const6, const3);
3097    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3098
3099    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3100    horiz0 = __msa_ave_u_b(inp0, res0);
3101    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3102    LD_UB2(src, src_stride, inp2, inp3);
3103    src += (2 * src_stride);
3104    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3105                                                  mask2, mask3, const20,
3106                                                  const6, const3);
3107    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3108
3109    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3110    horiz2 = __msa_ave_u_b(inp2, res1);
3111    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3112    LD_UB2(src, src_stride, inp0, inp1);
3113    src += (2 * src_stride);
3114    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3115                                                  mask2, mask3, const20,
3116                                                  const6, const3);
3117    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3118
3119    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3120    horiz4 = __msa_ave_u_b(inp0, res0);
3121    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3122    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3123                                                 horiz1, horiz2, horiz3, horiz4,
3124                                                 horiz1, horiz0, horiz0, horiz1,
3125                                                 horiz2, horiz3, horiz4, horiz5,
3126                                                 const20, const6, const3);
3127    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3128    res0 = __msa_ave_u_b(avg0, res0);
3129    ST_D2(res0, 0, 1, dst, dst_stride);
3130    dst += (2 * dst_stride);
3131
3132    LD_UB2(src, src_stride, inp2, inp3);
3133    src += (2 * src_stride);
3134    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3135                                                  mask2, mask3, const20,
3136                                                  const6, const3);
3137    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3138
3139    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3140    horiz6 = __msa_ave_u_b(inp2, res1);
3141    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3142    inp0 = LD_UB(src);
3143    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3144                                                       mask2, mask3, const20,
3145                                                       const6, const3);
3146    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3147    horiz8 = __msa_ave_u_b(inp0, res0);
3148    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3149                                                 horiz3, horiz4, horiz5, horiz6,
3150                                                 horiz3, horiz2, horiz1, horiz0,
3151                                                 horiz4, horiz5, horiz6, horiz7,
3152                                                 const20, const6, const3);
3153    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3154    res1 = __msa_ave_u_b(avg1, res1);
3155    ST_D2(res1, 0, 1, dst, dst_stride);
3156    dst += (2 * dst_stride);
3157
3158    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3159                                                 horiz5, horiz6, horiz7, horiz8,
3160                                                 horiz5, horiz4, horiz3, horiz2,
3161                                                 horiz6, horiz7, horiz8, horiz8,
3162                                                 const20, const6, const3);
3163    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3164    res0 = __msa_ave_u_b(avg0, res0);
3165    ST_D2(res0, 0, 1, dst, dst_stride);
3166    dst += (2 * dst_stride);
3167
3168    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3169                                                 horiz7, horiz8, horiz8, horiz7,
3170                                                 horiz7, horiz6, horiz5, horiz4,
3171                                                 horiz8, horiz8, horiz7, horiz6,
3172                                                 const20, const6, const3);
3173    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3174    res1 = __msa_ave_u_b(avg1, res1);
3175    ST_D2(res1, 0, 1, dst, dst_stride);
3176}
3177
3178static void hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t *src,
3179                                                    int32_t src_stride,
3180                                                    uint8_t *dst,
3181                                                    int32_t dst_stride)
3182{
3183    uint8_t buff[272];
3184
3185    hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3186    vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3187}
3188
3189static void hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t *src,
3190                                                  int32_t src_stride,
3191                                                  uint8_t *dst,
3192                                                  int32_t dst_stride)
3193{
3194    v16u8 inp0, inp1, inp2, inp3;
3195    v16u8 res0, res1;
3196    v16u8 horiz0, horiz1, horiz2, horiz3;
3197    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3198    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3199    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3200    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3201    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3202    v16u8 const20 = (v16u8) __msa_ldi_b(20);
3203    v16u8 const6 = (v16u8) __msa_ldi_b(6);
3204    v16u8 const3 = (v16u8) __msa_ldi_b(3);
3205
3206    LD_UB2(src, src_stride, inp0, inp1);
3207    src += (2 * src_stride);
3208    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3209                                                  mask2, mask3, const20,
3210                                                  const6, const3);
3211    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3212    horiz0 = __msa_ave_u_b(inp0, res0);
3213    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3214    LD_UB2(src, src_stride, inp2, inp3);
3215    src += (2 * src_stride);
3216    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3217                                                  mask2, mask3, const20,
3218                                                  const6, const3);
3219    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3220    horiz2 = __msa_ave_u_b(inp2, res1);
3221    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3222    LD_UB2(src, src_stride, inp0, inp1);
3223    src += (2 * src_stride);
3224    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3225                                                  mask2, mask3, const20,
3226                                                  const6, const3);
3227    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3228    horiz4 = __msa_ave_u_b(inp0, res0);
3229    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3230    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3231                                                 horiz1, horiz2, horiz3, horiz4,
3232                                                 horiz1, horiz0, horiz0, horiz1,
3233                                                 horiz2, horiz3, horiz4, horiz5,
3234                                                 const20, const6, const3);
3235
3236    LD_UB2(src, src_stride, inp2, inp3);
3237    src += (2 * src_stride);
3238    ST_D2(res0, 0, 1, dst, dst_stride);
3239    dst += 2 * dst_stride;
3240
3241    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3242                                                  mask2, mask3, const20,
3243                                                  const6, const3);
3244    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3245    horiz6 = __msa_ave_u_b(inp2, res1);
3246    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3247    inp0 = LD_UB(src);
3248    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3249                                                       mask2, mask3, const20,
3250                                                       const6, const3);
3251    horiz8 = __msa_ave_u_b(inp0, res0);
3252    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3253                                                 horiz3, horiz4, horiz5, horiz6,
3254                                                 horiz3, horiz2, horiz1, horiz0,
3255                                                 horiz4, horiz5, horiz6, horiz7,
3256                                                 const20, const6, const3);
3257    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3258                                                 horiz5, horiz6, horiz7, horiz8,
3259                                                 horiz5, horiz4, horiz3, horiz2,
3260                                                 horiz6, horiz7, horiz8, horiz8,
3261                                                 const20, const6, const3);
3262    ST_D4(res1, res0, 0, 1, 0, 1, dst, dst_stride);
3263    dst += (4 * dst_stride);
3264
3265    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3266                                                 horiz7, horiz8, horiz8, horiz7,
3267                                                 horiz7, horiz6, horiz5, horiz4,
3268                                                 horiz8, horiz8, horiz7, horiz6,
3269                                                 const20, const6, const3);
3270    ST_D2(res1, 0, 1, dst, dst_stride);
3271}
3272
3273static void hv_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
3274                                        int32_t src_stride,
3275                                        uint8_t *dst,
3276                                        int32_t dst_stride)
3277{
3278    uint8_t buff[272];
3279
3280    hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3281    vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3282}
3283
3284static void hv_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
3285                                      int32_t src_stride,
3286                                      uint8_t *dst,
3287                                      int32_t dst_stride)
3288{
3289    v16u8 inp0, inp1, inp2, inp3;
3290    v16u8 res0, res1;
3291    v16u8 horiz0, horiz1, horiz2, horiz3;
3292    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3293    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3294    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3295    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3296    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3297    v16u8 const20 = (v16u8) __msa_ldi_b(20);
3298    v16u8 const6 = (v16u8) __msa_ldi_b(6);
3299    v16u8 const3 = (v16u8) __msa_ldi_b(3);
3300
3301    LD_UB2(src, src_stride, inp0, inp1);
3302    src += (2 * src_stride);
3303    horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3304                                                    mask2, mask3, const20,
3305                                                    const6, const3);
3306    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3307    LD_UB2(src, src_stride, inp2, inp3);
3308    src += (2 * src_stride);
3309    horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3310                                                    mask2, mask3, const20,
3311                                                    const6, const3);
3312    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3313    LD_UB2(src, src_stride, inp0, inp1);
3314    src += (2 * src_stride);
3315    horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3316                                                    mask2, mask3, const20,
3317                                                    const6, const3);
3318    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3319    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3320                                                 horiz1, horiz2, horiz3, horiz4,
3321                                                 horiz1, horiz0, horiz0, horiz1,
3322                                                 horiz2, horiz3, horiz4, horiz5,
3323                                                 const20, const6, const3);
3324    LD_UB2(src, src_stride, inp2, inp3);
3325    src += (2 * src_stride);
3326    ST_D2(res0, 0, 1, dst, dst_stride);
3327    dst += 2 * dst_stride;
3328
3329    horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3330                                                    mask2, mask3, const20,
3331                                                    const6, const3);
3332    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3333    inp0 = LD_UB(src);
3334    horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3335                                                         mask2, mask3, const20,
3336                                                         const6, const3);
3337    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3338                                                 horiz3, horiz4, horiz5, horiz6,
3339                                                 horiz3, horiz2, horiz1, horiz0,
3340                                                 horiz4, horiz5, horiz6, horiz7,
3341                                                 const20, const6, const3);
3342    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3343                                                 horiz5, horiz6, horiz7, horiz8,
3344                                                 horiz5, horiz4, horiz3, horiz2,
3345                                                 horiz6, horiz7, horiz8, horiz8,
3346                                                 const20, const6, const3);
3347    ST_D2(res1, 0, 1, dst, dst_stride);
3348    dst += 2 * dst_stride;
3349
3350
3351    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3352                                                 horiz7, horiz8, horiz8, horiz7,
3353                                                 horiz7, horiz6, horiz5, horiz4,
3354                                                 horiz8, horiz8, horiz7, horiz6,
3355                                                 const20, const6, const3);
3356    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3357}
3358
3359static void hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t *src,
3360                                                    int32_t src_stride,
3361                                                    uint8_t *dst,
3362                                                    int32_t dst_stride)
3363{
3364    uint8_t buff[272];
3365
3366    hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3367    vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3368}
3369
3370static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src,
3371                                                  int32_t src_stride,
3372                                                  uint8_t *dst,
3373                                                  int32_t dst_stride)
3374{
3375    v16u8 inp0, inp1, inp2, inp3;
3376    v16u8 res0, res1;
3377    v16u8 horiz0, horiz1, horiz2, horiz3;
3378    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3379    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3380    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3381    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3382    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3383    v16u8 const20 = (v16u8) __msa_ldi_b(20);
3384    v16u8 const6 = (v16u8) __msa_ldi_b(6);
3385    v16u8 const3 = (v16u8) __msa_ldi_b(3);
3386
3387    LD_UB2(src, src_stride, inp0, inp1);
3388    src += (2 * src_stride);
3389    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3390                                                  mask2, mask3, const20,
3391                                                  const6, const3);
3392    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3393
3394    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3395    horiz0 = __msa_ave_u_b(inp0, res0);
3396    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3397    LD_UB2(src, src_stride, inp2, inp3);
3398    src += (2 * src_stride);
3399    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3400                                                  mask2, mask3, const20,
3401                                                  const6, const3);
3402    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3403
3404    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3405    horiz2 = __msa_ave_u_b(inp2, res1);
3406    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3407    LD_UB2(src, src_stride, inp0, inp1);
3408    src += (2 * src_stride);
3409    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3410                                                  mask2, mask3, const20,
3411                                                  const6, const3);
3412    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3413
3414    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3415    horiz4 = __msa_ave_u_b(inp0, res0);
3416    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3417    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3418                                                 horiz1, horiz2, horiz3, horiz4,
3419                                                 horiz1, horiz0, horiz0, horiz1,
3420                                                 horiz2, horiz3, horiz4, horiz5,
3421                                                 const20, const6, const3);
3422    LD_UB2(src, src_stride, inp2, inp3);
3423    src += (2 * src_stride);
3424    ST_D2(res0, 0, 1, dst, dst_stride);
3425    dst += 2 * dst_stride;
3426
3427    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3428                                                  mask2, mask3, const20,
3429                                                  const6, const3);
3430    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3431
3432    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3433    horiz6 = __msa_ave_u_b(inp2, res1);
3434    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3435    inp0 = LD_UB(src);
3436    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3437                                                       mask2, mask3, const20,
3438                                                       const6, const3);
3439    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3440    horiz8 = __msa_ave_u_b(inp0, res0);
3441    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3442                                                 horiz3, horiz4, horiz5, horiz6,
3443                                                 horiz3, horiz2, horiz1, horiz0,
3444                                                 horiz4, horiz5, horiz6, horiz7,
3445                                                 const20, const6, const3);
3446    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3447                                                 horiz5, horiz6, horiz7, horiz8,
3448                                                 horiz5, horiz4, horiz3, horiz2,
3449                                                 horiz6, horiz7, horiz8, horiz8,
3450                                                 const20, const6, const3);
3451    ST_D2(res1, 0, 1, dst, dst_stride);
3452    dst += 2 * dst_stride;
3453
3454    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3455                                                 horiz7, horiz8, horiz8, horiz7,
3456                                                 horiz7, horiz6, horiz5, horiz4,
3457                                                 horiz8, horiz8, horiz7, horiz6,
3458                                                 const20, const6, const3);
3459    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3460}
3461
3462static void hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t *src,
3463                                                      int32_t src_stride,
3464                                                      uint8_t *dst,
3465                                                      int32_t dst_stride)
3466{
3467    uint8_t buff[272];
3468
3469    hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3470    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3471}
3472
3473static void hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t *src,
3474                                                    int32_t src_stride,
3475                                                    uint8_t *dst,
3476                                                    int32_t dst_stride)
3477{
3478    v16u8 inp0, inp1, inp2, inp3;
3479    v16u8 res0, res1, avg0, avg1;
3480    v16u8 horiz0, horiz1, horiz2, horiz3;
3481    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3482    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3483    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3484    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3485    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3486    v16u8 const20 = (v16u8) __msa_ldi_b(20);
3487    v16u8 const6 = (v16u8) __msa_ldi_b(6);
3488    v16u8 const3 = (v16u8) __msa_ldi_b(3);
3489
3490    LD_UB2(src, src_stride, inp0, inp1);
3491    src += (2 * src_stride);
3492    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3493                                                  mask2, mask3, const20,
3494                                                  const6, const3);
3495    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3496    horiz0 = __msa_ave_u_b(inp0, res0);
3497    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3498    LD_UB2(src, src_stride, inp2, inp3);
3499    src += (2 * src_stride);
3500    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3501                                                  mask2, mask3, const20,
3502                                                  const6, const3);
3503    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3504    horiz2 = __msa_ave_u_b(inp2, res1);
3505    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3506    LD_UB2(src, src_stride, inp0, inp1);
3507    src += (2 * src_stride);
3508    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3509                                                  mask2, mask3, const20,
3510                                                  const6, const3);
3511    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3512    horiz4 = __msa_ave_u_b(inp0, res0);
3513    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3514    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3515                                                 horiz1, horiz2, horiz3, horiz4,
3516                                                 horiz1, horiz0, horiz0, horiz1,
3517                                                 horiz2, horiz3, horiz4, horiz5,
3518                                                 const20, const6, const3);
3519    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3520    res0 = __msa_ave_u_b(avg0, res0);
3521    ST_D2(res0, 0, 1, dst, dst_stride);
3522    dst += (2 * dst_stride);
3523
3524    LD_UB2(src, src_stride, inp2, inp3);
3525    src += (2 * src_stride);
3526    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3527                                                  mask2, mask3, const20,
3528                                                  const6, const3);
3529    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3530    horiz6 = __msa_ave_u_b(inp2, res1);
3531    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3532    inp0 = LD_UB(src);
3533    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3534                                                       mask2, mask3, const20,
3535                                                       const6, const3);
3536    horiz8 = __msa_ave_u_b(inp0, res0);
3537    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3538                                                 horiz3, horiz4, horiz5, horiz6,
3539                                                 horiz3, horiz2, horiz1, horiz0,
3540                                                 horiz4, horiz5, horiz6, horiz7,
3541                                                 const20, const6, const3);
3542    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3543    res1 = __msa_ave_u_b(avg1, res1);
3544    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3545                                                 horiz5, horiz6, horiz7, horiz8,
3546                                                 horiz5, horiz4, horiz3, horiz2,
3547                                                 horiz6, horiz7, horiz8, horiz8,
3548                                                 const20, const6, const3);
3549    ST_D2(res1, 0, 1, dst, dst_stride);
3550    dst += 2 * dst_stride;
3551
3552    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3553    res0 = __msa_ave_u_b(avg0, res0);
3554
3555    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3556                                                 horiz7, horiz8, horiz8, horiz7,
3557                                                 horiz7, horiz6, horiz5, horiz4,
3558                                                 horiz8, horiz8, horiz7, horiz6,
3559                                                 const20, const6, const3);
3560    ST_D2(res0, 0, 1, dst, dst_stride);
3561    dst += 2 * dst_stride;
3562
3563    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3564    res1 = __msa_ave_u_b(avg1, res1);
3565    ST_D2(res1, 0, 1, dst, dst_stride);
3566}
3567
3568static void hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t *src,
3569                                                    int32_t src_stride,
3570                                                    uint8_t *dst,
3571                                                    int32_t dst_stride)
3572{
3573    uint8_t buff[272];
3574
3575    hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3576    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3577}
3578
3579static void hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t *src,
3580                                                  int32_t src_stride,
3581                                                  uint8_t *dst,
3582                                                  int32_t dst_stride)
3583{
3584    v16u8 inp0, inp1, inp2, inp3;
3585    v16u8 res0, res1, avg0, avg1;
3586    v16u8 horiz0, horiz1, horiz2, horiz3;
3587    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3588    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3589    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3590    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3591    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3592    v16u8 const20 = (v16u8) __msa_ldi_b(20);
3593    v16u8 const6 = (v16u8) __msa_ldi_b(6);
3594    v16u8 const3 = (v16u8) __msa_ldi_b(3);
3595
3596    LD_UB2(src, src_stride, inp0, inp1);
3597    src += (2 * src_stride);
3598    horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3599                                                    mask2, mask3, const20,
3600                                                    const6, const3);
3601    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3602    LD_UB2(src, src_stride, inp2, inp3);
3603    src += (2 * src_stride);
3604    horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3605                                                    mask2, mask3, const20,
3606                                                    const6, const3);
3607    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3608    LD_UB2(src, src_stride, inp0, inp1);
3609    src += (2 * src_stride);
3610    horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3611                                                    mask2, mask3, const20,
3612                                                    const6, const3);
3613    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3614    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3615                                                 horiz1, horiz2, horiz3, horiz4,
3616                                                 horiz1, horiz0, horiz0, horiz1,
3617                                                 horiz2, horiz3, horiz4, horiz5,
3618                                                 const20, const6, const3);
3619    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3620    res0 = __msa_ave_u_b(avg0, res0);
3621    LD_UB2(src, src_stride, inp2, inp3);
3622    src += (2 * src_stride);
3623    ST_D2(res0, 0, 1, dst, dst_stride);
3624    dst += 2 * dst_stride;
3625
3626    horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3627                                                    mask2, mask3, const20,
3628                                                    const6, const3);
3629    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3630    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3631                                                 horiz3, horiz4, horiz5, horiz6,
3632                                                 horiz3, horiz2, horiz1, horiz0,
3633                                                 horiz4, horiz5, horiz6, horiz7,
3634                                                 const20, const6, const3);
3635    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3636    res1 = __msa_ave_u_b(avg1, res1);
3637    inp0 = LD_UB(src);
3638    horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3639                                                         mask2, mask3, const20,
3640                                                         const6, const3);
3641    ST_D2(res1, 0, 1, dst, dst_stride);
3642    dst += 2 * dst_stride;
3643
3644    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3645                                                 horiz5, horiz6, horiz7, horiz8,
3646                                                 horiz5, horiz4, horiz3, horiz2,
3647                                                 horiz6, horiz7, horiz8, horiz8,
3648                                                 const20, const6, const3);
3649    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3650    res0 = __msa_ave_u_b(avg0, res0);
3651    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3652                                                 horiz7, horiz8, horiz8, horiz7,
3653                                                 horiz7, horiz6, horiz5, horiz4,
3654                                                 horiz8, horiz8, horiz7, horiz6,
3655                                                 const20, const6, const3);
3656    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3657    res1 = __msa_ave_u_b(avg1, res1);
3658    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3659}
3660
3661static void hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t *src,
3662                                                      int32_t src_stride,
3663                                                      uint8_t *dst,
3664                                                      int32_t dst_stride)
3665{
3666    uint8_t buff[272];
3667
3668    hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3669    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3670}
3671
3672static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src,
3673                                                    int32_t src_stride,
3674                                                    uint8_t *dst,
3675                                                    int32_t dst_stride)
3676{
3677    v16u8 inp0, inp1, inp2, inp3;
3678    v16u8 res0, res1, avg0, avg1;
3679    v16u8 horiz0, horiz1, horiz2, horiz3;
3680    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3681    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3682    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3683    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3684    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3685    v16u8 const20 = (v16u8) __msa_ldi_b(20);
3686    v16u8 const6 = (v16u8) __msa_ldi_b(6);
3687    v16u8 const3 = (v16u8) __msa_ldi_b(3);
3688
3689    LD_UB2(src, src_stride, inp0, inp1);
3690    src += (2 * src_stride);
3691    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3692                                                  mask2, mask3, const20,
3693                                                  const6, const3);
3694    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3695
3696    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3697    horiz0 = __msa_ave_u_b(inp0, res0);
3698    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3699    LD_UB2(src, src_stride, inp2, inp3);
3700    src += (2 * src_stride);
3701    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3702                                                  mask2, mask3, const20,
3703                                                  const6, const3);
3704    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3705
3706    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3707    horiz2 = __msa_ave_u_b(inp2, res1);
3708    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3709    LD_UB2(src, src_stride, inp0, inp1);
3710    src += (2 * src_stride);
3711    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3712                                                  mask2, mask3, const20,
3713                                                  const6, const3);
3714
3715    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3716    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3717    horiz4 = __msa_ave_u_b(inp0, res0);
3718    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3719    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3720                                                 horiz1, horiz2, horiz3, horiz4,
3721                                                 horiz1, horiz0, horiz0, horiz1,
3722                                                 horiz2, horiz3, horiz4, horiz5,
3723                                                 const20, const6, const3);
3724    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3725    res0 = __msa_ave_u_b(avg0, res0);
3726    ST_D2(res0, 0, 1, dst, dst_stride);
3727    dst += (2 * dst_stride);
3728
3729    LD_UB2(src, src_stride, inp2, inp3);
3730    src += (2 * src_stride);
3731    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3732                                                  mask2, mask3, const20,
3733                                                  const6, const3);
3734    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3735
3736    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3737    horiz6 = __msa_ave_u_b(inp2, res1);
3738    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3739    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3740                                                 horiz3, horiz4, horiz5, horiz6,
3741                                                 horiz3, horiz2, horiz1, horiz0,
3742                                                 horiz4, horiz5, horiz6, horiz7,
3743                                                 const20, const6, const3);
3744    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3745    res1 = __msa_ave_u_b(avg1, res1);
3746    ST_D2(res1, 0, 1, dst, dst_stride);
3747    dst += (2 * dst_stride);
3748
3749    inp0 = LD_UB(src);
3750    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3751                                                       mask2, mask3, const20,
3752                                                       const6, const3);
3753    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3754    horiz8 = __msa_ave_u_b(inp0, res0);
3755    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3756                                                 horiz5, horiz6, horiz7, horiz8,
3757                                                 horiz5, horiz4, horiz3, horiz2,
3758                                                 horiz6, horiz7, horiz8, horiz8,
3759                                                 const20, const6, const3);
3760    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3761                                                 horiz7, horiz8, horiz8, horiz7,
3762                                                 horiz7, horiz6, horiz5, horiz4,
3763                                                 horiz8, horiz8, horiz7, horiz6,
3764                                                 const20, const6, const3);
3765    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3766    res0 = __msa_ave_u_b(avg0, res0);
3767    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3768    res1 = __msa_ave_u_b(avg1, res1);
3769    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3770}
3771
3772static void hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t *src,
3773                                                 int32_t src_stride,
3774                                                 uint8_t *dst,
3775                                                 int32_t dst_stride,
3776                                                 int32_t height)
3777{
3778    uint8_t loop_count;
3779    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3780    v16u8 res;
3781    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3782    v16u8 const6 = (v16u8) __msa_ldi_b(6);
3783    v16u8 const3 = (v16u8) __msa_ldi_b(3);
3784    v8u16 const20 = (v8u16) __msa_ldi_h(20);
3785
3786    for (loop_count = (height >> 2); loop_count--;) {
3787        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3788        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3789        src += (4 * src_stride);
3790        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
3791                                      const20, const6, const3);
3792        res = __msa_aver_u_b(inp0, res);
3793        ST_UB(res, dst);
3794        dst += dst_stride;
3795
3796        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
3797                                      const20, const6, const3);
3798        res = __msa_aver_u_b(inp2, res);
3799        ST_UB(res, dst);
3800        dst += dst_stride;
3801
3802        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
3803                                      const20, const6, const3);
3804        res = __msa_aver_u_b(inp4, res);
3805        ST_UB(res, dst);
3806        dst += dst_stride;
3807
3808        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
3809                                      const20, const6, const3);
3810        res = __msa_aver_u_b(inp6, res);
3811        ST_UB(res, dst);
3812        dst += dst_stride;
3813    }
3814
3815    LD_UB2(src, 1, inp0, inp1);
3816    res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
3817    res = __msa_aver_u_b(inp0, res);
3818    ST_UB(res, dst);
3819}
3820
3821static void hv_mc_qpel_aver_hv_src00_16x16_msa(const uint8_t *src,
3822                                               int32_t src_stride,
3823                                               uint8_t *dst,
3824                                               int32_t dst_stride)
3825{
3826    uint8_t buff[272];
3827
3828    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3829    vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3830}
3831
3832static void hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t *src,
3833                                             int32_t src_stride,
3834                                             uint8_t *dst,
3835                                             int32_t dst_stride)
3836{
3837    v16u8 inp0, inp1, inp2, inp3;
3838    v16u8 res0, res1, avg0, avg1;
3839    v16u8 horiz0, horiz1, horiz2, horiz3;
3840    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3841    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3842    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3843    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3844    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3845    v16u8 const20 = (v16u8) __msa_ldi_b(20);
3846    v16u8 const6 = (v16u8) __msa_ldi_b(6);
3847    v16u8 const3 = (v16u8) __msa_ldi_b(3);
3848
3849    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
3850    src += (4 * src_stride);
3851    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
3852                                         const20, const6, const3);
3853    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
3854                                         const20, const6, const3);
3855    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3856    horiz0 = __msa_aver_u_b(inp0, res0);
3857    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3858    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3859    horiz2 = __msa_aver_u_b(inp2, res1);
3860    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3861    LD_UB2(src, src_stride, inp0, inp1);
3862    src += (2 * src_stride);
3863    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
3864                                         const20, const6, const3);
3865    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3866    horiz4 = __msa_aver_u_b(inp0, res0);
3867    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3868    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3869                                        horiz1, horiz2, horiz3, horiz4,
3870                                        horiz1, horiz0, horiz0, horiz1,
3871                                        horiz2, horiz3, horiz4, horiz5,
3872                                        const20, const6, const3);
3873    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3874    res0 = __msa_aver_u_b(avg0, res0);
3875    ST_D2(res0, 0, 1, dst, dst_stride);
3876    dst += (2 * dst_stride);
3877
3878    LD_UB2(src, src_stride, inp2, inp3);
3879    src += (2 * src_stride);
3880    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
3881                                         const20, const6, const3);
3882    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3883    horiz6 = __msa_aver_u_b(inp2, res1);
3884    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3885    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3886                                        horiz3, horiz4, horiz5, horiz6,
3887                                        horiz3, horiz2, horiz1, horiz0,
3888                                        horiz4, horiz5, horiz6, horiz7,
3889                                        const20, const6, const3);
3890    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3891    res1 = __msa_aver_u_b(avg1, res1);
3892
3893    inp0 = LD_UB(src);
3894    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
3895                                              const20, const6, const3);
3896    horiz8 = __msa_aver_u_b(inp0, res0);
3897    ST_D2(res1, 0, 1, dst, dst_stride);
3898    dst += 2 * dst_stride;
3899
3900    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3901                                        horiz5, horiz6, horiz7, horiz8,
3902                                        horiz5, horiz4, horiz3, horiz2,
3903                                        horiz6, horiz7, horiz8, horiz8,
3904                                        const20, const6, const3);
3905    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3906    res0 = __msa_aver_u_b(avg0, res0);
3907    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3908                                        horiz7, horiz8, horiz8, horiz7,
3909                                        horiz7, horiz6, horiz5, horiz4,
3910                                        horiz8, horiz8, horiz7, horiz6,
3911                                        const20, const6, const3);
3912    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3913    res1 = __msa_aver_u_b(avg1, res1);
3914    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3915}
3916
3917static void hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t *src,
3918                                            int32_t src_stride,
3919                                            uint8_t *dst,
3920                                            int32_t dst_stride,
3921                                            int32_t height)
3922{
3923    uint8_t loop_count;
3924    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3925    v16u8 res;
3926    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3927    v16u8 const6 = (v16u8) __msa_ldi_b(6);
3928    v16u8 const3 = (v16u8) __msa_ldi_b(3);
3929    v8u16 const20 = (v8u16) __msa_ldi_h(20);
3930
3931    for (loop_count = (height >> 2); loop_count--;) {
3932        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3933        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3934        src += (4 * src_stride);
3935        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
3936                                      const20, const6, const3);
3937        ST_UB(res, dst);
3938        dst += dst_stride;
3939
3940        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
3941                                      const20, const6, const3);
3942        ST_UB(res, dst);
3943        dst += dst_stride;
3944
3945        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
3946                                      const20, const6, const3);
3947        ST_UB(res, dst);
3948        dst += dst_stride;
3949
3950        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
3951                                      const20, const6, const3);
3952        ST_UB(res, dst);
3953        dst += dst_stride;
3954    }
3955
3956    LD_UB2(src, 1, inp0, inp1);
3957    res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
3958    ST_UB(res, dst);
3959}
3960
3961static void hv_mc_qpel_aver_v_src0_16x16_msa(const uint8_t *src,
3962                                             int32_t src_stride,
3963                                             uint8_t *dst,
3964                                             int32_t dst_stride)
3965{
3966    uint8_t buff[272];
3967
3968    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3969    vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3970}
3971
3972static void hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t *src,
3973                                           int32_t src_stride,
3974                                           uint8_t *dst,
3975                                           int32_t dst_stride)
3976{
3977    v16u8 inp0, inp1, inp2, inp3;
3978    v16u8 res0, res1, avg0, avg1;
3979    v16u8 horiz0, horiz1, horiz2, horiz3;
3980    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3981    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3982    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3983    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3984    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3985    v16u8 const20 = (v16u8) __msa_ldi_b(20);
3986    v16u8 const6 = (v16u8) __msa_ldi_b(6);
3987    v16u8 const3 = (v16u8) __msa_ldi_b(3);
3988
3989    LD_UB2(src, src_stride, inp0, inp1);
3990    src += (2 * src_stride);
3991    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
3992                                           mask0, mask1, mask2, mask3,
3993                                           const20, const6, const3);
3994    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3995    LD_UB2(src, src_stride, inp2, inp3);
3996    src += (2 * src_stride);
3997    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
3998                                           mask0, mask1, mask2, mask3,
3999                                           const20, const6, const3);
4000    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4001    LD_UB2(src, src_stride, inp0, inp1);
4002    src += (2 * src_stride);
4003    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4004                                           mask0, mask1, mask2, mask3,
4005                                           const20, const6, const3);
4006    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4007    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4008                                        horiz1, horiz2, horiz3, horiz4,
4009                                        horiz1, horiz0, horiz0, horiz1,
4010                                        horiz2, horiz3, horiz4, horiz5,
4011                                        const20, const6, const3);
4012    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4013    res0 = __msa_aver_u_b(avg0, res0);
4014    ST_D2(res0, 0, 1, dst, dst_stride);
4015    dst += (2 * dst_stride);
4016
4017    LD_UB2(src, src_stride, inp2, inp3);
4018    src += (2 * src_stride);
4019    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4020                                           mask0, mask1, mask2, mask3,
4021                                           const20, const6, const3);
4022    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4023    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4024                                        horiz3, horiz4, horiz5, horiz6,
4025                                        horiz3, horiz2, horiz1, horiz0,
4026                                        horiz4, horiz5, horiz6, horiz7,
4027                                        const20, const6, const3);
4028    inp0 = LD_UB(src);
4029    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4030                                                mask0, mask1, mask2, mask3,
4031                                                const20, const6, const3);
4032    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4033    res1 = __msa_aver_u_b(avg1, res1);
4034    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4035                                        horiz5, horiz6, horiz7, horiz8,
4036                                        horiz5, horiz4, horiz3, horiz2,
4037                                        horiz6, horiz7, horiz8, horiz8,
4038                                        const20, const6, const3);
4039    ST_D2(res1, 0, 1, dst, dst_stride);
4040    dst += 2 * dst_stride;
4041
4042    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4043    res0 = __msa_aver_u_b(avg0, res0);
4044    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4045                                        horiz7, horiz8, horiz8, horiz7,
4046                                        horiz7, horiz6, horiz5, horiz4,
4047                                        horiz8, horiz8, horiz7, horiz6,
4048                                        const20, const6, const3);
4049    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4050    res1 = __msa_aver_u_b(avg1, res1);
4051    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4052}
4053
4054static void hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t *src,
4055                                                 int32_t src_stride,
4056                                                 uint8_t *dst,
4057                                                 int32_t dst_stride,
4058                                                 int32_t height)
4059{
4060    uint8_t loop_count;
4061    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
4062    v16u8 res;
4063    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
4064    v16u8 const6 = (v16u8) __msa_ldi_b(6);
4065    v16u8 const3 = (v16u8) __msa_ldi_b(3);
4066    v8u16 const20 = (v8u16) __msa_ldi_h(20);
4067
4068    for (loop_count = (height >> 2); loop_count--;) {
4069        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
4070        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
4071        src += (4 * src_stride);
4072        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
4073                                      const20, const6, const3);
4074        res = __msa_aver_u_b(res, inp1);
4075        ST_UB(res, dst);
4076        dst += dst_stride;
4077
4078        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
4079                                      const20, const6, const3);
4080        res = __msa_aver_u_b(res, inp3);
4081        ST_UB(res, dst);
4082        dst += dst_stride;
4083
4084        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
4085                                      const20, const6, const3);
4086        res = __msa_aver_u_b(res, inp5);
4087        ST_UB(res, dst);
4088        dst += dst_stride;
4089
4090        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
4091                                      const20, const6, const3);
4092        res = __msa_aver_u_b(res, inp7);
4093        ST_UB(res, dst);
4094        dst += dst_stride;
4095    }
4096
4097    LD_UB2(src, 1, inp0, inp1);
4098    res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
4099    res = __msa_aver_u_b(inp1, res);
4100    ST_UB(res, dst);
4101}
4102
4103static void hv_mc_qpel_aver_hv_src10_16x16_msa(const uint8_t *src,
4104                                               int32_t src_stride,
4105                                               uint8_t *dst,
4106                                               int32_t dst_stride)
4107{
4108    uint8_t buff[272];
4109
4110    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4111    vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4112}
4113
4114static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src,
4115                                             int32_t src_stride,
4116                                             uint8_t *dst,
4117                                             int32_t dst_stride)
4118{
4119    v16u8 inp0, inp1, inp2, inp3;
4120    v16u8 res0, res1, avg0, avg1;
4121    v16u8 horiz0, horiz1, horiz2, horiz3;
4122    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4123    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4124    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4125    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4126    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4127    v16u8 const20 = (v16u8) __msa_ldi_b(20);
4128    v16u8 const6 = (v16u8) __msa_ldi_b(6);
4129    v16u8 const3 = (v16u8) __msa_ldi_b(3);
4130
4131    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4132    src += (4 * src_stride);
4133    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4134                                         const20, const6, const3);
4135    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4136                                         const20, const6, const3);
4137    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4138
4139    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4140    horiz0 = __msa_aver_u_b(inp0, res0);
4141    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4142    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4143
4144    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4145    horiz2 = __msa_aver_u_b(inp2, res1);
4146    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4147    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4148    src += (4 * src_stride);
4149    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4150                                         const20, const6, const3);
4151    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4152                                         const20, const6, const3);
4153    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4154
4155    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4156    horiz4 = __msa_aver_u_b(inp0, res0);
4157    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4158    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4159
4160    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4161    horiz6 = __msa_aver_u_b(inp2, res1);
4162    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4163    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4164                                        horiz1, horiz2, horiz3, horiz4,
4165                                        horiz1, horiz0, horiz0, horiz1,
4166                                        horiz2, horiz3, horiz4, horiz5,
4167                                        const20, const6, const3);
4168    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4169    res0 = __msa_aver_u_b(avg0, res0);
4170    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4171                                        horiz3, horiz4, horiz5, horiz6,
4172                                        horiz3, horiz2, horiz1, horiz0,
4173                                        horiz4, horiz5, horiz6, horiz7,
4174                                        const20, const6, const3);
4175    ST_D2(res0, 0, 1, dst, dst_stride);
4176    dst += 2 * dst_stride;
4177
4178    inp0 = LD_UB(src);
4179    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4180                                              const20, const6, const3);
4181    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4182    res1 = __msa_aver_u_b(avg1, res1);
4183    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4184    horiz8 = __msa_aver_u_b(inp0, res0);
4185    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4186                                        horiz5, horiz6, horiz7, horiz8,
4187                                        horiz5, horiz4, horiz3, horiz2,
4188                                        horiz6, horiz7, horiz8, horiz8,
4189                                        const20, const6, const3);
4190    ST_D2(res1, 0, 1, dst, dst_stride);
4191    dst += 2 * dst_stride;
4192
4193    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4194    res0 = __msa_aver_u_b(avg0, res0);
4195    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4196                                        horiz7, horiz8, horiz8, horiz7,
4197                                        horiz7, horiz6, horiz5, horiz4,
4198                                        horiz8, horiz8, horiz7, horiz6,
4199                                        const20, const6, const3);
4200    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4201    res1 = __msa_aver_u_b(avg1, res1);
4202    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4203}
4204
4205static void hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t *src,
4206                                             int32_t src_stride,
4207                                             uint8_t *dst,
4208                                             int32_t dst_stride)
4209{
4210    uint8_t buff[272];
4211
4212    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4213    vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4214}
4215
4216static void hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t *src,
4217                                           int32_t src_stride,
4218                                           uint8_t *dst,
4219                                           int32_t dst_stride)
4220{
4221    v16u8 inp0, inp1, inp2, inp3;
4222    v16u8 res0, res1;
4223    v16u8 horiz0, horiz1, horiz2, horiz3;
4224    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4225    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4226    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4227    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4228    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4229    v16u8 const20 = (v16u8) __msa_ldi_b(20);
4230    v16u8 const6 = (v16u8) __msa_ldi_b(6);
4231    v16u8 const3 = (v16u8) __msa_ldi_b(3);
4232
4233    LD_UB2(src, src_stride, inp0, inp1);
4234    src += (2 * src_stride);
4235    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4236                                         const20, const6, const3);
4237    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4238    horiz0 = __msa_aver_u_b(inp0, res0);
4239    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4240
4241    LD_UB2(src, src_stride, inp2, inp3);
4242    src += (2 * src_stride);
4243    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4244                                         const20, const6, const3);
4245    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4246    horiz2 = __msa_aver_u_b(inp2, res1);
4247    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4248    LD_UB2(src, src_stride, inp0, inp1);
4249    src += (2 * src_stride);
4250    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4251                                         const20, const6, const3);
4252    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4253    horiz4 = __msa_aver_u_b(inp0, res0);
4254    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4255    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4256                                        horiz1, horiz2, horiz3, horiz4,
4257                                        horiz1, horiz0, horiz0, horiz1,
4258                                        horiz2, horiz3, horiz4, horiz5,
4259                                        const20, const6, const3);
4260    ST_D2(res0, 0, 1, dst, dst_stride);
4261    dst += (2 * dst_stride);
4262
4263    LD_UB2(src, src_stride, inp2, inp3);
4264    src += (2 * src_stride);
4265    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4266                                         const20, const6, const3);
4267    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4268    horiz6 = __msa_aver_u_b(inp2, res1);
4269    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4270    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4271                                        horiz3, horiz4, horiz5, horiz6,
4272                                        horiz3, horiz2, horiz1, horiz0,
4273                                        horiz4, horiz5, horiz6, horiz7,
4274                                        const20, const6, const3);
4275    inp0 = LD_UB(src);
4276    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4277                                              const20, const6, const3);
4278    horiz8 = __msa_aver_u_b(inp0, res0);
4279    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4280                                        horiz5, horiz6, horiz7, horiz8,
4281                                        horiz5, horiz4, horiz3, horiz2,
4282                                        horiz6, horiz7, horiz8, horiz8,
4283                                        const20, const6, const3);
4284    ST_D2(res1, 0, 1, dst, dst_stride);
4285    dst += 2 * dst_stride;
4286
4287    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4288                                        horiz7, horiz8, horiz8, horiz7,
4289                                        horiz7, horiz6, horiz5, horiz4,
4290                                        horiz8, horiz8, horiz7, horiz6,
4291                                        const20, const6, const3);
4292    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4293}
4294
4295static void hv_mc_qpel_16x16_msa(const uint8_t *src,
4296                                 int32_t src_stride,
4297                                 uint8_t *dst,
4298                                 int32_t dst_stride)
4299{
4300    uint8_t buff[272];
4301
4302    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4303    vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4304}
4305
4306static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride,
4307                               uint8_t *dst, int32_t dst_stride)
4308{
4309    v16u8 inp0, inp1, inp2, inp3;
4310    v16u8 res0, res1;
4311    v16u8 horiz0, horiz1, horiz2, horiz3;
4312    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4313    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4314    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4315    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4316    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4317    v16u8 const20 = (v16u8) __msa_ldi_b(20);
4318    v16u8 const6 = (v16u8) __msa_ldi_b(6);
4319    v16u8 const3 = (v16u8) __msa_ldi_b(3);
4320
4321    LD_UB2(src, src_stride, inp0, inp1);
4322    src += (2 * src_stride);
4323    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4324                                           mask0, mask1, mask2, mask3,
4325                                           const20, const6, const3);
4326    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4327    LD_UB2(src, src_stride, inp2, inp3);
4328    src += (2 * src_stride);
4329    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4330                                           mask0, mask1, mask2, mask3,
4331                                           const20, const6, const3);
4332    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4333    LD_UB2(src, src_stride, inp0, inp1);
4334    src += (2 * src_stride);
4335    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4336                                           mask0, mask1, mask2, mask3,
4337                                           const20, const6, const3);
4338    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4339    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4340                                        horiz1, horiz2, horiz3, horiz4,
4341                                        horiz1, horiz0, horiz0, horiz1,
4342                                        horiz2, horiz3, horiz4, horiz5,
4343                                        const20, const6, const3);
4344    ST_D2(res0, 0, 1, dst, dst_stride);
4345    dst += (2 * dst_stride);
4346
4347    LD_UB2(src, src_stride, inp2, inp3);
4348    src += (2 * src_stride);
4349    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4350                                           mask0, mask1, mask2, mask3,
4351                                           const20, const6, const3);
4352    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4353    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4354                                        horiz3, horiz4, horiz5, horiz6,
4355                                        horiz3, horiz2, horiz1, horiz0,
4356                                        horiz4, horiz5, horiz6, horiz7,
4357                                        const20, const6, const3);
4358    inp0 = LD_UB(src);
4359    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4360                                                mask0, mask1, mask2, mask3,
4361                                                const20, const6, const3);
4362    ST_D2(res1, 0, 1, dst, dst_stride);
4363    dst += 2 * dst_stride;
4364
4365    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4366                                        horiz5, horiz6, horiz7, horiz8,
4367                                        horiz5, horiz4, horiz3, horiz2,
4368                                        horiz6, horiz7, horiz8, horiz8,
4369                                        const20, const6, const3);
4370    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4371                                        horiz7, horiz8, horiz8, horiz7,
4372                                        horiz7, horiz6, horiz5, horiz4,
4373                                        horiz8, horiz8, horiz7, horiz6,
4374                                        const20, const6, const3);
4375    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4376}
4377
4378static void hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t *src,
4379                                             int32_t src_stride,
4380                                             uint8_t *dst,
4381                                             int32_t dst_stride)
4382{
4383    uint8_t buff[272];
4384
4385    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4386    vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4387}
4388
4389static void hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t *src,
4390                                           int32_t src_stride,
4391                                           uint8_t *dst,
4392                                           int32_t dst_stride)
4393{
4394    v16u8 inp0, inp1, inp2, inp3;
4395    v16u8 res0, res1;
4396    v16u8 horiz0, horiz1, horiz2, horiz3;
4397    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4398    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4399    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4400    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4401    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4402    v16u8 const20 = (v16u8) __msa_ldi_b(20);
4403    v16u8 const6 = (v16u8) __msa_ldi_b(6);
4404    v16u8 const3 = (v16u8) __msa_ldi_b(3);
4405
4406    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4407    src += (4 * src_stride);
4408
4409    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4410                                         const20, const6, const3);
4411    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4412                                         const20, const6, const3);
4413    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4414
4415    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4416    horiz0 = __msa_aver_u_b(inp0, res0);
4417    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4418    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4419
4420    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4421    horiz2 = __msa_aver_u_b(inp2, res1);
4422    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4423    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4424    src += (4 * src_stride);
4425    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4426                                         const20, const6, const3);
4427    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4428                                         const20, const6, const3);
4429    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4430
4431    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4432    horiz4 = __msa_aver_u_b(inp0, res0);
4433    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4434    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4435
4436    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4437    horiz6 = __msa_aver_u_b(inp2, res1);
4438    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4439    inp0 = LD_UB(src);
4440    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4441                                              const20, const6, const3);
4442    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4443    horiz8 = __msa_aver_u_b(inp0, res0);
4444    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4445                                        horiz1, horiz2, horiz3, horiz4,
4446                                        horiz1, horiz0, horiz0, horiz1,
4447                                        horiz2, horiz3, horiz4, horiz5,
4448                                        const20, const6, const3);
4449    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4450                                        horiz3, horiz4, horiz5, horiz6,
4451                                        horiz3, horiz2, horiz1, horiz0,
4452                                        horiz4, horiz5, horiz6, horiz7,
4453                                        const20, const6, const3);
4454    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4455    dst += (4 * dst_stride);
4456
4457    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4458                                        horiz5, horiz6, horiz7, horiz8,
4459                                        horiz5, horiz4, horiz3, horiz2,
4460                                        horiz6, horiz7, horiz8, horiz8,
4461                                        const20, const6, const3);
4462    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4463                                        horiz7, horiz8, horiz8, horiz7,
4464                                        horiz7, horiz6, horiz5, horiz4,
4465                                        horiz8, horiz8, horiz7, horiz6,
4466                                        const20, const6, const3);
4467    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4468}
4469
4470static void hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t *src,
4471                                               int32_t src_stride,
4472                                               uint8_t *dst,
4473                                               int32_t dst_stride)
4474{
4475    uint8_t buff[272];
4476
4477    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4478    vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4479}
4480
4481static void hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t *src,
4482                                             int32_t src_stride,
4483                                             uint8_t *dst,
4484                                             int32_t dst_stride)
4485{
4486    v16u8 inp0, inp1, inp2, inp3;
4487    v16u8 res0, res1, avg0, avg1;
4488    v16u8 horiz0, horiz1, horiz2, horiz3;
4489    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4490    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4491    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4492    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4493    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4494    v16u8 const20 = (v16u8) __msa_ldi_b(20);
4495    v16u8 const6 = (v16u8) __msa_ldi_b(6);
4496    v16u8 const3 = (v16u8) __msa_ldi_b(3);
4497
4498    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4499    src += (4 * src_stride);
4500
4501    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4502                                         const20, const6, const3);
4503    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4504                                         const20, const6, const3);
4505    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4506    horiz0 = __msa_aver_u_b(inp0, res0);
4507    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4508    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4509    horiz2 = __msa_aver_u_b(inp2, res1);
4510    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4511    LD_UB2(src, src_stride, inp0, inp1);
4512    src += (2 * src_stride);
4513
4514    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4515                                         const20, const6, const3);
4516    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4517    horiz4 = __msa_aver_u_b(inp0, res0);
4518    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4519    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4520                                        horiz1, horiz2, horiz3, horiz4,
4521                                        horiz1, horiz0, horiz0, horiz1,
4522                                        horiz2, horiz3, horiz4, horiz5,
4523                                        const20, const6, const3);
4524    avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4525    res0 = __msa_aver_u_b(avg0, res0);
4526    ST_D2(res0, 0, 1, dst, dst_stride);
4527    dst += (2 * dst_stride);
4528
4529    LD_UB2(src, src_stride, inp2, inp3);
4530    src += (2 * src_stride);
4531    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4532                                         const20, const6, const3);
4533    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4534    horiz6 = __msa_aver_u_b(inp2, res1);
4535    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4536    inp0 = LD_UB(src);
4537    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4538                                              const20, const6, const3);
4539    horiz8 = __msa_aver_u_b(inp0, res0);
4540    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4541                                        horiz3, horiz4, horiz5, horiz6,
4542                                        horiz3, horiz2, horiz1, horiz0,
4543                                        horiz4, horiz5, horiz6, horiz7,
4544                                        const20, const6, const3);
4545    avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4546    res1 = __msa_aver_u_b(avg1, res1);
4547    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4548                                        horiz5, horiz6, horiz7, horiz8,
4549                                        horiz5, horiz4, horiz3, horiz2,
4550                                        horiz6, horiz7, horiz8, horiz8,
4551                                        const20, const6, const3);
4552    ST_D2(res1, 0, 1, dst, dst_stride);
4553    dst += 2 * dst_stride;
4554
4555    avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4556    res0 = __msa_aver_u_b(avg0, res0);
4557    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4558                                        horiz7, horiz8, horiz8, horiz7,
4559                                        horiz7, horiz6, horiz5, horiz4,
4560                                        horiz8, horiz8, horiz7, horiz6,
4561                                        const20, const6, const3);
4562    avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4563    res1 = __msa_aver_u_b(avg1, res1);
4564    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4565}
4566
4567static void hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t *src,
4568                                             int32_t src_stride,
4569                                             uint8_t *dst,
4570                                             int32_t dst_stride)
4571{
4572    uint8_t buff[272];
4573
4574    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4575    vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4576}
4577
4578static void hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t *src,
4579                                           int32_t src_stride,
4580                                           uint8_t *dst,
4581                                           int32_t dst_stride)
4582{
4583    v16u8 inp0, inp1, inp2, inp3;
4584    v16u8 res0, res1, avg0, avg1;
4585    v16u8 horiz0, horiz1, horiz2, horiz3;
4586    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4587    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4588    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4589    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4590    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4591    v16u8 const20 = (v16u8) __msa_ldi_b(20);
4592    v16u8 const6 = (v16u8) __msa_ldi_b(6);
4593    v16u8 const3 = (v16u8) __msa_ldi_b(3);
4594
4595    LD_UB2(src, src_stride, inp0, inp1);
4596    src += (2 * src_stride);
4597    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4598                                           mask0, mask1, mask2, mask3,
4599                                           const20, const6, const3);
4600    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4601    LD_UB2(src, src_stride, inp2, inp3);
4602    src += (2 * src_stride);
4603    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4604                                           mask0, mask1, mask2, mask3,
4605                                           const20, const6, const3);
4606    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4607    LD_UB2(src, src_stride, inp0, inp1);
4608    src += (2 * src_stride);
4609    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4610                                           mask0, mask1, mask2, mask3,
4611                                           const20, const6, const3);
4612    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4613    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4614    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4615                                        horiz1, horiz2, horiz3, horiz4,
4616                                        horiz1, horiz0, horiz0, horiz1,
4617                                        horiz2, horiz3, horiz4, horiz5,
4618                                        const20, const6, const3);
4619    avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4620    res0 = __msa_aver_u_b(avg0, res0);
4621    ST_D2(res0, 0, 1, dst, dst_stride);
4622    dst += (2 * dst_stride);
4623
4624    LD_UB2(src, src_stride, inp2, inp3);
4625    src += (2 * src_stride);
4626    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4627                                           mask0, mask1, mask2, mask3,
4628                                           const20, const6, const3);
4629    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4630    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4631                                        horiz3, horiz4, horiz5, horiz6,
4632                                        horiz3, horiz2, horiz1, horiz0,
4633                                        horiz4, horiz5, horiz6, horiz7,
4634                                        const20, const6, const3);
4635    inp0 = LD_UB(src);
4636    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4637                                                mask0, mask1, mask2, mask3,
4638                                                const20, const6, const3);
4639    avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4640    res1 = __msa_aver_u_b(avg1, res1);
4641    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4642                                        horiz5, horiz6, horiz7, horiz8,
4643                                        horiz5, horiz4, horiz3, horiz2,
4644                                        horiz6, horiz7, horiz8, horiz8,
4645                                        const20, const6, const3);
4646    ST_D2(res1, 0, 1, dst, dst_stride);
4647    dst += 2 * dst_stride;
4648    avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4649    res0 = __msa_aver_u_b(avg0, res0);
4650
4651    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4652                                        horiz7, horiz8, horiz8, horiz7,
4653                                        horiz7, horiz6, horiz5, horiz4,
4654                                        horiz8, horiz8, horiz7, horiz6,
4655                                        const20, const6, const3);
4656    avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4657    res1 = __msa_aver_u_b(avg1, res1);
4658    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4659}
4660
4661static void hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t *src,
4662                                               int32_t src_stride,
4663                                               uint8_t *dst,
4664                                               int32_t dst_stride)
4665{
4666    uint8_t buff[272];
4667
4668    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4669    vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4670}
4671
4672static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src,
4673                                             int32_t src_stride,
4674                                             uint8_t *dst, int32_t dst_stride)
4675{
4676    v16u8 inp0, inp1, inp2, inp3;
4677    v16u8 res0, res1, avg0, avg1;
4678    v16u8 horiz0, horiz1, horiz2, horiz3;
4679    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4680    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4681    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4682    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4683    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4684    v16u8 const20 = (v16u8) __msa_ldi_b(20);
4685    v16u8 const6 = (v16u8) __msa_ldi_b(6);
4686    v16u8 const3 = (v16u8) __msa_ldi_b(3);
4687
4688    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4689    src += (4 * src_stride);
4690    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4691                                         mask0, mask1, mask2, mask3,
4692                                         const20, const6, const3);
4693    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4694
4695    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4696    horiz0 = __msa_aver_u_b(inp0, res0);
4697    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4698    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4699                                         const20, const6, const3);
4700    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4701
4702    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4703    horiz2 = __msa_aver_u_b(inp2, res1);
4704    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4705    LD_UB2(src, src_stride, inp0, inp1);
4706    src += (2 * src_stride);
4707    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4708                                         const20, const6, const3);
4709    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4710
4711    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4712    horiz4 = __msa_aver_u_b(inp0, res0);
4713    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4714    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4715                                        horiz1, horiz2, horiz3, horiz4,
4716                                        horiz1, horiz0, horiz0, horiz1,
4717                                        horiz2, horiz3, horiz4, horiz5,
4718                                        const20, const6, const3);
4719    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
4720    res0 = __msa_aver_u_b(avg0, res0);
4721    LD_UB2(src, src_stride, inp2, inp3);
4722    src += (2 * src_stride);
4723    ST_D2(res0, 0, 1, dst, dst_stride);
4724    dst += 2 * dst_stride;
4725
4726    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4727                                         const20, const6, const3);
4728    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4729
4730    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4731    horiz6 = __msa_aver_u_b(inp2, res1);
4732    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4733    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4734                                        horiz3, horiz4, horiz5, horiz6,
4735                                        horiz3, horiz2, horiz1, horiz0,
4736                                        horiz4, horiz5, horiz6, horiz7,
4737                                        const20, const6, const3);
4738    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
4739    res1 = __msa_aver_u_b(avg1, res1);
4740    inp0 = LD_UB(src);
4741    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4742                                              const20, const6, const3);
4743    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4744    horiz8 = __msa_aver_u_b(inp0, res0);
4745    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4746                                        horiz5, horiz6, horiz7, horiz8,
4747                                        horiz5, horiz4, horiz3, horiz2,
4748                                        horiz6, horiz7, horiz8, horiz8,
4749                                        const20, const6, const3);
4750    ST_D2(res1, 0, 1, dst, dst_stride);
4751    dst += 2 * dst_stride;
4752
4753    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
4754    res0 = __msa_aver_u_b(avg0, res0);
4755    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4756                                        horiz7, horiz8, horiz8, horiz7,
4757                                        horiz7, horiz6, horiz5, horiz4,
4758                                        horiz8, horiz8, horiz7, horiz6,
4759                                        const20, const6, const3);
4760    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
4761    res1 = __msa_aver_u_b(avg1, res1);
4762    ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4763}
4764
4765static void hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t *src,
4766                                                       int32_t src_stride,
4767                                                       uint8_t *dst,
4768                                                       int32_t dst_stride)
4769{
4770    uint8_t buff[272];
4771
4772    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4773    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4774}
4775
4776static void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src,
4777                                                     int32_t src_stride,
4778                                                     uint8_t *dst,
4779                                                     int32_t dst_stride)
4780{
4781    v16u8 inp0, inp1, inp2, inp3;
4782    v16u8 res0, res1, avg0, avg1;
4783    v16u8 horiz0, horiz1, horiz2, horiz3;
4784    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4785    v16u8 dst0, dst1;
4786    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4787    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4788    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4789    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4790    v16u8 const20 = (v16u8) __msa_ldi_b(20);
4791    v16u8 const6 = (v16u8) __msa_ldi_b(6);
4792    v16u8 const3 = (v16u8) __msa_ldi_b(3);
4793
4794    LD_UB2(src, src_stride, inp0, inp1);
4795    src += (2 * src_stride);
4796    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4797                                         const20, const6, const3);
4798    LD_UB2(src, src_stride, inp2, inp3);
4799    src += (2 * src_stride);
4800    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4801    horiz0 = __msa_aver_u_b(inp0, res0);
4802    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4803    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4804                                         const20, const6, const3);
4805    LD_UB2(src, src_stride, inp0, inp1);
4806    src += (2 * src_stride);
4807    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4808    horiz2 = __msa_aver_u_b(inp2, res1);
4809    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4810    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4811                                         const20, const6, const3);
4812    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4813    horiz4 = __msa_aver_u_b(inp0, res0);
4814    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4815    LD_UB2(dst, dst_stride, dst0, dst1);
4816    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4817    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4818                                        horiz1, horiz2, horiz3, horiz4,
4819                                        horiz1, horiz0, horiz0, horiz1,
4820                                        horiz2, horiz3, horiz4, horiz5,
4821                                        const20, const6, const3);
4822    res0 = __msa_aver_u_b(avg0, res0);
4823    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4824    res0 = __msa_aver_u_b(avg0, res0);
4825    ST_D2(res0, 0, 1, dst, dst_stride);
4826    dst += (2 * dst_stride);
4827
4828    LD_UB2(src, src_stride, inp2, inp3);
4829    src += (2 * src_stride);
4830    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4831                                         const20, const6, const3);
4832    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4833    horiz6 = __msa_aver_u_b(inp2, res1);
4834    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4835    LD_UB2(dst, dst_stride, dst0, dst1);
4836    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4837    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4838                                        horiz3, horiz4, horiz5, horiz6,
4839                                        horiz3, horiz2, horiz1, horiz0,
4840                                        horiz4, horiz5, horiz6, horiz7,
4841                                        const20, const6, const3);
4842    res1 = __msa_aver_u_b(avg1, res1);
4843    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4844    res1 = __msa_aver_u_b(avg1, res1);
4845    ST_D2(res1, 0, 1, dst, dst_stride);
4846    dst += (2 * dst_stride);
4847
4848    inp0 = LD_UB(src);
4849    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4850                                              const20, const6, const3);
4851    horiz8 = __msa_aver_u_b(inp0, res0);
4852    LD_UB2(dst, dst_stride, dst0, dst1);
4853    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4854    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4855                                        horiz5, horiz6, horiz7, horiz8,
4856                                        horiz5, horiz4, horiz3, horiz2,
4857                                        horiz6, horiz7, horiz8, horiz8,
4858                                        const20, const6, const3);
4859    res0 = __msa_aver_u_b(avg0, res0);
4860    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4861    res0 = __msa_aver_u_b(avg0, res0);
4862    ST_D2(res0, 0, 1, dst, dst_stride);
4863    dst += (2 * dst_stride);
4864
4865    LD_UB2(dst, dst_stride, dst0, dst1);
4866    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4867    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4868                                        horiz7, horiz8, horiz8, horiz7,
4869                                        horiz7, horiz6, horiz5, horiz4,
4870                                        horiz8, horiz8, horiz7, horiz6,
4871                                        const20, const6, const3);
4872    res1 = __msa_aver_u_b(avg1, res1);
4873    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4874    res1 = __msa_aver_u_b(avg1, res1);
4875    ST_D2(res1, 0, 1, dst, dst_stride);
4876}
4877
4878static void hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t *src,
4879                                                     int32_t src_stride,
4880                                                     uint8_t *dst,
4881                                                     int32_t dst_stride)
4882{
4883    uint8_t buff[272];
4884
4885    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4886    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4887}
4888
4889static void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src,
4890                                                   int32_t src_stride,
4891                                                   uint8_t *dst,
4892                                                   int32_t dst_stride)
4893{
4894    v16u8 inp0, inp1, inp2, inp3;
4895    v16u8 res0, res1, avg0, avg1;
4896    v16u8 horiz0, horiz1, horiz2, horiz3;
4897    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4898    v16u8 dst0, dst1;
4899    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4900    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4901    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4902    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4903    v16u8 const20 = (v16u8) __msa_ldi_b(20);
4904    v16u8 const6 = (v16u8) __msa_ldi_b(6);
4905    v16u8 const3 = (v16u8) __msa_ldi_b(3);
4906
4907    LD_UB2(src, src_stride, inp0, inp1);
4908    src += (2 * src_stride);
4909    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4910                                           mask0, mask1, mask2, mask3,
4911                                           const20, const6, const3);
4912    LD_UB2(src, src_stride, inp2, inp3);
4913    src += (2 * src_stride);
4914    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4915    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4916                                           mask0, mask1, mask2, mask3,
4917                                           const20, const6, const3);
4918    LD_UB2(src, src_stride, inp0, inp1);
4919    src += (2 * src_stride);
4920    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4921    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4922                                           mask0, mask1, mask2, mask3,
4923                                           const20, const6, const3);
4924    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4925    LD_UB2(dst, dst_stride, dst0, dst1);
4926    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4927    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4928                                        horiz1, horiz2, horiz3, horiz4,
4929                                        horiz1, horiz0, horiz0, horiz1,
4930                                        horiz2, horiz3, horiz4, horiz5,
4931                                        const20, const6, const3);
4932    res0 = __msa_aver_u_b(avg0, res0);
4933    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4934    res0 = __msa_aver_u_b(avg0, res0);
4935    ST_D2(res0, 0, 1, dst, dst_stride);
4936    dst += (2 * dst_stride);
4937
4938    LD_UB2(src, src_stride, inp2, inp3);
4939    src += (2 * src_stride);
4940    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4941                                           mask0, mask1, mask2, mask3,
4942                                           const20, const6, const3);
4943    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4944    LD_UB2(dst, dst_stride, dst0, dst1);
4945    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4946    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4947                                        horiz3, horiz4, horiz5, horiz6,
4948                                        horiz3, horiz2, horiz1, horiz0,
4949                                        horiz4, horiz5, horiz6, horiz7,
4950                                        const20, const6, const3);
4951    res1 = __msa_aver_u_b(avg1, res1);
4952    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4953    res1 = __msa_aver_u_b(avg1, res1);
4954    ST_D2(res1, 0, 1, dst, dst_stride);
4955    dst += (2 * dst_stride);
4956
4957    inp0 = LD_UB(src);
4958    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4959                                                mask0, mask1, mask2, mask3,
4960                                                const20, const6, const3);
4961    LD_UB2(dst, dst_stride, dst0, dst1);
4962    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4963    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4964                                        horiz5, horiz6, horiz7, horiz8,
4965                                        horiz5, horiz4, horiz3, horiz2,
4966                                        horiz6, horiz7, horiz8, horiz8,
4967                                        const20, const6, const3);
4968    res0 = __msa_aver_u_b(avg0, res0);
4969    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4970    res0 = __msa_aver_u_b(avg0, res0);
4971    ST_D2(res0, 0, 1, dst, dst_stride);
4972    dst += (2 * dst_stride);
4973
4974    LD_UB2(dst, dst_stride, dst0, dst1);
4975    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4976    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4977                                        horiz7, horiz8, horiz8, horiz7,
4978                                        horiz7, horiz6, horiz5, horiz4,
4979                                        horiz8, horiz8, horiz7, horiz6,
4980                                        const20, const6, const3);
4981    res1 = __msa_aver_u_b(avg1, res1);
4982    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4983    res1 = __msa_aver_u_b(avg1, res1);
4984    ST_D2(res1, 0, 1, dst, dst_stride);
4985}
4986
4987static void hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t *src,
4988                                                       int32_t src_stride,
4989                                                       uint8_t *dst,
4990                                                       int32_t dst_stride)
4991{
4992    uint8_t buff[272];
4993
4994    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4995    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4996}
4997
4998static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src,
4999                                                     int32_t src_stride,
5000                                                     uint8_t *dst,
5001                                                     int32_t dst_stride)
5002{
5003    v16u8 inp0, inp1, inp2, inp3;
5004    v16u8 res0, res1, avg0, avg1;
5005    v16u8 horiz0, horiz1, horiz2, horiz3;
5006    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5007    v16u8 dst0, dst1;
5008    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5009    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5010    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5011    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5012    v16u8 const20 = (v16u8) __msa_ldi_b(20);
5013    v16u8 const6 = (v16u8) __msa_ldi_b(6);
5014    v16u8 const3 = (v16u8) __msa_ldi_b(3);
5015
5016    LD_UB2(src, src_stride, inp0, inp1);
5017    src += (2 * src_stride);
5018    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5019                                         const20, const6, const3);
5020
5021    LD_UB2(src, src_stride, inp2, inp3);
5022    src += (2 * src_stride);
5023    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5024
5025    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5026    horiz0 = __msa_aver_u_b(inp0, res0);
5027    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5028    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5029                                         const20, const6, const3);
5030    LD_UB2(src, src_stride, inp0, inp1);
5031    src += (2 * src_stride);
5032    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5033
5034    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5035    horiz2 = __msa_aver_u_b(inp2, res1);
5036    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5037    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5038                                         const20, const6, const3);
5039
5040    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5041
5042    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5043    horiz4 = __msa_aver_u_b(inp0, res0);
5044    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5045    LD_UB2(dst, dst_stride, dst0, dst1);
5046    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
5047    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5048                                        horiz1, horiz2, horiz3, horiz4,
5049                                        horiz1, horiz0, horiz0, horiz1,
5050                                        horiz2, horiz3, horiz4, horiz5,
5051                                        const20, const6, const3);
5052    res0 = __msa_aver_u_b(avg0, res0);
5053    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5054    res0 = __msa_aver_u_b(avg0, res0);
5055    ST_D2(res0, 0, 1, dst, dst_stride);
5056    dst += (2 * dst_stride);
5057
5058    LD_UB2(src, src_stride, inp2, inp3);
5059    src += (2 * src_stride);
5060    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5061                                         const20, const6, const3);
5062
5063    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5064
5065    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5066    horiz6 = __msa_aver_u_b(inp2, res1);
5067    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5068    LD_UB2(dst, dst_stride, dst0, dst1);
5069    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
5070    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5071                                        horiz3, horiz4, horiz5, horiz6,
5072                                        horiz3, horiz2, horiz1, horiz0,
5073                                        horiz4, horiz5, horiz6, horiz7,
5074                                        const20, const6, const3);
5075    res1 = __msa_aver_u_b(avg1, res1);
5076    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5077    res1 = __msa_aver_u_b(avg1, res1);
5078    ST_D2(res1, 0, 1, dst, dst_stride);
5079    dst += (2 * dst_stride);
5080
5081    inp0 = LD_UB(src);
5082    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5083                                              const20, const6, const3);
5084    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5085    horiz8 = __msa_aver_u_b(inp0, res0);
5086    LD_UB2(dst, dst_stride, dst0, dst1);
5087    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
5088    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5089                                        horiz5, horiz6, horiz7, horiz8,
5090                                        horiz5, horiz4, horiz3, horiz2,
5091                                        horiz6, horiz7, horiz8, horiz8,
5092                                        const20, const6, const3);
5093    res0 = __msa_aver_u_b(avg0, res0);
5094    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5095    res0 = __msa_aver_u_b(avg0, res0);
5096    ST_D2(res0, 0, 1, dst, dst_stride);
5097    dst += (2 * dst_stride);
5098
5099    LD_UB2(dst, dst_stride, dst0, dst1);
5100    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
5101    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5102                                        horiz7, horiz8, horiz8, horiz7,
5103                                        horiz7, horiz6, horiz5, horiz4,
5104                                        horiz8, horiz8, horiz7, horiz6,
5105                                        const20, const6, const3);
5106    res1 = __msa_aver_u_b(avg1, res1);
5107    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5108    res1 = __msa_aver_u_b(avg1, res1);
5109    ST_D2(res1, 0, 1, dst, dst_stride);
5110}
5111
5112static void hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t *src,
5113                                                     int32_t src_stride,
5114                                                     uint8_t *dst,
5115                                                     int32_t dst_stride)
5116{
5117    uint8_t buff[272];
5118
5119    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
5120    vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5121}
5122
5123static void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src,
5124                                                   int32_t src_stride,
5125                                                   uint8_t *dst,
5126                                                   int32_t dst_stride)
5127{
5128    v16u8 inp0, inp1, inp2, inp3;
5129    v16u8 res0, res1, avg0, avg1;
5130    v16u8 horiz0, horiz1, horiz2, horiz3;
5131    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5132    v16u8 dst0, dst1;
5133    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5134    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5135    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5136    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5137    v16u8 const20 = (v16u8) __msa_ldi_b(20);
5138    v16u8 const6 = (v16u8) __msa_ldi_b(6);
5139    v16u8 const3 = (v16u8) __msa_ldi_b(3);
5140
5141    LD_UB2(src, src_stride, inp0, inp1);
5142    src += (2 * src_stride);
5143    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5144                                         const20, const6, const3);
5145    LD_UB2(src, src_stride, inp2, inp3);
5146    src += (2 * src_stride);
5147    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5148    horiz0 = __msa_aver_u_b(inp0, res0);
5149    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5150    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5151                                         const20, const6, const3);
5152    LD_UB2(src, src_stride, inp0, inp1);
5153    src += (2 * src_stride);
5154    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5155    horiz2 = __msa_aver_u_b(inp2, res1);
5156    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5157    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5158                                         const20, const6, const3);
5159    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5160    horiz4 = __msa_aver_u_b(inp0, res0);
5161    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5162    LD_UB2(dst, dst_stride, dst0, dst1);
5163    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5164                                        horiz1, horiz2, horiz3, horiz4,
5165                                        horiz1, horiz0, horiz0, horiz1,
5166                                        horiz2, horiz3, horiz4, horiz5,
5167                                        const20, const6, const3);
5168    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5169    res0 = __msa_aver_u_b(avg0, res0);
5170    ST_D2(res0, 0, 1, dst, dst_stride);
5171    dst += (2 * dst_stride);
5172
5173    LD_UB2(src, src_stride, inp2, inp3);
5174    src += (2 * src_stride);
5175    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5176                                         const20, const6, const3);
5177    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5178    horiz6 = __msa_aver_u_b(inp2, res1);
5179    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5180    LD_UB2(dst, dst_stride, dst0, dst1);
5181    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5182                                        horiz3, horiz4, horiz5, horiz6,
5183                                        horiz3, horiz2, horiz1, horiz0,
5184                                        horiz4, horiz5, horiz6, horiz7,
5185                                        const20, const6, const3);
5186    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5187    res1 = __msa_aver_u_b(avg1, res1);
5188    ST_D2(res1, 0, 1, dst, dst_stride);
5189    dst += (2 * dst_stride);
5190
5191    inp0 = LD_UB(src);
5192    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5193                                              const20, const6, const3);
5194    horiz8 = __msa_aver_u_b(inp0, res0);
5195    LD_UB2(dst, dst_stride, dst0, dst1);
5196    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5197                                        horiz5, horiz6, horiz7, horiz8,
5198                                        horiz5, horiz4, horiz3, horiz2,
5199                                        horiz6, horiz7, horiz8, horiz8,
5200                                        const20, const6, const3);
5201    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5202    res0 = __msa_aver_u_b(avg0, res0);
5203    ST_D2(res0, 0, 1, dst, dst_stride);
5204    dst += (2 * dst_stride);
5205
5206    LD_UB2(dst, dst_stride, dst0, dst1);
5207    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5208                                        horiz7, horiz8, horiz8, horiz7,
5209                                        horiz7, horiz6, horiz5, horiz4,
5210                                        horiz8, horiz8, horiz7, horiz6,
5211                                        const20, const6, const3);
5212    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5213    res1 = __msa_aver_u_b(avg1, res1);
5214    ST_D2(res1, 0, 1, dst, dst_stride);
5215}
5216
5217static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride,
5218                                         uint8_t *dst, int32_t dst_stride)
5219{
5220    uint8_t buff[272];
5221
5222    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
5223    vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5224
5225}
5226
5227static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride,
5228                                       uint8_t *dst, int32_t dst_stride)
5229{
5230    v16u8 inp0, inp1, inp2, inp3;
5231    v16u8 res0, res1, avg0, avg1;
5232    v16u8 horiz0, horiz1, horiz2, horiz3;
5233    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5234    v16u8 dst0, dst1;
5235    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5236    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5237    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5238    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5239    v16u8 const20 = (v16u8) __msa_ldi_b(20);
5240    v16u8 const6 = (v16u8) __msa_ldi_b(6);
5241    v16u8 const3 = (v16u8) __msa_ldi_b(3);
5242
5243    LD_UB2(src, src_stride, inp0, inp1);
5244    src += (2 * src_stride);
5245    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5246                                           mask0, mask1, mask2, mask3,
5247                                           const20, const6, const3);
5248    LD_UB2(src, src_stride, inp2, inp3);
5249    src += (2 * src_stride);
5250    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5251    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5252                                           mask0, mask1, mask2, mask3,
5253                                           const20, const6, const3);
5254    LD_UB2(src, src_stride, inp0, inp1);
5255    src += (2 * src_stride);
5256    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5257    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5258                                           mask0, mask1, mask2, mask3,
5259                                           const20, const6, const3);
5260    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5261    LD_UB2(src, src_stride, inp2, inp3);
5262    src += (2 * src_stride);
5263    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5264                                           mask0, mask1, mask2, mask3,
5265                                           const20, const6, const3);
5266    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5267    inp0 = LD_UB(src);
5268    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5269                                                mask0, mask1, mask2, mask3,
5270                                                const20, const6, const3);
5271    LD_UB2(dst, dst_stride, dst0, dst1);
5272    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5273                                        horiz1, horiz2, horiz3, horiz4,
5274                                        horiz1, horiz0, horiz0, horiz1,
5275                                        horiz2, horiz3, horiz4, horiz5,
5276                                        const20, const6, const3);
5277    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5278    res0 = __msa_aver_u_b(avg0, res0);
5279    ST_D2(res0, 0, 1, dst, dst_stride);
5280    dst += (2 * dst_stride);
5281
5282    LD_UB2(dst, dst_stride, dst0, dst1);
5283    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5284                                        horiz3, horiz4, horiz5, horiz6,
5285                                        horiz3, horiz2, horiz1, horiz0,
5286                                        horiz4, horiz5, horiz6, horiz7,
5287                                        const20, const6, const3);
5288    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5289    res1 = __msa_aver_u_b(avg1, res1);
5290    ST_D2(res1, 0, 1, dst, dst_stride);
5291    dst += (2 * dst_stride);
5292
5293    LD_UB2(dst, dst_stride, dst0, dst1);
5294    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5295                                        horiz5, horiz6, horiz7, horiz8,
5296                                        horiz5, horiz4, horiz3, horiz2,
5297                                        horiz6, horiz7, horiz8, horiz8,
5298                                        const20, const6, const3);
5299    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5300    res0 = __msa_aver_u_b(avg0, res0);
5301    ST_D2(res0, 0, 1, dst, dst_stride);
5302    dst += (2 * dst_stride);
5303
5304    LD_UB2(dst, dst_stride, dst0, dst1);
5305    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5306                                        horiz7, horiz8, horiz8, horiz7,
5307                                        horiz7, horiz6, horiz5, horiz4,
5308                                        horiz8, horiz8, horiz7, horiz6,
5309                                        const20, const6, const3);
5310    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5311    res1 = __msa_aver_u_b(avg1, res1);
5312    ST_D2(res1, 0, 1, dst, dst_stride);
5313}
5314
5315static void hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t *src,
5316                                                     int32_t src_stride,
5317                                                     uint8_t *dst,
5318                                                     int32_t dst_stride)
5319{
5320    uint8_t buff[272];
5321
5322    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5323    vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5324}
5325
5326static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src,
5327                                                   int32_t src_stride,
5328                                                   uint8_t *dst,
5329                                                   int32_t dst_stride)
5330{
5331    v16u8 inp0, inp1, inp2, inp3;
5332    v16u8 res0, res1, avg0, avg1;
5333    v16u8 horiz0, horiz1, horiz2, horiz3;
5334    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5335    v16u8 dst0, dst1;
5336    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5337    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5338    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5339    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5340    v16u8 const20 = (v16u8) __msa_ldi_b(20);
5341    v16u8 const6 = (v16u8) __msa_ldi_b(6);
5342    v16u8 const3 = (v16u8) __msa_ldi_b(3);
5343
5344    LD_UB2(src, src_stride, inp0, inp1);
5345    src += (2 * src_stride);
5346    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5347                                         const20, const6, const3);
5348    LD_UB2(src, src_stride, inp2, inp3);
5349    src += (2 * src_stride);
5350    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5351
5352    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5353    horiz0 = __msa_aver_u_b(inp0, res0);
5354    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5355    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5356                                         const20, const6, const3);
5357    LD_UB2(src, src_stride, inp0, inp1);
5358    src += (2 * src_stride);
5359    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5360
5361    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5362    horiz2 = __msa_aver_u_b(inp2, res1);
5363    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5364    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5365                                         const20, const6, const3);
5366
5367    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5368
5369    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5370    horiz4 = __msa_aver_u_b(inp0, res0);
5371    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5372    LD_UB2(dst, dst_stride, dst0, dst1);
5373    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5374                                        horiz1, horiz2, horiz3, horiz4,
5375                                        horiz1, horiz0, horiz0, horiz1,
5376                                        horiz2, horiz3, horiz4, horiz5,
5377                                        const20, const6, const3);
5378    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5379    res0 = __msa_aver_u_b(avg0, res0);
5380    ST_D2(res0, 0, 1, dst, dst_stride);
5381    dst += (2 * dst_stride);
5382
5383    LD_UB2(src, src_stride, inp2, inp3);
5384    src += (2 * src_stride);
5385    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5386                                         const20, const6, const3);
5387
5388    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5389
5390    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5391    horiz6 = __msa_aver_u_b(inp2, res1);
5392    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5393    LD_UB2(dst, dst_stride, dst0, dst1);
5394    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5395                                        horiz3, horiz4, horiz5, horiz6,
5396                                        horiz3, horiz2, horiz1, horiz0,
5397                                        horiz4, horiz5, horiz6, horiz7,
5398                                        const20, const6, const3);
5399    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5400    res1 = __msa_aver_u_b(avg1, res1);
5401    ST_D2(res1, 0, 1, dst, dst_stride);
5402    dst += (2 * dst_stride);
5403
5404    inp0 = LD_UB(src);
5405    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5406                                              const20, const6, const3);
5407    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5408    horiz8 = __msa_aver_u_b(inp0, res0);
5409    LD_UB2(dst, dst_stride, dst0, dst1);
5410    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5411                                        horiz5, horiz6, horiz7, horiz8,
5412                                        horiz5, horiz4, horiz3, horiz2,
5413                                        horiz6, horiz7, horiz8, horiz8,
5414                                        const20, const6, const3);
5415    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5416    res0 = __msa_aver_u_b(avg0, res0);
5417    ST_D2(res0, 0, 1, dst, dst_stride);
5418    dst += (2 * dst_stride);
5419
5420    LD_UB2(dst, dst_stride, dst0, dst1);
5421    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5422                                        horiz7, horiz8, horiz8, horiz7,
5423                                        horiz7, horiz6, horiz5, horiz4,
5424                                        horiz8, horiz8, horiz7, horiz6,
5425                                        const20, const6, const3);
5426    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5427    res1 = __msa_aver_u_b(avg1, res1);
5428    ST_D2(res1, 0, 1, dst, dst_stride);
5429}
5430
5431static void hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t *src,
5432                                                       int32_t src_stride,
5433                                                       uint8_t *dst,
5434                                                       int32_t dst_stride)
5435{
5436    uint8_t buff[272];
5437
5438    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
5439    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5440}
5441
5442static void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src,
5443                                                     int32_t src_stride,
5444                                                     uint8_t *dst,
5445                                                     int32_t dst_stride)
5446{
5447    v16u8 inp0, inp1, inp2, inp3;
5448    v16u8 res0, res1, avg0, avg1;
5449    v16u8 horiz0, horiz1, horiz2, horiz3;
5450    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5451    v16u8 dst0, dst1;
5452    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5453    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5454    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5455    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5456    v16u8 const20 = (v16u8) __msa_ldi_b(20);
5457    v16u8 const6 = (v16u8) __msa_ldi_b(6);
5458    v16u8 const3 = (v16u8) __msa_ldi_b(3);
5459
5460    LD_UB2(src, src_stride, inp0, inp1);
5461    src += (2 * src_stride);
5462
5463    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5464                                         const20, const6, const3);
5465    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5466    horiz0 = __msa_aver_u_b(inp0, res0);
5467    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5468    LD_UB2(src, src_stride, inp2, inp3);
5469    src += (2 * src_stride);
5470    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5471                                         const20, const6, const3);
5472    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5473    horiz2 = __msa_aver_u_b(inp2, res1);
5474    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5475    LD_UB2(dst, dst_stride, dst0, dst1);
5476    LD_UB2(src, src_stride, inp0, inp1);
5477    src += (2 * src_stride);
5478    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5479                                         const20, const6, const3);
5480    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5481    horiz4 = __msa_aver_u_b(inp0, res0);
5482    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5483    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5484                                        horiz1, horiz2, horiz3, horiz4,
5485                                        horiz1, horiz0, horiz0, horiz1,
5486                                        horiz2, horiz3, horiz4, horiz5,
5487                                        const20, const6, const3);
5488    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5489    res0 = __msa_aver_u_b(avg0, res0);
5490    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5491    res0 = __msa_aver_u_b(avg0, res0);
5492    ST_D2(res0, 0, 1, dst, dst_stride);
5493    dst += (2 * dst_stride);
5494
5495    LD_UB2(dst, dst_stride, dst0, dst1);
5496    LD_UB2(src, src_stride, inp2, inp3);
5497    src += (2 * src_stride);
5498    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5499                                         const20, const6, const3);
5500    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5501    horiz6 = __msa_aver_u_b(inp2, res1);
5502    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5503    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5504                                        horiz3, horiz4, horiz5, horiz6,
5505                                        horiz3, horiz2, horiz1, horiz0,
5506                                        horiz4, horiz5, horiz6, horiz7,
5507                                        const20, const6, const3);
5508    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5509    res1 = __msa_aver_u_b(avg1, res1);
5510    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5511    res1 = __msa_aver_u_b(avg1, res1);
5512    ST_D2(res1, 0, 1, dst, dst_stride);
5513    dst += (2 * dst_stride);
5514
5515    inp0 = LD_UB(src);
5516    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5517                                              const20, const6, const3);
5518    horiz8 = __msa_aver_u_b(inp0, res0);
5519    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5520                                        horiz5, horiz6, horiz7, horiz8,
5521                                        horiz5, horiz4, horiz3, horiz2,
5522                                        horiz6, horiz7, horiz8, horiz8,
5523                                        const20, const6, const3);
5524    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5525                                        horiz7, horiz8, horiz8, horiz7,
5526                                        horiz7, horiz6, horiz5, horiz4,
5527                                        horiz8, horiz8, horiz7, horiz6,
5528                                        const20, const6, const3);
5529    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5530    res0 = __msa_aver_u_b(avg0, res0);
5531    LD_UB2(dst, dst_stride, dst0, dst1);
5532    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5533    res0 = __msa_aver_u_b(avg0, res0);
5534    ST_D2(res0, 0, 1, dst, dst_stride);
5535    dst += (2 * dst_stride);
5536
5537    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5538    res1 = __msa_aver_u_b(avg1, res1);
5539    LD_UB2(dst, dst_stride, dst0, dst1);
5540    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5541    res1 = __msa_aver_u_b(avg1, res1);
5542    ST_D2(res1, 0, 1, dst, dst_stride);
5543}
5544
5545static void hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t *src,
5546                                                     int32_t src_stride,
5547                                                     uint8_t *dst,
5548                                                     int32_t dst_stride)
5549{
5550    uint8_t buff[272];
5551
5552    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
5553    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5554}
5555
5556static void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src,
5557                                                   int32_t src_stride,
5558                                                   uint8_t *dst,
5559                                                   int32_t dst_stride)
5560{
5561    v16u8 inp0, inp1, inp2, inp3;
5562    v16u8 res0, res1, avg0, avg1;
5563    v16u8 horiz0, horiz1, horiz2, horiz3;
5564    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5565    v16u8 dst0, dst1;
5566    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5567    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5568    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5569    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5570    v16u8 const20 = (v16u8) __msa_ldi_b(20);
5571    v16u8 const6 = (v16u8) __msa_ldi_b(6);
5572    v16u8 const3 = (v16u8) __msa_ldi_b(3);
5573
5574    LD_UB2(src, src_stride, inp0, inp1);
5575    src += (2 * src_stride);
5576    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5577                                           mask0, mask1, mask2, mask3,
5578                                           const20, const6, const3);
5579    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5580    LD_UB2(src, src_stride, inp2, inp3);
5581    src += (2 * src_stride);
5582    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5583                                           mask0, mask1, mask2, mask3,
5584                                           const20, const6, const3);
5585    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5586    LD_UB2(dst, dst_stride, dst0, dst1);
5587    LD_UB2(src, src_stride, inp0, inp1);
5588    src += (2 * src_stride);
5589    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5590                                           mask0, mask1, mask2, mask3,
5591                                           const20, const6, const3);
5592    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5593    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5594                                        horiz1, horiz2, horiz3, horiz4,
5595                                        horiz1, horiz0, horiz0, horiz1,
5596                                        horiz2, horiz3, horiz4, horiz5,
5597                                        const20, const6, const3);
5598    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5599    res0 = __msa_aver_u_b(avg0, res0);
5600    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5601    res0 = __msa_aver_u_b(avg0, res0);
5602    ST_D2(res0, 0, 1, dst, dst_stride);
5603    dst += (2 * dst_stride);
5604
5605    LD_UB2(dst, dst_stride, dst0, dst1);
5606    LD_UB2(src, src_stride, inp2, inp3);
5607    src += (2 * src_stride);
5608    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5609                                           mask0, mask1, mask2, mask3,
5610                                           const20, const6, const3);
5611    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5612    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5613                                        horiz3, horiz4, horiz5, horiz6,
5614                                        horiz3, horiz2, horiz1, horiz0,
5615                                        horiz4, horiz5, horiz6, horiz7,
5616                                        const20, const6, const3);
5617    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5618    res1 = __msa_aver_u_b(avg1, res1);
5619    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5620    res1 = __msa_aver_u_b(avg1, res1);
5621    ST_D2(res1, 0, 1, dst, dst_stride);
5622    dst += (2 * dst_stride);
5623
5624    inp0 = LD_UB(src);
5625    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5626                                                mask0, mask1, mask2, mask3,
5627                                                const20, const6, const3);
5628    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
5629                                        horiz6, horiz7, horiz8, horiz5, horiz4,
5630                                        horiz3, horiz2, horiz6, horiz7, horiz8,
5631                                        horiz8, const20, const6, const3);
5632    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
5633                                        horiz8, horiz8, horiz7, horiz7, horiz6,
5634                                        horiz5, horiz4, horiz8, horiz8, horiz7,
5635                                        horiz6, const20, const6, const3);
5636    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5637    res0 = __msa_aver_u_b(avg0, res0);
5638    LD_UB2(dst, dst_stride, dst0, dst1);
5639    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5640    res0 = __msa_aver_u_b(avg0, res0);
5641    ST_D2(res0, 0, 1, dst, dst_stride);
5642    dst += (2 * dst_stride);
5643
5644    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5645    res1 = __msa_aver_u_b(avg1, res1);
5646    LD_UB2(dst, dst_stride, dst0, dst1);
5647    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5648    res1 = __msa_aver_u_b(avg1, res1);
5649    ST_D2(res1, 0, 1, dst, dst_stride);
5650}
5651
5652static void hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t *src,
5653                                                       int32_t src_stride,
5654                                                       uint8_t *dst,
5655                                                       int32_t dst_stride)
5656{
5657    uint8_t buff[272];
5658
5659    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5660    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5661}
5662
5663static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src,
5664                                                     int32_t src_stride,
5665                                                     uint8_t *dst,
5666                                                     int32_t dst_stride)
5667{
5668    v16u8 inp0, inp1, inp2, inp3;
5669    v16u8 res0, res1, avg0, avg1;
5670    v16u8 horiz0, horiz1, horiz2, horiz3;
5671    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5672    v16u8 dst0, dst1;
5673    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5674    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5675    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5676    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5677    v16u8 const20 = (v16u8) __msa_ldi_b(20);
5678    v16u8 const6 = (v16u8) __msa_ldi_b(6);
5679    v16u8 const3 = (v16u8) __msa_ldi_b(3);
5680
5681    LD_UB2(src, src_stride, inp0, inp1);
5682    src += (2 * src_stride);
5683    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5684                                         const20, const6, const3);
5685    LD_UB2(src, src_stride, inp2, inp3);
5686    src += (2 * src_stride);
5687    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5688
5689    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5690    horiz0 = __msa_aver_u_b(inp0, res0);
5691    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5692    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5693                                         const20, const6, const3);
5694    LD_UB2(src, src_stride, inp0, inp1);
5695    src += (2 * src_stride);
5696    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5697
5698    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5699    horiz2 = __msa_aver_u_b(inp2, res1);
5700    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5701    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5702                                         const20, const6, const3);
5703    SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5704
5705    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5706    horiz4 = __msa_aver_u_b(inp0, res0);
5707    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5708    LD_UB2(dst, dst_stride, dst0, dst1);
5709    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5710    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, horiz1,
5711                                        horiz2, horiz3, horiz4, horiz1, horiz0,
5712                                        horiz0, horiz1, horiz2, horiz3, horiz4,
5713                                        horiz5, const20, const6, const3);
5714    res0 = __msa_aver_u_b(avg0, res0);
5715    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5716    res0 = __msa_aver_u_b(avg0, res0);
5717    ST_D2(res0, 0, 1, dst, dst_stride);
5718    dst += (2 * dst_stride);
5719
5720    LD_UB2(src, src_stride, inp2, inp3);
5721    src += (2 * src_stride);
5722    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5723                                         const20, const6, const3);
5724    SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5725
5726    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5727    horiz6 = __msa_aver_u_b(inp2, res1);
5728    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5729    LD_UB2(dst, dst_stride, dst0, dst1);
5730    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5731    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, horiz3,
5732                                        horiz4, horiz5, horiz6, horiz3, horiz2,
5733                                        horiz1, horiz0, horiz4, horiz5, horiz6,
5734                                        horiz7, const20, const6, const3);
5735    res1 = __msa_aver_u_b(avg1, res1);
5736    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5737    res1 = __msa_aver_u_b(avg1, res1);
5738    ST_D2(res1, 0, 1, dst, dst_stride);
5739    dst += (2 * dst_stride);
5740
5741    inp0 = LD_UB(src);
5742    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5743                                              const20, const6, const3);
5744    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5745    horiz8 = __msa_aver_u_b(inp0, res0);
5746    LD_UB2(dst, dst_stride, dst0, dst1);
5747    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5748    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
5749                                        horiz6, horiz7, horiz8, horiz5, horiz4,
5750                                        horiz3, horiz2, horiz6, horiz7, horiz8,
5751                                        horiz8, const20, const6, const3);
5752    res0 = __msa_aver_u_b(avg0, res0);
5753    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5754    res0 = __msa_aver_u_b(avg0, res0);
5755    ST_D2(res0, 0, 1, dst, dst_stride);
5756    dst += (2 * dst_stride);
5757
5758    LD_UB2(dst, dst_stride, dst0, dst1);
5759    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5760    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
5761                                        horiz8, horiz8, horiz7, horiz7, horiz6,
5762                                        horiz5, horiz4, horiz8, horiz8, horiz7,
5763                                        horiz6, const20, const6, const3);
5764    res1 = __msa_aver_u_b(avg1, res1);
5765    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5766    res1 = __msa_aver_u_b(avg1, res1);
5767    ST_D2(res1, 0, 1, dst, dst_stride);
5768}
5769
5770static void copy_8x8_msa(const uint8_t *src, int32_t src_stride,
5771                         uint8_t *dst, int32_t dst_stride)
5772{
5773    uint64_t src0, src1;
5774    int32_t loop_cnt;
5775
5776    for (loop_cnt = 4; loop_cnt--;) {
5777        src0 = LD(src);
5778        src += src_stride;
5779        src1 = LD(src);
5780        src += src_stride;
5781
5782        SD(src0, dst);
5783        dst += dst_stride;
5784        SD(src1, dst);
5785        dst += dst_stride;
5786    }
5787}
5788
5789static void copy_16x16_msa(const uint8_t *src, int32_t src_stride,
5790                           uint8_t *dst, int32_t dst_stride)
5791{
5792    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
5793    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
5794
5795    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5796    src += (8 * src_stride);
5797    LD_UB8(src, src_stride,
5798           src8, src9, src10, src11, src12, src13, src14, src15);
5799
5800    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
5801    dst += (8 * dst_stride);
5802    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
5803           dst, dst_stride);
5804}
5805
5806static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
5807                           uint8_t *dst, int32_t dst_stride,
5808                           int32_t height)
5809{
5810    int32_t cnt;
5811    uint64_t out0, out1, out2, out3;
5812    v16u8 src0, src1, src2, src3;
5813    v16u8 dst0, dst1, dst2, dst3;
5814
5815    for (cnt = (height / 4); cnt--;) {
5816        LD_UB4(src, src_stride, src0, src1, src2, src3);
5817        src += (4 * src_stride);
5818        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
5819
5820        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5821                    dst0, dst1, dst2, dst3);
5822
5823        out0 = __msa_copy_u_d((v2i64) dst0, 0);
5824        out1 = __msa_copy_u_d((v2i64) dst1, 0);
5825        out2 = __msa_copy_u_d((v2i64) dst2, 0);
5826        out3 = __msa_copy_u_d((v2i64) dst3, 0);
5827        SD4(out0, out1, out2, out3, dst, dst_stride);
5828        dst += (4 * dst_stride);
5829    }
5830}
5831
5832static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
5833                            uint8_t *dst, int32_t dst_stride,
5834                            int32_t height)
5835{
5836    int32_t cnt;
5837    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
5838    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5839
5840    for (cnt = (height / 8); cnt--;) {
5841        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5842        src += (8 * src_stride);
5843        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
5844
5845        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5846                    dst0, dst1, dst2, dst3);
5847        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
5848                    dst4, dst5, dst6, dst7);
5849        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
5850        dst += (8 * dst_stride);
5851    }
5852}
5853
5854void ff_copy_16x16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5855{
5856    copy_16x16_msa(src, stride, dest, stride);
5857}
5858
5859void ff_copy_8x8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5860{
5861    copy_8x8_msa(src, stride, dest, stride);
5862}
5863
5864void ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dest,
5865                                           const uint8_t *src,
5866                                           ptrdiff_t stride)
5867{
5868    horiz_mc_qpel_aver_src0_8width_msa(src, stride, dest, stride, 8);
5869}
5870
5871void ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dest,
5872                                            const uint8_t *src,
5873                                            ptrdiff_t stride)
5874{
5875    horiz_mc_qpel_aver_src0_16width_msa(src, stride, dest, stride, 16);
5876}
5877
5878void ff_horiz_mc_qpel_8width_msa(uint8_t *dest, const uint8_t *src,
5879                                 ptrdiff_t stride)
5880{
5881    horiz_mc_qpel_8width_msa(src, stride, dest, stride, 8);
5882}
5883
5884void ff_horiz_mc_qpel_16width_msa(uint8_t *dest,
5885                                  const uint8_t *src, ptrdiff_t stride)
5886{
5887    horiz_mc_qpel_16width_msa(src, stride, dest, stride, 16);
5888}
5889
5890void ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dest,
5891                                           const uint8_t *src,
5892                                           ptrdiff_t stride)
5893{
5894    horiz_mc_qpel_aver_src1_8width_msa(src, stride, dest, stride, 8);
5895}
5896
5897void ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dest,
5898                                            const uint8_t *src,
5899                                            ptrdiff_t stride)
5900{
5901    horiz_mc_qpel_aver_src1_16width_msa(src, stride, dest, stride, 16);
5902}
5903
5904void ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dest,
5905                                                  const uint8_t *src,
5906                                                  ptrdiff_t stride)
5907{
5908    horiz_mc_qpel_no_rnd_aver_src0_8width_msa(src, stride, dest, stride, 8);
5909}
5910
5911void ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dest,
5912                                                   const uint8_t *src,
5913                                                   ptrdiff_t stride)
5914{
5915    horiz_mc_qpel_no_rnd_aver_src0_16width_msa(src, stride, dest, stride, 16);
5916}
5917
5918void ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dest,
5919                                        const uint8_t *src, ptrdiff_t stride)
5920{
5921    horiz_mc_qpel_no_rnd_8width_msa(src, stride, dest, stride, 8);
5922}
5923
5924void ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dest,
5925                                         const uint8_t *src, ptrdiff_t stride)
5926{
5927    horiz_mc_qpel_no_rnd_16width_msa(src, stride, dest, stride, 16);
5928}
5929
5930void ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dest,
5931                                                  const uint8_t *src,
5932                                                  ptrdiff_t stride)
5933{
5934    horiz_mc_qpel_no_rnd_aver_src1_8width_msa(src, stride, dest, stride, 8);
5935}
5936
5937void ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dest,
5938                                                   const uint8_t *src,
5939                                                   ptrdiff_t stride)
5940{
5941    horiz_mc_qpel_no_rnd_aver_src1_16width_msa(src, stride, dest, stride, 16);
5942}
5943
5944void ff_avg_width8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5945{
5946    avg_width8_msa(src, stride, dest, stride, 8);
5947}
5948
5949void ff_avg_width16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5950{
5951    avg_width16_msa(src, stride, dest, stride, 16);
5952}
5953
5954void ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dest,
5955                                                   const uint8_t *src,
5956                                                   ptrdiff_t stride)
5957{
5958    horiz_mc_qpel_avg_dst_aver_src0_8width_msa(src, stride, dest, stride, 8);
5959}
5960
5961void ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dest,
5962                                                    const uint8_t *src,
5963                                                    ptrdiff_t stride)
5964{
5965    horiz_mc_qpel_avg_dst_aver_src0_16width_msa(src, stride, dest, stride, 16);
5966}
5967
5968void ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dest,
5969                                         const uint8_t *src, ptrdiff_t stride)
5970{
5971    horiz_mc_qpel_avg_dst_8width_msa(src, stride, dest, stride, 8);
5972}
5973
5974void ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dest,
5975                                          const uint8_t *src, ptrdiff_t stride)
5976{
5977    horiz_mc_qpel_avg_dst_16width_msa(src, stride, dest, stride, 16);
5978}
5979
5980void ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dest,
5981                                                   const uint8_t *src,
5982                                                   ptrdiff_t stride)
5983{
5984    horiz_mc_qpel_avg_dst_aver_src1_8width_msa(src, stride, dest, stride, 8);
5985}
5986
5987void ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dest,
5988                                                    const uint8_t *src,
5989                                                    ptrdiff_t stride)
5990{
5991    horiz_mc_qpel_avg_dst_aver_src1_16width_msa(src, stride, dest, stride, 16);
5992}
5993
5994
5995void ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dest,
5996                                       const uint8_t *src, ptrdiff_t stride)
5997{
5998    vert_mc_qpel_aver_src0_8x8_msa(src, stride, dest, stride);
5999}
6000
6001void ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dest,
6002                                         const uint8_t *src, ptrdiff_t stride)
6003{
6004    vert_mc_qpel_aver_src0_16x16_msa(src, stride, dest, stride);
6005}
6006
6007void ff_vert_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
6008                             ptrdiff_t stride)
6009{
6010    vert_mc_qpel_8x8_msa(src, stride, dest, stride);
6011}
6012
6013void ff_vert_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
6014                               ptrdiff_t stride)
6015{
6016    vert_mc_qpel_16x16_msa(src, stride, dest, stride);
6017}
6018
6019void ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dest,
6020                                       const uint8_t *src, ptrdiff_t stride)
6021{
6022    vert_mc_qpel_aver_src1_8x8_msa(src, stride, dest, stride);
6023}
6024
6025void ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dest,
6026                                         const uint8_t *src, ptrdiff_t stride)
6027{
6028    vert_mc_qpel_aver_src1_16x16_msa(src, stride, dest, stride);
6029}
6030
6031void ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dest,
6032                                              const uint8_t *src,
6033                                              ptrdiff_t stride)
6034{
6035    vert_mc_qpel_no_rnd_aver_src0_8x8_msa(src, stride, dest, stride);
6036}
6037
6038void ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dest,
6039                                                const uint8_t *src,
6040                                                ptrdiff_t stride)
6041{
6042    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(src, stride, dest, stride);
6043}
6044
6045void ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
6046                                    const uint8_t *src, ptrdiff_t stride)
6047{
6048    vert_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
6049}
6050
6051void ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
6052                                      const uint8_t *src, ptrdiff_t stride)
6053{
6054    vert_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
6055}
6056
6057void ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dest,
6058                                              const uint8_t *src,
6059                                              ptrdiff_t stride)
6060{
6061    vert_mc_qpel_no_rnd_aver_src1_8x8_msa(src, stride, dest, stride);
6062}
6063
6064void ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dest,
6065                                                const uint8_t *src,
6066                                                ptrdiff_t stride)
6067{
6068    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(src, stride, dest, stride);
6069}
6070
6071void ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dest,
6072                                               const uint8_t *src,
6073                                               ptrdiff_t stride)
6074{
6075    vert_mc_qpel_avg_dst_aver_src0_8x8_msa(src, stride, dest, stride);
6076}
6077
6078void ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dest,
6079                                                 const uint8_t *src,
6080                                                 ptrdiff_t stride)
6081{
6082    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(src, stride, dest, stride);
6083}
6084
6085void ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
6086                                     const uint8_t *src, ptrdiff_t stride)
6087{
6088    vert_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
6089}
6090
6091void ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
6092                                       const uint8_t *src, ptrdiff_t stride)
6093{
6094    vert_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
6095}
6096
6097void ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dest,
6098                                               const uint8_t *src,
6099                                               ptrdiff_t stride)
6100{
6101    vert_mc_qpel_avg_dst_aver_src1_8x8_msa(src, stride, dest, stride);
6102}
6103
6104void ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dest,
6105                                                 const uint8_t *src,
6106                                                 ptrdiff_t stride)
6107{
6108    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(src, stride, dest, stride);
6109}
6110
6111/* HV cases */
6112void ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dest,
6113                                           const uint8_t *src,
6114                                           ptrdiff_t stride)
6115{
6116    hv_mc_qpel_aver_hv_src00_16x16_msa(src, stride, dest, stride);
6117}
6118
6119void ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dest,
6120                                         const uint8_t *src, ptrdiff_t stride)
6121{
6122    hv_mc_qpel_aver_hv_src00_8x8_msa(src, stride, dest, stride);
6123}
6124
6125void ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dest,
6126                                         const uint8_t *src, ptrdiff_t stride)
6127{
6128    hv_mc_qpel_aver_v_src0_16x16_msa(src, stride, dest, stride);
6129}
6130
6131void ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dest,
6132                                       const uint8_t *src, ptrdiff_t stride)
6133{
6134    hv_mc_qpel_aver_v_src0_8x8_msa(src, stride, dest, stride);
6135}
6136
6137void ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dest,
6138                                           const uint8_t *src,
6139                                           ptrdiff_t stride)
6140{
6141    hv_mc_qpel_aver_hv_src10_16x16_msa(src, stride, dest, stride);
6142}
6143
6144void ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dest,
6145                                         const uint8_t *src, ptrdiff_t stride)
6146{
6147    hv_mc_qpel_aver_hv_src10_8x8_msa(src, stride, dest, stride);
6148}
6149
6150void ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dest,
6151                                         const uint8_t *src, ptrdiff_t stride)
6152{
6153    hv_mc_qpel_aver_h_src0_16x16_msa(src, stride, dest, stride);
6154}
6155
6156void ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dest,
6157                                       const uint8_t *src, ptrdiff_t stride)
6158{
6159    hv_mc_qpel_aver_h_src0_8x8_msa(src, stride, dest, stride);
6160}
6161
6162void ff_hv_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
6163                             ptrdiff_t stride)
6164{
6165    hv_mc_qpel_16x16_msa(src, stride, dest, stride);
6166}
6167
6168void ff_hv_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
6169                           ptrdiff_t stride)
6170{
6171    hv_mc_qpel_8x8_msa(src, stride, dest, stride);
6172}
6173
6174void ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dest,
6175                                         const uint8_t *src, ptrdiff_t stride)
6176{
6177    hv_mc_qpel_aver_h_src1_16x16_msa(src, stride, dest, stride);
6178}
6179
6180void ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dest,
6181                                       const uint8_t *src, ptrdiff_t stride)
6182{
6183    hv_mc_qpel_aver_h_src1_8x8_msa(src, stride, dest, stride);
6184}
6185
6186void ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dest,
6187                                           const uint8_t *src,
6188                                           ptrdiff_t stride)
6189{
6190    hv_mc_qpel_aver_hv_src01_16x16_msa(src, stride, dest, stride);
6191}
6192
6193void ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dest,
6194                                         const uint8_t *src, ptrdiff_t stride)
6195{
6196    hv_mc_qpel_aver_hv_src01_8x8_msa(src, stride, dest, stride);
6197}
6198
6199void ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dest,
6200                                         const uint8_t *src, ptrdiff_t stride)
6201{
6202    hv_mc_qpel_aver_v_src1_16x16_msa(src, stride, dest, stride);
6203}
6204
6205void ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dest,
6206                                       const uint8_t *src, ptrdiff_t stride)
6207{
6208    hv_mc_qpel_aver_v_src1_8x8_msa(src, stride, dest, stride);
6209}
6210
6211void ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dest,
6212                                           const uint8_t *src,
6213                                           ptrdiff_t stride)
6214{
6215    hv_mc_qpel_aver_hv_src11_16x16_msa(src, stride, dest, stride);
6216}
6217
6218void ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dest,
6219                                         const uint8_t *src, ptrdiff_t stride)
6220{
6221    hv_mc_qpel_aver_hv_src11_8x8_msa(src, stride, dest, stride);
6222}
6223
6224void ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dest,
6225                                                   const uint8_t *src,
6226                                                   ptrdiff_t stride)
6227{
6228    hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(src, stride, dest, stride);
6229}
6230
6231void ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dest,
6232                                                 const uint8_t *src,
6233                                                 ptrdiff_t stride)
6234{
6235    hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(src, stride, dest, stride);
6236}
6237
6238void ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dest,
6239                                                 const uint8_t *src,
6240                                                 ptrdiff_t stride)
6241{
6242    hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(src, stride, dest, stride);
6243}
6244
6245void ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dest,
6246                                               const uint8_t *src,
6247                                               ptrdiff_t stride)
6248{
6249    hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(src, stride, dest, stride);
6250}
6251
6252void ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dest,
6253                                                   const uint8_t *src,
6254                                                   ptrdiff_t stride)
6255{
6256    hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(src, stride, dest, stride);
6257}
6258
6259void ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dest,
6260                                                 const uint8_t *src,
6261                                                 ptrdiff_t stride)
6262{
6263    hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(src, stride, dest, stride);
6264}
6265
6266void ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dest,
6267                                                 const uint8_t *src,
6268                                                 ptrdiff_t stride)
6269{
6270    hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(src, stride, dest, stride);
6271}
6272
6273void ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dest,
6274                                               const uint8_t *src,
6275                                               ptrdiff_t stride)
6276{
6277    hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(src, stride, dest, stride);
6278}
6279
6280void ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
6281                                     const uint8_t *src, ptrdiff_t stride)
6282{
6283    hv_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
6284}
6285
6286void ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
6287                                   const uint8_t *src, ptrdiff_t stride)
6288{
6289    hv_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
6290}
6291
6292void ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dest,
6293                                                 const uint8_t *src,
6294                                                 ptrdiff_t stride)
6295{
6296    hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(src, stride, dest, stride);
6297}
6298
6299void ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dest,
6300                                               const uint8_t *src,
6301                                               ptrdiff_t stride)
6302{
6303    hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(src, stride, dest, stride);
6304}
6305
6306void ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dest,
6307                                                   const uint8_t *src,
6308                                                   ptrdiff_t stride)
6309{
6310    hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(src, stride, dest, stride);
6311}
6312
6313void ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dest,
6314                                                 const uint8_t *src,
6315                                                 ptrdiff_t stride)
6316{
6317    hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(src, stride, dest, stride);
6318}
6319
6320void ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dest,
6321                                                 const uint8_t *src,
6322                                                 ptrdiff_t stride)
6323{
6324    hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(src, stride, dest, stride);
6325}
6326
6327void ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dest,
6328                                               const uint8_t *src,
6329                                               ptrdiff_t stride)
6330{
6331    hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(src, stride, dest, stride);
6332}
6333
6334void ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dest,
6335                                                   const uint8_t *src,
6336                                                   ptrdiff_t stride)
6337{
6338    hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(src, stride, dest, stride);
6339}
6340
6341void ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dest,
6342                                                 const uint8_t *src,
6343                                                 ptrdiff_t stride)
6344{
6345    hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(src, stride, dest, stride);
6346}
6347
6348void ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dest,
6349                                                  const uint8_t *src,
6350                                                  ptrdiff_t stride)
6351{
6352    hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(src, stride, dest, stride);
6353}
6354
6355void ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dest,
6356                                                const uint8_t *src,
6357                                                ptrdiff_t stride)
6358{
6359    hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(src, stride, dest, stride);
6360}
6361
6362void ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dest,
6363                                                const uint8_t *src,
6364                                                ptrdiff_t stride)
6365{
6366    hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(src, stride, dest, stride);
6367}
6368
6369void ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dest,
6370                                              const uint8_t *src,
6371                                              ptrdiff_t stride)
6372{
6373    hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(src, stride, dest, stride);
6374}
6375
6376void ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dest,
6377                                                  const uint8_t *src,
6378                                                  ptrdiff_t stride)
6379{
6380    hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(src, stride, dest, stride);
6381}
6382
6383void ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dest,
6384                                                const uint8_t *src,
6385                                                ptrdiff_t stride)
6386{
6387    hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(src, stride, dest, stride);
6388}
6389
6390void ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dest,
6391                                                const uint8_t *src,
6392                                                ptrdiff_t stride)
6393{
6394    hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(src, stride, dest, stride);
6395}
6396
6397void ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dest,
6398                                              const uint8_t *src,
6399                                              ptrdiff_t stride)
6400{
6401    hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(src, stride, dest, stride);
6402}
6403
6404void ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
6405                                    const uint8_t *src, ptrdiff_t stride)
6406{
6407    hv_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
6408}
6409
6410void ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
6411                                  const uint8_t *src, ptrdiff_t stride)
6412{
6413    hv_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
6414}
6415
6416void ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dest,
6417                                                const uint8_t *src,
6418                                                ptrdiff_t stride)
6419{
6420    hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(src, stride, dest, stride);
6421}
6422
6423void ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dest,
6424                                              const uint8_t *src,
6425                                              ptrdiff_t stride)
6426{
6427    hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(src, stride, dest, stride);
6428}
6429
6430void ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dest,
6431                                                  const uint8_t *src,
6432                                                  ptrdiff_t stride)
6433{
6434    hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(src, stride, dest, stride);
6435}
6436
6437void ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dest,
6438                                                const uint8_t *src,
6439                                                ptrdiff_t stride)
6440{
6441    hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(src, stride, dest, stride);
6442}
6443
6444void ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dest,
6445                                                const uint8_t *src,
6446                                                ptrdiff_t stride)
6447{
6448    hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(src, stride, dest, stride);
6449}
6450
6451void ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dest,
6452                                              const uint8_t *src,
6453                                              ptrdiff_t stride)
6454{
6455    hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(src, stride, dest, stride);
6456}
6457
6458void ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dest,
6459                                                  const uint8_t *src,
6460                                                  ptrdiff_t stride)
6461{
6462    hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(src, stride, dest, stride);
6463}
6464
6465void ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dest,
6466                                                const uint8_t *src,
6467                                                ptrdiff_t stride)
6468{
6469    hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(src, stride, dest, stride);
6470}
6471